diff --git a/.gitignore b/.gitignore index f9fd432..d37f231 100644 --- a/.gitignore +++ b/.gitignore @@ -14,5 +14,6 @@ *.db *.files */src/ +src/* */pkg/ */linux-5*/ diff --git a/linux-tkg/PKGBUILD b/PKGBUILD similarity index 99% rename from linux-tkg/PKGBUILD rename to PKGBUILD index f56fcde..7b38182 100644 --- a/linux-tkg/PKGBUILD +++ b/PKGBUILD @@ -271,7 +271,7 @@ case $_basever in 'b9ebe0ae69bc2b2091d6bfcf6c7875a87ea7969fcfa4e306c48d47a60f9ef4d6' '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' - 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + '2bbbac963b6ca44ef3f8a71ec7c5cad7d66df860869a73059087ee236775970a' '45a9ab99215ab3313be6e66e073d29154aac55bc58975a4df2dad116c918d27c' 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' diff --git a/linux-tkg/README.md b/README.md similarity index 100% rename from linux-tkg/README.md rename to README.md diff --git a/linux-tkg/customization.cfg b/customization.cfg similarity index 100% rename from linux-tkg/customization.cfg rename to customization.cfg diff --git a/linux-tkg/install.sh b/install.sh similarity index 100% rename from linux-tkg/install.sh rename to install.sh diff --git a/linux-tkg/linux-tkg-config/5.10/90-cleanup.hook b/linux-tkg-config/5.10/90-cleanup.hook similarity index 100% rename from linux-tkg/linux-tkg-config/5.10/90-cleanup.hook rename to linux-tkg-config/5.10/90-cleanup.hook diff --git a/linux-tkg/linux-tkg-config/5.10/cleanup b/linux-tkg-config/5.10/cleanup similarity index 100% rename from linux-tkg/linux-tkg-config/5.10/cleanup rename to linux-tkg-config/5.10/cleanup diff --git a/linux-tkg/linux-tkg-config/5.10/config.x86_64 b/linux-tkg-config/5.10/config.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.10/config.x86_64 rename to linux-tkg-config/5.10/config.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.4/90-cleanup.hook b/linux-tkg-config/5.4/90-cleanup.hook similarity index 100% rename from linux-tkg/linux-tkg-config/5.4/90-cleanup.hook rename to linux-tkg-config/5.4/90-cleanup.hook diff --git a/linux-tkg/linux-tkg-config/5.4/cleanup b/linux-tkg-config/5.4/cleanup similarity index 100% rename from linux-tkg/linux-tkg-config/5.4/cleanup rename to linux-tkg-config/5.4/cleanup diff --git a/linux-tkg/linux-tkg-config/5.4/config.x86_64 b/linux-tkg-config/5.4/config.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.4/config.x86_64 rename to linux-tkg-config/5.4/config.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 b/linux-tkg-config/5.4/config_hardened.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 rename to linux-tkg-config/5.4/config_hardened.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.7/90-cleanup.hook b/linux-tkg-config/5.7/90-cleanup.hook similarity index 100% rename from linux-tkg/linux-tkg-config/5.7/90-cleanup.hook rename to linux-tkg-config/5.7/90-cleanup.hook diff --git a/linux-tkg/linux-tkg-config/5.7/cleanup b/linux-tkg-config/5.7/cleanup similarity index 100% rename from linux-tkg/linux-tkg-config/5.7/cleanup rename to linux-tkg-config/5.7/cleanup diff --git a/linux-tkg/linux-tkg-config/5.7/config.x86_64 b/linux-tkg-config/5.7/config.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.7/config.x86_64 rename to linux-tkg-config/5.7/config.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 b/linux-tkg-config/5.7/config_hardened.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 rename to linux-tkg-config/5.7/config_hardened.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.8/90-cleanup.hook b/linux-tkg-config/5.8/90-cleanup.hook similarity index 100% rename from linux-tkg/linux-tkg-config/5.8/90-cleanup.hook rename to linux-tkg-config/5.8/90-cleanup.hook diff --git a/linux-tkg/linux-tkg-config/5.8/cleanup b/linux-tkg-config/5.8/cleanup similarity index 100% rename from linux-tkg/linux-tkg-config/5.8/cleanup rename to linux-tkg-config/5.8/cleanup diff --git a/linux-tkg/linux-tkg-config/5.8/config.x86_64 b/linux-tkg-config/5.8/config.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.8/config.x86_64 rename to linux-tkg-config/5.8/config.x86_64 diff --git a/linux-tkg/linux-tkg-config/5.9/90-cleanup.hook b/linux-tkg-config/5.9/90-cleanup.hook similarity index 100% rename from linux-tkg/linux-tkg-config/5.9/90-cleanup.hook rename to linux-tkg-config/5.9/90-cleanup.hook diff --git a/linux-tkg/linux-tkg-config/5.9/cleanup b/linux-tkg-config/5.9/cleanup similarity index 100% rename from linux-tkg/linux-tkg-config/5.9/cleanup rename to linux-tkg-config/5.9/cleanup diff --git a/linux-tkg/linux-tkg-config/5.9/config.x86_64 b/linux-tkg-config/5.9/config.x86_64 similarity index 100% rename from linux-tkg/linux-tkg-config/5.9/config.x86_64 rename to linux-tkg-config/5.9/config.x86_64 diff --git a/linux-tkg/linux-tkg-config/generic-desktop-profile.cfg b/linux-tkg-config/generic-desktop-profile.cfg similarity index 100% rename from linux-tkg/linux-tkg-config/generic-desktop-profile.cfg rename to linux-tkg-config/generic-desktop-profile.cfg diff --git a/linux-tkg/linux-tkg-config/prepare b/linux-tkg-config/prepare similarity index 99% rename from linux-tkg/linux-tkg-config/prepare rename to linux-tkg-config/prepare index 6e8bace..e6d2ef2 100644 --- a/linux-tkg/linux-tkg-config/prepare +++ b/linux-tkg-config/prepare @@ -132,7 +132,7 @@ _tkg_initscript() { echo "_cpusched=\"MuQSS\"" > "${_path}"/cpuschedset elif [ "$_basever" = "58" ]; then echo "_cpusched=\"pds\"" > "${_path}"/cpuschedset - else [ "$_basever" = "59" ]; then + else echo "_cpusched=\"bmq\"" > "${_path}"/cpuschedset fi elif [ "$CONDITION" = "3" ]; then diff --git a/linux-tkg/linux-tkg-config/ryzen-desktop-profile.cfg b/linux-tkg-config/ryzen-desktop-profile.cfg similarity index 100% rename from linux-tkg/linux-tkg-config/ryzen-desktop-profile.cfg rename to linux-tkg-config/ryzen-desktop-profile.cfg diff --git a/linux-tkg/linux-tkg-patches/5.10/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg-patches/5.10/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch rename to linux-tkg-patches/5.10/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0002-clear-patches.patch b/linux-tkg-patches/5.10/0002-clear-patches.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0002-clear-patches.patch rename to linux-tkg-patches/5.10/0002-clear-patches.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0003-glitched-base.patch b/linux-tkg-patches/5.10/0003-glitched-base.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0003-glitched-base.patch rename to linux-tkg-patches/5.10/0003-glitched-base.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0003-glitched-cfs.patch b/linux-tkg-patches/5.10/0003-glitched-cfs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0003-glitched-cfs.patch rename to linux-tkg-patches/5.10/0003-glitched-cfs.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0004-5.10-ck1.patch b/linux-tkg-patches/5.10/0004-5.10-ck1.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0004-5.10-ck1.patch rename to linux-tkg-patches/5.10/0004-5.10-ck1.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0004-glitched-muqss.patch b/linux-tkg-patches/5.10/0004-glitched-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0004-glitched-muqss.patch rename to linux-tkg-patches/5.10/0004-glitched-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0004-glitched-ondemand-muqss.patch b/linux-tkg-patches/5.10/0004-glitched-ondemand-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0004-glitched-ondemand-muqss.patch rename to linux-tkg-patches/5.10/0004-glitched-ondemand-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0005-glitched-pds.patch b/linux-tkg-patches/5.10/0005-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0005-glitched-pds.patch rename to linux-tkg-patches/5.10/0005-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0006-add-acs-overrides_iommu.patch b/linux-tkg-patches/5.10/0006-add-acs-overrides_iommu.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0006-add-acs-overrides_iommu.patch rename to linux-tkg-patches/5.10/0006-add-acs-overrides_iommu.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0007-v5.10-fsync.patch b/linux-tkg-patches/5.10/0007-v5.10-fsync.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0007-v5.10-fsync.patch rename to linux-tkg-patches/5.10/0007-v5.10-fsync.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0009-glitched-bmq.patch b/linux-tkg-patches/5.10/0009-glitched-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0009-glitched-bmq.patch rename to linux-tkg-patches/5.10/0009-glitched-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0009-glitched-ondemand-bmq.patch b/linux-tkg-patches/5.10/0009-glitched-ondemand-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0009-glitched-ondemand-bmq.patch rename to linux-tkg-patches/5.10/0009-glitched-ondemand-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0009-prjc_v5.10-r0.patch b/linux-tkg-patches/5.10/0009-prjc_v5.10-r0.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0009-prjc_v5.10-r0.patch rename to linux-tkg-patches/5.10/0009-prjc_v5.10-r0.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0011-ZFS-fix.patch b/linux-tkg-patches/5.10/0011-ZFS-fix.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0011-ZFS-fix.patch rename to linux-tkg-patches/5.10/0011-ZFS-fix.patch diff --git a/linux-tkg/linux-tkg-patches/5.10/0012-misc-additions.patch b/linux-tkg-patches/5.10/0012-misc-additions.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.10/0012-misc-additions.patch rename to linux-tkg-patches/5.10/0012-misc-additions.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg-patches/5.4/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch rename to linux-tkg-patches/5.4/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch b/linux-tkg-patches/5.4/0002-clear-patches.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch rename to linux-tkg-patches/5.4/0002-clear-patches.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch b/linux-tkg-patches/5.4/0003-glitched-base.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch rename to linux-tkg-patches/5.4/0003-glitched-base.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch b/linux-tkg-patches/5.4/0003-glitched-cfs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch rename to linux-tkg-patches/5.4/0003-glitched-cfs.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch b/linux-tkg-patches/5.4/0004-5.4-ck1.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch rename to linux-tkg-patches/5.4/0004-5.4-ck1.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch b/linux-tkg-patches/5.4/0004-glitched-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch rename to linux-tkg-patches/5.4/0004-glitched-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch b/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch rename to linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch b/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch rename to linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch b/linux-tkg-patches/5.4/0005-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch rename to linux-tkg-patches/5.4/0005-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch b/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch rename to linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch b/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch rename to linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch b/linux-tkg-patches/5.4/0007-v5.4-fsync.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch rename to linux-tkg-patches/5.4/0007-v5.4-fsync.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch b/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch rename to linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch b/linux-tkg-patches/5.4/0009-glitched-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch rename to linux-tkg-patches/5.4/0009-glitched-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch b/linux-tkg-patches/5.4/0011-ZFS-fix.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch rename to linux-tkg-patches/5.4/0011-ZFS-fix.patch diff --git a/linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch b/linux-tkg-patches/5.4/0012-linux-hardened.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch rename to linux-tkg-patches/5.4/0012-linux-hardened.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch rename to linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch b/linux-tkg-patches/5.7/0002-clear-patches.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch rename to linux-tkg-patches/5.7/0002-clear-patches.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch b/linux-tkg-patches/5.7/0003-glitched-base.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch rename to linux-tkg-patches/5.7/0003-glitched-base.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch b/linux-tkg-patches/5.7/0003-glitched-cfs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch rename to linux-tkg-patches/5.7/0003-glitched-cfs.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch b/linux-tkg-patches/5.7/0004-5.7-ck1.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch rename to linux-tkg-patches/5.7/0004-5.7-ck1.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch b/linux-tkg-patches/5.7/0004-glitched-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch rename to linux-tkg-patches/5.7/0004-glitched-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch b/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch rename to linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch b/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch rename to linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch b/linux-tkg-patches/5.7/0005-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch rename to linux-tkg-patches/5.7/0005-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch b/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch rename to linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch b/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch rename to linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch b/linux-tkg-patches/5.7/0007-v5.7-fsync.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch rename to linux-tkg-patches/5.7/0007-v5.7-fsync.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch b/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch rename to linux-tkg-patches/5.7/0008-5.7-bcachefs.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch b/linux-tkg-patches/5.7/0009-glitched-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch rename to linux-tkg-patches/5.7/0009-glitched-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch b/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch rename to linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch b/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch rename to linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch b/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch rename to linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch b/linux-tkg-patches/5.7/0011-ZFS-fix.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch rename to linux-tkg-patches/5.7/0011-ZFS-fix.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch b/linux-tkg-patches/5.7/0012-linux-hardened.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch rename to linux-tkg-patches/5.7/0012-linux-hardened.patch diff --git a/linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch b/linux-tkg-patches/5.7/0012-misc-additions.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch rename to linux-tkg-patches/5.7/0012-misc-additions.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch rename to linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch b/linux-tkg-patches/5.8/0002-clear-patches.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch rename to linux-tkg-patches/5.8/0002-clear-patches.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch b/linux-tkg-patches/5.8/0003-glitched-base.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch rename to linux-tkg-patches/5.8/0003-glitched-base.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch b/linux-tkg-patches/5.8/0003-glitched-cfs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch rename to linux-tkg-patches/5.8/0003-glitched-cfs.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch b/linux-tkg-patches/5.8/0005-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch rename to linux-tkg-patches/5.8/0005-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch b/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch rename to linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch b/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch rename to linux-tkg-patches/5.8/0005-undead-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch b/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch rename to linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch b/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch rename to linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch b/linux-tkg-patches/5.8/0007-v5.8-fsync.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch rename to linux-tkg-patches/5.8/0007-v5.8-fsync.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch b/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch rename to linux-tkg-patches/5.8/0008-5.8-bcachefs.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch b/linux-tkg-patches/5.8/0009-glitched-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch rename to linux-tkg-patches/5.8/0009-glitched-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch b/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch rename to linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch b/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch rename to linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch b/linux-tkg-patches/5.8/0011-ZFS-fix.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch rename to linux-tkg-patches/5.8/0011-ZFS-fix.patch diff --git a/linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch b/linux-tkg-patches/5.8/0012-misc-additions.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch rename to linux-tkg-patches/5.8/0012-misc-additions.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch rename to linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch b/linux-tkg-patches/5.9/0002-clear-patches.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch rename to linux-tkg-patches/5.9/0002-clear-patches.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch b/linux-tkg-patches/5.9/0003-glitched-base.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch rename to linux-tkg-patches/5.9/0003-glitched-base.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch b/linux-tkg-patches/5.9/0003-glitched-cfs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch rename to linux-tkg-patches/5.9/0003-glitched-cfs.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch b/linux-tkg-patches/5.9/0004-5.9-ck1.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch rename to linux-tkg-patches/5.9/0004-5.9-ck1.patch diff --git a/linux510-rc-tkg/linux510-tkg-patches/0004-glitched-muqss.patch b/linux-tkg-patches/5.9/0004-glitched-muqss.patch similarity index 100% rename from linux510-rc-tkg/linux510-tkg-patches/0004-glitched-muqss.patch rename to linux-tkg-patches/5.9/0004-glitched-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch b/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch rename to linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch b/linux-tkg-patches/5.9/0005-glitched-pds.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch rename to linux-tkg-patches/5.9/0005-glitched-pds.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch b/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch rename to linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch b/linux-tkg-patches/5.9/0007-v5.9-fsync.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch rename to linux-tkg-patches/5.9/0007-v5.9-fsync.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch b/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch rename to linux-tkg-patches/5.9/0008-5.9-bcachefs.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch b/linux-tkg-patches/5.9/0009-glitched-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch rename to linux-tkg-patches/5.9/0009-glitched-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch b/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch rename to linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r1.patch b/linux-tkg-patches/5.9/0009-prjc_v5.9-r1.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r1.patch rename to linux-tkg-patches/5.9/0009-prjc_v5.9-r1.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch b/linux-tkg-patches/5.9/0011-ZFS-fix.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch rename to linux-tkg-patches/5.9/0011-ZFS-fix.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch b/linux-tkg-patches/5.9/0012-misc-additions.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch rename to linux-tkg-patches/5.9/0012-misc-additions.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch b/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch similarity index 100% rename from linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch rename to linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch b/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch deleted file mode 100644 index 2c4837e..0000000 --- a/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch +++ /dev/null @@ -1,78 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - MuQSS - -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index 84a1d08d68551..57c3036a68952 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -+#ifdef CONFIG_ZENIFY -+int sched_iso_cpu __read_mostly = 25; -+#else - int sched_iso_cpu __read_mostly = 70; -+#endif - - /* - * sched_yield_type - Choose what sort of yield sched_yield will perform. - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,7 @@ - choice - prompt "Timer frequency" - default HZ_100 if SCHED_MUQSS -- default HZ_250_NODEF if !SCHED_MUQSS -+ default HZ_500_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -50,6 +50,20 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500_NODEF -+ bool "500 HZ" -+ help -+ 500 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ -+ config HZ_750_NODEF -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000_NODEF - bool "1000 HZ" - help -@@ -63,6 +70,8 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250_NODEF - default 300 if HZ_300_NODEF -+ default 500 if HZ_500_NODEF -+ default 750 if HZ_750_NODEF - default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - -diff --git a/Makefile b/Makefile -index d4d36c61940b..4a9dfe471f1f 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus - - CKVERSION = -ck1 - CKNAME = MuQSS Powered --EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) - - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. diff --git a/linux510-rc-tkg/PKGBUILD b/linux510-rc-tkg/PKGBUILD deleted file mode 100644 index b211543..0000000 --- a/linux510-rc-tkg/PKGBUILD +++ /dev/null @@ -1,284 +0,0 @@ -# Based on the file created for Arch Linux by: -# Tobias Powalowski -# Thomas Baechler - -# Contributor: Tk-Glitch - -plain ' .---.` `.---.' -plain ' `/syhhhyso- -osyhhhys/`' -plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' -plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' -plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' -plain ' /ssyhhhysssssssssssssssssyhhhyss/' -plain ' .ossssssssssssssssssssssssssssso.' -plain ' :sssssssssssssssssssssssssssssssss:' -plain ' /sssssssssssssssssssssssssssssssssss/' -plain ' :sssssssssssssoosssssssoosssssssssssss:' -plain ' osssssssssssssoosssssssoossssssssssssso' -plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' -plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' -plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' -plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' -plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' -plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' -plain ' `.-:///////:-.`' - -_where="$PWD" # track basedir as different Arch based distros are moving srcdir around - -source "$_where"/customization.cfg # load default configuration from file -source "$_where"/linux*-tkg-config/prepare - -if [[ "$_sub" = rc* ]]; then - _srcpath="linux-${_basekernel}-${_sub}" -else - _srcpath="linux-${_basekernel}" -fi - -_tkg_initscript - -_distro="Arch" - -if [ -n "$_custom_pkgbase" ]; then - pkgbase="${_custom_pkgbase}" -else - pkgbase=linux"${_basever}"-tkg-"${_cpusched}" -fi -pkgname=("${pkgbase}" "${pkgbase}-headers") -pkgver="${_basekernel}"."${_sub}" -pkgrel=1 -pkgdesc='Linux-tkg' -arch=('x86_64') # no i686 in here -url="http://www.kernel.org/" -license=('GPL2') -makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'pahole' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') -optdepends=('schedtool') -options=('!strip' 'docs') -source=("https://git.kernel.org/torvalds/t/linux-${_basekernel}-${_sub}.tar.gz" - #"https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" - #"https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" - "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" - 'config.x86_64' # stock Arch config - #'config_hardened.x86_64' # hardened Arch config - 90-cleanup.hook - cleanup - # ARCH Patches - 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - # TkG - 0002-clear-patches.patch - 0003-glitched-base.patch - 0003-glitched-cfs.patch - 0004-glitched-ondemand-muqss.patch - 0004-glitched-muqss.patch - 0004-5.10-ck1.patch - #0005-undead-glitched-ondemand-pds.patch - #0005-undead-glitched-pds.patch - #0005-v5.8_undead-pds099o.patch - 0005-glitched-pds.patch - 0006-add-acs-overrides_iommu.patch - 0007-v5.10-fsync.patch - #0008-5.10-bcachefs.patch - 0009-glitched-ondemand-bmq.patch - 0009-glitched-bmq.patch - 0009-prjc_v5.10-r0.patch - 0011-ZFS-fix.patch - #0012-linux-hardened.patch - 0012-misc-additions.patch -) -sha256sums=('483d8b3945963ea375026c4dde019da36f5d2116241036b09493e63e92e39ee8' - '5ab29eb64e57df83b395a29a6a4f89030d142feffbfbf73b3afc6d97a2a7fd12' - '834247434877e4e76201ada7df35ebd4622116737e9650e0772f22d03083b426' - '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' - '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' - 'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6' - '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' - '1ac97da07e72ec7e2b0923d32daacacfaa632a44c714d6942d9f143fe239e1b5' - '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' - 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' - '2bbbac963b6ca44ef3f8a71ec7c5cad7d66df860869a73059087ee236775970a' - '4231bd331289f5678b49d084698f0a80a3ae602eccb41d89e4f85ff4465eb971' - 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' - '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - 'b302ba6c5bbe8ed19b20207505d513208fae1e678cf4d8e7ac0b154e5fe3f456' - '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' - 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' - '88c7e308e474c845e0cc09e09bd223fc39876eca757abf6d6c3b8321f49ce1f1' - '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' - '433b919e6a0be26784fb4304c43b1811a28f12ad3de9e26c0af827f64c0c316e') - -export KBUILD_BUILD_HOST=archlinux -export KBUILD_BUILD_USER=$pkgbase -export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" - -prepare() { - rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build - - ln -s "${_where}/customization.cfg" "${srcdir}" # workaround - - cd "${srcdir}/${_srcpath}" - - _tkg_srcprep -} - -build() { - cd "${srcdir}/${_srcpath}" - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _force_all_threads="-j$((`nproc`*2))" - else - _force_all_threads="${MAKEFLAGS}" - fi - - # ccache - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - export PATH="/usr/lib/ccache/bin/:$PATH" - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - fi - - # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". - declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" - - # build! - _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) -} - -hackbase() { - pkgdesc="The $pkgdesc kernel and modules" - depends=('coreutils' 'kmod' 'initramfs') - optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' - 'crda: to set the correct wireless channels of your country.' - 'linux-firmware: Firmware files for Linux' - 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' - 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' - 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' - 'update-grub: Simple wrapper around grub-mkconfig.') - provides=("linux=${pkgver}" "${pkgbase}" VIRTUALBOX-GUEST-MODULES WIREGUARD-MODULE) - replaces=(virtualbox-guest-modules-arch wireguard-arch) - - cd "${srcdir}/${_srcpath}" - - # get kernel version - local _kernver="$(\033[1;0m \033[1;1m$1\033[1;0m" >&2 -} - -error() { - echo -e " \033[1;31m==> ERROR: $1\033[1;0m" >&2 -} - -warning() { - echo -e " \033[1;33m==> WARNING: $1\033[1;0m" >&2 -} - -plain() { - echo "$1" >&2 -} - -# Stop the script at any ecountered error -set -e - -_where=`pwd` -srcdir="$_where" - -source linux*-tkg-config/prepare - -_cpu_opt_patch_link="https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" - -source customization.cfg - -if [ "$1" != "install" ] && [ "$1" != "config" ] && [ "$1" != "uninstall-help" ]; then - msg2 "Argument not recognised, options are: - - config : shallow clones the linux ${_basekernel}.x git tree into the folder linux-${_basekernel}, then applies on it the extra patches and prepares the .config file - by copying the one from the current linux system in /boot/config-`uname -r` and updates it. - - install : [RPM and DEB based distros only], does the config step, proceeds to compile, then prompts to install - - uninstall-help : [RPM and DEB based distros only], lists the installed kernels in this system, then gives a hint on how to uninstall them manually." - exit 0 -fi - -# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. -if [ -e "$_EXT_CONFIG_PATH" ]; then - msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values." - source "$_EXT_CONFIG_PATH" -fi - -_misc_adds="false" # We currently don't want this enabled on non-Arch - -if [ "$1" = "install" ] || [ "$1" = "config" ]; then - - if [ -z $_distro ] && [ "$1" = "install" ]; then - while true; do - echo "Which linux distribution are you running ?" - echo "if it's not on the list, chose the closest one to it: Fedora/Suse for RPM, Ubuntu/Debian for DEB" - echo " 1) Debian" - echo " 2) Fedora" - echo " 3) Suse" - echo " 4) Ubuntu" - read -p "[1-4]: " _distro_index - - if [ "$_distro_index" = "1" ]; then - _distro="Debian" - break - elif [ "$_distro_index" = "2" ]; then - _distro="Fedora" - break - elif [ "$_distro_index" = "3" ]; then - _distro="Suse" - break - elif [ "$_distro_index" = "4" ]; then - _distro="Ubuntu" - break - else - echo "Wrong index." - fi - done - fi - - if [[ $1 = "install" && "$_distro" != "Ubuntu" && "$_distro" != "Debian" && "$_distro" != "Fedora" && "$_distro" != "Suse" ]]; then - msg2 "Variable \"_distro\" in \"customization.cfg\" hasn't been set to \"Ubuntu\", \"Debian\", \"Fedora\" or \"Suse\"" - msg2 "This script can only install custom kernels for RPM and DEB based distros, though only those keywords are permitted. Exiting..." - exit 0 - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - msg2 "Installing dependencies" - sudo apt install git build-essential kernel-package fakeroot libncurses5-dev libssl-dev ccache bison flex qtbase5-dev -y - elif [ "$_distro" = "Fedora" ]; then - msg2 "Installing dependencies" - sudo dnf install fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby qt5-devel libXi-devel gcc-c++ git ccache flex bison elfutils-libelf-devel openssl-devel dwarves rpm-build -y - elif [ "$_distro" = "Suse" ]; then - msg2 "Installing dependencies" - sudo zypper install -y rpmdevtools ncurses-devel pesign libXi-devel gcc-c++ git ccache flex bison elfutils libelf-devel openssl-devel dwarves make patch bc rpm-build libqt5-qtbase-common-devel libqt5-qtbase-devel lz4 - fi - - # Force prepare script to avoid Arch specific commands if the user is using `config` - if [ "$1" = "config" ]; then - _distro="" - fi - - if [ -d linux-${_basekernel}.orig ]; then - rm -rf linux-${_basekernel}.orig - fi - - if [ -d linux-${_basekernel} ]; then - msg2 "Reseting files in linux-$_basekernel to their original state and getting latest updates" - cd "$_where"/linux-${_basekernel} - git checkout --force linux-$_basekernel.y - git clean -f -d -x - git pull - msg2 "Done" - cd "$_where" - else - msg2 "Shallow git cloning linux $_basekernel" - git clone --branch linux-$_basekernel.y --single-branch --depth=1 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git linux-${_basekernel} - msg2 "Done" - fi - - # Define current kernel subversion - if [ -z $_kernel_subver ]; then - cd "$_where"/linux-${_basekernel} - _kernelverstr=`git describe` - _kernel_subver=${_kernelverstr:5} - cd "$_where" - fi - - - # Run init script that is also run in PKGBUILD, it will define some env vars that we will use - _tkg_initscript - - cd "$_where" - msg2 "Downloading Graysky2's CPU optimisations patch" - wget "$_cpu_opt_patch_link" - - # Follow Ubuntu install isntructions in https://wiki.ubuntu.com/KernelTeam/GitKernelBuild - - # cd in linux folder, copy Ubuntu's current config file, update with new params - cd "$_where"/linux-${_basekernel} - - msg2 "Copying current kernel's config and running make oldconfig..." - cp /boot/config-`uname -r` .config - if [ "$_distro" = "Debian" ]; then #Help Debian cert problem. - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/test-signing-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/debian-uefi-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - fi - yes '' | make oldconfig - msg2 "Done" - - # apply linux-tkg patching script - _tkg_srcprep - - msg2 "Configuration done." -fi - -if [ "$1" = "install" ]; then - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _thread_num=`nproc` - else - _thread_num=`expr \`nproc\` / 4` - if [ "$_thread_num" = "0" ]; then - _thread_num=1 - fi - fi - - # ccache - if [ "$_noccache" != "true" ]; then - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - export PATH="/usr/lib/ccache/bin/:$PATH" - elif [ "$_distro" = "Fedora" ] || [ "$_distro" = "Suse" ]; then - export PATH="/usr/lib64/ccache/:$PATH" - fi - - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - - fi - - if [ -z $_kernel_localversion ]; then - _kernel_flavor="tkg-${_cpusched}" - else - _kernel_flavor="tkg-${_kernel_localversion}" - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - - if make -j ${_thread_num} deb-pkg LOCALVERSION=-${_kernel_flavor}; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create DEBS folder if it doesn't exist - mkdir -p DEBS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv "$_where"/*.deb "$_where"/DEBS/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [[ $_install =~ [yY] ]] || [ $_install = "yes" ] || [ $_install = "Yes" ]; then - cd "$_where" - _kernelname=$_basekernel.$_kernel_subver-$_kernel_flavor - _headers_deb="linux-headers-${_kernelname}*.deb" - _image_deb="linux-image-${_kernelname}_*.deb" - _kernel_devel_deb="linux-libc-dev_${_kernelname}*.deb" - - cd DEBS - sudo dpkg -i $_headers_deb $_image_deb $_kernel_devel_deb - fi - fi - - elif [[ "$_distro" = "Fedora" || "$_distro" = "Suse" ]]; then - - # Replace dashes with underscores, it seems that it's being done by binrpm-pkg - # Se we can actually refer properly to the rpm files. - _kernel_flavor=${_kernel_flavor//-/_} - - if make -j ${_thread_num} rpm-pkg EXTRAVERSION="_${_kernel_flavor}"; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create RPMS folder if it doesn't exist - mkdir -p RPMS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv ~/rpmbuild/RPMS/x86_64/* "$_where"/RPMS/ - - #Clean up the original folder, unneeded and takes a lot of space - rm -rf ~/rpmbuild/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [ "$_install" = "y" ] || [ "$_install" = "Y" ] || [ "$_install" = "yes" ] || [ "$_install" = "Yes" ]; then - - _kernelname=$_basekernel.${_kernel_subver}_$_kernel_flavor - _headers_rpm="kernel-headers-${_kernelname}*.rpm" - _kernel_rpm="kernel-${_kernelname}*.rpm" - _kernel_devel_rpm="kernel-devel-${_kernelname}*.rpm" - - cd RPMS - if [ "$_distro" = "Fedora" ]; then - sudo dnf install $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - elif [ "$_distro" = "Suse" ]; then - msg2 "Some files from 'linux-glibc-devel' will be replaced by files from the custom kernel-hearders package" - msg2 "To revert back to the original kernel headers do 'sudo zypper install -f linux-glibc-devel'" - sudo zypper install --replacefiles --allow-unsigned-rpm $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - fi - - msg2 "Install successful" - fi - fi - fi -fi - -if [ "$1" = "uninstall-help" ]; then - - cd "$_where" - msg2 "List of installed custom tkg kernels: " - - if [ "$_distro" = "Ubuntu" ]; then - dpkg -l "*tkg*" | grep "linux.*tkg" - dpkg -l "*linux-libc-dev*" | grep "linux.*tkg" - msg2 "To uninstall a version, you should remove the linux-image, linux-headers and linux-libc-dev associated to it (if installed), with: " - msg2 " sudo apt remove linux-image-VERSION linux-headers-VERSION linux-libc-dev-VERSION" - msg2 " where VERSION is displayed in the lists above, uninstall only versions that have \"tkg\" in its name" - elif [ "$_distro" = "Fedora" ]; then - dnf list --installed kernel* - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo dnf remove --noautoremove kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second column" - elif [ "$_distro" = "Suse" ]; then - zypper packages --installed-only | grep "kernel.*tkg" - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo zypper remove --no-clean-deps kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second to last column" - fi - -fi diff --git a/linux510-rc-tkg/linux510-tkg-config/90-cleanup.hook b/linux510-rc-tkg/linux510-tkg-config/90-cleanup.hook deleted file mode 100644 index 99f5221..0000000 --- a/linux510-rc-tkg/linux510-tkg-config/90-cleanup.hook +++ /dev/null @@ -1,14 +0,0 @@ -[Trigger] -Type = File -Operation = Install -Operation = Upgrade -Operation = Remove -Target = usr/lib/modules/*/ -Target = !usr/lib/modules/*/?* - -[Action] -Description = Cleaning up... -When = PostTransaction -Exec = /usr/share/libalpm/scripts/cleanup -NeedsTargets - diff --git a/linux510-rc-tkg/linux510-tkg-config/cleanup b/linux510-rc-tkg/linux510-tkg-config/cleanup deleted file mode 100755 index c00c08d..0000000 --- a/linux510-rc-tkg/linux510-tkg-config/cleanup +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -for _f in /usr/lib/modules/*tkg*; do - if [[ ! -e ${_f}/vmlinuz ]]; then - rm -rf "$_f" - fi -done - -# vim:set ft=sh sw=2 et: - diff --git a/linux510-rc-tkg/linux510-tkg-config/config.x86_64 b/linux510-rc-tkg/linux510-tkg-config/config.x86_64 deleted file mode 100644 index 5d15c18..0000000 --- a/linux510-rc-tkg/linux510-tkg-config/config.x86_64 +++ /dev/null @@ -1,11179 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 5.10.0-rc1 Kernel Configuration -# -CONFIG_CC_VERSION_TEXT="gcc (GCC) 10.2.0" -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=100200 -CONFIG_LD_VERSION=235010000 -CONFIG_CLANG_VERSION=0 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_CAN_LINK_STATIC=y -CONFIG_CC_HAS_ASM_GOTO=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -CONFIG_HAVE_KERNEL_ZSTD=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -CONFIG_KERNEL_XZ=y -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -# CONFIG_KERNEL_ZSTD is not set -CONFIG_DEFAULT_INIT="" -CONFIG_DEFAULT_HOSTNAME="archlinux" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_WATCH_QUEUE=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y -CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y -CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -# end of Timers subsystem - -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y - -# -# CPU/Task time and stats accounting -# -CONFIG_TICK_CPU_ACCOUNTING=y -# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -CONFIG_RCU_EXPERT=y -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_RUDE_RCU=y -CONFIG_TASKS_TRACE_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_RCU_FANOUT=64 -CONFIG_RCU_FANOUT_LEAF=16 -CONFIG_RCU_FAST_NO_HZ=y -CONFIG_RCU_BOOST=y -CONFIG_RCU_BOOST_DELAY=500 -# CONFIG_RCU_NOCB_CPU is not set -# CONFIG_TASKS_TRACE_RCU_READ_MB is not set -# end of RCU Subsystem - -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_IKHEADERS is not set -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -CONFIG_UCLAMP_TASK=y -CONFIG_UCLAMP_BUCKETS_COUNT=5 -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_USER_NS_UNPRIVILEGED=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_RD_ZSTD=y -CONFIG_BOOT_CONFIG=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -CONFIG_EXPERT=y -# CONFIG_UID16 is not set -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_HAVE_ARCH_USERFAULTFD_WP=y -CONFIG_MEMBARRIER=y -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_BPF_LSM=y -CONFIG_BPF_SYSCALL=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -# CONFIG_BPF_PRELOAD is not set -CONFIG_USERFAULTFD=y -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_RSEQ=y -# CONFIG_DEBUG_RSEQ is not set -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -# CONFIG_SLOB is not set -CONFIG_SLAB_MERGE_DEFAULT=y -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_FILTER_PGPROT=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DYNAMIC_PHYSICAL_MASK=y -CONFIG_PGTABLE_LEVELS=5 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_X86_CPU_RESCTRL=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_PARAVIRT_XXL=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -CONFIG_XEN=y -CONFIG_XEN_PV=y -CONFIG_XEN_PV_SMP=y -CONFIG_XEN_DOM0=y -CONFIG_XEN_PVHVM=y -CONFIG_XEN_PVHVM_SMP=y -CONFIG_XEN_512GB=y -CONFIG_XEN_SAVE_RESTORE=y -# CONFIG_XEN_DEBUG_FS is not set -CONFIG_XEN_PVH=y -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -CONFIG_JAILHOUSE_GUEST=y -CONFIG_ACRN_GUEST=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_PROCESSOR_SELECT=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=320 -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -CONFIG_X86_MCE_INJECT=m -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=m -CONFIG_PERF_EVENTS_INTEL_RAPL=m -CONFIG_PERF_EVENTS_INTEL_CSTATE=m -CONFIG_PERF_EVENTS_AMD_POWER=m -# end of Performance monitoring - -CONFIG_X86_16BIT=y -CONFIG_X86_ESPFIX64=y -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_X86_IOPL_IOPERM=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -CONFIG_X86_5LEVEL=y -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=5 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ARCH_PROC_KCORE_TEXT=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_UMIP=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_ARCH_HAS_KEXEC_PURGATORY=y -# CONFIG_KEXEC_SIG is not set -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x200000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -CONFIG_LEGACY_VSYSCALL_XONLY=y -# CONFIG_LEGACY_VSYSCALL_NONE is not set -# CONFIG_CMDLINE_BOOL is not set -CONFIG_MODIFY_LDT_SYSCALL=y -CONFIG_HAVE_LIVEPATCH=y -# CONFIG_LIVEPATCH is not set -# end of Processor type and features - -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_SUSPEND_SKIP_SYNC is not set -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_HIBERNATION_SNAPSHOT_DEV=y -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -# CONFIG_PM_TEST_SUSPEND is not set -CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_DPM_WATCHDOG is not set -CONFIG_PM_TRACE=y -CONFIG_PM_TRACE_RTC=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_PM_GENERIC_DOMAINS_OF=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -CONFIG_ACPI_EC_DEBUGFS=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=y -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_DEBUG=y -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -CONFIG_ACPI_CUSTOM_METHOD=m -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -# CONFIG_NFIT_SECURITY_DEBUG is not set -CONFIG_ACPI_NUMA=y -CONFIG_ACPI_HMAT=y -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -CONFIG_ACPI_APEI_EINJ=m -CONFIG_ACPI_APEI_ERST_DEBUG=m -# CONFIG_ACPI_DPTF is not set -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_ADXL=y -CONFIG_ACPI_CONFIGFS=m -CONFIG_PMIC_OPREGION=y -CONFIG_BYTCRC_PMIC_OPREGION=y -CONFIG_CHTCRC_PMIC_OPREGION=y -CONFIG_XPOWER_PMIC_OPREGION=y -CONFIG_BXT_WC_PMIC_OPREGION=y -CONFIG_CHT_WC_PMIC_OPREGION=y -CONFIG_CHT_DC_TI_PMIC_OPREGION=y -CONFIG_TPS68470_PMIC_OPREGION=y -CONFIG_X86_PM_TIMER=y -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_CPUFREQ_DT=m -CONFIG_CPUFREQ_DT_PLATDEV=y -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=m - -# -# shared options -# -CONFIG_X86_SPEEDSTEP_LIB=m -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=m -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_XEN=y -CONFIG_MMCONF_FAM10H=y -# CONFIG_PCI_CNB20LE_QUIRK is not set -# CONFIG_ISA_BUS is not set -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# CONFIG_X86_SYSFB is not set -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_X86_X32 is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -# end of Binary Emulations - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -# CONFIG_GOOGLE_SMI is not set -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_MEMCONSOLE=m -# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set -CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -# CONFIG_EFI_VARS is not set -CONFIG_EFI_ESRT=y -CONFIG_EFI_VARS_PSTORE=y -# CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE is not set -CONFIG_EFI_RUNTIME_MAP=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_SOFT_RESERVE=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y -# CONFIG_EFI_BOOTLOADER_CONTROL is not set -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_APPLE_PROPERTIES=y -# CONFIG_RESET_ATTACK_MITIGATION is not set -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_EFI_EMBEDDED_FIRMWARE=y -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_EFI_EARLYCON=y -CONFIG_EFI_CUSTOM_SSDT_OVERLAYS=y - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_KVM_XFER_TO_GUEST_WORK=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_MMU_AUDIT=y -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y -CONFIG_AS_TPAUSE=y - -# -# General architecture-dependent options -# -CONFIG_CRASH_CORE=y -CONFIG_KEXEC_CORE=y -CONFIG_HOTPLUG_SMT=y -CONFIG_GENERIC_ENTRY=y -CONFIG_OPROFILE=m -# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -# CONFIG_STATIC_CALL_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=28 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_ISA_BUS_API=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y -CONFIG_HAVE_STATIC_CALL=y -CONFIG_HAVE_STATIC_CALL_INLINE=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set -# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set -# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_MODVERSIONS is not set -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -CONFIG_MODULE_COMPRESS_XZ=y -CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_INTEGRITY_T10=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y -# CONFIG_BLK_CMDLINE_PARSER is not set -CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y -CONFIG_BLK_SED_OPAL=y -CONFIG_BLK_INLINE_ENCRYPTION=y -CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_AIX_PARTITION=y -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y -CONFIG_BLK_PM=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_FAST_GUP=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -CONFIG_HWPOISON_INJECT=m -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -# CONFIG_CMA is not set -CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZPOOL=y -CONFIG_ZBUD=y -CONFIG_Z3FOLD=y -CONFIG_ZSMALLOC=y -# CONFIG_ZSMALLOC_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DEVICE=y -CONFIG_DEV_PAGEMAP_OPS=y -CONFIG_HMM_MIRROR=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_VMAP_PFN=y -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_BENCHMARK is not set -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MAPPING_DIRTY_HELPERS=y -# end of Memory Management options - -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_REDIRECT=y -CONFIG_SKB_EXTENSIONS=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_UNIX_SCM=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -# CONFIG_XFRM_USER_COMPAT is not set -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_AH=m -CONFIG_XFRM_ESP=m -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_XFRM_ESPINTCP=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -# CONFIG_NET_IPGRE_BROADCAST is not set -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_ESPINTCP=y -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_INET_RAW_DIAG=m -CONFIG_INET_DIAG_DESTROY=y -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_CUBIC=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="cubic" -CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_ESPINTCP=y -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_RPL_LWTUNNEL=y -CONFIG_NETLABEL=y -CONFIG_MPTCP=y -CONFIG_INET_MPTCP_DIAG=m -CONFIG_MPTCP_IPV6=y -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=15 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_MH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS MH scheduler -# -CONFIG_IP_VS_MH_TAB_INDEX=12 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_FLOW_TABLE_IPV4=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_FLOW_TABLE_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_BPFILTER is not set -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y -# end of DCCP CCIDs Configuration - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# end of DCCP Kernel Hacking - -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_TIPC_CRYPTO=y -CONFIG_TIPC_DIAG=m -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_BRIDGE_MRP=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_8021Q=m -CONFIG_NET_DSA_TAG_AR9331=m -CONFIG_NET_DSA_TAG_BRCM_COMMON=m -CONFIG_NET_DSA_TAG_BRCM=m -CONFIG_NET_DSA_TAG_BRCM_PREPEND=m -CONFIG_NET_DSA_TAG_GSWIP=m -CONFIG_NET_DSA_TAG_DSA=m -CONFIG_NET_DSA_TAG_EDSA=m -CONFIG_NET_DSA_TAG_MTK=m -CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_RTL4_A=m -CONFIG_NET_DSA_TAG_OCELOT=m -CONFIG_NET_DSA_TAG_QCA=m -CONFIG_NET_DSA_TAG_LAN9303=m -CONFIG_NET_DSA_TAG_SJA1105=m -CONFIG_NET_DSA_TAG_TRAILER=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -# CONFIG_DECNET is not set -CONFIG_LLC=m -CONFIG_LLC2=m -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_CBS=m -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_SKBPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_FQ_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_SCH_ETS=m -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_CODEL is not set -CONFIG_DEFAULT_FQ_CODEL=y -# CONFIG_DEFAULT_FQ_PIE is not set -# CONFIG_DEFAULT_SFQ is not set -# CONFIG_DEFAULT_PFIFO_FAST is not set -CONFIG_DEFAULT_NET_SCH="fq_codel" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_MPLS=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_CTINFO=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_ACT_CT=m -CONFIG_NET_ACT_GATE=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_TC_SKB_EXT=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -CONFIG_BATMAN_ADV_BATMAN_V=y -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -CONFIG_BATMAN_ADV_DEBUGFS=y -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_BATMAN_ADV_SYSFS=y -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VSOCKETS_DIAG=m -CONFIG_VSOCKETS_LOOPBACK=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -CONFIG_QRTR=m -CONFIG_QRTR_SMD=m -CONFIG_QRTR_TUN=m -CONFIG_QRTR_MHI=m -CONFIG_NET_NCSI=y -CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -CONFIG_NET_DROP_MONITOR=y -# end of Network testing -# end of Networking options - -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -# end of AX.25 network device drivers - -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m -CONFIG_CAN_J1939=m -# CONFIG_CAN_ISOTP is not set - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_FLEXCAN=m -CONFIG_CAN_GRCAN=m -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_KVASER_PCIEFD=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_M_CAN_PLATFORM=m -CONFIG_CAN_M_CAN_TCAN4X5X=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_EMS_PCI=m -# CONFIG_CAN_EMS_PCMCIA is not set -CONFIG_CAN_F81601=m -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m -# CONFIG_CAN_MCP251XFD is not set -# end of CAN SPI interfaces - -# -# CAN USB interfaces -# -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_MCBA_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_UCAN=m -# end of CAN USB interfaces - -# CONFIG_CAN_DEBUG_DEVICES is not set -# end of CAN Device Drivers - -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -CONFIG_BT_MSFTEXT=y -CONFIG_BT_DEBUGFS=y -# CONFIG_BT_SELFTEST is not set - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_RTL=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_MTKSDIO=m -CONFIG_BT_MTKUART=m -CONFIG_BT_HCIRSI=m -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -CONFIG_AF_RXRPC_DEBUG=y -CONFIG_RXKAD=y -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -# CONFIG_CFG80211_CERTIFICATION_ONUS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_XEN=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -CONFIG_CEPH_LIB_PRETTYDEBUG=y -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_PN532_UART=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -# end of Near Field Communication (NFC) devices - -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SOCK_MSG=y -CONFIG_NET_DEVLINK=y -CONFIG_PAGE_POOL=y -CONFIG_FAILOVER=m -CONFIG_ETHTOOL_NETLINK=y -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_BW is not set -CONFIG_PCIE_EDR=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -CONFIG_PCI_MSI_ARCH_FALLBACKS=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=y -CONFIG_PCI_PF_STUB=m -CONFIG_XEN_PCIDEV_FRONTEND=m -CONFIG_PCI_ATS=y -CONFIG_PCI_ECAM=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_P2PDMA=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -# CONFIG_PCIE_BUS_TUNE_OFF is not set -CONFIG_PCIE_BUS_DEFAULT=y -# CONFIG_PCIE_BUS_SAFE is not set -# CONFIG_PCIE_BUS_PERFORMANCE is not set -# CONFIG_PCIE_BUS_PEER2PEER is not set -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=y - -# -# PCI controller drivers -# -CONFIG_PCI_FTPCI100=y -CONFIG_PCI_HOST_COMMON=y -CONFIG_PCI_HOST_GENERIC=y -CONFIG_PCIE_XILINX=y -CONFIG_VMD=m -CONFIG_PCI_HYPERV_INTERFACE=m - -# -# DesignWare PCI Core Support -# -CONFIG_PCIE_DW=y -CONFIG_PCIE_DW_HOST=y -CONFIG_PCIE_DW_EP=y -CONFIG_PCIE_DW_PLAT=y -CONFIG_PCIE_DW_PLAT_HOST=y -CONFIG_PCIE_DW_PLAT_EP=y -CONFIG_PCIE_INTEL_GW=y -CONFIG_PCI_MESON=y -# end of DesignWare PCI Core Support - -# -# Mobiveil PCIe Core Support -# -# end of Mobiveil PCIe Core Support - -# -# Cadence PCIe controllers support -# -CONFIG_PCIE_CADENCE=y -CONFIG_PCIE_CADENCE_HOST=y -CONFIG_PCIE_CADENCE_EP=y -CONFIG_PCIE_CADENCE_PLAT=y -CONFIG_PCIE_CADENCE_PLAT_HOST=y -CONFIG_PCIE_CADENCE_PLAT_EP=y -# CONFIG_PCI_J721E_HOST is not set -# CONFIG_PCI_J721E_EP is not set -# end of Cadence PCIe controllers support -# end of PCI controller drivers - -# -# PCI Endpoint -# -CONFIG_PCI_ENDPOINT=y -CONFIG_PCI_ENDPOINT_CONFIGFS=y -# CONFIG_PCI_EPF_TEST is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -# end of PCI switch controller drivers - -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=m -CONFIG_RAPIDIO_TSI721=m -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=m -CONFIG_RAPIDIO_CPS_XX=m -CONFIG_RAPIDIO_TSI568=m -CONFIG_RAPIDIO_CPS_GEN2=m -CONFIG_RAPIDIO_RXS_GEN3=m -# end of RapidIO Switch drivers - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_CACHE=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_HMEM_REPORTING=y -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_SYS_HYPERVISOR=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SLIMBUS=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_REGMAP_SOUNDWIRE=m -CONFIG_REGMAP_SCCB=m -CONFIG_REGMAP_I3C=m -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# end of Generic Driver Options - -# -# Bus devices -# -CONFIG_MOXTET=m -CONFIG_SIMPLE_PM_BUS=y -CONFIG_MHI_BUS=m -# CONFIG_MHI_BUS_DEBUG is not set -# end of Bus devices - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y -CONFIG_GNSS=m -CONFIG_GNSS_SERIAL=m -CONFIG_GNSS_MTK_SERIAL=m -CONFIG_GNSS_SIRF_SERIAL=m -CONFIG_GNSS_UBX_SERIAL=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m - -# -# Partition parsers -# -CONFIG_MTD_AR7_PARTS=m -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_OF_PARTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# end of Partition parsers - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_PSTORE=m -CONFIG_MTD_SWAP=m -CONFIG_MTD_PARTITIONED_MASTER=y - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m -# end of RAM/ROM/Flash chip drivers - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_MTD_PHYSMAP_VERSATILE=y -CONFIG_MTD_PHYSMAP_GEMINI=y -CONFIG_MTD_PHYSMAP_GPIO_ADDR=y -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -# end of Mapping drivers for chip access - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -# end of Self-contained MTD device drivers - -# -# NAND -# -CONFIG_MTD_NAND_CORE=m -CONFIG_MTD_ONENAND=m -# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y -CONFIG_MTD_NAND_ECC_SW_HAMMING=m -CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y -CONFIG_MTD_RAW_NAND=m -CONFIG_MTD_NAND_ECC_SW_BCH=y - -# -# Raw/parallel NAND flash controllers -# -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_DT=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_MXIC=m -CONFIG_MTD_NAND_GPIO=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_NAND_CADENCE=m -CONFIG_MTD_NAND_ARASAN=m - -# -# Misc -# -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_SPI_NAND=m - -# -# ECC engine support -# -CONFIG_MTD_NAND_ECC=y -# end of ECC engine support -# end of NAND - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -# end of LPDDR & LPDDR2 PCM memory drivers - -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_SPI_INTEL_SPI=m -CONFIG_SPI_INTEL_SPI_PCI=m -CONFIG_SPI_INTEL_SPI_PLATFORM=m -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -CONFIG_MTD_UBI_GLUEBI=m -CONFIG_MTD_UBI_BLOCK=y -CONFIG_MTD_HYPERBUS=m -CONFIG_DTC=y -CONFIG_OF=y -# CONFIG_OF_UNITTEST is not set -CONFIG_OF_FLATTREE=y -CONFIG_OF_EARLY_FLATTREE=y -CONFIG_OF_KOBJ=y -CONFIG_OF_DYNAMIC=y -CONFIG_OF_ADDRESS=y -CONFIG_OF_IRQ=y -CONFIG_OF_NET=y -CONFIG_OF_RESERVED_MEM=y -CONFIG_OF_RESOLVE=y -CONFIG_OF_OVERLAY=y -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_CDROM=m -# CONFIG_PARIDE is not set -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -# CONFIG_CDROM_PKTCDVD_WCACHE is not set -CONFIG_ATA_OVER_ETH=m -CONFIG_XEN_BLKDEV_FRONTEND=m -CONFIG_XEN_BLKDEV_BACKEND=m -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m -CONFIG_BLK_DEV_RNBD=y -CONFIG_BLK_DEV_RNBD_CLIENT=m -CONFIG_BLK_DEV_RNBD_SERVER=m - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -CONFIG_NVME_MULTIPATH=y -CONFIG_NVME_HWMON=y -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -CONFIG_NVME_TARGET=m -# CONFIG_NVME_TARGET_PASSTHRU is not set -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m -CONFIG_NVME_TARGET_TCP=m -# end of NVME Support - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_VMWARE_BALLOON=m -CONFIG_LATTICE_ECP3_CONFIG=m -# CONFIG_SRAM is not set -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_XILINX_SDFEC=m -CONFIG_MISC_RTSX=m -CONFIG_PVPANIC=m -# CONFIG_HISI_HIKEY_USB is not set -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -# end of Texas Instruments shared transport line discipline - -CONFIG_SENSORS_LIS3_I2C=m -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=m -CONFIG_INTEL_MEI_ME=m -CONFIG_INTEL_MEI_TXE=m -# CONFIG_INTEL_MEI_VIRTIO is not set -CONFIG_INTEL_MEI_HDCP=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC & related support -# -CONFIG_INTEL_MIC_BUS=m -CONFIG_SCIF_BUS=m -CONFIG_VOP_BUS=m -CONFIG_INTEL_MIC_HOST=m -CONFIG_INTEL_MIC_CARD=m -CONFIG_SCIF=m -CONFIG_MIC_COSM=m -CONFIG_VOP=m -# end of Intel MIC & related support - -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -CONFIG_MISC_ALCOR_PCI=m -CONFIG_MISC_RTSX_PCI=m -CONFIG_MISC_RTSX_USB=m -CONFIG_HABANA_AI=m -CONFIG_UACCE=m -# end of Misc devices - -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC94XX=m -CONFIG_AIC94XX_DEBUG=y -CONFIG_SCSI_MVSAS=m -CONFIG_SCSI_MVSAS_DEBUG=y -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -# CONFIG_SCSI_UFS_DWC_TC_PCI is not set -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_CDNS_PLATFORM=m -# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set -CONFIG_SCSI_UFS_BSG=y -# CONFIG_SCSI_UFS_CRYPTO is not set -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_SCSI_MYRB=m -CONFIG_SCSI_MYRS=m -CONFIG_VMWARE_PVSCSI=m -CONFIG_XEN_SCSI_FRONTEND=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_FDOMAIN=m -CONFIG_SCSI_FDOMAIN_PCI=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -CONFIG_SCSI_IPR_TRACE=y -CONFIG_SCSI_IPR_DUMP=y -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -CONFIG_SCSI_DEBUG=m -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=3 -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_AHCI_CEVA=m -CONFIG_AHCI_QORIQ=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -# CONFIG_PATA_PLATFORM is not set -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BCACHE_ASYNC_REGISTRATION=y -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -CONFIG_DM_DEBUG=y -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -CONFIG_DM_EBS=m -CONFIG_DM_ERA=m -CONFIG_DM_CLONE=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_MULTIPATH_HST=m -CONFIG_DM_DELAY=m -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -# CONFIG_FUSION_LOGGING is not set - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -# end of IEEE 1394 (FireWire) support - -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN_L3S=y -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_BAREUDP=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_NTB_NETDEV=m -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -# CONFIG_ATM_NICSTAR_USE_SUNI is not set -# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m -CONFIG_CAIF_DRIVERS=y -CONFIG_CAIF_TTY=m -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -# CONFIG_B53_SPI_DRIVER is not set -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_B53_SERDES=m -CONFIG_NET_DSA_BCM_SF2=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_LANTIQ_GSWIP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_MV88E6XXX_PTP=y -# CONFIG_NET_DSA_MSCC_SEVILLE is not set -CONFIG_NET_DSA_AR9331=m -CONFIG_NET_DSA_SJA1105=m -CONFIG_NET_DSA_SJA1105_PTP=y -CONFIG_NET_DSA_SJA1105_TAS=y -CONFIG_NET_DSA_SJA1105_VL=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_REALTEK_SMI=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_NET_DSA_VITESSE_VSC73XX=m -CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m -CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m -# end of Distributed Switch Architecture drivers - -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BCMGENET=m -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_SYSTEMPORT=m -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_BNXT_HWMON=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_CAVIUM_PTP=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -CONFIG_CHELSIO_T4_FCOE=y -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_CHELSIO_INLINE_CRYPTO=y -CONFIG_CHELSIO_IPSEC_INLINE=m -CONFIG_CHELSIO_TLS_DEVICE=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_NET_VENDOR_CORTINA=y -CONFIG_GEMINI_ETHERNET=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_BE2=y -CONFIG_BE2NET_BE3=y -CONFIG_BE2NET_LANCER=y -CONFIG_BE2NET_SKYHAWK=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_GOOGLE=y -CONFIG_GVE=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -# CONFIG_IXGBE_IPSEC is not set -CONFIG_IXGBEVF=m -CONFIG_IXGBEVF_IPSEC=y -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_IAVF=m -CONFIG_I40EVF=m -CONFIG_ICE=m -CONFIG_FM10K=m -CONFIG_IGC=m -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -# CONFIG_PRESTERA is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX4_CORE_GEN2=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_EN_ARFS=y -CONFIG_MLX5_EN_RXNFC=y -CONFIG_MLX5_MPFS=y -CONFIG_MLX5_ESWITCH=y -CONFIG_MLX5_CLS_ACT=y -CONFIG_MLX5_TC_CT=y -CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_IPOIB=y -CONFIG_MLX5_FPGA_IPSEC=y -# CONFIG_MLX5_IPSEC is not set -CONFIG_MLX5_EN_IPSEC=y -CONFIG_MLX5_FPGA_TLS=y -CONFIG_MLX5_TLS=y -CONFIG_MLX5_EN_TLS=y -CONFIG_MLX5_SW_STEERING=y -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_LAN743X=m -CONFIG_NET_VENDOR_MICROSEMI=y -CONFIG_MSCC_OCELOT_SWITCH_LIB=m -CONFIG_MSCC_OCELOT_SWITCH=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETERION=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -CONFIG_NFP_APP_FLOWER=y -CONFIG_NFP_APP_ABM_NIC=y -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_NI=y -CONFIG_NI_XGE_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_VENDOR_PACKET_ENGINES=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_PENSANDO=y -CONFIG_IONIC=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_QED_OOO=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCA7000=m -CONFIG_QCA7000_SPI=m -CONFIG_QCA7000_UART=m -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_SOCIONEXT=y -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -# CONFIG_STMMAC_SELFTESTS is not set -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_DWC_QOS_ETH=m -CONFIG_DWMAC_GENERIC=m -# CONFIG_DWMAC_INTEL_PLAT is not set -CONFIG_DWMAC_INTEL=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -# CONFIG_TI_CPSW_PHY_SEL is not set -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XILINX=y -CONFIG_XILINX_AXI_EMAC=m -CONFIG_XILINX_LL_TEMAC=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -CONFIG_DEFXX_MMIO=y -CONFIG_SKFP=m -# CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y -CONFIG_FIXED_PHY=m -CONFIG_SFP=m - -# -# MII PHY device drivers -# -CONFIG_AMD_PHY=m -CONFIG_ADIN_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AX88796B_PHY=m -CONFIG_BROADCOM_PHY=m -CONFIG_BCM54140_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM84881_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_LXT_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROCHIP_T1_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_NXP_TJA11XX_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_RENESAS_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_DP83822_PHY=m -CONFIG_DP83TC811_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_DP83869_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_OF_MDIO=m -CONFIG_MDIO_DEVRES=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_HISI_FEMAC=m -CONFIG_MDIO_I2C=m -CONFIG_MDIO_MVUSB=m -CONFIG_MDIO_MSCC_MIIM=m -CONFIG_MDIO_OCTEON=m -CONFIG_MDIO_IPQ4019=m -CONFIG_MDIO_IPQ8064=m -CONFIG_MDIO_THUNDER=m - -# -# MDIO Multiplexers -# -CONFIG_MDIO_BUS_MUX=m -CONFIG_MDIO_BUS_MUX_GPIO=m -CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m -CONFIG_MDIO_BUS_MUX_MMIOREG=m - -# -# PCS device drivers -# -CONFIG_PCS_XPCS=m -# end of PCS device drivers - -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_USB_NET_AQC111=m -CONFIG_WLAN=y -# CONFIG_WIRELESS_WDS is not set -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -CONFIG_ATH5K_DEBUG=y -CONFIG_ATH5K_TRACER=y -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_COMMON_DEBUG=y -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -CONFIG_ATH9K_DEBUGFS=y -CONFIG_ATH9K_STATION_STATISTICS=y -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_PCI_NO_EEPROM=m -CONFIG_ATH9K_HTC=m -CONFIG_ATH9K_HTC_DEBUGFS=y -CONFIG_ATH9K_HWRNG=y -CONFIG_ATH9K_COMMON_SPECTRAL=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_DEBUGFS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -CONFIG_ATH6KL_DEBUG=y -CONFIG_ATH6KL_TRACING=y -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -CONFIG_WIL6210_DEBUGFS=y -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_AHB=y -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -CONFIG_ATH10K_DEBUG=y -CONFIG_ATH10K_DEBUGFS=y -CONFIG_ATH10K_SPECTRAL=y -CONFIG_ATH10K_TRACING=y -CONFIG_WCN36XX=m -CONFIG_WCN36XX_DEBUGFS=y -# CONFIG_ATH11K is not set -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -CONFIG_B43LEGACY_DEBUG=y -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -CONFIG_BRCM_TRACING=y -CONFIG_BRCMDBG=y -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -CONFIG_IWLEGACY_DEBUG=y -CONFIG_IWLEGACY_DEBUGFS=y -# end of iwl3945 / iwl4965 Debugging Options - -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -# CONFIG_IWLWIFI_BCAST_FILTERING is not set - -# -# Debugging Options -# -CONFIG_IWLWIFI_DEBUG=y -CONFIG_IWLWIFI_DEBUGFS=y -CONFIG_IWLWIFI_DEVICE_TRACING=y -# end of Debugging Options - -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -# CONFIG_P54_SPI_DEFAULT_EEPROM is not set -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76x0_COMMON=m -CONFIG_MT76x0U=m -CONFIG_MT76x0E=m -CONFIG_MT76x2_COMMON=m -CONFIG_MT76x2E=m -CONFIG_MT76x2U=m -CONFIG_MT7603E=m -CONFIG_MT7615_COMMON=m -CONFIG_MT7615E=m -CONFIG_MT7663_USB_SDIO_COMMON=m -CONFIG_MT7663U=m -# CONFIG_MT7663S is not set -CONFIG_MT7915E=m -CONFIG_WLAN_VENDOR_MICROCHIP=y -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -CONFIG_RT2X00_LIB_DEBUGFS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -CONFIG_RTLWIFI_DEBUG=y -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_RTW88=m -CONFIG_RTW88_CORE=m -CONFIG_RTW88_PCI=m -CONFIG_RTW88_8822B=m -CONFIG_RTW88_8822C=m -CONFIG_RTW88_8723D=m -CONFIG_RTW88_8821C=m -CONFIG_RTW88_8822BE=m -CONFIG_RTW88_8822CE=m -CONFIG_RTW88_8723DE=m -CONFIG_RTW88_8821CE=m -CONFIG_RTW88_DEBUG=y -CONFIG_RTW88_DEBUGFS=y -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -CONFIG_RSI_DEBUGFS=y -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_RSI_COEX=y -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SPI=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_VIRT_WIFI=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -# end of WiMAX Wireless Broadband devices - -# CONFIG_WAN is not set -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_IEEE802154_MCR20A=m -CONFIG_IEEE802154_HWSIM=m -CONFIG_XEN_NETDEV_FRONTEND=m -CONFIG_XEN_NETDEV_BACKEND=m -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_USB4_NET=m -CONFIG_HYPERV_NET=m -CONFIG_NETDEVSIM=m -CONFIG_NET_FAILOVER=m -CONFIG_ISDN=y -CONFIG_ISDN_CAPI=y -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_HDLC=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_NVM=y -CONFIG_NVM_PBLK=m -# CONFIG_NVM_PBLK_DEBUG is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=m -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5520=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_APPLESPI=m -CONFIG_KEYBOARD_ATKBD=m -CONFIG_KEYBOARD_QT1050=m -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_STMPE=m -CONFIG_KEYBOARD_IQS62X=m -CONFIG_KEYBOARD_OMAP4=m -CONFIG_KEYBOARD_TC3589X=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_TWL4030=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_KEYBOARD_CAP11XX=m -CONFIG_KEYBOARD_BCM=m -CONFIG_KEYBOARD_MTK_PMIC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -CONFIG_MOUSE_PS2_VMMOUSE=y -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -# CONFIG_JOYSTICK_ADC is not set -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=m -CONFIG_JOYSTICK_IFORCE_232=m -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -CONFIG_JOYSTICK_JOYDUMP=m -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_JOYSTICK_PXRC=m -CONFIG_JOYSTICK_FSIA6B=m -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_88PM860X=m -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ADC=m -CONFIG_TOUCHSCREEN_AR1021_I2C=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_BU21029=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m -CONFIG_TOUCHSCREEN_CY8CTMA140=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9034=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_EXC3000=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_HIDEEP=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_S6SY761=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_IMX6UL_TSC=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_STMPE=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_COLIBRI_VF50=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_TOUCHSCREEN_IQS5XX=m -# CONFIG_TOUCHSCREEN_ZINITIX is not set -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM860X_ONKEY=m -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_ATMEL_CAPTOUCH=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77650_ONKEY=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MAX8925_ONKEY=m -CONFIG_INPUT_MAX8997_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_GPIO_VIBRA=m -CONFIG_INPUT_CPCAP_PWRBUTTON=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_TWL4030_PWRBUTTON=m -CONFIG_INPUT_TWL4030_VIBRA=m -CONFIG_INPUT_TWL6040_VIBRA=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PALMAS_PWRBUTTON=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_RK805_PWRKEY=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9055_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_IQS269A=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_INPUT_RAVE_SP_PWRBUTTON=m -CONFIG_INPUT_STPMIC1_ONKEY=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -# CONFIG_RMI4_F3A is not set -# CONFIG_RMI4_F54 is not set -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=m -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=m -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=m -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -# CONFIG_SERIO_APBPS2 is not set -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_LDISC_AUTOLOAD=y - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=m -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=32 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_ASPEED_VUART=m -CONFIG_SERIAL_8250_SHARE_IRQ=y -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -CONFIG_SERIAL_8250_RSA=y -CONFIG_SERIAL_8250_DWLIB=y -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_OF_PLATFORM=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=m -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SIFIVE=m -CONFIG_SERIAL_LANTIQ=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_XILINX_PS_UART=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_FSL_LINFLEXUART=m -CONFIG_SERIAL_CONEXANT_DIGICOLOR=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_SPRD=m -# end of Serial drivers - -CONFIG_SERIAL_MCTRL_GPIO=y -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_NOZOMI=m -CONFIG_NULL_TTY=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_HVC_DRIVER=y -CONFIG_HVC_IRQ=y -CONFIG_HVC_XEN=y -CONFIG_HVC_XEN_FRONTEND=y -CONFIG_SERIAL_DEV_BUS=y -CONFIG_SERIAL_DEV_CTRL_TTYPORT=y -# CONFIG_TTY_PRINTK is not set -CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set -CONFIG_PPDEV=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PLAT_DATA=y -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_IPMB_DEVICE_INTERFACE=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -# CONFIG_HW_RANDOM_BA431 is not set -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_HW_RANDOM_CCTRNG=m -# CONFIG_HW_RANDOM_XIPHERA is not set -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -# end of PCMCIA character devices - -CONFIG_MWAVE=m -CONFIG_DEVMEM=y -# CONFIG_DEVKMEM is not set -CONFIG_NVRAM=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -CONFIG_DEVPORT=y -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_XEN=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m -CONFIG_XILLYBUS_OF=m -# end of Character devices - -# CONFIG_RANDOM_TRUST_CPU is not set -# CONFIG_RANDOM_TRUST_BOOTLOADER is not set - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_ARB_GPIO_CHALLENGE=m -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_GPMUX=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_PINCTRL=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_DEMUX_PINCTRL=m -CONFIG_I2C_MUX_MLXCPLD=m -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_AMD_MP2=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_CHT_WC=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_NVIDIA_GPU=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=y -CONFIG_I2C_DESIGNWARE_SLAVE=y -CONFIG_I2C_DESIGNWARE_PLATFORM=y -CONFIG_I2C_DESIGNWARE_BAYTRAIL=y -CONFIG_I2C_DESIGNWARE_PCI=m -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -CONFIG_I2C_RK3X=m -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -CONFIG_I2C_FSI=m -# end of I2C Hardware Bus support - -CONFIG_I2C_STUB=m -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_SLAVE_TESTUNIT is not set -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -CONFIG_SPI_MEM=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_DMA=y -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_FSI=m -CONFIG_SPI_NXP_FLEXSPI=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_FSL_LIB=m -CONFIG_SPI_FSL_SPI=m -# CONFIG_SPI_LANTIQ_SSC is not set -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_SIFIVE=m -CONFIG_SPI_MXIC=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m -CONFIG_SPI_AMD=m - -# -# SPI Multiplexer support -# -CONFIG_SPI_MUX=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPI_DYNAMIC=y -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=y -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -CONFIG_PPS_CLIENT_KTIMER=m -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=y -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_INES=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PTP_1588_CLOCK_IDT82P33=m -CONFIG_PTP_1588_CLOCK_IDTCM=m -CONFIG_PTP_1588_CLOCK_VMW=m -# end of PTP clock support - -CONFIG_PINCTRL=y -CONFIG_GENERIC_PINCTRL_GROUPS=y -CONFIG_PINMUX=y -CONFIG_GENERIC_PINMUX_FUNCTIONS=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AS3722=m -CONFIG_PINCTRL_AXP209=m -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_DA9062=m -CONFIG_PINCTRL_MCP23S08_I2C=m -CONFIG_PINCTRL_MCP23S08_SPI=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_SINGLE=m -CONFIG_PINCTRL_SX150X=y -CONFIG_PINCTRL_STMFX=m -CONFIG_PINCTRL_MAX77620=m -CONFIG_PINCTRL_PALMAS=m -CONFIG_PINCTRL_RK805=m -CONFIG_PINCTRL_OCELOT=y -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y -CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -# CONFIG_PINCTRL_EMMITSBURG is not set -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_JASPERLAKE=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y - -# -# Renesas pinctrl drivers -# -# end of Renesas pinctrl drivers - -CONFIG_PINCTRL_LOCHNAGAR=m -CONFIG_PINCTRL_MADERA=m -CONFIG_PINCTRL_CS47L15=y -CONFIG_PINCTRL_CS47L35=y -CONFIG_PINCTRL_CS47L85=y -CONFIG_PINCTRL_CS47L90=y -CONFIG_PINCTRL_CS47L92=y -CONFIG_PINCTRL_EQUILIBRIUM=m -CONFIG_GPIOLIB=y -CONFIG_GPIOLIB_FASTPATH_LIMIT=512 -CONFIG_OF_GPIO=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_CDEV=y -CONFIG_GPIO_CDEV_V1=y -CONFIG_GPIO_GENERIC=y -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_74XX_MMIO=m -CONFIG_GPIO_ALTERA=m -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_CADENCE=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_FTGPIO010=y -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_GRGPIO=m -CONFIG_GPIO_HLWD=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LOGICVC=m -CONFIG_GPIO_MB86S7X=m -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_SAMA5D2_PIOBU=m -CONFIG_GPIO_SIFIVE=y -CONFIG_GPIO_SIOX=m -CONFIG_GPIO_SYSCON=m -CONFIG_GPIO_VX855=m -CONFIG_GPIO_WCD934X=m -CONFIG_GPIO_XILINX=m -CONFIG_GPIO_AMD_FCH=m -# end of Memory mapped GPIO drivers - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m -CONFIG_GPIO_WINBOND=m -CONFIG_GPIO_WS16C48=m -# end of Port-mapped I/O GPIO drivers - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_ADNP=m -CONFIG_GPIO_GW_PLD=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCA953X_IRQ=y -CONFIG_GPIO_PCA9570=m -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m -# end of I2C GPIO expanders - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ADP5520=m -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD70528=m -CONFIG_GPIO_BD71828=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_CRYSTAL_COVE=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DA9055=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_LP87565=m -CONFIG_GPIO_MADERA=m -CONFIG_GPIO_MAX77620=m -CONFIG_GPIO_MAX77650=m -CONFIG_GPIO_MSIC=y -CONFIG_GPIO_PALMAS=y -CONFIG_GPIO_RC5T583=y -CONFIG_GPIO_STMPE=y -CONFIG_GPIO_TC3589X=y -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS6586X=y -CONFIG_GPIO_TPS65910=y -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_TPS68470=y -CONFIG_GPIO_TQMX86=m -CONFIG_GPIO_TWL4030=m -CONFIG_GPIO_TWL6040=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8350=m -CONFIG_GPIO_WM8994=m -# end of MFD GPIO expanders - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_PCIE_IDIO_24=m -CONFIG_GPIO_RDC321X=m -CONFIG_GPIO_SODAVILLE=y -# end of PCI GPIO expanders - -# -# SPI GPIO expanders -# -CONFIG_GPIO_74X164=m -CONFIG_GPIO_MAX3191X=m -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m -CONFIG_GPIO_MOXTET=m -# end of SPI GPIO expanders - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -# end of USB GPIO expanders - -CONFIG_GPIO_AGGREGATOR=m -CONFIG_GPIO_MOCKUP=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m -CONFIG_W1_MASTER_SGI=m -# end of 1-wire Bus Masters - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -CONFIG_W1_SLAVE_DS2405=m -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2430=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -# CONFIG_W1_SLAVE_DS2433_CRC is not set -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS250X=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_W1_SLAVE_DS28E17=m -# end of 1-wire Slaves - -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_AS3722=y -CONFIG_POWER_RESET_GPIO=y -CONFIG_POWER_RESET_GPIO_RESTART=y -CONFIG_POWER_RESET_LTC2952=y -CONFIG_POWER_RESET_MT6323=y -CONFIG_POWER_RESET_RESTART=y -CONFIG_POWER_RESET_SYSCON=y -CONFIG_POWER_RESET_SYSCON_POWEROFF=y -CONFIG_REBOOT_MODE=m -CONFIG_SYSCON_REBOOT_MODE=m -CONFIG_NVMEM_REBOOT_MODE=m -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_MAX8925_POWER=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -CONFIG_WM8350_POWER=m -CONFIG_TEST_POWER=m -CONFIG_BATTERY_88PM860X=m -CONFIG_CHARGER_ADP5061=m -CONFIG_BATTERY_ACT8945A=m -CONFIG_BATTERY_CPCAP=m -CONFIG_BATTERY_CW2015=m -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_MANAGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9030=m -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_BATTERY_TWL4030_MADC=m -CONFIG_CHARGER_88PM860X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_BATTERY_RX51=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_TWL4030=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_LP8788=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LT3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_DETECTOR_MAX14656=m -CONFIG_CHARGER_MAX77650=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_MAX8997=m -CONFIG_CHARGER_MAX8998=m -CONFIG_CHARGER_MP2629=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -# CONFIG_CHARGER_BQ2515X is not set -CONFIG_CHARGER_BQ25890=m -# CONFIG_CHARGER_BQ25980 is not set -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65090=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_CHARGER_CROS_USBPD=m -CONFIG_CHARGER_UCS1002=m -CONFIG_CHARGER_BD70528=m -CONFIG_CHARGER_BD99954=m -CONFIG_CHARGER_WILCO=m -# CONFIG_RN5T618_POWER is not set -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM1177=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_AS370=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_AXI_FAN_CONTROL=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_AMD_ENERGY=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_CORSAIR_CPRO=m -CONFIG_SENSORS_DRIVETEMP=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_DA9055=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_GSC=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LOCHNAGAR=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2947=m -CONFIG_SENSORS_LTC2947_I2C=m -CONFIG_SENSORS_LTC2947_SPI=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX31730=m -CONFIG_SENSORS_MAX6621=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_MLXREG_FAN=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -# CONFIG_SENSORS_MR75203 is not set -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_NPCM7XX=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -# CONFIG_SENSORS_ADM1266 is not set -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_BEL_PFE=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_INSPUR_IPSPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_IR38064=m -CONFIG_SENSORS_IRPS5401=m -CONFIG_SENSORS_ISL68137=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -# CONFIG_SENSORS_LTC2978_REGULATOR is not set -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX16601=m -CONFIG_SENSORS_MAX20730=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX31785=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -# CONFIG_SENSORS_MP2975 is not set -CONFIG_SENSORS_PXE1610=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_XDPE122=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_PWM_FAN=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TMP513=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83773G=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_WM8350=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -# CONFIG_THERMAL_NETLINK is not set -# CONFIG_THERMAL_STATISTICS is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 -CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_OF=y -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -CONFIG_CPU_THERMAL=y -CONFIG_CPU_FREQ_THERMAL=y -CONFIG_CPU_IDLE_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_THERMAL_MMIO=m -CONFIG_MAX77620_THERMAL=m -CONFIG_DA9062_THERMAL=m - -# -# Intel thermal drivers -# -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=y -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -# end of Intel thermal drivers - -# CONFIG_TI_SOC_THERMAL is not set -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_BD70528_WATCHDOG=m -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9055_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_GPIO_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_MENZ069_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_WM8350_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_RAVE_SP_WATCHDOG=m -CONFIG_MLX_WDT=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_RN5T618_WATCHDOG=m -CONFIG_TWL4030_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_MAX77620_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_STPMIC1_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m -CONFIG_F71808E_WDT=m -CONFIG_SP5100_TCO=m -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_KEMPLD_WDT=m -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_TQMX86_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m -CONFIG_XEN_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -# CONFIG_BCMA_HOST_SOC is not set -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_ACT8945A=m -CONFIG_MFD_AS3711=y -CONFIG_MFD_AS3722=m -CONFIG_PMIC_ADP5520=y -CONFIG_MFD_AAT2870_CORE=y -CONFIG_MFD_ATMEL_FLEXCOM=m -CONFIG_MFD_ATMEL_HLCDC=m -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC_DEV=m -CONFIG_MFD_MADERA=m -CONFIG_MFD_MADERA_I2C=m -CONFIG_MFD_MADERA_SPI=m -CONFIG_MFD_CS47L15=y -CONFIG_MFD_CS47L35=y -CONFIG_MFD_CS47L85=y -CONFIG_MFD_CS47L90=y -CONFIG_MFD_CS47L92=y -CONFIG_PMIC_DA903X=y -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9052_I2C=y -CONFIG_MFD_DA9055=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_GATEWORKS_GSC=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_MFD_MP2629=m -CONFIG_MFD_HI6421_PMIC=m -CONFIG_HTC_PASIC3=m -CONFIG_HTC_I2CPLD=y -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC=y -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_INTEL_SOC_PMIC_CHTWC=y -CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m -CONFIG_INTEL_SOC_PMIC_MRFLD=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_INTEL_MSIC=y -CONFIG_MFD_INTEL_PMC_BXT=m -CONFIG_MFD_IQS62X=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_88PM860X=y -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77620=y -CONFIG_MFD_MAX77650=m -CONFIG_MFD_MAX77686=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX77843=y -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MAX8925=y -CONFIG_MFD_MAX8997=y -CONFIG_MFD_MAX8998=y -CONFIG_MFD_MT6360=m -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_CPCAP=m -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RC5T583=y -CONFIG_MFD_RK808=m -CONFIG_MFD_RN5T618=m -CONFIG_MFD_SEC_CORE=y -CONFIG_MFD_SI476X_CORE=m -# CONFIG_MFD_SL28CPLD is not set -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_ABX500_CORE=y -CONFIG_AB3100_CORE=y -CONFIG_AB3100_OTP=y -CONFIG_MFD_STMPE=y - -# -# STMicroelectronics STMPE Interface Drivers -# -CONFIG_STMPE_I2C=y -CONFIG_STMPE_SPI=y -# end of STMicroelectronics STMPE Interface Drivers - -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_LP8788=y -CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65090=y -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TPS68470=y -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TI_LP87565=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS6586X=y -CONFIG_MFD_TPS65910=y -CONFIG_MFD_TPS65912=m -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y -CONFIG_TWL4030_CORE=y -CONFIG_MFD_TWL4030_AUDIO=y -CONFIG_TWL6040_CORE=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -CONFIG_MFD_TC3589X=y -CONFIG_MFD_TQMX86=m -CONFIG_MFD_VX855=m -CONFIG_MFD_LOCHNAGAR=y -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM8400=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_I2C=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8350=y -CONFIG_MFD_WM8350_I2C=y -CONFIG_MFD_WM8994=m -CONFIG_MFD_ROHM_BD718XX=m -CONFIG_MFD_ROHM_BD70528=m -CONFIG_MFD_ROHM_BD71828=m -CONFIG_MFD_STPMIC1=m -CONFIG_MFD_STMFX=m -CONFIG_MFD_WCD934X=m -CONFIG_RAVE_SP_CORE=m -# CONFIG_MFD_INTEL_M10_BMC is not set -# end of Multifunction device drivers - -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PG86X=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_88PM8607=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_ACT8945A=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_AAT2870=m -CONFIG_REGULATOR_AB3100=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AS3711=m -CONFIG_REGULATOR_AS3722=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD70528=m -CONFIG_REGULATOR_BD71828=m -CONFIG_REGULATOR_BD718XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_CPCAP=m -CONFIG_REGULATOR_CROS_EC=m -CONFIG_REGULATOR_DA903X=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9055=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_FAN53880=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_HI6421=m -CONFIG_REGULATOR_HI6421V530=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LOCHNAGAR=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP873X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LP87565=m -CONFIG_REGULATOR_LP8788=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX77620=m -CONFIG_REGULATOR_MAX77650=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8925=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m -CONFIG_REGULATOR_MAX8997=m -CONFIG_REGULATOR_MAX8998=m -CONFIG_REGULATOR_MAX77686=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MAX77802=m -CONFIG_REGULATOR_MAX77826=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MCP16502=m -CONFIG_REGULATOR_MP5416=m -CONFIG_REGULATOR_MP8859=m -CONFIG_REGULATOR_MP886X=m -CONFIG_REGULATOR_MPQ7920=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6358=m -# CONFIG_REGULATOR_MT6360 is not set -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PALMAS=m -CONFIG_REGULATOR_PCA9450=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_QCOM_USB_VBUS=m -# CONFIG_REGULATOR_RASPBERRYPI_TOUCHSCREEN_ATTINY is not set -CONFIG_REGULATOR_RC5T583=m -CONFIG_REGULATOR_RK808=m -CONFIG_REGULATOR_RN5T618=m -CONFIG_REGULATOR_ROHM=m -# CONFIG_REGULATOR_RT4801 is not set -CONFIG_REGULATOR_RT5033=m -# CONFIG_REGULATOR_RTMV20 is not set -CONFIG_REGULATOR_S2MPA01=m -CONFIG_REGULATOR_S2MPS11=m -CONFIG_REGULATOR_S5M8767=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_SLG51000=m -CONFIG_REGULATOR_STPMIC1=m -CONFIG_REGULATOR_SY8106A=m -CONFIG_REGULATOR_SY8824X=m -CONFIG_REGULATOR_SY8827N=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65090=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS65218=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS6586X=m -CONFIG_REGULATOR_TPS65910=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m -CONFIG_REGULATOR_TWL4030=m -CONFIG_REGULATOR_VCTRL=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8350=m -CONFIG_REGULATOR_WM8400=m -CONFIG_REGULATOR_WM8994=m -CONFIG_REGULATOR_QCOM_LABIBB=m -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_LIRC=y -CONFIG_RC_DECODERS=y -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_IR_IMON_DECODER=m -CONFIG_IR_RCMM_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_IMON_RAW=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_RC_XBOX_DVD=m -CONFIG_IR_TOY=m -CONFIG_CEC_CORE=m -CONFIG_CEC_NOTIFIER=y -CONFIG_CEC_PIN=y -CONFIG_MEDIA_CEC_RC=y -# CONFIG_CEC_PIN_ERROR_INJ is not set -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_CEC_CH7322=m -CONFIG_CEC_CROS_EC=m -CONFIG_CEC_GPIO=m -CONFIG_CEC_SECO=m -CONFIG_CEC_SECO_RC=y -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_SUPPORT=m -# CONFIG_MEDIA_SUPPORT_FILTER is not set -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y - -# -# Media device types -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_PLATFORM_SUPPORT=y -CONFIG_MEDIA_TEST_SUPPORT=y -# end of Media device types - -# -# Media core support -# -CONFIG_VIDEO_DEV=m -CONFIG_MEDIA_CONTROLLER=y -CONFIG_DVB_CORE=m -# end of Media core support - -# -# Video4Linux options -# -CONFIG_VIDEO_V4L2=m -CONFIG_VIDEO_V4L2_I2C=y -CONFIG_VIDEO_V4L2_SUBDEV_API=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -# end of Video4Linux options - -# -# Media controller options -# -CONFIG_MEDIA_CONTROLLER_DVB=y -CONFIG_MEDIA_CONTROLLER_REQUEST_API=y - -# -# Please notice that the enabled Media controller Request API is EXPERIMENTAL -# -# end of Media controller options - -# -# Digital TV options -# -CONFIG_DVB_MMAP=y -CONFIG_DVB_NET=y -CONFIG_DVB_MAX_ADAPTERS=16 -# CONFIG_DVB_DYNAMIC_MINORS is not set -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set -# CONFIG_DVB_ULE_DEBUG is not set -# end of Digital TV options - -# -# Media drivers -# -CONFIG_TTPCI_EEPROM=m -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_CXUSB_ANALOG=y -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_VIDEO_IPU3_CIO2=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=m -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m -CONFIG_RADIO_WL128X=m -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set -CONFIG_VIDEO_V4L2_TPG=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_VIDEO_CADENCE=y -CONFIG_VIDEO_CADENCE_CSI2RX=m -CONFIG_VIDEO_CADENCE_CSI2TX=m -CONFIG_VIDEO_ASPEED=m -CONFIG_VIDEO_MUX=m -CONFIG_VIDEO_XILINX=m -# CONFIG_VIDEO_XILINX_CSI2RXSS is not set -CONFIG_VIDEO_XILINX_TPG=m -CONFIG_VIDEO_XILINX_VTC=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# MMC/SDIO DVB adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_V4L_TEST_DRIVERS=y -CONFIG_VIDEO_VIMC=m -CONFIG_VIDEO_VIVID=m -CONFIG_VIDEO_VIVID_CEC=y -CONFIG_VIDEO_VIVID_MAX_DEVS=64 -CONFIG_VIDEO_VIM2M=m -CONFIG_VIDEO_VICODEC=m -# CONFIG_DVB_TEST_DRIVERS is not set - -# -# FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -# end of Media drivers - -# -# Media ancillary drivers -# -CONFIG_MEDIA_ATTACH=y - -# -# IR I2C driver auto-selected by 'Autoselect ancillary drivers' -# -CONFIG_VIDEO_IR_I2C=m - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TDA1997X=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_TLV320AIC23B=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m -# end of Audio decoders, processors and mixers - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m -# end of RDS decoders - -# -# Video decoders -# -CONFIG_VIDEO_ADV7180=m -CONFIG_VIDEO_ADV7183=m -CONFIG_VIDEO_ADV748X=m -CONFIG_VIDEO_ADV7604=m -CONFIG_VIDEO_ADV7604_CEC=y -CONFIG_VIDEO_ADV7842=m -CONFIG_VIDEO_ADV7842_CEC=y -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_ML86V7667=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TC358743=m -CONFIG_VIDEO_TC358743_CEC=y -CONFIG_VIDEO_TVP514X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TVP7002=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_TW9910=m -CONFIG_VIDEO_VPX3220=m -# CONFIG_VIDEO_MAX9286 is not set - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m -# end of Video decoders - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m -CONFIG_VIDEO_ADV7343=m -CONFIG_VIDEO_ADV7393=m -CONFIG_VIDEO_AD9389B=m -CONFIG_VIDEO_AK881X=m -CONFIG_VIDEO_THS8200=m -# end of Video encoders - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m -# end of Video improvement chips - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m -# end of Audio/Video compression chips - -# -# SDR tuner chips -# -CONFIG_SDR_MAX2175=m -# end of SDR tuner chips - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_THS7303=m -CONFIG_VIDEO_M52790=m -CONFIG_VIDEO_I2C=m -CONFIG_VIDEO_ST_MIPID02=m -# end of Miscellaneous helper chips - -# -# Camera sensor devices -# -CONFIG_VIDEO_APTINA_PLL=m -CONFIG_VIDEO_SMIAPP_PLL=m -CONFIG_VIDEO_HI556=m -CONFIG_VIDEO_IMX214=m -CONFIG_VIDEO_IMX219=m -CONFIG_VIDEO_IMX258=m -CONFIG_VIDEO_IMX274=m -CONFIG_VIDEO_IMX290=m -CONFIG_VIDEO_IMX319=m -CONFIG_VIDEO_IMX355=m -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV2659=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_OV2685=m -CONFIG_VIDEO_OV2740=m -CONFIG_VIDEO_OV5640=m -CONFIG_VIDEO_OV5645=m -CONFIG_VIDEO_OV5647=m -CONFIG_VIDEO_OV6650=m -CONFIG_VIDEO_OV5670=m -CONFIG_VIDEO_OV5675=m -CONFIG_VIDEO_OV5695=m -CONFIG_VIDEO_OV7251=m -CONFIG_VIDEO_OV772X=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_OV7740=m -CONFIG_VIDEO_OV8856=m -CONFIG_VIDEO_OV9640=m -CONFIG_VIDEO_OV9650=m -CONFIG_VIDEO_OV13858=m -CONFIG_VIDEO_VS6624=m -CONFIG_VIDEO_MT9M001=m -CONFIG_VIDEO_MT9M032=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9P031=m -CONFIG_VIDEO_MT9T001=m -CONFIG_VIDEO_MT9T112=m -CONFIG_VIDEO_MT9V011=m -CONFIG_VIDEO_MT9V032=m -CONFIG_VIDEO_MT9V111=m -CONFIG_VIDEO_SR030PC30=m -CONFIG_VIDEO_NOON010PC30=m -CONFIG_VIDEO_M5MOLS=m -# CONFIG_VIDEO_RDACM20 is not set -CONFIG_VIDEO_RJ54N1=m -CONFIG_VIDEO_S5K6AA=m -CONFIG_VIDEO_S5K6A3=m -CONFIG_VIDEO_S5K4ECGX=m -CONFIG_VIDEO_S5K5BAF=m -CONFIG_VIDEO_SMIAPP=m -CONFIG_VIDEO_ET8EK8=m -CONFIG_VIDEO_S5C73M3=m -# end of Camera sensor devices - -# -# Lens drivers -# -CONFIG_VIDEO_AD5820=m -CONFIG_VIDEO_AK7375=m -CONFIG_VIDEO_DW9714=m -CONFIG_VIDEO_DW9768=m -CONFIG_VIDEO_DW9807_VCM=m -# end of Lens drivers - -# -# Flash devices -# -CONFIG_VIDEO_ADP1653=m -CONFIG_VIDEO_LM3560=m -CONFIG_VIDEO_LM3646=m -# end of Flash devices - -# -# SPI helper chips -# -CONFIG_VIDEO_GS1662=m -# end of SPI helper chips - -# -# Media SPI Adapters -# -CONFIG_CXD2880_SPI_DRV=m -# end of Media SPI Adapters - -CONFIG_MEDIA_TUNER=m - -# -# Customize TV tuners -# -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA18250=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m -CONFIG_MEDIA_TUNER_QM1D1B0004=m -# end of Customize TV tuners - -# -# Customise DVB Frontends -# - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_S5H1432=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_DIB9000=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m -CONFIG_DVB_CXD2880=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m -CONFIG_DVB_MN88443X=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBH29=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_LGS8GL5=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Common Interface (EN50221) controller drivers -# -CONFIG_DVB_CXD2099=m -CONFIG_DVB_SP2=m -# end of Customise DVB Frontends - -# -# Tools to develop new frontends -# -CONFIG_DVB_DUMMY_FE=m -# end of Media ancillary drivers - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=10 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DBI=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set -CONFIG_DRM_LOAD_EDID_FIRMWARE=y -CONFIG_DRM_DP_CEC=y -CONFIG_DRM_TTM=m -CONFIG_DRM_TTM_DMA_PAGE_POOL=y -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_I2C_NXP_TDA9950=m -# end of I2C encoder or helper chips - -# -# ARM devices -# -CONFIG_DRM_KOMEDA=m -# end of ARM devices - -CONFIG_DRM_RADEON=m -CONFIG_DRM_RADEON_USERPTR=y -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_DCN=y -CONFIG_DRM_AMD_DC_DCN3_0=y -CONFIG_DRM_AMD_DC_HDCP=y -CONFIG_DRM_AMD_DC_SI=y -# CONFIG_DEBUG_KERNEL_DC is not set -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_DRM_NOUVEAU=m -# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -# CONFIG_NOUVEAU_DEBUG_PUSH is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_NOUVEAU_SVM=y -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="*" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m - -# -# drm/i915 Debugging -# -# CONFIG_DRM_I915_WERROR is not set -# CONFIG_DRM_I915_DEBUG is not set -# CONFIG_DRM_I915_DEBUG_MMIO is not set -# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set -# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set -# CONFIG_DRM_I915_DEBUG_GUC is not set -# CONFIG_DRM_I915_SELFTEST is not set -# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set -# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set -# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set -# end of drm/i915 Debugging - -# -# drm/i915 Profile Guided Optimisation -# -CONFIG_DRM_I915_FENCE_TIMEOUT=10000 -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -# end of drm/i915 Profile Guided Optimisation - -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_RCAR_DW_HDMI=m -CONFIG_DRM_RCAR_LVDS=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_PANEL_ARM_VERSATILE=m -CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596=m -CONFIG_DRM_PANEL_BOE_HIMAX8279D=m -CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m -CONFIG_DRM_PANEL_LVDS=m -CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_PANEL_ELIDA_KD35T133=m -CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m -CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m -CONFIG_DRM_PANEL_ILITEK_IL9322=m -CONFIG_DRM_PANEL_ILITEK_ILI9881C=m -CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m -CONFIG_DRM_PANEL_JDI_LT070ME05000=m -CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m -CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W=m -CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m -CONFIG_DRM_PANEL_SAMSUNG_LD9040=m -CONFIG_DRM_PANEL_LG_LB035Q02=m -CONFIG_DRM_PANEL_LG_LG4573=m -CONFIG_DRM_PANEL_NEC_NL8048HL11=m -CONFIG_DRM_PANEL_NOVATEK_NT35510=m -CONFIG_DRM_PANEL_NOVATEK_NT39016=m -# CONFIG_DRM_PANEL_MANTIX_MLAF057WE51 is not set -CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m -CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m -CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m -CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m -CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m -CONFIG_DRM_PANEL_RAYDIUM_RM67191=m -CONFIG_DRM_PANEL_RAYDIUM_RM68200=m -CONFIG_DRM_PANEL_RONBO_RB070D30=m -CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0_SPI=m -# CONFIG_DRM_PANEL_SAMSUNG_S6E63M0_DSI is not set -CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m -CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m -CONFIG_DRM_PANEL_SEIKO_43WVF1G=m -CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m -CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m -CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m -CONFIG_DRM_PANEL_SITRONIX_ST7701=m -# CONFIG_DRM_PANEL_SITRONIX_ST7703 is not set -CONFIG_DRM_PANEL_SITRONIX_ST7789V=m -CONFIG_DRM_PANEL_SONY_ACX424AKP=m -CONFIG_DRM_PANEL_SONY_ACX565AKM=m -CONFIG_DRM_PANEL_TPO_TD028TTEC1=m -CONFIG_DRM_PANEL_TPO_TD043MTEA1=m -CONFIG_DRM_PANEL_TPO_TPG110=m -CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m -CONFIG_DRM_PANEL_VISIONOX_RM69299=m -CONFIG_DRM_PANEL_XINPENG_XPP055C272=m -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_CDNS_DSI=m -CONFIG_DRM_CHRONTEL_CH7033=m -CONFIG_DRM_DISPLAY_CONNECTOR=m -# CONFIG_DRM_LONTIUM_LT9611 is not set -CONFIG_DRM_LVDS_CODEC=m -CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m -CONFIG_DRM_NWL_MIPI_DSI=m -CONFIG_DRM_NXP_PTN3460=m -CONFIG_DRM_PARADE_PS8622=m -CONFIG_DRM_PARADE_PS8640=m -CONFIG_DRM_SIL_SII8620=m -CONFIG_DRM_SII902X=m -CONFIG_DRM_SII9234=m -CONFIG_DRM_SIMPLE_BRIDGE=m -CONFIG_DRM_THINE_THC63LVD1024=m -# CONFIG_DRM_TOSHIBA_TC358762 is not set -CONFIG_DRM_TOSHIBA_TC358764=m -CONFIG_DRM_TOSHIBA_TC358767=m -CONFIG_DRM_TOSHIBA_TC358768=m -# CONFIG_DRM_TOSHIBA_TC358775 is not set -CONFIG_DRM_TI_TFP410=m -CONFIG_DRM_TI_SN65DSI86=m -CONFIG_DRM_TI_TPD12S015=m -CONFIG_DRM_ANALOGIX_ANX6345=m -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_DRM_ANALOGIX_DP=m -CONFIG_DRM_I2C_ADV7511=m -CONFIG_DRM_I2C_ADV7511_AUDIO=y -CONFIG_DRM_I2C_ADV7511_CEC=y -# CONFIG_DRM_CDNS_MHDP8546 is not set -CONFIG_DRM_DW_HDMI=m -CONFIG_DRM_DW_HDMI_AHB_AUDIO=m -CONFIG_DRM_DW_HDMI_I2S_AUDIO=m -CONFIG_DRM_DW_HDMI_CEC=m -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_ARCPGU=m -CONFIG_DRM_MXS=y -CONFIG_DRM_MXSFB=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_GM12U320=m -CONFIG_TINYDRM_HX8357D=m -CONFIG_TINYDRM_ILI9225=m -CONFIG_TINYDRM_ILI9341=m -CONFIG_TINYDRM_ILI9486=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -CONFIG_TINYDRM_ST7735R=m -CONFIG_DRM_XEN=y -CONFIG_DRM_XEN_FRONTEND=m -CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_BACKLIGHT=m -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -CONFIG_XEN_FBDEV_FRONTEND=m -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -CONFIG_FB_HYPERV=m -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SSD1307 is not set -# CONFIG_FB_SM712 is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_LCD_OTM3225A=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -# CONFIG_BACKLIGHT_KTD253 is not set -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA903X=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_MAX8925=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP5520=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_88PM860X=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_AAT2870=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_LP8788=m -CONFIG_BACKLIGHT_PANDORA=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_AS3711=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -CONFIG_BACKLIGHT_RAVE_SP=m -CONFIG_BACKLIGHT_LED=m -# end of Backlight & LCD device support - -CONFIG_VIDEOMODE_HELPERS=y -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -# CONFIG_SND_DEBUG_VERBOSE is not set -# CONFIG_SND_PCM_XRUN_DEBUG is not set -# CONFIG_SND_CTL_VALIDATION is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -# CONFIG_SND_BT87X_OVERCLOCK is not set -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_GENERIC_LEDS=y -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -# CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM is not set -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_AC97_BUS=y -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m -CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m -CONFIG_SND_SOC_AMD_ACP3x=m -CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m -CONFIG_SND_SOC_AMD_RENOIR=m -CONFIG_SND_SOC_AMD_RENOIR_MACH=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_SOC_MIKROE_PROTO=m -CONFIG_SND_BCM63XX_I2S_WHISTLER=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_PCI=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_CATPT=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL=m -CONFIG_SND_SOC_INTEL_APL=m -CONFIG_SND_SOC_INTEL_KBL=m -CONFIG_SND_SOC_INTEL_GLK=m -CONFIG_SND_SOC_INTEL_CNL=m -CONFIG_SND_SOC_INTEL_CFL=m -CONFIG_SND_SOC_INTEL_CML_H=m -CONFIG_SND_SOC_INTEL_CML_LP=m -CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m -CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m -# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set -CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m -CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m -CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m -CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH=m -CONFIG_SND_SOC_MTK_BTCVSD=m -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_OF=m -# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set -# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_ACPI=m -CONFIG_SND_SOC_SOF_INTEL_PCI=m -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_BROADWELL_SUPPORT=y -CONFIG_SND_SOC_SOF_BROADWELL=m -CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_INTEL_SOUNDWIRE_LINK=y -CONFIG_SND_SOC_SOF_INTEL_SOUNDWIRE_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_INTEL_SOUNDWIRE=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -CONFIG_SND_SOC_XILINX_I2S=m -CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m -CONFIG_SND_SOC_XILINX_SPDIF=m -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -CONFIG_SND_SOC_AC97_CODEC=m -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_ADAU7118=m -CONFIG_SND_SOC_ADAU7118_HW=m -CONFIG_SND_SOC_ADAU7118_I2C=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4118=m -CONFIG_SND_SOC_AK4458=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_AK5558=m -CONFIG_SND_SOC_ALC5623=m -CONFIG_SND_SOC_BD28623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CPCAP=m -CONFIG_SND_SOC_CROS_EC_CODEC=m -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS35L36=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -# CONFIG_SND_SOC_CS4234 is not set -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4341=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_CX2072X=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES7241=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_HDAC_HDA=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_LOCHNAGAR_SC=m -CONFIG_SND_SOC_MAX98088=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX9867=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX98373=m -CONFIG_SND_SOC_MAX98373_I2C=m -# CONFIG_SND_SOC_MAX98373_SDW is not set -CONFIG_SND_SOC_MAX98390=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM1789=m -CONFIG_SND_SOC_PCM1789_I2C=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM186X=m -CONFIG_SND_SOC_PCM186X_I2C=m -CONFIG_SND_SOC_PCM186X_SPI=m -CONFIG_SND_SOC_PCM3060=m -CONFIG_SND_SOC_PCM3060_I2C=m -CONFIG_SND_SOC_PCM3060_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RK3328=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT1011=m -CONFIG_SND_SOC_RT1015=m -CONFIG_SND_SOC_RT1308_SDW=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5660=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_RT5682=m -CONFIG_SND_SOC_RT5682_I2C=m -CONFIG_SND_SOC_RT5682_SDW=m -CONFIG_SND_SOC_RT700=m -CONFIG_SND_SOC_RT700_SDW=m -CONFIG_SND_SOC_RT711=m -CONFIG_SND_SOC_RT711_SDW=m -CONFIG_SND_SOC_RT715=m -CONFIG_SND_SOC_RT715_SDW=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2305=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS2562=m -# CONFIG_SND_SOC_TAS2764 is not set -CONFIG_SND_SOC_TAS2770=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TAS6424=m -CONFIG_SND_SOC_TDA7419=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC32X4=m -CONFIG_SND_SOC_TLV320AIC32X4_I2C=m -CONFIG_SND_SOC_TLV320AIC32X4_SPI=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TLV320ADCX140=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_TSCS42XX=m -CONFIG_SND_SOC_TSCS454=m -CONFIG_SND_SOC_UDA1334=m -CONFIG_SND_SOC_WCD9335=m -CONFIG_SND_SOC_WCD934X=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8782=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8904=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_WSA881X=m -CONFIG_SND_SOC_ZL38060=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_MAX9759=m -CONFIG_SND_SOC_MT6351=m -CONFIG_SND_SOC_MT6358=m -CONFIG_SND_SOC_MT6660=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8822=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -# end of CODEC drivers - -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_AUDIO_GRAPH_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_SND_XEN_FRONTEND=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_BIGBEN_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_COUGAR=m -CONFIG_HID_MACALLY=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CREATIVE_SB0540=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELAN=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_GLORIOUS=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GOOGLE_HAMMER=m -# CONFIG_HID_VIVALDI is not set -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_VIEWSONIC=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_JABRA=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MALTRON=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_REDRAGON=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEAM=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_U2FZERO=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set -CONFIG_HID_ALPS=m -CONFIG_HID_MCP2221=m -# end of Special HID drivers - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# end of USB HID Boot Protocol drivers -# end of USB HID support - -# -# I2C HID support -# -CONFIG_I2C_HID=m -# end of I2C HID support - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support -# end of HID support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_USB_CONN_GPIO=m -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=y -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -# CONFIG_USB_FEW_INIT_RETRIES is not set -CONFIG_USB_DYNAMIC_MINORS=y -# CONFIG_USB_OTG is not set -# CONFIG_USB_OTG_PRODUCTLIST is not set -# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_MON=m - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PCI_RENESAS=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_CDNS3=m -CONFIG_USB_CDNS3_GADGET=y -CONFIG_USB_CDNS3_HOST=y -CONFIG_USB_CDNS3_PCI_WRAP=m -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -# CONFIG_MUSB_PIO_ONLY is not set -CONFIG_USB_DWC3=m -CONFIG_USB_DWC3_ULPI=y -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC3_HAPS=m -CONFIG_USB_DWC3_OF_SIMPLE=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_MSM=m -CONFIG_USB_CHIPIDEA_IMX=m -CONFIG_USB_CHIPIDEA_GENERIC=m -CONFIG_USB_CHIPIDEA_TEGRA=m -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_CONSOLE=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -CONFIG_USB_SERIAL_UPD78F0730=m -CONFIG_USB_SERIAL_DEBUG=m - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_APPLE_MFI_FASTCHARGE=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set -CONFIG_USB_ISP1301=m -# end of USB Physical Layer drivers - -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_SNP_UDC_PLAT=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -CONFIG_USB_GADGET_XILINX=m -CONFIG_USB_MAX3420_UDC=m -CONFIG_USB_DUMMY_HCD=m -# end of USB Peripheral Controller - -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC1_LEGACY=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y - -# -# USB Gadget precomposed configurations -# -CONFIG_USB_ZERO=m -CONFIG_USB_AUDIO=m -# CONFIG_GADGET_UAC1 is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -CONFIG_USB_G_DBGP=m -# CONFIG_USB_G_DBGP_PRINTK is not set -CONFIG_USB_G_DBGP_SERIAL=y -CONFIG_USB_G_WEBCAM=m -CONFIG_USB_RAW_GADGET=m -# end of USB Gadget precomposed configurations - -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -# CONFIG_TYPEC_MT6360 is not set -# CONFIG_TYPEC_TCPCI_MAXIM is not set -CONFIG_TYPEC_FUSB302=m -CONFIG_TYPEC_WCOVE=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -CONFIG_TYPEC_HD3SS3220=m -CONFIG_TYPEC_TPS6598X=m -# CONFIG_TYPEC_STUSB160X is not set - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -CONFIG_TYPEC_MUX_PI3USB30532=m -CONFIG_TYPEC_MUX_INTEL_PMC=m -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -CONFIG_TYPEC_NVIDIA_ALTMODE=m -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -CONFIG_USB_ROLES_INTEL_XHCI=m -CONFIG_MMC=m -CONFIG_PWRSEQ_EMMC=m -CONFIG_PWRSEQ_SD8787=m -CONFIG_PWRSEQ_SIMPLE=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -CONFIG_MMC_TEST=m - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_SDHCI_OF_ARASAN=m -CONFIG_MMC_SDHCI_OF_ASPEED=m -CONFIG_MMC_SDHCI_OF_AT91=m -CONFIG_MMC_SDHCI_OF_DWCMSHC=m -CONFIG_MMC_SDHCI_CADENCE=m -CONFIG_MMC_SDHCI_F_SDH30=m -CONFIG_MMC_SDHCI_MILBEAUT=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_ALCOR=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MMC_SDHCI_OMAP=m -CONFIG_MMC_SDHCI_AM654=m -CONFIG_MMC_SDHCI_EXTERNAL_DMA=y -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -CONFIG_LEDS_CLASS_MULTICOLOR=m -CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y - -# -# LED drivers -# -CONFIG_LEDS_88PM860X=m -CONFIG_LEDS_AAT1290=m -CONFIG_LEDS_AN30259A=m -CONFIG_LEDS_APU=m -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_AW2013=m -CONFIG_LEDS_BCM6328=m -CONFIG_LEDS_BCM6358=m -CONFIG_LEDS_CPCAP=m -CONFIG_LEDS_CR0014114=m -CONFIG_LEDS_EL15203000=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3532=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_LM3692X=m -CONFIG_LEDS_LM3601X=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -# CONFIG_LEDS_LP50XX is not set -# CONFIG_LEDS_LP55XX_COMMON is not set -CONFIG_LEDS_LP8788=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -CONFIG_LEDS_PCA955X_GPIO=y -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_WM8350=m -CONFIG_LEDS_DA903X=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_MAX77650=m -CONFIG_LEDS_MAX77693=m -CONFIG_LEDS_MAX8997=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m -CONFIG_LEDS_KTD2692=m -CONFIG_LEDS_IS31FL319X=m -CONFIG_LEDS_IS31FL32XX=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_SYSCON=y -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_MLXREG=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m -CONFIG_LEDS_SPI_BYTE=m -CONFIG_LEDS_TI_LMU_COMMON=m -CONFIG_LEDS_LM3697=m -CONFIG_LEDS_LM36274=m -CONFIG_LEDS_TPS6105X=m -CONFIG_LEDS_SGM3140=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_MTD=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_ACTIVITY=m -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_LEDS_TRIGGER_NETDEV=m -CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -CONFIG_SPEAKUP_SYNTH_DUMMY=m -# end of Speakup console speech - -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_EFA=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_RDMA_SIW=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_RTRS=m -CONFIG_INFINIBAND_RTRS_CLIENT=m -CONFIG_INFINIBAND_RTRS_SERVER=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_I10NM=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM860X=m -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABEOZ9=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_AS3722=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_CENTURY=y -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_HYM8563=m -CONFIG_RTC_DRV_LP8788=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_MAX8925=m -CONFIG_RTC_DRV_MAX8998=m -CONFIG_RTC_DRV_MAX8997=m -CONFIG_RTC_DRV_MAX77686=m -CONFIG_RTC_DRV_RK808=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_ISL12026=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF85363=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BD70528=m -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_TWL4030=m -CONFIG_RTC_DRV_PALMAS=m -CONFIG_RTC_DRV_TPS6586X=m -CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m -CONFIG_RTC_DRV_RC5T583=m -CONFIG_RTC_DRV_RC5T619=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3028=m -# CONFIG_RTC_DRV_RV3032 is not set -CONFIG_RTC_DRV_RV8803=m -CONFIG_RTC_DRV_S5M=m -CONFIG_RTC_DRV_SD3078=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9055=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_WM8350=m -CONFIG_RTC_DRV_PCF50633=m -CONFIG_RTC_DRV_AB3100=m -CONFIG_RTC_DRV_ZYNQMP=m -CONFIG_RTC_DRV_CROS_EC=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_CADENCE=m -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m -CONFIG_RTC_DRV_R7301=m -CONFIG_RTC_DRV_CPCAP=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_RTC_DRV_WILCO_EC=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_DMA_OF=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_DW_AXI_DMAC=m -CONFIG_FSL_EDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IDXD=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_PLX_DMA=m -# CONFIG_XILINX_ZYNQMP_DPDMA is not set -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=y -CONFIG_DW_DMAC_PCI=y -CONFIG_DW_EDMA=m -CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y -CONFIG_SF_PDMA=m - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -# CONFIG_DMABUF_MOVE_NOTIFY is not set -# CONFIG_DMABUF_SELFTESTS is not set -CONFIG_DMABUF_HEAPS=y -CONFIG_DMABUF_HEAPS_SYSTEM=y -# end of DMABUF options - -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_HT16K33=m -CONFIG_PARPORT_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -# CONFIG_CHARLCD_BL_OFF is not set -# CONFIG_CHARLCD_BL_ON is not set -CONFIG_CHARLCD_BL_FLASH=y -CONFIG_PANEL=m -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VBOXGUEST=m -# CONFIG_NITRO_ENCLAVES is not set -CONFIG_VIRTIO=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_MEM=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y -CONFIG_VIRTIO_DMA_SHARED_BUFFER=m -CONFIG_VDPA=m -CONFIG_VDPA_SIM=m -CONFIG_IFCVF=m -CONFIG_MLX5_VDPA=y -CONFIG_MLX5_VDPA_NET=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_RING=m -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TIMER=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -# end of Microsoft Hyper-V guest support - -# -# Xen driver support -# -CONFIG_XEN_BALLOON=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 -CONFIG_XEN_SCRUB_PAGES_DEFAULT=y -CONFIG_XEN_DEV_EVTCHN=m -CONFIG_XEN_BACKEND=y -CONFIG_XENFS=m -CONFIG_XEN_COMPAT_XENFS=y -CONFIG_XEN_SYS_HYPERVISOR=y -CONFIG_XEN_XENBUS_FRONTEND=y -CONFIG_XEN_GNTDEV=m -CONFIG_XEN_GNTDEV_DMABUF=y -CONFIG_XEN_GRANT_DEV_ALLOC=m -CONFIG_XEN_GRANT_DMA_ALLOC=y -CONFIG_SWIOTLB_XEN=y -CONFIG_XEN_PCIDEV_BACKEND=m -CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y -CONFIG_XEN_SCSI_BACKEND=m -CONFIG_XEN_PRIVCMD=m -CONFIG_XEN_ACPI_PROCESSOR=m -CONFIG_XEN_MCE_LOG=y -CONFIG_XEN_HAVE_PVMMU=y -CONFIG_XEN_EFI=y -CONFIG_XEN_AUTO_XLATE=y -CONFIG_XEN_ACPI=y -CONFIG_XEN_SYMS=y -CONFIG_XEN_HAVE_VPMU=y -CONFIG_XEN_FRONT_PGDIR_SHBUF=m -CONFIG_XEN_UNPOPULATED_ALLOC=y -# end of Xen driver support - -# CONFIG_GREYBUS is not set -CONFIG_STAGING=y -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -# CONFIG_COMEDI_ISA_DRIVERS is not set -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_COMEDI_NI_ROUTING=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16203=m -CONFIG_ADIS16240=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD7816=m -CONFIG_AD7280=m -# end of Analog to digital converters - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m -# end of Analog digital bi-direction converters - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7746=m -# end of Capacitance to digital converters - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m -# end of Direct Digital Synthesis - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m -# end of Network Analyzer, Impedance Converters - -# -# Active energy metering IC -# -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m -# end of Active energy metering IC - -# -# Resolver to digital converters -# -CONFIG_AD2S1210=m -# end of Resolver to digital converters -# end of IIO staging drivers - -# CONFIG_FB_SM750 is not set -CONFIG_STAGING_MEDIA=y -CONFIG_INTEL_ATOMISP=y -CONFIG_VIDEO_ATOMISP=m -CONFIG_VIDEO_ATOMISP_ISP2401=y -CONFIG_VIDEO_ATOMISP_OV2722=m -CONFIG_VIDEO_ATOMISP_GC2235=m -CONFIG_VIDEO_ATOMISP_MSRLIST_HELPER=m -CONFIG_VIDEO_ATOMISP_MT9M114=m -CONFIG_VIDEO_ATOMISP_GC0310=m -CONFIG_VIDEO_ATOMISP_OV2680=m -CONFIG_VIDEO_ATOMISP_OV5693=m -CONFIG_VIDEO_ATOMISP_LM3554=m -# CONFIG_VIDEO_ZORAN is not set -CONFIG_VIDEO_IPU3_IMGU=m - -# -# Android -# -# end of Android - -CONFIG_STAGING_BOARD=y -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_GS_FPGABOOT=m -CONFIG_UNISYSSPAR=y -CONFIG_UNISYS_VISORNIC=m -CONFIG_UNISYS_VISORINPUT=m -CONFIG_UNISYS_VISORHBA=m -# CONFIG_FB_TFT is not set -CONFIG_MOST_COMPONENTS=m -CONFIG_MOST_NET=m -CONFIG_MOST_SOUND=m -CONFIG_MOST_VIDEO=m -CONFIG_MOST_DIM2=m -CONFIG_MOST_I2C=m -CONFIG_KS7010=m -CONFIG_PI433=m - -# -# Gasket devices -# -CONFIG_STAGING_GASKET_FRAMEWORK=m -CONFIG_STAGING_APEX_DRIVER=m -# end of Gasket devices - -CONFIG_XIL_AXIS_FIFO=m -CONFIG_FIELDBUS_DEV=m -CONFIG_HMS_ANYBUSS_BUS=m -CONFIG_ARCX_ANYBUS_CONTROLLER=m -CONFIG_HMS_PROFINET=m -CONFIG_KPC2000=y -CONFIG_KPC2000_CORE=m -CONFIG_KPC2000_SPI=m -CONFIG_KPC2000_I2C=m -CONFIG_KPC2000_DMA=m -CONFIG_QLGE=m -CONFIG_WFX=m -# CONFIG_SPMI_HISI3670 is not set -# CONFIG_MFD_HI6421_SPMI is not set -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_HUAWEI_WMI=m -CONFIG_INTEL_WMI_SBL_FW_UPDATE=m -CONFIG_INTEL_WMI_THUNDERBOLT=m -CONFIG_MXM_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_XIAOMI_WMI=m -CONFIG_ACERHDF=m -CONFIG_ACER_WIRELESS=m -CONFIG_ACER_WMI=m -CONFIG_APPLE_GMUX=m -CONFIG_ASUS_LAPTOP=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_EEEPC_WMI=m -CONFIG_DCDBAS=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_SMBIOS_WMI=y -CONFIG_DELL_SMBIOS_SMM=y -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_RBTN=m -# CONFIG_DELL_RBU is not set -CONFIG_DELL_SMO8800=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_DESCRIPTOR=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_AMILO_RFKILL=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_GPD_POCKET_FAN=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_IBM_RTL=m -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_INTEL_ATOMISP2_LED=m -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_MENLOW=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_VBTN=m -CONFIG_SURFACE3_WMI=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_SURFACE_3_POWER_OPREGION=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_MSI_LAPTOP=m -CONFIG_MSI_WMI=m -CONFIG_PCENGINES_APU2=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_SAMSUNG_Q10=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_LG_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_SYSTEM76_ACPI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_I2C_MULTI_INSTANTIATE=m -CONFIG_MLX_PLATFORM=m -CONFIG_TOUCHSCREEN_DMI=y -CONFIG_INTEL_IPS=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_INTEL_CHTDC_TI_PWRBTN=m -CONFIG_INTEL_MFLD_THERMAL=m -CONFIG_INTEL_MID_POWER_BUTTON=m -CONFIG_INTEL_MRFLD_PWRBTN=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_SCU_IPC=y -CONFIG_INTEL_SCU=y -CONFIG_INTEL_SCU_PCI=y -CONFIG_INTEL_SCU_PLATFORM=m -CONFIG_INTEL_SCU_IPC_UTIL=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_PMC_ATOM=y -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CHROMEOS_TBMC=m -CONFIG_CROS_EC=m -CONFIG_CROS_EC_I2C=m -CONFIG_CROS_EC_RPMSG=m -CONFIG_CROS_EC_ISHTP=m -CONFIG_CROS_EC_SPI=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LIGHTBAR=m -CONFIG_CROS_EC_VBC=m -CONFIG_CROS_EC_DEBUGFS=m -CONFIG_CROS_EC_SENSORHUB=m -CONFIG_CROS_EC_SYSFS=m -CONFIG_CROS_EC_TYPEC=m -CONFIG_CROS_USBPD_LOGGER=m -CONFIG_CROS_USBPD_NOTIFY=m -CONFIG_WILCO_EC=m -CONFIG_WILCO_EC_DEBUGFS=m -CONFIG_WILCO_EC_EVENTS=m -CONFIG_WILCO_EC_TELEMETRY=m -CONFIG_MELLANOX_PLATFORM=y -CONFIG_MLXREG_HOTPLUG=m -CONFIG_MLXREG_IO=m -CONFIG_HAVE_CLK=y -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y -CONFIG_COMMON_CLK_WM831X=m -CONFIG_COMMON_CLK_MAX77686=m -CONFIG_COMMON_CLK_MAX9485=m -CONFIG_COMMON_CLK_RK808=m -CONFIG_COMMON_CLK_SI5341=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_SI514=m -CONFIG_COMMON_CLK_SI544=m -CONFIG_COMMON_CLK_SI570=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CDCE925=m -CONFIG_COMMON_CLK_CS2000_CP=m -CONFIG_COMMON_CLK_S2MPS11=m -CONFIG_CLK_TWL6040=m -CONFIG_COMMON_CLK_LOCHNAGAR=m -CONFIG_COMMON_CLK_PALMAS=m -CONFIG_COMMON_CLK_PWM=m -CONFIG_COMMON_CLK_VC5=m -CONFIG_COMMON_CLK_BD718XX=m -CONFIG_COMMON_CLK_FIXED_MMIO=y -CONFIG_CLK_LGM_CGU=y -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_TIMER_OF=y -CONFIG_TIMER_PROBE=y -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -CONFIG_CLKSRC_MMIO=y -CONFIG_MICROCHIP_PIT64B=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PLATFORM_MHU=m -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_MAILBOX_TEST=m -CONFIG_IOMMU_IOVA=y -CONFIG_IOASID=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_OF_IOMMU=y -CONFIG_IOMMU_DMA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set -CONFIG_IRQ_REMAP=y -CONFIG_HYPERV_IOMMU=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=y -# CONFIG_REMOTEPROC_CDEV is not set -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m -CONFIG_RPMSG_VIRTIO=m -# end of Rpmsg drivers - -CONFIG_SOUNDWIRE=m - -# -# SoundWire Devices -# -CONFIG_SOUNDWIRE_CADENCE=m -CONFIG_SOUNDWIRE_INTEL=m -CONFIG_SOUNDWIRE_QCOM=m -CONFIG_SOUNDWIRE_GENERIC_ALLOCATION=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Aspeed SoC drivers -# -# end of Aspeed SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Qualcomm SoC drivers -# -# end of Qualcomm SoC drivers - -CONFIG_SOC_TI=y - -# -# Xilinx SoC drivers -# -CONFIG_XILINX_VCU=m -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_FSA9480=m -CONFIG_EXTCON_GPIO=m -CONFIG_EXTCON_INTEL_INT3496=m -CONFIG_EXTCON_INTEL_CHT_WC=m -CONFIG_EXTCON_INTEL_MRFLD=m -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_MAX77843=m -CONFIG_EXTCON_MAX8997=m -CONFIG_EXTCON_PALMAS=m -CONFIG_EXTCON_PTN5150=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -CONFIG_EXTCON_USB_GPIO=m -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_BUFFER_DMA=m -CONFIG_IIO_BUFFER_DMAENGINE=m -CONFIG_IIO_BUFFER_HW_CONSUMER=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16209=m -CONFIG_ADXL372=m -CONFIG_ADXL372_SPI=m -CONFIG_ADXL372_I2C=m -CONFIG_BMA220=m -CONFIG_BMA400=m -CONFIG_BMA400_I2C=m -CONFIG_BMA400_SPI=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD06=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7091R5=m -CONFIG_AD7124=m -CONFIG_AD7192=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7292=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7766=m -CONFIG_AD7768_1=m -CONFIG_AD7780=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD7949=m -CONFIG_AD799X=m -CONFIG_AD9467=m -CONFIG_ADI_AXI_ADC=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_CPCAP_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_ENVELOPE_DETECTOR=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_INTEL_MRFLD_ADC=m -CONFIG_LP8788_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2496=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1241=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MCP3911=m -CONFIG_MEN_Z188_ADC=m -CONFIG_MP2629_ADC=m -CONFIG_NAU7802=m -CONFIG_PALMAS_GPADC=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_QCOM_SPMI_ADC5=m -CONFIG_RN5T618_ADC=m -CONFIG_SD_ADC_MODULATOR=m -CONFIG_STMPE_ADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_ADS8344=m -CONFIG_TI_ADS8688=m -CONFIG_TI_ADS124S08=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_TWL4030_MADC=m -CONFIG_TWL6030_GPADC=m -CONFIG_VF610_ADC=m -CONFIG_VIPERBOARD_ADC=m -CONFIG_XILINX_XADC=m -# end of Analog to digital converters - -# -# Analog Front Ends -# -CONFIG_IIO_RESCALE=m -# end of Analog Front Ends - -# -# Amplifiers -# -CONFIG_AD8366=m -CONFIG_HMC425=m -# end of Amplifiers - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_ATLAS_EZO_SENSOR=m -CONFIG_BME680=m -CONFIG_BME680_I2C=m -CONFIG_BME680_SPI=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_PMS7003=m -# CONFIG_SCD30_CORE is not set -CONFIG_SENSIRION_SGP30=m -CONFIG_SPS30=m -CONFIG_VZ89X=m -# end of Chemical Sensors - -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m -CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -# end of Hid Sensor IIO Common - -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -# end of SSP Sensor Common - -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_AD5686=m -CONFIG_AD5686_SPI=m -CONFIG_AD5696_I2C=m -CONFIG_AD5755=m -CONFIG_AD5758=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5770R=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_DPOT_DAC=m -CONFIG_DS4424=m -CONFIG_LTC1660=m -CONFIG_LTC2632=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MAX5821=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m -CONFIG_TI_DAC082S085=m -CONFIG_TI_DAC5571=m -CONFIG_TI_DAC7311=m -CONFIG_TI_DAC7612=m -CONFIG_VF610_DAC=m -# end of Digital to analog converters - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set -# end of IIO dummy driver - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m -# end of Clock Generator/Distribution - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m -CONFIG_ADF4371=m -# end of Phase-Locked Loop (PLL) frequency synthesizers -# end of Frequency Synthesizers DDS/PLL - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -# CONFIG_ADXRS290 is not set -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_FXAS21002C=m -CONFIG_FXAS21002C_I2C=m -CONFIG_FXAS21002C_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m -# end of Digital gyroscope sensors - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m -# end of Heart Rate Monitors -# end of Health Sensors - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -# CONFIG_HDC2010 is not set -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m -# end of Humidity sensors - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16460=m -CONFIG_ADIS16475=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_FXOS8700=m -CONFIG_FXOS8700_I2C=m -CONFIG_FXOS8700_SPI=m -CONFIG_KMX61=m -CONFIG_INV_ICM42600=m -CONFIG_INV_ICM42600_I2C=m -CONFIG_INV_ICM42600_SPI=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ST_LSM6DSX_I3C=m -# end of Inertial measurement units - -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -CONFIG_ACPI_ALS=m -CONFIG_ADJD_S311=m -CONFIG_ADUX1020=m -CONFIG_AL3010=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -# CONFIG_AS73211 is not set -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM3605=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP002=m -CONFIG_GP2AP020A00F=m -CONFIG_IQS621_ALS=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_LV0104CS=m -CONFIG_MAX44000=m -CONFIG_MAX44009=m -CONFIG_NOA1305=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1133=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_ST_UVIS25=m -CONFIG_ST_UVIS25_I2C=m -CONFIG_ST_UVIS25_SPI=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL2772=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VCNL4035=m -CONFIG_VEML6030=m -CONFIG_VEML6070=m -CONFIG_VL6180=m -CONFIG_ZOPT2201=m -# end of Light sensors - -# -# Magnetometer sensors -# -CONFIG_AK8974=m -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m -CONFIG_SENSORS_RM3100=m -CONFIG_SENSORS_RM3100_I2C=m -CONFIG_SENSORS_RM3100_SPI=m -# end of Magnetometer sensors - -# -# Multiplexers -# -CONFIG_IIO_MUX=m -# end of Multiplexers - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m -# end of Inclinometer sensors - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m -# end of Triggers - standalone - -# -# Linear and angular position sensors -# -CONFIG_IQS624_POS=m -# end of Linear and angular position sensors - -# -# Digital potentiometers -# -CONFIG_AD5272=m -CONFIG_DS1803=m -CONFIG_MAX5432=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4018=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_MCP41010=m -CONFIG_TPL0102=m -# end of Digital potentiometers - -# -# Digital potentiostats -# -CONFIG_LMP91000=m -# end of Digital potentiostats - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_DLHL60D=m -CONFIG_DPS310=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_ICP10100=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m -# end of Pressure sensors - -# -# Lightning sensors -# -CONFIG_AS3935=m -# end of Lightning sensors - -# -# Proximity and distance sensors -# -CONFIG_ISL29501=m -CONFIG_LIDAR_LITE_V2=m -CONFIG_MB1232=m -CONFIG_PING=m -CONFIG_RFD77402=m -CONFIG_SRF04=m -CONFIG_SX9310=m -CONFIG_SX9500=m -CONFIG_SRF08=m -CONFIG_VCNL3020=m -CONFIG_VL53L0X_I2C=m -# end of Proximity and distance sensors - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -# end of Resolver to digital converters - -# -# Temperature sensors -# -CONFIG_IQS620AT_TEMP=m -CONFIG_LTC2983=m -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_MLX90632=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_MAX31856=m -# end of Temperature sensors - -CONFIG_NTB=m -CONFIG_NTB_MSI=y -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_SWITCHTEC=m -# CONFIG_NTB_PINGPONG is not set -# CONFIG_NTB_TOOL is not set -# CONFIG_NTB_PERF is not set -# CONFIG_NTB_MSI_TEST is not set -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -# CONFIG_VME_FAKE is not set - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_ATMEL_HLCDC_PWM=m -CONFIG_PWM_CRC=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_FSL_FTM=m -CONFIG_PWM_IQS620A=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_PWM_STMPE=y -CONFIG_PWM_TWL=m -CONFIG_PWM_TWL_LED=m - -# -# IRQ chip support -# -CONFIG_IRQCHIP=y -CONFIG_AL_FIC=y -CONFIG_MADERA_IRQ=m -# CONFIG_MST_IRQ is not set -# end of IRQ chip support - -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -CONFIG_RESET_BRCMSTB_RESCAL=y -CONFIG_RESET_INTEL_GW=y -CONFIG_RESET_TI_SYSCON=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_GENERIC_PHY_MIPI_DPHY=y -# CONFIG_USB_LGM_PHY is not set -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_CADENCE_TORRENT=m -CONFIG_PHY_CADENCE_DPHY=m -CONFIG_PHY_CADENCE_SIERRA=m -CONFIG_PHY_CADENCE_SALVO=m -CONFIG_PHY_FSL_IMX8MQ_USB=m -CONFIG_PHY_MIXEL_MIPI_DPHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_MAPPHONE_MDM6600=m -CONFIG_PHY_OCELOT_SERDES=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -CONFIG_PHY_TUSB1210=m -# CONFIG_PHY_INTEL_LGM_COMBO is not set -# CONFIG_PHY_INTEL_LGM_EMMC is not set -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_IDLE_INJECT=y -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -# end of Performance monitor support - -CONFIG_RAS=y -CONFIG_RAS_CEC=y -# CONFIG_RAS_CEC_DEBUG is not set -CONFIG_USB4=m -# CONFIG_USB4_DEBUGFS_WRITE is not set - -# -# Android -# -# CONFIG_ANDROID is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_ND_PFN=m -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_OF_PMEM=m -CONFIG_DAX_DRIVER=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_HMEM=m -CONFIG_DEV_DAX_HMEM_DEVICES=y -CONFIG_DEV_DAX_KMEM=m -CONFIG_DEV_DAX_PMEM_COMPAT=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -CONFIG_NVMEM_SPMI_SDAM=m -CONFIG_RAVE_SP_EEPROM=m - -# -# HW tracing support -# -CONFIG_STM=m -CONFIG_STM_PROTO_BASIC=m -CONFIG_STM_PROTO_SYS_T=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_STM_SOURCE_FTRACE=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_ACPI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -# end of HW tracing support - -CONFIG_FPGA=m -CONFIG_ALTERA_PR_IP_CORE=m -CONFIG_ALTERA_PR_IP_CORE_PLAT=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_FPGA_MGR_ICE40_SPI=m -CONFIG_FPGA_MGR_MACHXO2_SPI=m -CONFIG_FPGA_BRIDGE=m -CONFIG_ALTERA_FREEZE_BRIDGE=m -CONFIG_XILINX_PR_DECOUPLER=m -CONFIG_FPGA_REGION=m -CONFIG_OF_FPGA_REGION=m -CONFIG_FPGA_DFL=m -CONFIG_FPGA_DFL_FME=m -CONFIG_FPGA_DFL_FME_MGR=m -CONFIG_FPGA_DFL_FME_BRIDGE=m -CONFIG_FPGA_DFL_FME_REGION=m -CONFIG_FPGA_DFL_AFU=m -CONFIG_FPGA_DFL_PCI=m -CONFIG_FSI=m -CONFIG_FSI_NEW_DEV_NODE=y -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_MASTER_ASPEED=m -CONFIG_FSI_SCOM=m -CONFIG_FSI_SBEFIFO=m -CONFIG_FSI_OCC=m -CONFIG_TEE=m - -# -# TEE drivers -# -CONFIG_AMDTEE=m -# end of TEE drivers - -CONFIG_MULTIPLEXER=m - -# -# Multiplexer drivers -# -CONFIG_MUX_ADG792A=m -CONFIG_MUX_ADGS1408=m -CONFIG_MUX_GPIO=m -CONFIG_MUX_MMIO=m -# end of Multiplexer drivers - -CONFIG_PM_OPP=y -CONFIG_UNISYS_VISORBUS=m -CONFIG_SIOX=m -CONFIG_SIOX_BUS_GPIO=m -CONFIG_SLIMBUS=m -CONFIG_SLIM_QCOM_CTRL=m -CONFIG_INTERCONNECT=y -CONFIG_COUNTER=m -CONFIG_FTM_QUADDEC=m -CONFIG_MICROCHIP_TCB_CAPTURE=m -CONFIG_MOST=m -# CONFIG_MOST_USB_HDM is not set -CONFIG_MOST_CDEV=m -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=m -CONFIG_EXT4_USE_FOR_EXT2=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -# CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=m -CONFIG_XFS_SUPPORT_V4=y -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_XFS_ONLINE_SCRUB=y -CONFIG_XFS_ONLINE_REPAIR=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_OCFS2_FS_O2CB=m -CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m -CONFIG_OCFS2_FS_STATS=y -CONFIG_OCFS2_DEBUG_MASKLOG=y -# CONFIG_OCFS2_DEBUG_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_IO_TRACE is not set -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_F2FS_FS_LZORLE=y -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -# CONFIG_MANDATORY_FILE_LOCKING is not set -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=m -# CONFIG_FS_ENCRYPTION_INLINE_CRYPT is not set -CONFIG_FS_VERITY=y -# CONFIG_FS_VERITY_DEBUG is not set -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_AUTOFS4_FS=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m -CONFIG_FUSE_DAX=y -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -CONFIG_FSCACHE_HISTOGRAM=y -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_VMCORE=y -CONFIG_PROC_VMCORE_DEVICE_DUMP=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -# CONFIG_TMPFS_INODE64 is not set -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_MEMFD_CREATE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=y -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -# CONFIG_JFFS2_FS_WBUF_VERIFY is not set -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_RTIME=y -CONFIG_UBIFS_FS=m -# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_FS_ZSTD=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_XATTR=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_UBIFS_FS_AUTHENTICATION=y -CONFIG_CRAMFS=m -CONFIG_CRAMFS_BLOCKDEV=y -CONFIG_CRAMFS_MTD=y -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -CONFIG_SQUASHFS_DECOMP_MULTI=y -# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set -# CONFIG_SQUASHFS_EMBEDDED is not set -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -CONFIG_ROMFS_FS=m -CONFIG_ROMFS_BACKED_BY_BLOCK=y -# CONFIG_ROMFS_BACKED_BY_MTD is not set -# CONFIG_ROMFS_BACKED_BY_BOTH is not set -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFLATE_COMPRESS=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -# CONFIG_PSTORE_842_COMPRESS is not set -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y -CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=y -CONFIG_PSTORE_ZONE=m -CONFIG_PSTORE_BLK=m -CONFIG_PSTORE_BLK_BLKDEV="" -CONFIG_PSTORE_BLK_KMSG_SIZE=64 -CONFIG_PSTORE_BLK_MAX_REASON=2 -# CONFIG_SYSV_FS is not set -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EROFS_FS=m -# CONFIG_EROFS_FS_DEBUG is not set -CONFIG_EROFS_FS_XATTR=y -CONFIG_EROFS_FS_POSIX_ACL=y -CONFIG_EROFS_FS_SECURITY=y -CONFIG_EROFS_FS_ZIP=y -CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 -CONFIG_VBOXSF_FS=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFS_DEBUG=y -# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -# CONFIG_NFSD_FLEXFILELAYOUT is not set -# CONFIG_NFSD_V4_2_INTER_SSC is not set -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y -CONFIG_SUNRPC_DEBUG=y -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CEPH_FS_SECURITY_LABEL=y -CONFIG_CIFS=m -# CONFIG_CIFS_STATS2 is not set -CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y -# CONFIG_CIFS_WEAK_PW_HASH is not set -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -CONFIG_CIFS_DEBUG=y -# CONFIG_CIFS_DEBUG2 is not set -# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set -CONFIG_CIFS_DFS_UPCALL=y -# CONFIG_CIFS_SMB_DIRECT is not set -CONFIG_CIFS_FSCACHE=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -# CONFIG_AFS_DEBUG_CURSOR is not set -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -CONFIG_KEY_DH_OPERATIONS=y -CONFIG_KEY_NOTIFICATIONS=y -# CONFIG_SECURITY_DMESG_RESTRICT is not set -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_SECURITY_INFINIBAND=y -CONFIG_SECURITY_NETWORK_XFRM=y -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HARDENED_USERCOPY_FALLBACK=y -# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set -CONFIG_FORTIFY_SOURCE=y -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DISABLE is not set -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -CONFIG_SECURITY_SMACK=y -CONFIG_SECURITY_SMACK_BRINGUP=y -CONFIG_SECURITY_SMACK_NETFILTER=y -CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y -CONFIG_SECURITY_TOMOYO=y -CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 -CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 -# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set -CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" -CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" -# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -# CONFIG_INTEGRITY is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_SMACK is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama" - -# -# Kernel hardening options -# -CONFIG_GCC_PLUGIN_STRUCTLEAK=y - -# -# Memory initialization -# -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set -CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y -# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set -# CONFIG_GCC_PLUGIN_STACKLEAK is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set -# end of Memory initialization -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=y -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=y -CONFIG_CRYPTO_ECC=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECRDSA=m -# CONFIG_CRYPTO_SM2 is not set -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_CURVE25519_X86=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_NHPOLY1305=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_ESSIV=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_BLAKE2B=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_BLAKE2S_X86=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=y -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=y -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -# CONFIG_CRYPTO_USER_API_RNG_CAVP is not set -CONFIG_CRYPTO_USER_API_AEAD=m -CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE=y -# CONFIG_CRYPTO_STATS is not set -CONFIG_CRYPTO_HASH_INFO=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA256=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -CONFIG_CRYPTO_DEV_ATMEL_I2C=m -CONFIG_CRYPTO_DEV_ATMEL_ECC=m -CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_CRYPTO_DEV_SAFEXCEL=m -CONFIG_CRYPTO_DEV_CCREE=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_TPM_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -CONFIG_SIGNED_PE_FILE_VERIFICATION=y - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_LINEAR_RANGES=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_CORDIC=m -# CONFIG_PRIME_NUMBERS is not set -CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_ARCH_USE_SYM_ANNOTATIONS=y -CONFIG_CRC_CCITT=y -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_DECOMPRESS_ZSTD=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=y -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_DMA_OPS=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_DMA_DECLARE_COHERENT=y -CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y -CONFIG_DMA_VIRT_OPS=y -CONFIG_SWIOTLB=y -CONFIG_DMA_COHERENT_POOL=y -# CONFIG_DMA_API_DEBUG is not set -CONFIG_SGL_ALLOC=y -CONFIG_IOMMU_HELPER=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_DIMLIB=y -CONFIG_LIBFDT=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -CONFIG_FONT_TER16x32=y -# CONFIG_FONT_6x8 is not set -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_COPY_MC=y -CONFIG_ARCH_STACKWALK=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -CONFIG_OBJAGG=m -# CONFIG_STRING_SELFTEST is not set -# end of Library routines - -CONFIG_PLDMFW=y - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 -CONFIG_CONSOLE_LOGLEVEL_QUIET=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -CONFIG_DYNAMIC_DEBUG=y -CONFIG_DYNAMIC_DEBUG_CORE=y -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -# -# Compile-time checks and compiler options -# -CONFIG_DEBUG_INFO=y -# CONFIG_DEBUG_INFO_REDUCED is not set -# CONFIG_DEBUG_INFO_COMPRESSED is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y -# CONFIG_GDB_SCRIPTS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -# CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B is not set -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_FS_ALLOW_ALL=y -# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set -# CONFIG_DEBUG_FS_ALLOW_NONE is not set -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_UBSAN is not set -CONFIG_HAVE_ARCH_KCSAN=y -# end of Generic Kernel Debugging Instruments - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_OWNER is not set -CONFIG_PAGE_POISONING=y -CONFIG_PAGE_POISONING_NO_SANITY=y -CONFIG_PAGE_POISONING_ZERO=y -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_ARCH_HAS_DEBUG_WX=y -CONFIG_DEBUG_WX=y -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_VM_PGTABLE is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y -# CONFIG_KASAN is not set -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# CONFIG_SCF_TORTURE_TEST is not set -# CONFIG_CSD_LOCK_WAIT_DEBUG is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -# CONFIG_DEBUG_LIST is not set -# CONFIG_DEBUG_PLIST is not set -# CONFIG_DEBUG_SG is not set -# CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BUG_ON_DATA_CORRUPTION is not set -# end of Debug kernel data structures - -# CONFIG_DEBUG_CREDENTIALS is not set - -# -# RCU Debugging -# -# CONFIG_RCU_SCALE_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -# CONFIG_RCU_REF_SCALE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# CONFIG_RCU_STRICT_GRACE_PERIOD is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_RING_BUFFER_ALLOW_SWAP=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_STACK_TRACER=y -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -CONFIG_SCHED_TRACER=y -CONFIG_HWLAT_TRACER=y -CONFIG_MMIOTRACE=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT=y -# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -CONFIG_BPF_KPROBE_OVERRIDE=y -CONFIG_FTRACE_MCOUNT_RECORD=y -CONFIG_TRACING_MAP=y -CONFIG_SYNTH_EVENTS=y -CONFIG_HIST_TRIGGERS=y -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_MMIOTRACE_TEST is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_SYNTH_EVENT_GEN_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_HIST_TRIGGERS_DEBUG is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -CONFIG_STRICT_DEVMEM=y -CONFIG_IO_STRICT_DEVMEM=y - -# -# x86 Debugging -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y -# CONFIG_X86_VERBOSE_BOOTUP is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEBUG_BOOT_PARAMS=y -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# CONFIG_UNWINDER_GUESS is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_STRSCPY is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_OVERFLOW is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_HASH is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_PARMAN is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_BITOPS is not set -# CONFIG_TEST_VMALLOC is not set -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_OBJAGG is not set -# CONFIG_TEST_STACKINIT is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_TEST_HMM is not set -# CONFIG_TEST_FREE_PAGES is not set -# CONFIG_TEST_FPU is not set -# CONFIG_MEMTEST is not set -# CONFIG_HYPERV_TESTING is not set -# end of Kernel Testing and Coverage -# end of Kernel hacking diff --git a/linux510-rc-tkg/linux510-tkg-config/generic-desktop-profile.cfg b/linux510-rc-tkg/linux510-tkg-config/generic-desktop-profile.cfg deleted file mode 100644 index 4e0af37..0000000 --- a/linux510-rc-tkg/linux510-tkg-config/generic-desktop-profile.cfg +++ /dev/null @@ -1,35 +0,0 @@ -# linux510-TkG config file -# Generic Desktop - - -#### KERNEL OPTIONS #### - -# Disable some non-module debugging - See PKGBUILD for the list -_debugdisable="false" - -# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME - -# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" -_ftracedisable="false" - -# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" -_numadisable="false" - -# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" -_voluntary_preempt="false" - -# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" -_zenify="true" - -# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" -_compileroptlevel="1" - -# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" -_random_trust_cpu="false" - -# CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) -# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "mc" -_runqueue_sharing="mc" - -# Timer frequency - "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "750" -_timer_freq="500" diff --git a/linux510-rc-tkg/linux510-tkg-config/prepare b/linux510-rc-tkg/linux510-tkg-config/prepare deleted file mode 100644 index 5153c7a..0000000 --- a/linux510-rc-tkg/linux510-tkg-config/prepare +++ /dev/null @@ -1,991 +0,0 @@ -#!/bin/bash - -_basever=510 -_basekernel=5.10 -_sub=rc1 - -_tkg_initscript() { - - cp "$_where"/linux"$_basever"-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - cp "$_where"/linux"$_basever"-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - - # Load external configuration file if present. Available variable values will overwrite customization.cfg ones. - if [ -e "$_EXT_CONFIG_PATH" ]; then - source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" - fi - - if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then - # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. - plain "Do you want to use a predefined optimized profile?" - read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; - fi - if [ "$_OPTIPROFILE" = "2" ]; then - source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" - elif [ "$_OPTIPROFILE" = "3" ]; then - source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" - fi - - # source cpuschedset early if present - if [ -e "$_where"/cpuschedset ]; then - source "$_where"/cpuschedset - fi - - # CPU SCHED selector - if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then - plain "What CPU sched variant do you want to build/install?" - read -rp "`echo $' > 1.Project C / PDS\n 2.Project C / BMQ\n 3.MuQSS\n 4.CFS\nchoice[1-4?]: '`" CONDITION; - if [ "$CONDITION" = "2" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "3" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "4" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - fi - if [ -n "$_custom_pkgbase" ]; then - echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset - fi - elif [ "$_cpusched" = "muqss" ] || [ "$_cpusched" = "MuQSS" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "pds" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "cfs" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "bmq" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - else - if [ "$_nofallback" != "true" ]; then - warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" - read -rp "`echo $' > N/y : '`" _fallback; - fi - if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - error "Exiting..." - exit 1 - fi - fi - - source "$_where"/cpuschedset -} - -user_patcher() { - # To patch the user because all your base are belong to us - local _patches=("$_where"/*."${_userpatch_ext}revert") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Reverting your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 -R < "${_f}" - echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi - - _patches=("$_where"/*."${_userpatch_ext}patch") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Applying your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 < "${_f}" - echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi -} - -_tkg_srcprep() { - - if [ "${_distro}" = "Arch" ]; then - msg2 "Setting version..." - scripts/setlocalversion --save-scmversion - echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel - echo "" > localversion.20-pkgname - - # add upstream patch - #msg2 "Patching from $_basekernel to $pkgver" - #patch -p1 -i "$srcdir"/patch-"${pkgver}" - - # ARCH Patches - if [ "${_configfile}" = "config_hardened.x86_64" ] && [ "${_cpusched}" = "cfs" ]; then - msg2 "Using linux hardened patchset" - patch -Np1 -i "$srcdir"/0012-linux-hardened.patch - else - patch -Np1 -i "$srcdir"/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - fi - fi - - # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch - msg2 "Applying graysky's cpu opts patch" - if [ "${_distro}" = "Arch" ]; then - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch - else - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v5.8+.patch - fi - - # TkG - msg2 "Applying clear linux patches" - patch -Np1 -i "$srcdir"/0002-clear-patches.patch - - msg2 "Applying glitched base patch" - patch -Np1 -i "$srcdir"/0003-glitched-base.patch - - if [ -z $_misc_adds ]; then - plain "Enable misc additions ? May contain temporary fixes pending upstream or changes that can break on non-Arch. " - read -rp "`echo $' > [Y]/n : '`" _interactive_misc_adds; - if [ "$_interactive_misc_adds" != "n" ] && [ "$_interactive_misc_adds" != "N" ]; then - _misc_adds="true" - fi - fi - - if [ "$_misc_adds" = "true" ]; then - msg2 "Applying misc additions patch" - patch -Np1 -i "$srcdir"/0012-misc-additions.patch - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS - msg2 "Applying MuQSS base patch" - patch -Np1 -i "$srcdir"/0004-5.10-ck1.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying MuQSS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0004-glitched-ondemand-muqss.patch - fi - - msg2 "Applying Glitched MuQSS patch" - patch -Np1 -i "$srcdir"/0004-glitched-muqss.patch - - elif [ "${_cpusched}" = "pds" ]; then - # PDS-mq - msg2 "Applying PDS base patch" - patch -Np1 -i "$srcdir"/0009-prjc_v5.10-r0.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying PDS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched PDS patch" - patch -Np1 -i "$srcdir"/0005-glitched-pds.patch - - elif [ "${_cpusched}" = "bmq" ]; then - # Project C / BMQ - msg2 "Applying Project C / BMQ base patch" - - patch -Np1 -i "$srcdir"/0009-prjc_v5.10-r0.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying BMQ agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched BMQ patch" - patch -Np1 -i "$srcdir"/0009-glitched-bmq.patch - - elif [ "${_cpusched}" = "cfs" ]; then - msg2 "Applying Glitched CFS patch" - patch -Np1 -i "$srcdir"/0003-glitched-cfs.patch - fi - - if [ "${_distro}" = "Arch" ]; then - if [ -z "${_configfile}" ]; then - _configfile="config.x86_64" - fi - - cat "${srcdir}/${_configfile}" > ./.config - fi - - - # Set some -tkg defaults - echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config - sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config - echo "CONFIG_DEFAULT_CAKE=y" >> ./.config - echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config - echo "# CONFIG_NTP_PPS is not set" >> ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config - sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO=y/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set/' ./.config - sed -i -e 's/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lzo"/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4"/' ./.config - sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config - sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config - echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config - echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config - echo "# CONFIG_X86_P6_NOP is not set" >> ./.config - - # openrgb - echo "CONFIG_I2C_NCT6775=m" >> ./.config - - # ccache fix - if [ "$_noccache" != "true" ]; then - if { [ "$_distro" = "Arch" ] && pacman -Qq ccache &> /dev/null; } || { [ "$_distro" = "Ubuntu" ] && dpkg -l ccache > /dev/null; }; then - sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config - fi - fi - # Skip dbg package creation on non-Arch - if [ "$_distro" != "Arch" ]; then - sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config - fi - - if [ "$_font_autoselect" != "false" ]; then - sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config - fi - - # Inject cpuopts options - echo "# CONFIG_MK8SSE3 is not set" >> ./.config - echo "# CONFIG_MK10 is not set" >> ./.config - echo "# CONFIG_MBARCELONA is not set" >> ./.config - echo "# CONFIG_MBOBCAT is not set" >> ./.config - echo "# CONFIG_MJAGUAR is not set" >> ./.config - echo "# CONFIG_MBULLDOZER is not set" >> ./.config - echo "# CONFIG_MPILEDRIVER is not set" >> ./.config - echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config - echo "# CONFIG_MEXCAVATOR is not set" >> ./.config - echo "# CONFIG_MZEN is not set" >> ./.config - echo "# CONFIG_MZEN2 is not set" >> ./.config - echo "# CONFIG_MATOM is not set" >> ./.config - echo "# CONFIG_MNEHALEM is not set" >> ./.config - echo "# CONFIG_MWESTMERE is not set" >> ./.config - echo "# CONFIG_MSILVERMONT is not set" >> ./.config - echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config - echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config - echo "# CONFIG_MHASWELL is not set" >> ./.config - echo "# CONFIG_MBROADWELL is not set" >> ./.config - echo "# CONFIG_MSKYLAKE is not set" >> ./.config - echo "# CONFIG_MSKYLAKEX is not set" >> ./.config - echo "# CONFIG_MCANNONLAKE is not set" >> ./.config - echo "# CONFIG_MICELAKE is not set" >> ./.config - echo "# CONFIG_MGOLDMONT is not set" >> ./.config - echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config - echo "# CONFIG_MCASCADELAKE is not set" >> ./.config - echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config - echo "# CONFIG_MTIGERLAKE is not set" >> ./.config - - # Disable some debugging - if [ "${_debugdisable}" = "true" ]; then - sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config - sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS default config - echo "CONFIG_SCHED_MUQSS=y" >> ./.config - elif [ "${_cpusched}" = "pds" ]; then - # PDS default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_PDS=y" >> ./.config - echo "# CONFIG_SCHED_BMQ is not set" >> ./.config - elif [ "${_cpusched}" = "bmq" ]; then - # BMQ default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_BMQ=y" >> ./.config - echo "# CONFIG_SCHED_PDS is not set" >> ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - # Disable CFS - sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config - sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config - # sched yield type - if [ -n "$_sched_yield_type" ]; then - CONDITION0="$_sched_yield_type" - else - plain "" - plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." - plain "" - plain "For PDS and MuQSS:" - plain "0: No yield." - plain "1: Yield only to better priority/deadline tasks." - plain "2: Expire timeslice and recalculate deadline." - plain "" - plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" - plain "0: No yield." - plain "1: Deboost and requeue task. (default)" - plain "2: Set rq skip task." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0. Supposedly best option for gaming performance - could lead to stability issues on some (AMD) platforms when combined with MuQSS\n > 1. Default and recommended option for MuQSS - could lead to stability issues on some (Intel) platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - else - read -rp "`echo $'\n > 0. Recommended option for gaming on PDS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - fi - fi - if [ "$CONDITION0" = "0" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - elif [ "$CONDITION0" = "1" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "$CONDITION0" = "2" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c - fi - else - if [ "${_cpusched}" = "MuQSS" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - fi - fi - - # Round Robin interval - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - if [ -n "$_rr_interval" ]; then - CONDITION1="$_rr_interval" - else - plain "" - plain "Round Robin interval is the longest duration two tasks with the same nice level will" - plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" - plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" - plain "value can help offset the disadvantages of rescheduling a process that has yielded." - plain "" - plain "MuQSS default: 6ms" - plain "PDS default: 4ms" - plain "BMQ default: 2ms" - read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; - fi - if [ "$CONDITION1" = "1" ]; then - msg2 "Using 2ms rr_interval" - _rrvalue="2" - elif [ "$CONDITION1" = "2" ]; then - msg2 "Using 4ms rr_interval" - _rrvalue="4" - elif [ "$CONDITION1" = "3" ]; then - msg2 "Using 6ms rr_interval" - _rrvalue="6" - elif [ "$CONDITION1" = "4" ]; then - msg2 "Using 8ms rr_interval" - _rrvalue="8" - else - msg2 "Using default rr_interval" - _rrvalue="default" - fi - if [ "$_rrvalue" != "default" ]; then - if [ "${_cpusched}" = "MuQSS" ]; then - sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - else - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - fi - fi - - # zenify - if [ "$_zenify" = "true" ]; then - echo "CONFIG_ZENIFY=y" >> ./.config - elif [ "$_zenify" = "false" ]; then - echo "# CONFIG_ZENIFY is not set" >> ./.config - fi - - # compiler optimization level - if [ "$_compileroptlevel" = "1" ]; then - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - elif [ "$_compileroptlevel" = "2" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config - elif [ "$_compileroptlevel" = "3" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - fi - - # cpu opt - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then - echo "# CONFIG_MNATIVE is not set" >> ./.config - fi - - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then - sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config - fi - - if [ "$_processor_opt" = "native" ]; then - echo "CONFIG_MNATIVE=y" >> ./.config - elif [ "$_processor_opt" = "k8" ]; then - sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config - elif [ "$_processor_opt" = "k8sse3" ]; then - sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config - elif [ "$_processor_opt" = "k10" ]; then - sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config - elif [ "$_processor_opt" = "barcelona" ]; then - sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config - elif [ "$_processor_opt" = "bobcat" ]; then - sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config - elif [ "$_processor_opt" = "jaguar" ]; then - sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config - elif [ "$_processor_opt" = "bulldozer" ]; then - sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config - elif [ "$_processor_opt" = "piledriver" ]; then - sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config - elif [ "$_processor_opt" = "steamroller" ]; then - sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config - elif [ "$_processor_opt" = "excavator" ]; then - sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config - elif [ "$_processor_opt" = "zen" ]; then - sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config - elif [ "$_processor_opt" = "zen2" ]; then - sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config - elif [ "$_processor_opt" = "mpsc" ]; then - sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config - elif [ "$_processor_opt" = "atom" ]; then - sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config - elif [ "$_processor_opt" = "core2" ]; then - sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config - elif [ "$_processor_opt" = "nehalem" ]; then - sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config - elif [ "$_processor_opt" = "westmere" ]; then - sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config - elif [ "$_processor_opt" = "silvermont" ]; then - sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config - elif [ "$_processor_opt" = "sandybridge" ]; then - sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "ivybridge" ]; then - sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "haswell" ]; then - sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config - elif [ "$_processor_opt" = "broadwell" ]; then - sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config - elif [ "$_processor_opt" = "skylake" ]; then - sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config - elif [ "$_processor_opt" = "skylakex" ]; then - sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config - elif [ "$_processor_opt" = "cannonlake" ]; then - sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config - elif [ "$_processor_opt" = "icelake" ]; then - sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config - elif [ "$_processor_opt" = "goldmont" ]; then - sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config - elif [ "$_processor_opt" = "goldmontplus" ]; then - sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config - elif [ "$_processor_opt" = "cascadelake" ]; then - sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config - elif [ "$_processor_opt" = "cooperlake" ]; then - sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config - elif [ "$_processor_opt" = "tigerlake" ]; then - sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config - fi - - # irq threading - if [ "$_irq_threading" = "true" ]; then - echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config - elif [ "$_irq_threading" = "false" ]; then - echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config - fi - - # smt nice - if [ "$_smt_nice" = "true" ]; then - echo "CONFIG_SMT_NICE=y" >> ./.config - elif [ "$_smt_nice" = "false" ]; then - echo "# CONFIG_SMT_NICE is not set" >> ./.config - fi - - # random trust cpu - if [ "$_random_trust_cpu" = "true" ]; then - sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config - fi - - # rq sharing - if [ "$_runqueue_sharing" = "none" ]; then - echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" = "smt" ]; then - echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "mc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "smp" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "all" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config - elif [ "$_runqueue_sharing" = "mc-llc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - fi - - # timer freq - if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - if [ "$_timer_freq" = "1000" ]; then - sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "CONFIG_HZ_1000_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "750" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "CONFIG_HZ_750=y" >> ./.config - echo "CONFIG_HZ_750_NODEF=y" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "500" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "100" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - fi - elif [ "${_cpusched}" = "MuQSS" ] && [ -z "$_timer_freq" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - else - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - fi - - # default cpu gov - if [ "$_default_cpu_gov" = "performance" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config - elif [ "$_default_cpu_gov" = "ondemand" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config - fi - - # ACPI_CPUFREQ disablement - if [ "$_disable_acpi_cpufreq" = "true" ]; then - sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config - fi - - # ftrace - if [ -z "$_ftracedisable" ]; then - plain "" - plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" - plain "and analyzing of kernel functions." - read -rp "`echo $' > N/y : '`" CONDITION2; - fi - if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" = "true" ]; then - sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config - sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config - fi - - # disable numa - if [ -z "$_numadisable" ]; then - plain "" - plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." - plain "https://bbs.archlinux.org/viewtopic.php?id=239174" - read -rp "`echo $' > N/y : '`" CONDITION3; - fi - if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" = "true" ]; then - # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU - sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ - -i -e '/CONFIG_AMD_NUMA=y/d' \ - -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ - -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ - -i -e '/# CONFIG_NUMA_EMU is not set/d' \ - -i -e '/CONFIG_NODES_SHIFT=6/d' \ - -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ - -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ - -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config - fi - - # tickless - if [ -z "$_tickless" ]; then - plain "" - plain "Use CattaRappa mode (Tickless/Dynticks) ?" - plain "Can give higher performances in many cases but lower consistency on some hardware." - plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - else - read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - fi - fi - if [ "$CONDITION4" = "0" ] || [ "$_tickless" = "0" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config - elif [ "$CONDITION4" = "2" ] || [ "$_tickless" = "2" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - if [ "${_cpusched}" = "MuQSS" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config - echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config - fi - fi - - # voluntary preempt - if [ -z "$_voluntary_preempt" ]; then - plain "" - plain "Use explicit preemption points?" - plain "It can improve latency on PDS (at the cost of throughput)" - plain "and improve throughput on other schedulers (at the cost of latency)" - read -rp "`echo $' > N/y : '`" CONDITION5; - fi - if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" = "true" ]; then - sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config - sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config - sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config - fi - - # Open Firmware support - if [ -z "$_OFenable" ]; then - plain "" - plain "Enable Device Tree and Open Firmware support?" - read -rp "`echo $' > N/y : '`" CONDITION6; - fi - if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" = "true" ]; then - sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config - fi - - # acs override - if [ -z "$_acs_override" ]; then - plain "" - plain "Use ACS override patch?" - plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" - read -rp "`echo $' > N/y : '`" CONDITION7; - fi - if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" = "true" ]; then - msg2 "Patching ACS override" - patch -Np1 -i "$srcdir"/0006-add-acs-overrides_iommu.patch - fi - - # bcachefs - #if [ -z "$_bcachefs" ]; then - # plain "" - # plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." - # plain "https://bcachefs.org/" - # read -rp "`echo $' > N/y : '`" CONDITION8; - #fi - #if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" = "true" ]; then - # msg2 "Patching Bcache filesystem support override" - # patch -Np1 -i "$srcdir"/0008-5.10-bcachefs.patch - # echo "CONFIG_BCACHEFS_FS=m" >> ./.config - # echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config - # echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config - # echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config - # echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config - # echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config - #fi - - # fsync support - if [ -z "$_fsync" ]; then - plain "" - plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" - plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; - fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then - msg2 "Patching Fsync support" - patch -Np1 -i "$srcdir"/0007-v5.10-fsync.patch - fi - - # ZFS fix - if [ -z "$_zfsfix" ]; then - plain "" - plain "Add back missing symbol for AES-NI/AVX support on ZFS" - plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" - read -rp "`echo $' > N/y : '`" CONDITION11; - fi - if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" = "true" ]; then - msg2 "Patching missing symbol for AES-NI/AVX support on ZFS" - patch -Np1 -i "$srcdir"/0011-ZFS-fix.patch - fi - - # Community patches - if [ -n "$_community_patches" ]; then - if [ ! -d "$_where/../../community-patches" ]; then - cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/${_srcpath}" - fi - _community_patches=($_community_patches) - for _p in ${_community_patches[@]}; do - ln -s "$_where"/../../community-patches/linux"$_basever"-tkg/$_p "$_where"/ - done - fi - - # userpatches - if [ "$_user_patches" = "true" ]; then - _userpatch_target="linux-${_basekernel}" - _userpatch_ext="my" - user_patcher - fi - - # Community patches removal - for _p in ${_community_patches[@]}; do - rm -f "$_where"/$_p - done - - if [ "$_distro" = "Arch" ]; then - # don't run depmod on 'make install'. We'll do this ourselves in packaging - sed -i '2iexit 0' scripts/depmod.sh - - # get kernel version - make prepare - fi - - # modprobed-db - if [ -z "$_modprobeddb" ]; then - plain "" - plain "Use modprobed db to clean config from unneeded modules?" - plain "Speeds up compilation considerably. Requires root." - plain "https://wiki.archlinux.org/index.php/Modprobed-db" - plain "!!!! Make sure to have a well populated db !!!!" - read -rp "`echo $' > N/y : '`" CONDITIONMPDB; - fi - if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" = "true" ]; then - sudo modprobed-db recall - yes "" | make localmodconfig - fi - - if [ true = "$_config_fragments" ]; then - local fragments=() - mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) - - if [ true = "$_config_fragments_no_confirm" ]; then - printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" - else - for i in "${!fragments[@]}"; do - while true; do - read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB - CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONMPDB" in - y|yes) - break;; - n|no|'') - unset fragments[$i] - break;; - *) - echo 'Please answer with yes or no' - esac - done - done - fi - - if [ 0 -lt "${#fragments[@]}" ]; then - scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" - fi - fi - - # menuconfig / nconfig - if [ -z "$_menunconfig" ]; then - plain "" - plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" - plain "to configure the kernel before building it?" - plain "If you do, make sure your terminal is currently" - plain "at least 19 lines by 80 columns large or you'll get an error :D" - read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n 3. xconfig\n choice[0-3?]: '`" CONDITIONMNC; - _menunconfig="$CONDITIONMNC" - fi - if [ 1 = "$_menunconfig" ]; then - cp .config .config.orig - make menuconfig - elif [ 2 = "$_menunconfig" ]; then - cp .config .config.orig - make nconfig - elif [ 3 = "$_menunconfig" ]; then - cp .config .config.orig - make xconfig - else - # rewrite configuration - yes "" | make config >/dev/null - fi - if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ] || [ 3 = "$_menunconfig" ]; then - if [ -z "${_diffconfig}" ]; then - while true; do - read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF - CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONF" in - y|yes) - _diffconfig=true - break;; - n|no|'') - _diffconfig=false - break;; - *) - echo 'Please answer with yes or no' - esac - done - fi - if [ true = "$_diffconfig" ]; then - if [ -z "$_diffconfig_name" ]; then - IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name - fi - if [ -z "$_diffconfig_name" ]; then - echo 'No file name given, not generating config fragment.' - else ( - prev_pwd="${PWD:-$(pwd)}" - cd "$_where" - "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" - ) fi - fi - rm .config.orig - fi - - if [ "$_distro" = "Arch" ]; then - make -s kernelrelease > version - msg2 "Prepared %s version %s" "$pkgbase" "$( -From: Serge Hallyn -Date: Fri, 31 May 2013 19:12:12 +0100 -Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default - -Signed-off-by: Serge Hallyn -[bwh: Remove unneeded binary sysctl bits] -Signed-off-by: Daniel Micay ---- - kernel/fork.c | 15 +++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 3 +++ - 3 files changed, 30 insertions(+) - -diff --git a/kernel/fork.c b/kernel/fork.c -index 07cc743698d3668e..4011d68a8ff9305c 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b86520ed3fb60fbf..f7dab3760839f1a1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -105,6 +105,9 @@ extern int core_uses_pid; - - #if defined(CONFIG_SYSCTL) - -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR - static int sixty = 60; -@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index c490f1e4313b998a..dd03bd39d7bf194d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - --- -2.15.1 - -From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Thu, 7 Dec 2017 13:50:48 +0100 -Subject: ZEN: Add CONFIG for unprivileged_userns_clone - -This way our default behavior continues to match the vanilla kernel. ---- - init/Kconfig | 16 ++++++++++++++++ - kernel/user_namespace.c | 4 ++++ - 2 files changed, 20 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 4592bf7997c0..f3df02990aff 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1004,6 +1004,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 6b9dbc257e34..107b17f0d528 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -27,7 +27,11 @@ - #include - - /* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else - int unprivileged_userns_clone; -+#endif - - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux510-rc-tkg/linux510-tkg-patches/0002-clear-patches.patch b/linux510-rc-tkg/linux510-tkg-patches/0002-clear-patches.patch deleted file mode 100644 index 22a32f5..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0002-clear-patches.patch +++ /dev/null @@ -1,360 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Mon, 14 Mar 2016 11:10:58 -0600 -Subject: [PATCH] pci pme wakeups - -Reduce wakeups for PME checks, which are a workaround for miswired -boards (sadly, too many of them) in laptops. ---- - drivers/pci/pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index c9338f9..6974fbf 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -62,7 +62,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sat, 19 Mar 2016 21:32:19 -0400 -Subject: [PATCH] intel_idle: tweak cpuidle cstates - -Increase target_residency in cpuidle cstate - -Tune intel_idle to be a bit less agressive; -Clear linux is cleaner in hygiene (wakupes) than the average linux, -so we can afford changing these in a way that increases -performance while keeping power efficiency ---- - drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- - 1 file changed, 22 insertions(+), 22 deletions(-) - -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index f449584..c994d24 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Fri, 6 Jan 2017 15:34:09 +0000 -Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little - bigger than default - ---- - net/ipv4/tcp.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 30c1142..4345075 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -4201,8 +4201,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH] locking: rwsem: spin faster - -tweak rwsem owner spinning a bit ---- - kernel/locking/rwsem.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index f11b9bd..1bbfcc1 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); -@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - rcu_read_unlock(); - --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: [PATCH] initialize ata before graphics - -ATA init is the long pole in the boot process, and its asynchronous. -move the graphics init after it so that ata and graphics initialize -in parallel ---- - drivers/Makefile | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/Makefile b/drivers/Makefile -index c0cd1b9..af1e2fb 100644 ---- a/drivers/Makefile -+++ b/drivers/Makefile -@@ -59,15 +59,8 @@ obj-y += char/ - # iommu/ comes before gpu as gpu are using iommu controllers - obj-y += iommu/ - --# gpu/ comes after char for AGP vs DRM startup and after iommu --obj-y += gpu/ -- - obj-$(CONFIG_CONNECTOR) += connector/ - --# i810fb and intelfb depend on char/agp/ --obj-$(CONFIG_FB_I810) += video/fbdev/i810/ --obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -- - obj-$(CONFIG_PARPORT) += parport/ - obj-$(CONFIG_NVM) += lightnvm/ - obj-y += base/ block/ misc/ mfd/ nfc/ -@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/ - obj-y += scsi/ - obj-y += nvme/ - obj-$(CONFIG_ATA) += ata/ -+ -+# gpu/ comes after char for AGP vs DRM startup and after iommu -+obj-y += gpu/ -+ -+# i810fb and intelfb depend on char/agp/ -+obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -+ - obj-$(CONFIG_TARGET_CORE) += target/ - obj-$(CONFIG_MTD) += mtd/ - obj-$(CONFIG_SPI) += spi/ --- -https://clearlinux.org - diff --git a/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-base.patch b/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-base.patch deleted file mode 100644 index d0bb7d3..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-base.patch +++ /dev/null @@ -1,678 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: [PATCH 01/17] glitched - ---- - scripts/mkcompile_h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h -index baf3ab8d9d49..854e32e6aec7 100755 ---- a/scripts/mkcompile_h -+++ b/scripts/mkcompile_h -@@ -41,8 +41,8 @@ else - fi - - UTS_VERSION="#$VERSION" --CONFIG_FLAGS="" --if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi -+CONFIG_FLAGS="TKG" -+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi - if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi - if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"; fi - --- -2.28.0 - - -From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 16:59:22 +0000 -Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which - VFS caches are reclaimed - -Signed-off-by: Alexandre Frade ---- - fs/dcache.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/dcache.c b/fs/dcache.c -index 361ea7ab30ea..0c5cf69b241a 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -71,7 +71,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); --- -2.28.0 - - -From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 18:29:13 +0000 -Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks - to iterate in a single balance run. - -Signed-off-by: Alexandre Frade ---- - kernel/sched/core.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f788cd61df21..2bfbb4213707 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features = - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ --const_debug unsigned int sysctl_sched_nr_migrate = 32; -+const_debug unsigned int sysctl_sched_nr_migrate = 128; - - /* - * period over which we measure -rt task CPU usage in us. -@@ -71,9 +71,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - /* - * __task_rq_lock - lock the rq @p resides on. --- -2.28.0 - - -From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 17:41:29 +0000 -Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo - -Signed-off-by: Alexandre Frade ---- - scripts/setlocalversion | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 20f2efd57b11..0552d8b9f582 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ # echo "+" - return - fi - # If we are past a tagged commit (like --- -2.28.0 - - -From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001 -From: Oleksandr Natalenko -Date: Wed, 11 Dec 2019 11:46:19 +0100 -Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches - -Building a kernel with -O3 may help in hunting bugs like [1] and thus -using this switch should not be restricted to one specific arch only. - -With that, lets expose it for everyone. - -[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/ - -Signed-off-by: Oleksandr Natalenko ---- - init/Kconfig | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/init/Kconfig b/init/Kconfig -index 0498af567f70..3ae8678e1145 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - - config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" -- depends on ARC - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. --- -2.28.0 - - -From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Fri, 26 Oct 2018 11:22:33 +0100 -Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 - inlining - ---- - drivers/infiniband/core/addr.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 3a98439bba83..6efc4f907f58 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - union { - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - --- -2.28.0 - - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: [PATCH 07/17] Zenify & stuff - ---- - init/Kconfig | 32 ++++++++++++++++++++++++++++++++ - kernel/sched/fair.c | 25 +++++++++++++++++++++++++ - mm/page-writeback.c | 8 ++++++++ - 3 files changed, 65 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 3ae8678e1145..da708eed0f1e 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 6b3b59cc51d6..2a0072192c3d 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -37,8 +37,13 @@ - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_latency = 3000000ULL; -+static unsigned int normalized_sysctl_sched_latency = 3000000ULL; -+#else - unsigned int sysctl_sched_latency = 6000000ULL; - static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -+#endif - - /* - * The initial- and re-scaling of tunables is configurable -@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_min_granularity = 300000ULL; -+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -+#else - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+#endif - - /* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -+#ifdef CONFIG_ZENIFY -+static unsigned int sched_nr_latency = 10; -+#else - static unsigned int sched_nr_latency = 8; -+#endif - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_wakeup_granularity = 500000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; -+ -+const_debug unsigned int sysctl_sched_migration_cost = 50000UL; -+#else - unsigned int sysctl_sched_wakeup_granularity = 1000000UL; - static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -+#endif - - int sched_thermal_decay_shift; - static int __init setup_sched_thermal_decay_shift(char *str) -@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -+#else - unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif -+#endif - - static inline void update_load_add(struct load_weight *lw, unsigned long inc) - { -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 28b3e7a67565..01a1aef2b9b1 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; - /* - * Start background writeback (via writeback threads) at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int dirty_background_ratio = 20; -+#else - int dirty_background_ratio = 10; -+#endif - - /* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of -@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; - /* - * The generator of dirty data starts writeback at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int vm_dirty_ratio = 50; -+#else - int vm_dirty_ratio = 20; -+#endif - - /* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of --- -2.28.0 - - -From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Sun, 16 Jan 2011 18:57:32 -0600 -Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control - -4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, - reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 - seconds (netperf TCP_STREAM) including long stalls. - - Be careful when choosing this. ~heftig ---- - net/ipv4/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index e64e59b536d3..bfb55ef7ebbe 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -691,6 +691,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO --- -2.28.0 - - -From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 28 Nov 2018 19:01:27 -0600 -Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag - strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: https://lwn.net/Articles/711248/ ---- - mm/huge_memory.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 74300e337c3c..9277f22c10a7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1< -Date: Wed, 24 Oct 2018 16:58:52 -0300 -Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default - -Signed-off-by: Alexandre Frade ---- - net/sched/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/sched/Kconfig b/net/sched/Kconfig -index 84badf00647e..6a922bca9f39 100644 ---- a/net/sched/Kconfig -+++ b/net/sched/Kconfig -@@ -471,6 +471,9 @@ choice - config DEFAULT_SFQ - bool "Stochastic Fair Queue" if NET_SCH_SFQ - -+ config DEFAULT_CAKE -+ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE -+ - config DEFAULT_PFIFO_FAST - bool "Priority FIFO Fast" - endchoice -@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH - default "fq" if DEFAULT_FQ - default "fq_codel" if DEFAULT_FQ_CODEL - default "sfq" if DEFAULT_SFQ -+ default "cake" if DEFAULT_CAKE - default "pfifo_fast" - endif - --- -2.28.0 - - -From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 18 Feb 2019 17:40:57 +0100 -Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10) - -Multiple users have reported it's helping reducing/eliminating stuttering -with DXVK. ---- - mm/page_alloc.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 898ff44f2c7b..e72074034793 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly; - #else - int watermark_boost_factor __read_mostly = 15000; - #endif --int watermark_scale_factor = 10; -+int watermark_scale_factor = 200; - - static unsigned long nr_kernel_pages __initdata; - static unsigned long nr_all_pages __initdata; --- -2.28.0 - - -From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Fri, 19 Apr 2019 12:33:38 +0200 -Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default - -The value is still pretty low, and AMD64-ABI and ELF extended numbering -supports that, so we should be fine on modern x86 systems. - -This fixes crashes in some applications using more than 65535 vmas (also -affects some windows games running in wine, such as Star Citizen). ---- - include/linux/mm.h | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index bc05c3588aa3..b0cefe94920d 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define MAPCOUNT_ELF_CORE_MARGIN (5) --#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -+#define DEFAULT_MAX_MAP_COUNT (262144) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 27 Jul 2020 00:19:18 +0200 -Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT - -Some games such as Detroit: Become Human tend to be very crash prone with -lower values. ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index b0cefe94920d..890165099b07 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define DEFAULT_MAX_MAP_COUNT (262144) -+#define DEFAULT_MAX_MAP_COUNT (524288) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 4eab3d70e880..79669aa39d79 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* --- -2.28.0 - -From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 3 Aug 2020 17:05:04 +0000 -Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file - read-ahead pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/pagemap.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index cf2468da68e9..007dea784451 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); - void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); - --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); --- -2.28.0 - - -From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 15 Jan 2020 20:43:56 -0600 -Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter - -If intel-pstate is compiled into the kernel, it will preempt the loading -of acpi-cpufreq so you can take advantage of hardware p-states without -any friction. - -However, intel-pstate is not completely superior to cpufreq's ondemand -for one reason. There's no concept of an up_threshold property. - -In ondemand, up_threshold essentially reduces the maximum utilization to -compare against, allowing you to hit max frequencies and turbo boost -from a much lower core utilization. - -With intel-pstate, you have the concept of minimum and maximum -performance, but no tunable that lets you define, maximum frequency -means 50% core utilization. For just this oversight, there's reasons -you may want ondemand. - -Lets support setting "enable" in kernel boot parameters. This lets -kernel maintainers include "intel_pstate=disable" statically in the -static boot parameters, but let users of the kernel override this -selection. ---- - Documentation/admin-guide/kernel-parameters.txt | 3 +++ - drivers/cpufreq/intel_pstate.c | 2 ++ - 2 files changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index fb95fad81c79..3e92fee81e33 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -1857,6 +1857,9 @@ - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors -+ enable -+ Enable intel_pstate in-case "disable" was passed -+ previously in the kernel boot parameters - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 36a469150ff9..aee891c9b78a 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) - pr_info("HWP disabled\n"); - no_hwp = 1; - } -+ if (!strcmp(str, "enable")) -+ no_load = 0; - if (!strcmp(str, "force")) - force_load = 1; - if (!strcmp(str, "hwp_only")) --- -2.28.0 - diff --git a/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-cfs.patch b/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-cfs.patch deleted file mode 100644 index 06b7f02..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0003-glitched-cfs.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - diff --git a/linux510-rc-tkg/linux510-tkg-patches/0004-5.10-ck1.patch b/linux510-rc-tkg/linux510-tkg-patches/0004-5.10-ck1.patch deleted file mode 100644 index 00e7c4d..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0004-5.10-ck1.patch +++ /dev/null @@ -1,13369 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a1068742a6df..d2a8f1c637d2 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4595,6 +4595,14 @@ - Memory area to be used by remote processor image, - managed by CMA. - -+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. -+ Format: -+ smt -- Share SMT (hyperthread) sibling runqueues -+ mc -- Share MC (multicore) sibling runqueues -+ smp -- Share SMP runqueues -+ none -- So not share any runqueues -+ Default value is mc -+ - rw [KNL] Mount root device read-write on boot - - S [KNL] Run init in single mode -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index d4b32cc32bb7..9e1e71fc66d0 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -436,6 +436,16 @@ this allows system administrators to override the - ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. - - -+iso_cpu: (MuQSS CPU scheduler only) -+=================================== -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling five -+seconds over the -whole- system, meaning all cpus. -+ -+Set to 70 (percent) by default. -+ -+ - kexec_load_disabled - =================== - -@@ -1077,6 +1087,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after - rebooting. ??? - - -+rr_interval: (MuQSS CPU scheduler only) -+======================================= -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. Conversely decreasing it will decrease average and maximum -+latencies but at the expense of throughput. This value is in -+milliseconds and the default value chosen depends on the number of -+cpus available at scheduler initialisation with a minimum of 6. -+ -+Valid values are from 1-1000. -+ -+ - sched_energy_aware - ================== - -@@ -1515,3 +1539,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+ -+yield_type: (MuQSS CPU scheduler only) -+====================================== -+ -+This determines what type of yield calls to sched_yield will perform. -+ -+ 0: No yield. -+ 1: Yield only to better priority/deadline tasks. (default) -+ 2: Expire timeslice and recalculate deadline. -diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt -new file mode 100644 -index 000000000000..c0282002a079 ---- /dev/null -+++ b/Documentation/scheduler/sched-BFS.txt -@@ -0,0 +1,351 @@ -+BFS - The Brain Fuck Scheduler by Con Kolivas. -+ -+Goals. -+ -+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to -+completely do away with the complex designs of the past for the cpu process -+scheduler and instead implement one that is very simple in basic design. -+The main focus of BFS is to achieve excellent desktop interactivity and -+responsiveness without heuristics and tuning knobs that are difficult to -+understand, impossible to model and predict the effect of, and when tuned to -+one workload cause massive detriment to another. -+ -+ -+Design summary. -+ -+BFS is best described as a single runqueue, O(n) lookup, earliest effective -+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual -+deadline first) and my previous Staircase Deadline scheduler. Each component -+shall be described in order to understand the significance of, and reasoning for -+it. The codebase when the first stable version was released was approximately -+9000 lines less code than the existing mainline linux kernel scheduler (in -+2.6.31). This does not even take into account the removal of documentation and -+the cgroups code that is not used. -+ -+Design reasoning. -+ -+The single runqueue refers to the queued but not running processes for the -+entire system, regardless of the number of CPUs. The reason for going back to -+a single runqueue design is that once multiple runqueues are introduced, -+per-CPU or otherwise, there will be complex interactions as each runqueue will -+be responsible for the scheduling latency and fairness of the tasks only on its -+own runqueue, and to achieve fairness and low latency across multiple CPUs, any -+advantage in throughput of having CPU local tasks causes other disadvantages. -+This is due to requiring a very complex balancing system to at best achieve some -+semblance of fairness across CPUs and can only maintain relatively low latency -+for tasks bound to the same CPUs, not across them. To increase said fairness -+and latency across CPUs, the advantage of local runqueue locking, which makes -+for better scalability, is lost due to having to grab multiple locks. -+ -+A significant feature of BFS is that all accounting is done purely based on CPU -+used and nowhere is sleep time used in any way to determine entitlement or -+interactivity. Interactivity "estimators" that use some kind of sleep/run -+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag -+tasks that aren't interactive as being so. The reason for this is that it is -+close to impossible to determine that when a task is sleeping, whether it is -+doing it voluntarily, as in a userspace application waiting for input in the -+form of a mouse click or otherwise, or involuntarily, because it is waiting for -+another thread, process, I/O, kernel activity or whatever. Thus, such an -+estimator will introduce corner cases, and more heuristics will be required to -+cope with those corner cases, introducing more corner cases and failed -+interactivity detection and so on. Interactivity in BFS is built into the design -+by virtue of the fact that tasks that are waking up have not used up their quota -+of CPU time, and have earlier effective deadlines, thereby making it very likely -+they will preempt any CPU bound task of equivalent nice level. See below for -+more information on the virtual deadline mechanism. Even if they do not preempt -+a running task, because the rr interval is guaranteed to have a bound upper -+limit on how long a task will wait for, it will be scheduled within a timeframe -+that will not cause visible interface jitter. -+ -+ -+Design details. -+ -+Task insertion. -+ -+BFS inserts tasks into each relevant queue as an O(1) insertion into a double -+linked list. On insertion, *every* running queue is checked to see if the newly -+queued task can run on any idle queue, or preempt the lowest running task on the -+system. This is how the cross-CPU scheduling of BFS achieves significantly lower -+latency per extra CPU the system has. In this case the lookup is, in the worst -+case scenario, O(n) where n is the number of CPUs on the system. -+ -+Data protection. -+ -+BFS has one single lock protecting the process local data of every task in the -+global queue. Thus every insertion, removal and modification of task data in the -+global runqueue needs to grab the global lock. However, once a task is taken by -+a CPU, the CPU has its own local data copy of the running process' accounting -+information which only that CPU accesses and modifies (such as during a -+timer tick) thus allowing the accounting data to be updated lockless. Once a -+CPU has taken a task to run, it removes it from the global queue. Thus the -+global queue only ever has, at most, -+ -+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 -+ -+tasks in the global queue. This value is relevant for the time taken to look up -+tasks during scheduling. This will increase if many tasks with CPU affinity set -+in their policy to limit which CPUs they're allowed to run on if they outnumber -+the number of CPUs. The +1 is because when rescheduling a task, the CPU's -+currently running task is put back on the queue. Lookup will be described after -+the virtual deadline mechanism is explained. -+ -+Virtual deadline. -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in BFS is entirely in the virtual deadline mechanism. The one -+tunable in BFS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in jiffies by this equation: -+ -+ jiffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. Once a task is descheduled, it is put back on the queue, and an -+O(n) lookup of all queued-but-not-running tasks is done to determine which has -+the earliest deadline and that task is chosen to receive CPU next. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (jiffies) is -+constantly moving. -+ -+Task lookup. -+ -+BFS has 103 priority queues. 100 of these are dedicated to the static priority -+of realtime tasks, and the remaining 3 are, in order of best to worst priority, -+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority -+scheduling). When a task of these priorities is queued, a bitmap of running -+priorities is set showing which of these priorities has tasks waiting for CPU -+time. When a CPU is made to reschedule, the lookup for the next task to get -+CPU time is performed in the following way: -+ -+First the bitmap is checked to see what static priority tasks are queued. If -+any realtime priorities are found, the corresponding queue is checked and the -+first task listed there is taken (provided CPU affinity is suitable) and lookup -+is complete. If the priority corresponds to a SCHED_ISO task, they are also -+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds -+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this -+stage, every task in the runlist that corresponds to that priority is checked -+to see which has the earliest set deadline, and (provided it has suitable CPU -+affinity) it is taken off the runqueue and given the CPU. If a task has an -+expired deadline, it is taken and the rest of the lookup aborted (as they are -+chosen in FIFO order). -+ -+Thus, the lookup is O(n) in the worst case only, where n is as described -+earlier, as tasks may be chosen before the whole task list is looked over. -+ -+ -+Scalability. -+ -+The major limitations of BFS will be that of scalability, as the separate -+runqueue designs will have less lock contention as the number of CPUs rises. -+However they do not scale linearly even with separate runqueues as multiple -+runqueues will need to be locked concurrently on such designs to be able to -+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness -+across CPUs, and to achieve low enough latency for tasks on a busy CPU when -+other CPUs would be more suited. BFS has the advantage that it requires no -+balancing algorithm whatsoever, as balancing occurs by proxy simply because -+all CPUs draw off the global runqueue, in priority and deadline order. Despite -+the fact that scalability is _not_ the prime concern of BFS, it both shows very -+good scalability to smaller numbers of CPUs and is likely a more scalable design -+at these numbers of CPUs. -+ -+It also has some very low overhead scalability features built into the design -+when it has been deemed their overhead is so marginal that they're worth adding. -+The first is the local copy of the running process' data to the CPU it's running -+on to allow that data to be updated lockless where possible. Then there is -+deference paid to the last CPU a task was running on, by trying that CPU first -+when looking for an idle CPU to use the next time it's scheduled. Finally there -+is the notion of cache locality beyond the last running CPU. The sched_domains -+information is used to determine the relative virtual "cache distance" that -+other CPUs have from the last CPU a task was running on. CPUs with shared -+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated -+as cache local. CPUs without shared caches are treated as not cache local, and -+CPUs on different NUMA nodes are treated as very distant. This "relative cache -+distance" is used by modifying the virtual deadline value when doing lookups. -+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for -+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning -+behind the doubling of deadlines is as follows. The real cost of migrating a -+task from one CPU to another is entirely dependant on the cache footprint of -+the task, how cache intensive the task is, how long it's been running on that -+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and -+how layered the CPU cache is, how fast a context switch is... and so on. In -+other words, it's close to random in the real world where we do more than just -+one sole workload. The only thing we can be sure of is that it's not free. So -+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs -+is more important than cache locality, and cache locality only plays a part -+after that. Doubling the effective deadline is based on the premise that the -+"cache local" CPUs will tend to work on the same tasks up to double the number -+of cache local CPUs, and once the workload is beyond that amount, it is likely -+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA -+is a value I pulled out of my arse. -+ -+When choosing an idle CPU for a waking task, the cache locality is determined -+according to where the task last ran and then idle CPUs are ranked from best -+to worst to choose the most suitable idle CPU based on cache locality, NUMA -+node locality and hyperthread sibling business. They are chosen in the -+following preference (if idle): -+ -+* Same core, idle or busy cache, idle threads -+* Other core, same cache, idle or busy cache, idle threads. -+* Same node, other CPU, idle cache, idle threads. -+* Same node, other CPU, busy cache, idle threads. -+* Same core, busy threads. -+* Other core, same cache, busy threads. -+* Same node, other CPU, busy threads. -+* Other node, other CPU, idle cache, idle threads. -+* Other node, other CPU, busy cache, idle threads. -+* Other node, other CPU, busy threads. -+ -+This shows the SMT or "hyperthread" awareness in the design as well which will -+choose a real idle core first before a logical SMT sibling which already has -+tasks on the physical CPU. -+ -+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. -+However this benchmarking was performed on an earlier design that was far less -+scalable than the current one so it's hard to know how scalable it is in terms -+of both CPUs (due to the global runqueue) and heavily loaded machines (due to -+O(n) lookup) at this stage. Note that in terms of scalability, the number of -+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) -+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark -+results are very promising indeed, without needing to tweak any knobs, features -+or options. Benchmark contributions are most welcome. -+ -+ -+Features -+ -+As the initial prime target audience for BFS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval -+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition -+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is -+support for CGROUPS. The average user should neither need to know what these -+are, nor should they need to be using them to have good desktop behaviour. -+ -+rr_interval -+ -+There is only one "scheduler" tunable, the round robin interval. This can be -+accessed in -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6 on a -+uniprocessor machine, and automatically set to a progressively higher value on -+multiprocessor machines. The reasoning behind increasing the value on more CPUs -+is that the effective latency is decreased by virtue of there being more CPUs on -+BFS (for reasons explained above), and increasing the value allows for less -+cache contention and more throughput. Valid values are from 1 to 1000 -+Decreasing the value will decrease latencies at the cost of decreasing -+throughput, while increasing it will improve throughput, but at the cost of -+worsening latencies. The accuracy of the rr interval is limited by HZ resolution -+of the kernel configuration. Thus, the worst case latencies are usually slightly -+higher than this actual value. The default value of 6 is not an arbitrary one. -+It is based on the fact that humans can detect jitter at approximately 7ms, so -+aiming for much lower latencies is pointless under most circumstances. It is -+worth noting this fact when comparing the latency performance of BFS to other -+schedulers. Worst case latencies being higher than 7ms are far worse than -+average latencies not being in the microsecond range. -+ -+Isochronous scheduling. -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of _total CPU_ available across the machine, configurable -+as a percentage in the following "resource handling" tunable (as opposed to a -+scheduler tunable): -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of BFS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+Because some applications constantly set their policy as well as their nice -+level, there is potential for them to undo the override specified by the user -+on the command line of setting the policy to SCHED_ISO. To counter this, once -+a task has been set to SCHED_ISO policy, it needs superuser privileges to set -+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child -+processes and threads will also inherit the ISO policy. -+ -+Idleprio scheduling. -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start -+a video encode or so on without any slowdown of other tasks. To avoid this -+policy from grabbing shared resources and holding them indefinitely, if it -+detects a state where the task is waiting on I/O, the machine is about to -+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As -+per the Isochronous task management, once a task has been scheduled as IDLEPRIO, -+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can -+be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+ schedtool -D -e ./mprime -+ -+Subtick accounting. -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the -+timer tick frequency (HZ) is lowered. It is possible to create an application -+which uses almost 100% CPU, yet by being descheduled at the right time, records -+zero CPU usage. While the main problem with this is that there are possible -+security implications, it is also difficult to determine how much CPU a task -+really does use. BFS tries to use the sub-tick accounting from the TSC clock, -+where possible, to determine real CPU usage. This is not entirely reliable, but -+is far more likely to produce accurate CPU usage data than the existing designs -+and will not show tasks as consuming no CPU usage when they actually are. Thus, -+the amount of CPU reported as being used by BFS will more accurately represent -+how much CPU the task itself is using (as is shown for example by the 'time' -+application), so the reported values may be quite different to other schedulers. -+Values reported as the 'load' are more prone to problems with this design, but -+per process values are closer to real usage. When comparing throughput of BFS -+to other designs, it is important to compare the actual completed work in terms -+of total wall clock time taken and total work done, rather than the reported -+"cpu usage". -+ -+ -+Con Kolivas Fri Aug 27 2010 -diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt -new file mode 100644 -index 000000000000..ae28b85c9995 ---- /dev/null -+++ b/Documentation/scheduler/sched-MuQSS.txt -@@ -0,0 +1,373 @@ -+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. -+ -+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with -+one 8 level skiplist per runqueue, and fine grained locking for much more -+scalability. -+ -+ -+Goals. -+ -+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from -+here on (pronounced mux) is to completely do away with the complex designs of -+the past for the cpu process scheduler and instead implement one that is very -+simple in basic design. The main focus of MuQSS is to achieve excellent desktop -+interactivity and responsiveness without heuristics and tuning knobs that are -+difficult to understand, impossible to model and predict the effect of, and when -+tuned to one workload cause massive detriment to another, while still being -+scalable to many CPUs and processes. -+ -+ -+Design summary. -+ -+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) -+lookup, earliest effective virtual deadline first tickless design, loosely based -+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase -+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. -+Each component shall be described in order to understand the significance of, -+and reasoning for it. -+ -+ -+Design reasoning. -+ -+In BFS, the use of a single runqueue across all CPUs meant that each CPU would -+need to scan the entire runqueue looking for the process with the earliest -+deadline and schedule that next, regardless of which CPU it originally came -+from. This made BFS deterministic with respect to latency and provided -+guaranteed latencies dependent on number of processes and CPUs. The single -+runqueue, however, meant that all CPUs would compete for the single lock -+protecting it, which would lead to increasing lock contention as the number of -+CPUs rose and appeared to limit scalability of common workloads beyond 16 -+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously -+increased overhead proportionate to the number of queued proecesses and led to -+cache thrashing while iterating over the linked list. -+ -+MuQSS is an evolution of BFS, designed to maintain the same scheduling -+decision mechanism and be virtually deterministic without relying on the -+constrained design of the single runqueue by splitting out the single runqueue -+to be per-CPU and use skiplists instead of linked lists. -+ -+The original reason for going back to a single runqueue design for BFS was that -+once multiple runqueues are introduced, per-CPU or otherwise, there will be -+complex interactions as each runqueue will be responsible for the scheduling -+latency and fairness of the tasks only on its own runqueue, and to achieve -+fairness and low latency across multiple CPUs, any advantage in throughput of -+having CPU local tasks causes other disadvantages. This is due to requiring a -+very complex balancing system to at best achieve some semblance of fairness -+across CPUs and can only maintain relatively low latency for tasks bound to the -+same CPUs, not across them. To increase said fairness and latency across CPUs, -+the advantage of local runqueue locking, which makes for better scalability, is -+lost due to having to grab multiple locks. -+ -+MuQSS works around the problems inherent in multiple runqueue designs by -+making its skip lists priority ordered and through novel use of lockless -+examination of each other runqueue it can decide if it should take the earliest -+deadline task from another runqueue for latency reasons, or for CPU balancing -+reasons. It still does not have a balancing system, choosing to allow the -+next task scheduling decision and task wakeup CPU choice to allow balancing to -+happen by virtue of its choices. -+ -+As a further evolution of the design, MuQSS normally configures sharing of -+runqueues in a logical fashion for when CPU resources are shared for improved -+latency and throughput. By default it shares runqueues and locks between -+multicore siblings. Optionally it can be configured to run with sharing of -+SMT siblings only, all SMP packages or no sharing at all. Additionally it can -+be selected at boot time. -+ -+ -+Design details. -+ -+Custom skip list implementation: -+ -+To avoid the overhead of building up and tearing down skip list structures, -+the variant used by MuQSS has a number of optimisations making it specific for -+its use case in the scheduler. It uses static arrays of 8 'levels' instead of -+building up and tearing down structures dynamically. This makes each runqueue -+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU -+it means that it scales O(log N) up to 64k x number of logical CPUs which is -+far beyond the realistic task limits each CPU could handle. By being 8 levels -+it also makes the array exactly one cacheline in size. Additionally, each -+skip list node is bidirectional making insertion and removal amortised O(1), -+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very -+first entry in each list at all times with MuQSS, so there is never a need to -+do a search and thus look up is always O(1). In interactive mode, the queues -+will be searched beyond their first entry if the first task is not suitable -+for affinity or SMT nice reasons. -+ -+Task insertion: -+ -+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into -+a custom skip list as described above (based on the original design by William -+Pugh). Insertion is ordered in such a way that there is never a need to do a -+search by ordering tasks according to static priority primarily, and then -+virtual deadline at the time of insertion. -+ -+Niffies: -+ -+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are -+of nanosecond resolution. Niffies are calculated per-runqueue from the high -+resolution TSC timers, and in order to maintain fairness are synchronised -+between CPUs whenever both runqueues are locked concurrently. -+ -+Virtual deadline: -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in MuQSS is entirely in the virtual deadline mechanism. The one -+tunable in MuQSS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in niffies by this equation: -+ -+ niffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (niffies) is -+constantly moving. -+ -+Task lookup: -+ -+As tasks are already pre-ordered according to anticipated scheduling order in -+the skip lists, lookup for the next suitable task per-runqueue is always a -+matter of simply selecting the first task in the 0th level skip list entry. -+In order to maintain optimal latency and fairness across CPUs, MuQSS does a -+novel examination of every other runqueue in cache locality order, choosing the -+best task across all runqueues. This provides near-determinism of how long any -+task across the entire system may wait before receiving CPU time. The other -+runqueues are first examine lockless and then trylocked to minimise the -+potential lock contention if they are likely to have a suitable better task. -+Each other runqueue lock is only held for as long as it takes to examine the -+entry for suitability. In "interactive" mode, the default setting, MuQSS will -+look for the best deadline task across all CPUs, while in !interactive mode, -+it will only select a better deadline task from another CPU if it is more -+heavily laden than the current one. -+ -+Lookup is therefore O(k) where k is number of CPUs. -+ -+ -+Latency. -+ -+Through the use of virtual deadlines to govern the scheduling order of normal -+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by -+the rr_interval tunable which is set to 6ms by default. This means that the -+longest a CPU bound task will wait for more CPU is proportional to the number -+of running tasks and in the common case of 0-2 running tasks per CPU, will be -+under the 7ms threshold for human perception of jitter. Additionally, as newly -+woken tasks will have an early deadline from their previous runtime, the very -+tasks that are usually latency sensitive will have the shortest interval for -+activation, usually preempting any existing CPU bound tasks. -+ -+Tickless expiry: -+ -+A feature of MuQSS is that it is not tied to the resolution of the chosen tick -+rate in Hz, instead depending entirely on the high resolution timers where -+possible for sub-millisecond accuracy on timeouts regarless of the underlying -+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates -+such as 100 by default, benefiting from the improved throughput and lower -+power usage it provides. Another advantage of this approach is that in -+combination with the Full No HZ option, which disables ticks on running task -+CPUs instead of just idle CPUs, the tick can be disabled at all times -+regardless of how many tasks are running instead of being limited to just one -+running task. Note that this option is NOT recommended for regular desktop -+users. -+ -+ -+Scalability and balancing. -+ -+Unlike traditional approaches where balancing is a combination of CPU selection -+at task wakeup and intermittent balancing based on a vast array of rules set -+according to architecture, busyness calculations and special case management, -+MuQSS indirectly balances on the fly at task wakeup and next task selection. -+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for -+each logical CPU and uses this to aid task/CPU selection when CPUs are busy. -+Additionally it selects any idle CPUs, if they are available, at any time over -+busy CPUs according to the following preference: -+ -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ -+Mux is therefore SMT, MC and Numa aware without the need for extra -+intermittent balancing to maintain CPUs busy and make the most of cache -+coherency. -+ -+ -+Features -+ -+As the initial prime target audience for MuQSS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, -+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO -+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS -+does _not_ now feature is support for CGROUPS. The average user should neither -+need to know what these are, nor should they need to be using them to have good -+desktop behaviour. However since some applications refuse to work without -+cgroups, one can enable them with MuQSS as a stub and the filesystem will be -+created which will allow the applications to work. -+ -+rr_interval: -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6. Valid values -+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of -+decreasing throughput, while increasing it will improve throughput, but at the -+cost of worsening latencies. It is based on the fact that humans can detect -+jitter at approximately 7ms, so aiming for much lower latencies is pointless -+under most circumstances. It is worth noting this fact when comparing the -+latency performance of MuQSS to other schedulers. Worst case latencies being -+higher than 7ms are far worse than average latencies not being in the -+microsecond range. -+ -+interactive: -+ -+ /proc/sys/kernel/interactive -+ -+The value is a simple boolean of 1 for on and 0 for off and is set to on by -+default. Disabling this will disable the near-determinism of MuQSS when -+selecting the next task by not examining all CPUs for the earliest deadline -+task, or which CPU to wake to, instead prioritising CPU balancing for improved -+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis -+instead of across the whole system. -+ -+Runqueue sharing. -+ -+By default MuQSS chooses to share runqueue resources (specifically the skip -+list and locking) between multicore siblings. It is configurable at build time -+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing -+only between simultaneous mulithreading siblings, multicore siblings, or -+symmetric multiprocessing physical packages. Additionally it can be se at -+bootime with the use of the rqshare parameter. The reason for configurability -+is that some architectures have CPUs with many multicore siblings (>= 16) -+where it may be detrimental to throughput to share runqueues and another -+sharing option may be desirable. Additionally, more sharing than usual can -+improve latency on a system-wide level at the expense of throughput if desired. -+ -+The options are: -+none, smt, mc, smp -+ -+eg: -+ rqshare=mc -+ -+Isochronous scheduling: -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of CPU available per CPU, configurable as a percentage in -+the following "resource handling" tunable (as opposed to a scheduler tunable): -+ -+iso_cpu: -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of MuQSS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+ -+ -+Idleprio scheduling: -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start a -+video encode or so on without any slowdown of other tasks. To avoid this policy -+from grabbing shared resources and holding them indefinitely, if it detects a -+state where the task is waiting on I/O, the machine is about to suspend to ram -+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has -+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without -+superuser privileges since it is effectively a lower scheduling policy. Tasks -+can be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+schedtool -D -e ./mprime -+ -+Subtick accounting: -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the timer -+tick frequency (HZ) is lowered. It is possible to create an application which -+uses almost 100% CPU, yet by being descheduled at the right time, records zero -+CPU usage. While the main problem with this is that there are possible security -+implications, it is also difficult to determine how much CPU a task really does -+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU -+usage. Thus, the amount of CPU reported as being used by MuQSS will more -+accurately represent how much CPU the task itself is using (as is shown for -+example by the 'time' application), so the reported values may be quite -+different to other schedulers. When comparing throughput of MuQSS to other -+designs, it is important to compare the actual completed work in terms of total -+wall clock time taken and total work done, rather than the reported "cpu usage". -+ -+Symmetric MultiThreading (SMT) aware nice: -+ -+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the -+logical CPU count rises by adding thread units to each CPU core, allowing more -+than one task to be run simultaneously on the same core, the disadvantage of it -+is that the CPU power is shared between the tasks, not summating to the power -+of two CPUs. The practical upshot of this is that two tasks running on -+separate threads of the same core run significantly slower than if they had one -+core each to run on. While smart CPU selection allows each task to have a core -+to itself whenever available (as is done on MuQSS), it cannot offset the -+slowdown that occurs when the cores are all loaded and only a thread is left. -+Most of the time this is harmless as the CPU is effectively overloaded at this -+point and the extra thread is of benefit. However when running a niced task in -+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets -+precisely the same amount of CPU power as the unniced one. MuQSS has an -+optional configuration feature known as SMT-NICE which selectively idles the -+secondary niced thread for a period proportional to the nice difference, -+allowing CPU distribution according to nice level to be maintained, at the -+expense of a small amount of extra overhead. If this is configured in on a -+machine without SMT threads, the overhead is minimal. -+ -+ -+Con Kolivas Sat, 29th October 2016 -diff --git a/Makefile b/Makefile -index 51540b291738..ab8c480660a6 100644 ---- a/Makefile -+++ b/Makefile -@@ -18,6 +18,10 @@ $(if $(filter __%, $(MAKECMDGOALS)), \ - PHONY := __all - __all: - -+CKVERSION = -ck1 -+CKNAME = MuQSS Powered -+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) -+ - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. - # -diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig -index 9c5f06e8eb9b..0d1069eee09c 100644 ---- a/arch/alpha/Kconfig -+++ b/arch/alpha/Kconfig -@@ -666,6 +666,8 @@ config HZ - default 1200 if HZ_1200 - default 1024 - -+source "kernel/Kconfig.MuQSS" -+ - config SRM_ENV - tristate "SRM environment through procfs" - depends on PROC_FS -diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index a12656ec0072..b46b6ddc7636 100644 ---- a/arch/arc/configs/tb10x_defconfig -+++ b/arch/arc/configs/tb10x_defconfig -@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y - CONFIG_ARC_CACHE_LINE_SHIFT=5 - CONFIG_HZ=250 - CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_COMPACTION is not set - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index e00d94b16658..efabbd09475a 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -1236,6 +1236,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config HAVE_ARM_SCU - bool - help -diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig -index 44ff9cd88d81..9c639c998015 100644 ---- a/arch/arm/configs/bcm2835_defconfig -+++ b/arch/arm/configs/bcm2835_defconfig -@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_ARCH_MULTI_V6=y - CONFIG_ARCH_BCM=y - CONFIG_ARCH_BCM2835=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_KSM=y - CONFIG_CLEANCACHE=y -diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig -index 82d3ffb18e70..bb05667427a6 100644 ---- a/arch/arm/configs/imx_v6_v7_defconfig -+++ b/arch/arm/configs/imx_v6_v7_defconfig -@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y - CONFIG_PCI_IMX6=y - CONFIG_SMP=y - CONFIG_ARM_PSCI=y -+CONFIG_PREEMPT=y - CONFIG_HIGHMEM=y - CONFIG_FORCE_MAX_ZONEORDER=14 - CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" -diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig -index 1d923dbb9928..9c1931f1fafd 100644 ---- a/arch/arm/configs/mps2_defconfig -+++ b/arch/arm/configs/mps2_defconfig -@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y - CONFIG_SET_MEM_PARAM=y - CONFIG_DRAM_BASE=0x21000000 - CONFIG_DRAM_SIZE=0x1000000 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_ATAGS is not set - CONFIG_ZBOOT_ROM_TEXT=0x0 - CONFIG_ZBOOT_ROM_BSS=0x0 -diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig -index a9c6f32a9b1c..870866aaa39d 100644 ---- a/arch/arm/configs/mxs_defconfig -+++ b/arch/arm/configs/mxs_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT_VOLUNTARY=n - CONFIG_TASKSTATS=y - CONFIG_TASK_DELAY_ACCT=y - CONFIG_TASK_XACCT=y -@@ -25,6 +25,13 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_MODVERSIONS=y - CONFIG_BLK_DEV_INTEGRITY=y -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+# CONFIG_ARCH_MULTI_V7 is not set -+CONFIG_ARCH_MXS=y -+# CONFIG_ARM_THUMB is not set -+CONFIG_PREEMPT=y -+CONFIG_AEABI=y - CONFIG_NET=y - CONFIG_PACKET=y - CONFIG_UNIX=y -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 6d232837cbee..052cae73d674 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -945,6 +945,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 -diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig -index 023b4e644b1c..013e630b96a6 100644 ---- a/arch/mips/configs/fuloong2e_defconfig -+++ b/arch/mips/configs/fuloong2e_defconfig -@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig -index 9085f4d6c698..fb23111d45f6 100644 ---- a/arch/mips/configs/gpr_defconfig -+++ b/arch/mips/configs/gpr_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig -index 21a1168ae301..529a1b1007cf 100644 ---- a/arch/mips/configs/ip22_defconfig -+++ b/arch/mips/configs/ip22_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig -index 0921ef38e9fb..6da05cef46f8 100644 ---- a/arch/mips/configs/ip28_defconfig -+++ b/arch/mips/configs/ip28_defconfig -@@ -1,5 +1,5 @@ - CONFIG_SYSVIPC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig -index 8c223035921f..a3bf87450343 100644 ---- a/arch/mips/configs/jazz_defconfig -+++ b/arch/mips/configs/jazz_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_RELAY=y -diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig -index 914af125a7fa..76a64290373f 100644 ---- a/arch/mips/configs/mtx1_defconfig -+++ b/arch/mips/configs/mtx1_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig -index 4ecb157e56d4..ea7309283b01 100644 ---- a/arch/mips/configs/nlm_xlr_defconfig -+++ b/arch/mips/configs/nlm_xlr_defconfig -@@ -1,10 +1,10 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_TASKSTATS=y -diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig -index 63fe2da1b37f..7f08ee237345 100644 ---- a/arch/mips/configs/pic32mzda_defconfig -+++ b/arch/mips/configs/pic32mzda_defconfig -@@ -1,7 +1,7 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig -index b9adf15ebbec..0025b56dc300 100644 ---- a/arch/mips/configs/pistachio_defconfig -+++ b/arch/mips/configs/pistachio_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_DEFAULT_HOSTNAME="localhost" - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=m - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=18 -diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig -index 30d7c3db884e..9e68acfa0d0e 100644 ---- a/arch/mips/configs/rm200_defconfig -+++ b/arch/mips/configs/rm200_defconfig -@@ -1,6 +1,6 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig -new file mode 100644 -index 000000000000..578524f80cc4 ---- /dev/null -+++ b/arch/parisc/configs/712_defconfig -@@ -0,0 +1,181 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_GSC_LASI=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+# CONFIG_IPV6 is not set -+CONFIG_NETFILTER=y -+CONFIG_LLC2=m -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_ATA_OVER_ETH=m -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=m -+CONFIG_MD_LINEAR=m -+CONFIG_MD_RAID0=m -+CONFIG_MD_RAID1=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPP_MPPE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=m -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_SERIAL_MUX is not set -+CONFIG_PDC_CONSOLE=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_HARMONY=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_JFS_FS=m -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_UFS_FS=m -+CONFIG_NFS_FS=y -+CONFIG_NFS_V4=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+CONFIG_CRYPTO_DEFLATE=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_FONTS=y -+CONFIG_FONT_8x8=y -+CONFIG_FONT_8x16=y -diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig -new file mode 100644 -index 000000000000..d1bdfad94048 ---- /dev/null -+++ b/arch/parisc/configs/c3000_defconfig -@@ -0,0 +1,151 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA8X00=y -+CONFIG_PREEMPT=y -+# CONFIG_GSC is not set -+CONFIG_PCI=y -+CONFIG_PCI_LBA=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_BOOTP=y -+# CONFIG_INET_DIAG is not set -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_NETFILTER=y -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_BLK_DEV_UMEM=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_BLK_DEV_DM=m -+CONFIG_DM_CRYPT=m -+CONFIG_DM_SNAPSHOT=m -+CONFIG_DM_MIRROR=m -+CONFIG_DM_ZERO=m -+CONFIG_DM_MULTIPATH=m -+CONFIG_FUSION=y -+CONFIG_FUSION_SPI=m -+CONFIG_FUSION_CTL=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=m -+CONFIG_TIGON3=m -+CONFIG_NET_TULIP=y -+CONFIG_DE2104X=m -+CONFIG_TULIP=y -+CONFIG_TULIP_MMIO=y -+CONFIG_E100=m -+CONFIG_E1000=m -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_ATKBD is not set -+# CONFIG_MOUSE_PS2 is not set -+CONFIG_SERIO=m -+CONFIG_SERIO_LIBPS2=m -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=13 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_USB_HIDDEV=y -+CONFIG_USB=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_PRINTER=m -+CONFIG_USB_STORAGE=m -+CONFIG_USB_STORAGE_USBAT=m -+CONFIG_USB_STORAGE_SDDR09=m -+CONFIG_USB_STORAGE_SDDR55=m -+CONFIG_USB_STORAGE_JUMPSHOT=m -+CONFIG_USB_MDC800=m -+CONFIG_USB_MICROTEK=m -+CONFIG_USB_LEGOTOWER=m -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_MUTEXES=y -+# CONFIG_DEBUG_BUGVERBOSE is not set -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MD5=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_DES=m -+# CONFIG_CRYPTO_HW is not set -diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig -new file mode 100644 -index 000000000000..0d976614934c ---- /dev/null -+++ b/arch/parisc/configs/defconfig -@@ -0,0 +1,206 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_IOMMU_CCIO=y -+CONFIG_GSC_LASI=y -+CONFIG_GSC_WAX=y -+CONFIG_EISA=y -+CONFIG_PCI=y -+CONFIG_GSC_DINO=y -+CONFIG_PCI_LBA=y -+CONFIG_PCCARD=y -+CONFIG_YENTA=y -+CONFIG_PD6729=y -+CONFIG_I82092=y -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+CONFIG_INET6_AH=y -+CONFIG_INET6_ESP=y -+CONFIG_INET6_IPCOMP=y -+CONFIG_LLC2=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_PARPORT_PC_PCMCIA=m -+CONFIG_PARPORT_1284=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECS=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_GENERIC=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_ZALON=y -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_MD_RAID10=y -+CONFIG_BLK_DEV_DM=y -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=y -+CONFIG_TIGON3=y -+CONFIG_NET_TULIP=y -+CONFIG_TULIP=y -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=y -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_CS=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_DYNAMIC_MINORS=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_SND_HARMONY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_NTRIG=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_HID_TOPSEED=y -+CONFIG_USB=y -+CONFIG_USB_MON=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_UHCI_HCD=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_VFAT_FS=y -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=y -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=y -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=y -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_KEYS=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_LIBCRC32C=m -+CONFIG_FONTS=y -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index 787e829b6f25..22914bbb4caa 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -882,6 +882,8 @@ config SCHED_SMT - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config PPC_DENORMALISATION - bool "PowerPC denormalisation exception handling" - depends on PPC_BOOK3S_64 -diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig -index 66e9a0fd64ff..c8531232efb7 100644 ---- a/arch/powerpc/configs/ppc6xx_defconfig -+++ b/arch/powerpc/configs/ppc6xx_defconfig -@@ -73,7 +73,7 @@ CONFIG_QE_GPIO=y - CONFIG_MCU_MPC8349EMITX=y - CONFIG_HIGHMEM=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_HIBERNATION=y - CONFIG_PM_DEBUG=y -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig -index ee6d28ae08de..827e4693c5b2 100644 ---- a/arch/sh/configs/se7712_defconfig -+++ b/arch/sh/configs/se7712_defconfig -@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=66666666 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" - CONFIG_NET=y -diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig -index bad921bc10f8..e8f42bc0d370 100644 ---- a/arch/sh/configs/se7721_defconfig -+++ b/arch/sh/configs/se7721_defconfig -@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_7721_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=33333333 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" - CONFIG_NET=y -diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig -index ba887f1351be..4434e93b70bc 100644 ---- a/arch/sh/configs/titan_defconfig -+++ b/arch/sh/configs/titan_defconfig -@@ -19,7 +19,7 @@ CONFIG_SH_TITAN=y - CONFIG_SH_PCLK_FREQ=30000000 - CONFIG_SH_DMA=y - CONFIG_SH_DMA_API=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" - CONFIG_PCI=y -diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig -index bde4d21a8ac8..c054ec82d91b 100644 ---- a/arch/sparc/configs/sparc64_defconfig -+++ b/arch/sparc/configs/sparc64_defconfig -@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NUMA=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_SUN_LDOMS=y - CONFIG_PCI=y - CONFIG_PCI_MSI=y -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 7101ac64bb20..6f56ad1894d1 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1005,6 +1005,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_MUQSS && SCHED_SMT -+ default y -+ help -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -@@ -1035,6 +1051,8 @@ config SCHED_MC_PRIO - - If unsure say Y here. - -+source "kernel/Kconfig.MuQSS" -+ - config UP_LATE_INIT - def_bool y - depends on !SMP && X86_LOCAL_APIC -@@ -1419,7 +1437,7 @@ config HIGHMEM64G - endchoice - - choice -- prompt "Memory split" if EXPERT -+ prompt "Memory split" - default VMSPLIT_3G - depends on X86_32 - help -@@ -1439,17 +1457,17 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !X86_PAE -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !X86_PAE -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig -index 78210793d357..0c4415b23002 100644 ---- a/arch/x86/configs/i386_defconfig -+++ b/arch/x86/configs/i386_defconfig -@@ -23,6 +23,8 @@ CONFIG_PROFILING=y - CONFIG_SMP=y - CONFIG_X86_GENERIC=y - CONFIG_HPET_TIMER=y -+CONFIG_SCHED_SMT=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_REBOOTFIXUPS=y - CONFIG_MICROCODE_AMD=y -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 9936528e1939..328c7d0a38a1 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -20,6 +20,9 @@ CONFIG_BLK_DEV_INITRD=y - # CONFIG_COMPAT_BRK is not set - CONFIG_PROFILING=y - CONFIG_SMP=y -+CONFIG_NR_CPUS=64 -+CONFIG_SCHED_SMT=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_MICROCODE_AMD=y - CONFIG_X86_MSR=y -diff --git a/drivers/accessibility/speakup/speakup_acntpc.c b/drivers/accessibility/speakup/speakup_acntpc.c -index c94328a5bd4a..6e7d4671aa69 100644 ---- a/drivers/accessibility/speakup/speakup_acntpc.c -+++ b/drivers/accessibility/speakup/speakup_acntpc.c -@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/speakup_apollo.c b/drivers/accessibility/speakup/speakup_apollo.c -index 0877b4044c28..627102d048c1 100644 ---- a/drivers/accessibility/speakup/speakup_apollo.c -+++ b/drivers/accessibility/speakup/speakup_apollo.c -@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) - if (!synth->io_ops->synth_out(synth, ch)) { - synth->io_ops->tiocmset(0, UART_MCR_RTS); - synth->io_ops->tiocmset(UART_MCR_RTS, 0); -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff --git a/drivers/accessibility/speakup/speakup_decext.c b/drivers/accessibility/speakup/speakup_decext.c -index 7408eb29cf38..938a0c35968f 100644 ---- a/drivers/accessibility/speakup/speakup_decext.c -+++ b/drivers/accessibility/speakup/speakup_decext.c -@@ -180,7 +180,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_decpc.c b/drivers/accessibility/speakup/speakup_decpc.c -index 96f24c848cc5..1130dfe4da6c 100644 ---- a/drivers/accessibility/speakup/speakup_decpc.c -+++ b/drivers/accessibility/speakup/speakup_decpc.c -@@ -398,7 +398,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (dt_sendchar(ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_dectlk.c b/drivers/accessibility/speakup/speakup_dectlk.c -index 780214b5ca16..7b91594c57aa 100644 ---- a/drivers/accessibility/speakup/speakup_dectlk.c -+++ b/drivers/accessibility/speakup/speakup_dectlk.c -@@ -247,7 +247,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_dtlk.c b/drivers/accessibility/speakup/speakup_dtlk.c -index dbebed0eeeec..6d83c13ca4a6 100644 ---- a/drivers/accessibility/speakup/speakup_dtlk.c -+++ b/drivers/accessibility/speakup/speakup_dtlk.c -@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - jiffy_delta_val = jiffy_delta->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/speakup_keypc.c b/drivers/accessibility/speakup/speakup_keypc.c -index 414827e888fc..cb31c9176daa 100644 ---- a/drivers/accessibility/speakup/speakup_keypc.c -+++ b/drivers/accessibility/speakup/speakup_keypc.c -@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c -index ac47dbac7207..09f6ba829dfd 100644 ---- a/drivers/accessibility/speakup/synth.c -+++ b/drivers/accessibility/speakup/synth.c -@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (ch == '\n') - ch = synth->procspeech; -- if (unicode) -- ret = synth->io_ops->synth_out_unicode(synth, ch); -- else -- ret = synth->io_ops->synth_out(synth, ch); -- if (!ret) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ if (!synth->io_ops->synth_out(synth, ch)) { -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth->io_ops->synth_out(synth, synth->procspeech)) -- schedule_timeout( -- msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - else -- schedule_timeout( -- msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/block/swim.c b/drivers/block/swim.c -index dd34504382e5..0caa1c7e9223 100644 ---- a/drivers/block/swim.c -+++ b/drivers/block/swim.c -@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, - if (swim_readbit(base, MOTOR_ON)) - break; - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - } else if (action == OFF) { - swim_action(base, MOTOR_OFF); -@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) - if (!swim_readbit(base, DISK_IN)) - break; - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - swim_select(base, RELAX); - } -@@ -372,6 +372,7 @@ static inline int swim_step(struct swim __iomem *base) - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); -+ schedule_min_hrtimeout(); - - swim_select(base, RELAX); - if (!swim_readbit(base, STEP)) -diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c -index 737c0b6b24ea..a3db1f42bb3b 100644 ---- a/drivers/char/ipmi/ipmi_msghandler.c -+++ b/drivers/char/ipmi/ipmi_msghandler.c -@@ -3542,7 +3542,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) - /* Current message first, to preserve order */ - while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { - /* Wait for the message to clear out. */ -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - /* No need for locks, the interface is down. */ -diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c -index 0416b9c9d410..9ce5fae0f1cf 100644 ---- a/drivers/char/ipmi/ipmi_ssif.c -+++ b/drivers/char/ipmi/ipmi_ssif.c -@@ -1288,7 +1288,7 @@ static void shutdown_ssif(void *send_info) - - /* make sure the driver is not looking for flags any more. */ - while (ssif_info->ssif_state != SSIF_NORMAL) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - ssif_info->stopping = true; - del_timer_sync(&ssif_info->watch_timer); -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -index a95156fc5db7..8f07c8900184 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, - DRM_ERROR("SVGA device lockup.\n"); - break; - } -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - if (interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -index 75f3efee21a4..09b1932ce85b 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, - break; - } - if (lazy) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - else if ((++count & 0x0F) == 0) { - /** - * FIXME: Use schedule_hr_timeout here for -diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c -index 29f5fed28c2a..974cb08c7aa7 100644 ---- a/drivers/hwmon/fam15h_power.c -+++ b/drivers/hwmon/fam15h_power.c -@@ -221,7 +221,7 @@ static ssize_t power1_average_show(struct device *dev, - prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; - } - -- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); -+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); - if (leftover) - return 0; - -diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c -index abc8d7db8dc1..baa9d6338a52 100644 ---- a/drivers/iio/light/tsl2563.c -+++ b/drivers/iio/light/tsl2563.c -@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) - default: - delay = 402; - } -- /* -- * TODO: Make sure that we wait at least required delay but why we -- * have to extend it one tick more? -- */ -- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); -+ schedule_msec_hrtimeout_interruptible(delay + 1); - } - - static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) -diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c -index 39530d43590e..a7caf2eb5771 100644 ---- a/drivers/media/i2c/msp3400-driver.c -+++ b/drivers/media/i2c/msp3400-driver.c -@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) - break; - dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) - break; - dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c -index cf7cfda94107..f63e17489547 100644 ---- a/drivers/media/pci/cx18/cx18-gpio.c -+++ b/drivers/media/pci/cx18/cx18-gpio.c -@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, - - /* Assert */ - gpio_update(cx, mask, ~active_lo); -- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); - - /* Deassert */ - gpio_update(cx, mask, ~active_hi); -- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); - } - - /* -diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c -index 856e7ab7f33e..766a26251337 100644 ---- a/drivers/media/pci/ivtv/ivtv-gpio.c -+++ b/drivers/media/pci/ivtv/ivtv-gpio.c -@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) - curout = (curout & ~0xF) | 1; - write_reg(curout, IVTV_REG_GPIO_OUT); - /* We could use something else for smaller time */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - curout |= 2; - write_reg(curout, IVTV_REG_GPIO_OUT); - curdir &= ~0x80; -@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) - curout = read_reg(IVTV_REG_GPIO_OUT); - curout &= ~(1 << itv->card->xceive_pin); - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - - curout |= 1 << itv->card->xceive_pin; - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - return 0; - } - -diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c -index 35dccb31174c..8181cd65e876 100644 ---- a/drivers/media/pci/ivtv/ivtv-ioctl.c -+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c -@@ -1139,7 +1139,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) - TASK_UNINTERRUPTIBLE); - if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) - break; -- schedule_timeout(msecs_to_jiffies(25)); -+ schedule_msec_hrtimeout((25)); - } - finish_wait(&itv->vsync_waitq, &wait); - mutex_lock(&itv->serialize_lock); -diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c -index f04ee84bab5f..c4469b4b8f99 100644 ---- a/drivers/media/pci/ivtv/ivtv-streams.c -+++ b/drivers/media/pci/ivtv/ivtv-streams.c -@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) - while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && - time_before(jiffies, - then + msecs_to_jiffies(2000))) { -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - } - - /* To convert jiffies to ms, we must multiply by 1000 -diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c -index cb0437b4c331..163fffc0e1d4 100644 ---- a/drivers/media/radio/radio-mr800.c -+++ b/drivers/media/radio/radio-mr800.c -@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, - retval = -ENODATA; - break; - } -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - retval = -ERESTARTSYS; - break; - } -diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c -index fb9de7bbcd19..e53cf45e7f3f 100644 ---- a/drivers/media/radio/radio-tea5777.c -+++ b/drivers/media/radio/radio-tea5777.c -@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) - } - - if (wait) { -- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) -+ if (schedule_msec_hrtimeout_interruptible((wait))) - return -ERESTARTSYS; - } - -diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c -index c37315226c42..e73e6393403c 100644 ---- a/drivers/media/radio/tea575x.c -+++ b/drivers/media/radio/tea575x.c -@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, - for (;;) { - if (time_after(jiffies, timeout)) - break; -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - /* some signal arrived, stop search */ - tea->val &= ~TEA575X_BIT_SEARCH; - snd_tea575x_set_freq(tea); -diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c -index b690796d24d4..448b13da62b4 100644 ---- a/drivers/mfd/ucb1x00-core.c -+++ b/drivers/mfd/ucb1x00-core.c -@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) - break; - /* yield to other processes */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - return UCB_ADC_DAT(val); -diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c -index 8e6607fc8a67..b9ab770bbdb5 100644 ---- a/drivers/misc/sgi-xp/xpc_channel.c -+++ b/drivers/misc/sgi-xp/xpc_channel.c -@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) - - atomic_inc(&ch->n_on_msg_allocate_wq); - prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); -- ret = schedule_timeout(1); -+ ret = schedule_min_hrtimeout(); - finish_wait(&ch->msg_allocate_wq, &wait); - atomic_dec(&ch->n_on_msg_allocate_wq); - -diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c -index 4a33ec4fc089..da85f847ebb4 100644 ---- a/drivers/net/caif/caif_hsi.c -+++ b/drivers/net/caif/caif_hsi.c -@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) - break; - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - retry--; - } - -diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c -index 66d0198e7834..ce1c7bf9be87 100644 ---- a/drivers/net/can/usb/peak_usb/pcan_usb.c -+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c -@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) - } else { - /* the PCAN-USB needs time to init */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); -+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); - } - - return err; -diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c -index 65b315bc60ab..2b3f71086f5f 100644 ---- a/drivers/net/usb/lan78xx.c -+++ b/drivers/net/usb/lan78xx.c -@@ -2666,7 +2666,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) - while (!skb_queue_empty(&dev->rxq) && - !skb_queue_empty(&dev->txq) && - !skb_queue_empty(&dev->done)) { -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); -diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c -index 2b2a841cd938..1a4d27179db1 100644 ---- a/drivers/net/usb/usbnet.c -+++ b/drivers/net/usb/usbnet.c -@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) - spin_lock_irqsave(&q->lock, flags); - while (!skb_queue_empty(q)) { - spin_unlock_irqrestore(&q->lock, flags); -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&q->lock, flags); - } -diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -index 461e955aa259..5ab8e7396ea4 100644 ---- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c -+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, - * doesn't seem to have as many firmware restart cycles... - * - * As a test, we're sticking in a 1/100s delay here */ -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - return 0; - -@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) - IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); - i = 5000; - do { -- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_uninterruptible((40)); - /* Todo... wait for sync command ... */ - - read_register(priv->net_dev, IPW_REG_INTA, &inta); -diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c -index 4547ac44c8d4..8fa1a7fdf12c 100644 ---- a/drivers/parport/ieee1284.c -+++ b/drivers/parport/ieee1284.c -@@ -202,7 +202,7 @@ int parport_wait_peripheral(struct parport *port, - /* parport_wait_event didn't time out, but the - * peripheral wasn't actually ready either. - * Wait for another 10ms. */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - } - -diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c -index 2c11bd3fe1fd..8cb6b61c0880 100644 ---- a/drivers/parport/ieee1284_ops.c -+++ b/drivers/parport/ieee1284_ops.c -@@ -520,7 +520,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, - /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { - parport_release (dev); -- schedule_timeout_interruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_interruptible((40)); - parport_claim_or_block (dev); - } - else -diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c -index bffe548187ee..c2918ee3e100 100644 ---- a/drivers/platform/x86/intel_ips.c -+++ b/drivers/platform/x86/intel_ips.c -@@ -798,7 +798,7 @@ static int ips_adjust(void *data) - ips_gpu_lower(ips); - - sleep: -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); - } while (!kthread_should_stop()); - - dev_dbg(ips->dev, "ips-adjust thread stopped\n"); -@@ -974,7 +974,7 @@ static int ips_monitor(void *data) - seqno_timestamp = get_jiffies_64(); - - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - - /* Collect an initial average */ - for (i = 0; i < IPS_SAMPLE_COUNT; i++) { -@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) - mchp_samples[i] = mchp; - } - -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - if (kthread_should_stop()) - break; - } -@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) - * us to reduce the sample frequency if the CPU and GPU are idle. - */ - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - last_sample_period = IPS_SAMPLE_PERIOD; - - timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); -diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c -index 2018614f258f..fc19b312c345 100644 ---- a/drivers/rtc/rtc-wm8350.c -+++ b/drivers/rtc/rtc-wm8350.c -@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); - - if (!retries) { -@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); - - if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) -@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) - /* Wait until confirmation */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); - - if (rtc_ctrl & WM8350_RTC_ALMSTS) -diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c -index 03b1805b106c..41ee54ff304a 100644 ---- a/drivers/scsi/fnic/fnic_scsi.c -+++ b/drivers/scsi/fnic/fnic_scsi.c -@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) - - /* wait for io cmpl */ - while (atomic_read(&fnic->in_flight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); - -@@ -2278,7 +2278,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, - } - } - -- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); -+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); - - /* walk again to check, if IOs are still pending in fw */ - if (fnic_is_abts_pending(fnic, lr_sc)) -diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c -index 983eeb0e3d07..007966930f94 100644 ---- a/drivers/scsi/lpfc/lpfc_scsi.c -+++ b/drivers/scsi/lpfc/lpfc_scsi.c -@@ -5194,7 +5194,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, - tgt_id, lun_id, context); - later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; - while (time_after(later, jiffies) && cnt) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); -+ schedule_msec_hrtimeout_uninterruptible((20)); - cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); - } - if (cnt) { -diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c -index b3650c989ed4..7ed1fb285754 100644 ---- a/drivers/scsi/snic/snic_scsi.c -+++ b/drivers/scsi/snic/snic_scsi.c -@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) - - /* Wait for all the IOs that are entered in Qcmd */ - while (atomic_read(&snic->ios_inflight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - ret = snic_issue_hba_reset(snic, sc); - if (ret) { -diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c -index 9266e13f6271..df5c53216d78 100644 ---- a/drivers/staging/comedi/drivers/ni_mio_common.c -+++ b/drivers/staging/comedi/drivers/ni_mio_common.c -@@ -4748,7 +4748,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) - if ((status & NI67XX_CAL_STATUS_BUSY) == 0) - break; - set_current_state(TASK_INTERRUPTIBLE); -- if (schedule_timeout(1)) -+ if (schedule_min_hrtimeout()) - return -EIO; - } - if (i == timeout) { -diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c -index 898add4d1fc8..0aa9dd467349 100644 ---- a/drivers/staging/rts5208/rtsx.c -+++ b/drivers/staging/rts5208/rtsx.c -@@ -477,7 +477,7 @@ static int rtsx_polling_thread(void *__dev) - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); -+ schedule_msec_hrtimeout((POLLING_INTERVAL)); - - /* lock the device pointers */ - mutex_lock(&dev->dev_mutex); -diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c -index 0433536930a9..d8726f28843f 100644 ---- a/drivers/staging/unisys/visornic/visornic_main.c -+++ b/drivers/staging/unisys/visornic/visornic_main.c -@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - if (atomic_read(&devdata->usage)) - break; -@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c -index cfe63932f825..71c00ef772a3 100644 ---- a/drivers/video/fbdev/omap/hwa742.c -+++ b/drivers/video/fbdev/omap/hwa742.c -@@ -913,7 +913,7 @@ static void hwa742_resume(void) - if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(5)); -+ schedule_msec_hrtimeout((5)); - } - hwa742_set_update_mode(hwa742.update_mode_before_suspend); - } -diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c -index f1551e00eb12..f0f651e92504 100644 ---- a/drivers/video/fbdev/pxafb.c -+++ b/drivers/video/fbdev/pxafb.c -@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) - mutex_unlock(&fbi->ctrlr_lock); - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(30)); -+ schedule_msec_hrtimeout((30)); - } - - pr_debug("%s(): task ending\n", __func__); -diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c -index 76d2e43817ea..6ba0604e2162 100644 ---- a/fs/btrfs/inode-map.c -+++ b/fs/btrfs/inode-map.c -@@ -91,7 +91,7 @@ static int caching_kthread(void *data) - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - goto again; - } else - continue; -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 617db4e0faa0..f85926764f9a 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 27828145ca09..504cc97bf475 100644 ---- a/include/linux/freezer.h -+++ b/include/linux/freezer.h -@@ -311,6 +311,7 @@ static inline void set_freezable(void) {} - #define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - -+#define pm_freezing (false) - #endif /* !CONFIG_FREEZER */ - - #endif /* FREEZER_H_INCLUDED */ -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..73417df5daa2 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_MUQSS -+#define INIT_TASK_COMM "MuQSS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h -index e9bfe6972aed..16ba1c7e5bde 100644 ---- a/include/linux/ioprio.h -+++ b/include/linux/ioprio.h -@@ -53,6 +53,8 @@ enum { - */ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (iso_task(task)) -+ return 0; - return (task_nice(task) + 20) / 5; - } - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..139e4535fcc6 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -35,6 +35,10 @@ - #include - #include - -+#ifdef CONFIG_SCHED_MUQSS -+#include -+#endif -+ - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; - struct backing_dev_info; -@@ -213,13 +217,40 @@ struct task_group; - - extern void scheduler_tick(void); - --#define MAX_SCHEDULE_TIMEOUT LONG_MAX -- -+#define MAX_SCHEDULE_TIMEOUT LONG_MAX - extern long schedule_timeout(long timeout); - extern long schedule_timeout_interruptible(long timeout); - extern long schedule_timeout_killable(long timeout); - extern long schedule_timeout_uninterruptible(long timeout); - extern long schedule_timeout_idle(long timeout); -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+extern long schedule_msec_hrtimeout(long timeout); -+extern long schedule_min_hrtimeout(void); -+extern long schedule_msec_hrtimeout_interruptible(long timeout); -+extern long schedule_msec_hrtimeout_uninterruptible(long timeout); -+#else -+static inline long schedule_msec_hrtimeout(long timeout) -+{ -+ return schedule_timeout(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_min_hrtimeout(void) -+{ -+ return schedule_timeout(1); -+} -+ -+static inline long schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); -+} -+#endif -+ - asmlinkage void schedule(void); - extern void schedule_preempt_disabled(void); - asmlinkage void preempt_schedule_irq(void); -@@ -651,8 +682,10 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - struct __call_single_node wake_entry; - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ -@@ -678,10 +711,25 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+#ifdef CONFIG_SCHED_MUQSS -+ int time_slice; -+ u64 deadline; -+ skiplist_node node; /* Skip list node */ -+ u64 last_ran; -+ u64 sched_time; /* sched_clock time spent running */ -+#ifdef CONFIG_SMT_NICE -+ int smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+#ifdef CONFIG_HOTPLUG_CPU -+ bool zerobound; /* Bound to CPU0 for hotplug */ -+#endif -+ unsigned long rt_timeout; -+#else /* CONFIG_SCHED_MUQSS */ - - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -@@ -863,6 +911,10 @@ struct task_struct { - #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME - u64 utimescaled; - u64 stimescaled; -+#endif -+#ifdef CONFIG_SCHED_MUQSS -+ /* Unbanked cpu time */ -+ unsigned long utime_ns, stime_ns; - #endif - u64 gtime; - struct prev_cputime prev_cputime; -@@ -1332,6 +1384,40 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_MUQSS -+#define tsk_seruntime(t) ((t)->sched_time) -+#define tsk_rttimeout(t) ((t)->rt_timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+} -+ -+void print_scheduler_version(void); -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return (p->policy == SCHED_ISO); -+} -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+ p->nr_cpus_allowed = current->nr_cpus_allowed; -+} -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "CFS CPU scheduler.\n"); -+} -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_MUQSS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..73d6319a856a 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) - #ifdef CONFIG_SMP - - struct root_domain; -+#ifdef CONFIG_SCHED_MUQSS -+static inline void dl_clear_root_domain(struct root_domain *rd) -+{ -+} -+static inline void dl_add_task_root_domain(struct task_struct *p) -+{ -+} -+#else /* CONFIG_SCHED_MUQSS */ - extern void dl_add_task_root_domain(struct task_struct *p); - extern void dl_clear_root_domain(struct root_domain *rd); -+#endif /* CONFIG_SCHED_MUQSS */ - - #endif /* CONFIG_SMP */ -diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h -index 6d67e9a5af6b..101fe470aa8f 100644 ---- a/include/linux/sched/nohz.h -+++ b/include/linux/sched/nohz.h -@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); - static inline void nohz_balance_enter_idle(int cpu) { } - #endif - --#ifdef CONFIG_NO_HZ_COMMON -+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - void calc_load_nohz_start(void); - void calc_load_nohz_remote(struct rq *rq); - void calc_load_nohz_stop(void); -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..43c9d9e50c09 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,8 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_MUQSS -+/* Note different MAX_RT_PRIO */ -+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) -+ -+#define ISO_PRIO (MAX_RT_PRIO) -+#define NORMAL_PRIO (MAX_RT_PRIO + 1) -+#define IDLE_PRIO (MAX_RT_PRIO + 2) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* CONFIG_SCHED_MUQSS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - -+#endif /* CONFIG_SCHED_MUQSS */ -+ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..010b2244e0b6 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_MUQSS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index a98965007eef..743f67fd012e 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..d4be84ba273b ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,33 @@ -+#ifndef _LINUX_SKIP_LISTS_H -+#define _LINUX_SKIP_LISTS_H -+typedef u64 keyType; -+typedef void *valueType; -+ -+typedef struct nodeStructure skiplist_node; -+ -+struct nodeStructure { -+ int level; /* Levels in this structure */ -+ keyType key; -+ valueType value; -+ skiplist_node *next[8]; -+ skiplist_node *prev[8]; -+}; -+ -+typedef struct listStructure { -+ int entries; -+ int level; /* Maximum level of the list -+ (1 more than the number of levels in the list) */ -+ skiplist_node *header; /* pointer to header */ -+} skiplist; -+ -+void skiplist_init(skiplist_node *slnode); -+skiplist *new_skiplist(skiplist_node *slnode); -+void free_skiplist(skiplist *l); -+void skiplist_node_init(skiplist_node *node); -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); -+void skiplist_delete(skiplist *l, skiplist_node *node); -+ -+static inline bool skiplist_node_empty(skiplist_node *node) { -+ return (!node->next[0]); -+} -+#endif /* _LINUX_SKIP_LISTS_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..f48c5c5da651 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -115,9 +115,16 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented on MuQSS only */ - #define SCHED_IDLE 5 -+#ifdef CONFIG_SCHED_MUQSS -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO SCHED_IDLE -+#define SCHED_MAX (SCHED_IDLEPRIO) -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+#else /* CONFIG_SCHED_MUQSS */ - #define SCHED_DEADLINE 6 -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..7e0eb99bd607 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,18 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_MUQSS -+ bool "MuQSS cpu scheduler" -+ select HIGH_RES_TIMERS -+ help -+ The Multiple Queue Skiplist Scheduler for excellent interactivity and -+ responsiveness on the desktop and highly scalable deterministic -+ low latency on any hardware. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -510,6 +522,7 @@ config SCHED_THERMAL_PRESSURE - default y if ARM64 - depends on SMP - depends on CPU_FREQ_THERMAL -+ depends on !SCHED_MUQSS - help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler -@@ -858,6 +871,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_MUQSS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -942,9 +956,13 @@ menuconfig CGROUP_SCHED - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. It uses cgroups to group -- tasks. -+ tasks. In combination with MuQSS this is purely a STUB to create the -+ files associated with the CPU controller cgroup but most of the -+ controls do nothing. This is useful for working in environments and -+ with applications that will only work if this control group is -+ present. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_MUQSS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1073,6 +1091,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_MUQSS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1200,6 +1219,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_MUQSS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..2557beb609c0 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,17 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_MUQSS -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, -+ .time_slice = 1000000, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -87,6 +95,7 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifndef CONFIG_SCHED_MUQSS - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -94,6 +103,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/init/main.c b/init/main.c -index e880b4ecb314..fe0a705e83f2 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -1421,6 +1421,8 @@ static int __ref kernel_init(void *unused) - - do_sysctl_args(); - -+ print_scheduler_version(); -+ - if (ramdisk_execute_command) { - ret = run_init_process(ramdisk_execute_command); - if (!ret) -diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS -new file mode 100644 -index 000000000000..a6a58781ef91 ---- /dev/null -+++ b/kernel/Kconfig.MuQSS -@@ -0,0 +1,105 @@ -+choice -+ prompt "CPU scheduler runqueue sharing" -+ default RQ_MC if SCHED_MUQSS -+ default RQ_NONE -+ -+config RQ_NONE -+ bool "No sharing" -+ help -+ This is the default behaviour where the CPU scheduler has one runqueue -+ per CPU, whether it is a physical or logical CPU (hyperthread). -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=none -+ -+ If unsure, say N. -+ -+config RQ_SMT -+ bool "SMT (hyperthread) siblings" -+ depends on SCHED_SMT && SCHED_MUQSS -+ -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by SMT (hyperthread) siblings. As these logical cores share -+ one physical core, sharing the runqueue resource can lead to decreased -+ overhead, lower latency and higher throughput. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smt -+ -+ If unsure, say N. -+ -+config RQ_MC -+ bool "Multicore siblings" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by multicore siblings in addition to any SMT siblings. -+ As these physical cores share caches, sharing the runqueue resource -+ will lead to lower latency, but its effects on overhead and throughput -+ are less predictable. As a general rule, 6 or fewer cores will likely -+ benefit from this, while larger CPUs will only derive a latency -+ benefit. If your workloads are primarily single threaded, this will -+ possibly worsen throughput. If you are only concerned about latency -+ then enable this regardless of how many cores you have. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=mc -+ -+ If unsure, say Y. -+ -+config RQ_MC_LLC -+ bool "Multicore siblings (LLC)" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will behave similarly as -+ with "Multicore siblings". -+ This option takes LLC cache into account when scheduling tasks. -+ Option may benefit CPUs with multiple LLC caches, such as Ryzen -+ and Xeon CPUs. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=llc -+ -+ If unsure, say N. -+ -+config RQ_SMP -+ bool "Symmetric Multi-Processing" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by all physical CPUs unless they are on separate NUMA nodes. -+ As physical CPUs usually do not share resources, sharing the runqueue -+ will normally worsen throughput but improve latency. If you only -+ care about latency enable this. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smp -+ -+ If unsure, say N. -+ -+config RQ_ALL -+ bool "NUMA" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ regardless of the architecture configuration, including across NUMA -+ nodes. This can substantially decrease throughput in NUMA -+ configurations, but light NUMA designs will not be dramatically -+ affected. This option should only be chosen if latency is the prime -+ concern. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=all -+ -+ If unsure, say N. -+endchoice -+ -+config SHARERQ -+ int -+ default 0 if RQ_NONE -+ default 1 if RQ_SMT -+ default 2 if RQ_MC -+ default 3 if RQ_MC_LLC -+ default 4 if RQ_SMP -+ default 5 if RQ_ALL -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888e..89ed751ac4e4 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,8 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_100 if SCHED_MUQSS -+ default HZ_250_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -20,11 +21,18 @@ choice - config HZ_100 - bool "100 HZ" - help -+ 100 Hz is a suitable choice in combination with MuQSS which does -+ not rely on ticks for rescheduling interrupts, and is not Hz limited -+ for timeouts and sleeps from both the kernel and userspace. -+ This allows us to benefit from the lower overhead and higher -+ throughput of fewer timer ticks. -+ -+ Non-MuQSS kernels: - 100 Hz is a typical choice for servers, SMP and NUMA systems - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEF - bool "250 HZ" - help - 250 Hz is a good compromise choice allowing server performance -@@ -32,7 +40,10 @@ choice - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. - -- config HZ_300 -+ 250 Hz is the default choice for the mainline scheduler but not -+ advantageous in combination with MuQSS. -+ -+ config HZ_300_NODEF - bool "300 HZ" - help - 300 Hz is a good compromise choice allowing server performance -@@ -40,7 +51,7 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -- config HZ_1000 -+ config HZ_1000_NODEF - bool "1000 HZ" - help - 1000 Hz is the preferred choice for desktop systems and other -@@ -51,9 +62,9 @@ endchoice - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -- default 300 if HZ_300 -- default 1000 if HZ_1000 -+ default 250 if HZ_250_NODEF -+ default 300 if HZ_300_NODEF -+ default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - def_bool HIGH_RES_TIMERS -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index bf82259cff96..d9438eb6f91c 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -2,7 +2,7 @@ - - choice - prompt "Preemption Model" -- default PREEMPT_NONE -+ default PREEMPT - - config PREEMPT_NONE - bool "No Forced Preemption (Server)" -@@ -18,7 +18,7 @@ config PREEMPT_NONE - latencies. - - config PREEMPT_VOLUNTARY -- bool "Voluntary Kernel Preemption (Desktop)" -+ bool "Voluntary Kernel Preemption (Nothing)" - depends on !ARCH_NO_PREEMPT - help - This option reduces the latency of the kernel by adding more -@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY - applications to run more 'smoothly' even when the system is - under load. - -- Select this if you are building a kernel for a desktop system. -+ Select this for no system in particular (choose Preemptible -+ instead on a desktop if you know what's good for you). - - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" -diff --git a/kernel/Makefile b/kernel/Makefile -index 9a20016d4900..a2640d78eadb 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ - extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ - notifier.o ksysfs.o cred.o reboot.o \ -- async.o range.o smpboot.o ucount.o regset.o -+ async.o range.o smpboot.o ucount.o regset.o \ -+ skip_list.o - - obj-$(CONFIG_BPFILTER) += usermode_driver.o - obj-$(CONFIG_MODULES) += kmod.o -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 733e80f334e7..3f3506c851fd 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig -index 10a5aff4eecc..ce3bcc66b48d 100644 ---- a/kernel/irq/Kconfig -+++ b/kernel/irq/Kconfig -@@ -112,6 +112,23 @@ config GENERIC_IRQ_RESERVATION_MODE - config IRQ_FORCED_THREADING - bool - -+config FORCE_IRQ_THREADING -+ bool "Make IRQ threading compulsory" -+ depends on IRQ_FORCED_THREADING -+ default n -+ help -+ -+ Make IRQ threading mandatory for any IRQ handlers that support it -+ instead of being optional and requiring the threadirqs kernel -+ parameter. Instead they can be optionally disabled with the -+ nothreadirqs kernel parameter. -+ -+ Enabling this may make some architectures not boot with runqueue -+ sharing and MuQSS. -+ -+ Enable if you are building for a desktop or low latency system, -+ otherwise say N. -+ - config SPARSE_IRQ - bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - help -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 5df903fccb60..17a0dd194582 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -25,9 +25,20 @@ - #include "internals.h" - - #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -+#ifdef CONFIG_FORCE_IRQ_THREADING -+__read_mostly bool force_irqthreads = true; -+#else - __read_mostly bool force_irqthreads; -+#endif - EXPORT_SYMBOL_GPL(force_irqthreads); - -+static int __init setup_noforced_irqthreads(char *arg) -+{ -+ force_irqthreads = false; -+ return 0; -+} -+early_param("nothreadirqs", setup_noforced_irqthreads); -+ - static int __init setup_forced_irqthreads(char *arg) - { - force_irqthreads = true; -diff --git a/kernel/kthread.c b/kernel/kthread.c -index 3edaa380dc7b..a1712699726b 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -471,6 +471,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) - } - EXPORT_SYMBOL(kthread_bind); - -+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) -+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -+ -+/* -+ * new_kthread_bind is a special variant of __kthread_bind_mask. -+ * For new threads to work on muqss we want to call do_set_cpus_allowed -+ * without the task_cpu being set and the task rescheduled until they're -+ * rescheduled on their own so we call __do_set_cpus_allowed directly which -+ * only changes the cpumask. This is particularly important for smpboot threads -+ * to work. -+ */ -+static void new_kthread_bind(struct task_struct *p, unsigned int cpu) -+{ -+ unsigned long flags; -+ -+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) -+ return; -+ -+ /* It's safe because the task is inactive. */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ __do_set_cpus_allowed(p, cpumask_of(cpu)); -+ p->flags |= PF_NO_SETAFFINITY; -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+#else -+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) -+#endif -+ - /** - * kthread_create_on_cpu - Create a cpu bound kthread - * @threadfn: the function to run until signal_pending(current). -@@ -491,7 +519,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), - cpu); - if (IS_ERR(p)) - return p; -- kthread_bind(p, cpu); -+ new_kthread_bind(p, cpu); - /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); - to_kthread(p)->cpu = cpu; -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..825f9b8e228f 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) - { - static char err_buf[STACK_ERR_BUF_SIZE]; - struct rq *rq; -- struct rq_flags flags; -+ struct rq_flags rf; - int ret; - bool success = false; - -@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) - * functions. If all goes well, switch the task to the target patch - * state. - */ -- rq = task_rq_lock(task, &flags); -+ rq = task_rq_lock(task, &rf); - - if (task_running(rq, task) && task != current) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, -@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) - task->patch_state = klp_target_state; - - done: -- task_rq_unlock(rq, task, &flags); -+ task_rq_unlock(rq, task, &rf); - - /* - * Due to console deadlock issues, pr_debug() can't be used while -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..1ff14a21193d 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - -+ifdef CONFIG_SCHED_MUQSS -+obj-y += MuQSS.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+ -+obj-$(CONFIG_SMP) += topology.o -+else - obj-y += core.o loadavg.o clock.o cputime.o - obj-y += idle.o fair.o rt.o deadline.o - obj-y += wait.o wait_bit.o swait.o completion.o - - obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -new file mode 100644 -index 000000000000..8da537d5226c ---- /dev/null -+++ b/kernel/sched/MuQSS.c -@@ -0,0 +1,7855 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * kernel/sched/MuQSS.c, was kernel/sched.c -+ * -+ * Kernel scheduler and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and -+ * make semaphores SMP safe -+ * 1998-11-19 Implemented schedule_timeout() and related stuff -+ * by Andrea Arcangeli -+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: -+ * hybrid priority-list and round-robin design with -+ * an array-switch method of distributing timeslices -+ * and per-CPU runqueues. Cleanups and useful suggestions -+ * by Davide Libenzi, preemptible kernel bits by Robert Love. -+ * 2003-09-03 Interactivity tuning by Con Kolivas. -+ * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-04-15 Work begun on replacing all interactivity tuning with a -+ * fair scheduling design by Con Kolivas. -+ * 2007-05-05 Load balancing (smp-nice) and other improvements -+ * by Peter Williams -+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith -+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri -+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, -+ * Thomas Gleixner, Mike Kravetz -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS -+ * scheduler by Con Kolivas. -+ * 2019-08-31 LLC bits by Eduards Bezverhijs -+ */ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "MuQSS.h" -+#include "smp.h" -+ -+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) -+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+ -+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -+ -+#define is_iso_policy(policy) ((policy) == SCHED_ISO) -+#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -+ -+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -+ -+#define ISO_PERIOD (5 * HZ) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) -+#define JIFFY_NS (APPROX_NS_PS / HZ) -+#define JIFFY_US (1048576 / HZ) -+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) -+#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) -+#define HALF_JIFFY_US (1048576 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "MuQSS CPU scheduler v0.204 by Con Kolivas.\n"); -+} -+ -+/* Define RQ share levels */ -+#define RQSHARE_NONE 0 -+#define RQSHARE_SMT 1 -+#define RQSHARE_MC 2 -+#define RQSHARE_MC_LLC 3 -+#define RQSHARE_SMP 4 -+#define RQSHARE_ALL 5 -+ -+/* Define locality levels */ -+#define LOCALITY_SAME 0 -+#define LOCALITY_SMT 1 -+#define LOCALITY_MC_LLC 2 -+#define LOCALITY_MC 3 -+#define LOCALITY_SMP 4 -+#define LOCALITY_DISTANT 5 -+ -+/* -+ * This determines what level of runqueue sharing will be done and is -+ * configurable at boot time with the bootparam rqshare = -+ */ -+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ -+ -+static int __init set_rqshare(char *str) -+{ -+ if (!strncmp(str, "none", 4)) { -+ rqshare = RQSHARE_NONE; -+ return 0; -+ } -+ if (!strncmp(str, "smt", 3)) { -+ rqshare = RQSHARE_SMT; -+ return 0; -+ } -+ if (!strncmp(str, "mc", 2)) { -+ rqshare = RQSHARE_MC; -+ return 0; -+ } -+ if (!strncmp(str, "llc", 3)) { -+ rqshare = RQSHARE_MC_LLC; -+ return 0; -+ } -+ if (!strncmp(str, "smp", 3)) { -+ rqshare = RQSHARE_SMP; -+ return 0; -+ } -+ if (!strncmp(str, "all", 3)) { -+ rqshare = RQSHARE_ALL; -+ return 0; -+ } -+ return 1; -+} -+__setup("rqshare=", set_rqshare); -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+ -+/* -+ * Tunable to choose whether to prioritise latency or throughput, simple -+ * binary yes or no -+ */ -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run five seconds as real time tasks. This is the total over -+ * all online cpus. -+ */ -+int sched_iso_cpu __read_mostly = 70; -+ -+/* -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The relative length of deadline for each priority(nice) level. -+ */ -+static int prio_ratios[NICE_WIDTH] __read_mostly; -+ -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifdef CONFIG_SMP -+/* -+ * Total number of runqueues. Equals number of CPUs when there is no runqueue -+ * sharing but is usually less with SMT/MC sharing of runqueues. -+ */ -+static int total_runqueues __read_mostly = 1; -+ -+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -+ -+struct rq *cpu_rq(int cpu) -+{ -+ return &per_cpu(runqueues, (cpu)); -+} -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+/* -+ * For asym packing, by default the lower numbered cpu has higher priority. -+ */ -+int __weak arch_asym_cpu_priority(int cpu) -+{ -+ return -cpu; -+} -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+#include "stats.h" -+ -+/* -+ * All common locking functions performed on rq->lock. rq->clock is local to -+ * the CPU accessing it so it can be modified just with interrupts disabled -+ * when we're not updating niffies. -+ * Looking up task_rq must be done under rq->lock to be safe. -+ */ -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+static void update_irq_load_avg(struct rq *rq, long delta); -+#else -+static inline void update_irq_load_avg(struct rq *rq, long delta) {} -+#endif -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if (irq_delta + steal) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta < 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * Niffies are a globally increasing nanosecond counter. They're only used by -+ * update_load_avg and time_slice_expired, however deadlines are based on them -+ * across CPUs. Update them whenever we will call one of those functions, and -+ * synchronise them across CPUs whenever we hold both runqueue locks. -+ */ -+static inline void update_clocks(struct rq *rq) -+{ -+ s64 ndiff, minndiff; -+ long jdiff; -+ -+ update_rq_clock(rq); -+ ndiff = rq->clock - rq->old_clock; -+ rq->old_clock = rq->clock; -+ jdiff = jiffies - rq->last_jiffy; -+ -+ /* Subtract any niffies added by balancing with other rqs */ -+ ndiff -= rq->niffies - rq->last_niffy; -+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; -+ if (minndiff < 0) -+ minndiff = 0; -+ ndiff = max(ndiff, minndiff); -+ rq->niffies += ndiff; -+ rq->last_niffy = rq->niffies; -+ if (jdiff) { -+ rq->last_jiffy += jdiff; -+ rq->last_jiffy_niffies = rq->niffies; -+ } -+} -+ -+/* -+ * Any time we have two runqueues locked we use that as an opportunity to -+ * synchronise niffies to the highest value as idle ticks may have artificially -+ * kept niffies low on one CPU and the truth can only be later. -+ */ -+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) -+{ -+ if (rq1->niffies > rq2->niffies) -+ rq2->niffies = rq1->niffies; -+ else -+ rq1->niffies = rq2->niffies; -+} -+ -+/* -+ * double_rq_lock - safely lock two runqueues -+ * -+ * Note this does not disable interrupts like task_rq_lock, -+ * you need to do so manually before calling. -+ */ -+ -+/* For when we know rq1 != rq2 */ -+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ if (rq1 < rq2) { -+ raw_spin_lock(rq1->lock); -+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); -+ } else { -+ raw_spin_lock(rq2->lock); -+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ BUG_ON(!irqs_disabled()); -+ if (rq1->lock == rq2->lock) { -+ raw_spin_lock(rq1->lock); -+ __acquire(rq2->lock); /* Fake it out ;) */ -+ } else -+ __double_rq_lock(rq1, rq2); -+ synchronise_niffies(rq1, rq2); -+} -+ -+/* -+ * double_rq_unlock - safely unlock two runqueues -+ * -+ * Note this does not restore interrupts like task_rq_unlock, -+ * you need to do so manually after calling. -+ */ -+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) -+ __releases(rq1->lock) -+ __releases(rq2->lock) -+{ -+ raw_spin_unlock(rq1->lock); -+ if (rq1->lock != rq2->lock) -+ raw_spin_unlock(rq2->lock); -+ else -+ __release(rq2->lock); -+} -+ -+static inline void lock_all_rqs(void) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_lock(rq->lock); -+ } -+} -+ -+static inline void unlock_all_rqs(void) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_unlock(rq->lock); -+ } -+ preempt_enable(); -+} -+ -+/* Specially nest trylock an rq */ -+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) -+{ -+ if (unlikely(!do_raw_spin_trylock(rq->lock))) -+ return false; -+ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ synchronise_niffies(this_rq, rq); -+ return true; -+} -+ -+/* Unlock a specially nested trylocked rq */ -+static inline void unlock_rq(struct rq *rq) -+{ -+ spin_release(&rq->lock->dep_map, _RET_IP_); -+ do_raw_spin_unlock(rq->lock); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* Task can safely be re-inserted now */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+static inline void smp_sched_reschedule(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ smp_send_reschedule(cpu); -+} -+ -+/* -+ * resched_task - mark a task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_task(struct task_struct *p) -+{ -+ int cpu; -+#ifdef CONFIG_LOCKDEP -+ /* Kernel threads call this when creating workqueues while still -+ * inactive from __kthread_bind_mask, holding only the pi_lock */ -+ if (!(p->flags & PF_KTHREAD)) { -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(rq->lock); -+ } -+#endif -+ if (test_tsk_need_resched(p)) -+ return; -+ -+ cpu = task_cpu(p); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(p)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * A task that is not running or queued will not have a node set. -+ * A task that is queued but not running will have a node set. -+ * A task that is currently running will have ->on_cpu set but no node set. -+ */ -+static inline bool task_queued(struct task_struct *p) -+{ -+ return !skiplist_node_empty(&p->node); -+} -+ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -+static inline void resched_if_idle(struct rq *rq); -+ -+static inline bool deadline_before(u64 deadline, u64 time) -+{ -+ return (deadline < time); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes cpu fairly amongst tasks of the -+ * same nice value, it proportions cpu according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 prio_deadline_diff(int user_prio) -+{ -+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -+} -+ -+static inline u64 task_deadline_diff(struct task_struct *p) -+{ -+ return prio_deadline_diff(TASK_USER_PRIO(p)); -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return prio_deadline_diff(USER_PRIO(static_prio)); -+} -+ -+static inline int longest_deadline_diff(void) -+{ -+ return prio_deadline_diff(39); -+} -+ -+static inline int ms_longest_deadline_diff(void) -+{ -+ return NS_TO_MS(longest_deadline_diff()); -+} -+ -+static inline bool rq_local(struct rq *rq); -+ -+#ifndef SCHED_CAPACITY_SCALE -+#define SCHED_CAPACITY_SCALE 1024 -+#endif -+ -+static inline int rq_load(struct rq *rq) -+{ -+ return rq->nr_running; -+} -+ -+/* -+ * Update the load average for feeding into cpu frequency governors. Use a -+ * rough estimate of a rolling average with ~ time constant of 32ms. -+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 -+ * Make sure a call to update_clocks has been made before calling this to get -+ * an updated rq->niffies. -+ */ -+static void update_load_avg(struct rq *rq, unsigned int flags) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->load_avg = load; -+ -+ rq->load_update = rq->niffies; -+ update_irq_load_avg(rq, 0); -+ if (likely(rq_local(rq))) -+ cpufreq_trigger(rq, flags); -+} -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+/* -+ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds -+ * here so we scale curload to how long it's been since the last update. -+ */ -+static void update_irq_load_avg(struct rq *rq, long delta) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; -+ rq->irq_load_avg = load; -+ -+ rq->irq_load_update = rq->niffies; -+} -+#endif -+ -+/* -+ * Removing from the runqueue. Enter with rq locked. Deleting a task -+ * from the skip list is done via the stored node reference in the task struct -+ * and does not require a full look up. Thus it occurs in O(k) time where k -+ * is the "level" of the list the task was stored at - usually < 4, max 8. -+ */ -+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ skiplist_delete(rq->sl, &p->node); -+ rq->best_key = rq->node->next[0]->key; -+ update_clocks(rq); -+ -+ if (!(flags & DEQUEUE_SAVE)) { -+ sched_info_dequeued(rq, p); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ } -+ rq->nr_running--; -+ if (rt_task(p)) -+ rq->rt_nr_running--; -+ update_load_avg(rq, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_RCU -+static bool rcu_read_critical(struct task_struct *p) -+{ -+ return p->rcu_read_unlock_special.b.blocked; -+} -+#else /* CONFIG_PREEMPT_RCU */ -+#define rcu_read_critical(p) (false) -+#endif /* CONFIG_PREEMPT_RCU */ -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) && -+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -+} -+ -+/* -+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check -+ * that the iso_refractory flag is not set. -+ */ -+static inline bool isoprio_suitable(struct rq *rq) -+{ -+ return !rq->iso_refractory; -+} -+ -+static inline void inc_nr_running(struct rq *rq) -+{ -+ rq->nr_running++; -+ if (trace_sched_update_nr_running_tp_enabled()) { -+ call_trace_sched_update_nr_running(rq, 1); -+ } -+} -+ -+static inline void dec_nr_running(struct rq *rq) -+{ -+ rq->nr_running--; -+ if (trace_sched_update_nr_running_tp_enabled()) { -+ call_trace_sched_update_nr_running(rq, -1); -+ } -+} -+ -+/* -+ * Adding to the runqueue. Enter with rq locked. -+ */ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ unsigned int randseed, cflags = 0; -+ u64 sl_id; -+ -+ if (!rt_task(p)) { -+ /* Check it hasn't gotten rt from PI */ -+ if ((idleprio_task(p) && idleprio_suitable(p)) || -+ (iso_task(p) && isoprio_suitable(rq))) -+ p->prio = p->normal_prio; -+ else -+ p->prio = NORMAL_PRIO; -+ } else -+ rq->rt_nr_running++; -+ /* -+ * The sl_id key passed to the skiplist generates a sorted list. -+ * Realtime and sched iso tasks run FIFO so they only need be sorted -+ * according to priority. The skiplist will put tasks of the same -+ * key inserted later in FIFO order. Tasks of sched normal, batch -+ * and idleprio are sorted according to their deadlines. Idleprio -+ * tasks are offset by an impossibly large deadline value ensuring -+ * they get sorted into last positions, but still according to their -+ * own deadlines. This creates a "landscape" of skiplists running -+ * from priority 0 realtime in first place to the lowest priority -+ * idleprio tasks last. Skiplist insertion is an O(log n) process. -+ */ -+ if (p->prio <= ISO_PRIO) { -+ sl_id = p->prio; -+ } else { -+ sl_id = p->deadline; -+ if (idleprio_task(p)) { -+ if (p->prio == IDLE_PRIO) -+ sl_id |= 0xF000000000000000; -+ else -+ sl_id += longest_deadline_diff(); -+ } -+ } -+ /* -+ * Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as the random seed for skiplist insertion. -+ */ -+ update_clocks(rq); -+ if (!(flags & ENQUEUE_RESTORE)) { -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags & ENQUEUE_WAKEUP); -+ } -+ -+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; -+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); -+ rq->best_key = rq->node->next[0]->key; -+ if (p->in_iowait) -+ cflags |= SCHED_CPUFREQ_IOWAIT; -+ inc_nr_running(rq); -+ update_load_avg(rq, cflags); -+} -+ -+/* -+ * Returns the relative length of deadline all compared to the shortest -+ * deadline which is that of nice -20. -+ */ -+static inline int task_prio_ratio(struct task_struct *p) -+{ -+ return prio_ratios[TASK_USER_PRIO(p)]; -+} -+ -+/* -+ * task_timeslice - all tasks of all priorities get the exact same timeslice -+ * length. CPU distribution is handled by giving different deadlines to -+ * tasks of different priorities. Use 128 as the base value for fast shifts. -+ */ -+static inline int task_timeslice(struct task_struct *p) -+{ -+ return (rr_interval * task_prio_ratio(p) / 128); -+} -+ -+#ifdef CONFIG_SMP -+/* Entered with rq locked */ -+static inline void resched_if_idle(struct rq *rq) -+{ -+ if (rq_idle(rq)) -+ resched_task(rq->curr); -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return (rq->cpu == smp_processor_id()); -+} -+#ifdef CONFIG_SMT_NICE -+static const cpumask_t *thread_cpumask(int cpu); -+ -+/* Find the best real time priority running on any SMT siblings of cpu and if -+ * none are running, the static priority of the best deadline task running. -+ * The lookups to the other runqueues is done lockless as the occasional wrong -+ * value would be harmless. */ -+static int best_smt_bias(struct rq *this_rq) -+{ -+ int other_cpu, best_bias = 0; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq = cpu_rq(other_cpu); -+ -+ if (rq_idle(rq)) -+ continue; -+ if (unlikely(!rq->online)) -+ continue; -+ if (!rq->rq_mm) -+ continue; -+ if (likely(rq->rq_smt_bias > best_bias)) -+ best_bias = rq->rq_smt_bias; -+ } -+ return best_bias; -+} -+ -+static int task_prio_bias(struct task_struct *p) -+{ -+ if (rt_task(p)) -+ return 1 << 30; -+ else if (task_running_iso(p)) -+ return 1 << 29; -+ else if (task_running_idle(p)) -+ return 0; -+ return MAX_PRIO - p->static_prio; -+} -+ -+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -+{ -+ return true; -+} -+ -+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; -+ -+/* We've already decided p can run on CPU, now test if it shouldn't for SMT -+ * nice reasons. */ -+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -+{ -+ int best_bias, task_bias; -+ -+ /* Kernel threads always run */ -+ if (unlikely(!p->mm)) -+ return true; -+ if (rt_task(p)) -+ return true; -+ if (!idleprio_suitable(p)) -+ return true; -+ best_bias = best_smt_bias(this_rq); -+ /* The smt siblings are all idle or running IDLEPRIO */ -+ if (best_bias < 1) -+ return true; -+ task_bias = task_prio_bias(p); -+ if (task_bias < 1) -+ return false; -+ if (task_bias >= best_bias) -+ return true; -+ /* Dither 25% cpu of normal tasks regardless of nice difference */ -+ if (best_bias % 4 == 1) -+ return true; -+ /* Sorry, you lose */ -+ return false; -+} -+#else /* CONFIG_SMT_NICE */ -+#define smt_schedule(p, this_rq) (true) -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) -+{ -+ set_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+/* -+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to -+ * allow easy lookup of whether any suitable idle CPUs are available. -+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the -+ * idle_cpus variable than to do a full bitmask check when we are busy. The -+ * bits are set atomically but read locklessly as occasional false positive / -+ * negative is harmless. -+ */ -+static inline void set_cpuidle_map(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ atomic_set_cpu(cpu, &cpu_idle_map); -+} -+ -+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) -+{ -+ clear_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+ atomic_clear_cpu(cpu, &cpu_idle_map); -+} -+ -+static bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); -+} -+ -+/* -+ * Resched current on rq. We don't know if rq is local to this CPU nor if it -+ * is locked so we do not use an intermediate variable for the task to avoid -+ * having it dereferenced. -+ */ -+static void resched_curr(struct rq *rq) -+{ -+ int cpu; -+ -+ if (test_tsk_need_resched(rq->curr)) -+ return; -+ -+ rq->preempt = rq->curr; -+ cpu = rq->cpu; -+ -+ /* We're doing this without holding the rq lock if it's not task_rq */ -+ -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(rq->curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(rq->curr)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+#define CPUIDLE_DIFF_THREAD (1) -+#define CPUIDLE_DIFF_CORE_LLC (2) -+#define CPUIDLE_DIFF_CORE (4) -+#define CPUIDLE_CACHE_BUSY (8) -+#define CPUIDLE_DIFF_CPU (16) -+#define CPUIDLE_THREAD_BUSY (32) -+#define CPUIDLE_DIFF_NODE (64) -+ -+/* -+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the -+ * lowest value would give the most suitable CPU to schedule p onto next. The -+ * order works out to be the following: -+ * -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ */ -+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -+{ -+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | -+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | -+ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; -+ int cpu_tmp; -+ -+ if (cpumask_test_cpu(best_cpu, tmpmask)) -+ goto out; -+ -+ for_each_cpu(cpu_tmp, tmpmask) { -+ int ranking, locality; -+ struct rq *tmp_rq; -+ -+ ranking = 0; -+ tmp_rq = cpu_rq(cpu_tmp); -+ -+ locality = rq->cpu_locality[cpu_tmp]; -+#ifdef CONFIG_NUMA -+ if (locality > LOCALITY_SMP) -+ ranking |= CPUIDLE_DIFF_NODE; -+ else -+#endif -+ if (locality > LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CPU; -+#ifdef CONFIG_SCHED_MC -+ else if (locality == LOCALITY_MC_LLC) -+ ranking |= CPUIDLE_DIFF_CORE_LLC; -+ else if (locality == LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CORE; -+ if (!(tmp_rq->cache_idle(tmp_rq))) -+ ranking |= CPUIDLE_CACHE_BUSY; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (locality == LOCALITY_SMT) -+ ranking |= CPUIDLE_DIFF_THREAD; -+#endif -+ if (ranking < best_ranking -+#ifdef CONFIG_SCHED_SMT -+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) -+#endif -+ ) { -+ best_cpu = cpu_tmp; -+ best_ranking = ranking; -+ } -+ } -+out: -+ return best_cpu; -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ struct rq *this_rq = cpu_rq(this_cpu); -+ -+ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); -+} -+ -+/* As per resched_curr but only will resched idle task */ -+static inline void resched_idle(struct rq *rq) -+{ -+ if (test_tsk_need_resched(rq->idle)) -+ return; -+ -+ rq->preempt = rq->idle; -+ -+ set_tsk_need_resched(rq->idle); -+ -+ if (rq_local(rq)) { -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ smp_sched_reschedule(rq->cpu); -+} -+ -+DEFINE_PER_CPU(cpumask_t, idlemask); -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); -+ struct rq *rq; -+ int best_cpu; -+ -+ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); -+ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); -+ rq = cpu_rq(best_cpu); -+ if (!smt_schedule(p, rq)) -+ return NULL; -+ rq->preempt = p; -+ resched_idle(rq); -+ return rq; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq->rq_order[cpu]; -+} -+#else /* CONFIG_SMP */ -+static inline void set_cpuidle_map(int cpu) -+{ -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+} -+ -+static inline bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return uprq->curr == uprq->idle; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+} -+ -+static inline void resched_curr(struct rq *rq) -+{ -+ resched_task(rq->curr); -+} -+ -+static inline void resched_if_idle(struct rq *rq) -+{ -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return true; -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq; -+} -+ -+static inline bool smt_schedule(struct task_struct *p, struct rq *rq) -+{ -+ return true; -+} -+#endif /* CONFIG_SMP */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ if (idleprio_task(p)) -+ return IDLE_PRIO; -+ if (iso_task(p)) -+ return ISO_PRIO; -+ return NORMAL_PRIO; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. Enter with rq locked. -+ */ -+static void activate_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ resched_if_idle(rq); -+ -+ /* -+ * Sleep time is in units of nanosecs, so shift by 20 to get a -+ * milliseconds-range estimation of the amount of time that the task -+ * spent sleeping: -+ */ -+ if (unlikely(prof_on == SLEEP_PROFILING)) { -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), -+ (rq->niffies - p->last_ran) >> 20); -+ } -+ -+ p->prio = effective_prio(p); -+ enqueue_task(rq, p, flags); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+} -+ -+/* -+ * deactivate_task - If it's running, it's not on the runqueue and we can just -+ * decrement the nr_running. Enter with rq locked. -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ p->on_rq = 0; -+ sched_info_dequeued(rq, p); -+ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ -+ psi_dequeue(p, DEQUEUE_SLEEP); -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+ struct rq *rq; -+ -+ if (task_cpu(p) == new_cpu) -+ return; -+ -+ /* Do NOT call set_task_cpu on a currently queued task as we will not -+ * be reliably holding the rq lock after changing CPU. */ -+ BUG_ON(task_queued(p)); -+ rq = task_rq(p); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * Furthermore, all task_rq users should acquire both locks, see -+ * task_rq_lock(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(rq->lock))); -+#endif -+ -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ /* -+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ p->wake_cpu = new_cpu; -+ -+ if (task_running(rq, p)) { -+ /* -+ * We should only be calling this on a running task if we're -+ * holding rq lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ -+ /* -+ * We can't change the task_thread_info CPU on a running task -+ * as p will still be protected by the rq lock of the CPU it -+ * is still running on so we only set the wake_cpu for it to be -+ * lazily updated once off the CPU. -+ */ -+ return; -+ } -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, new_cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); -+#endif -+ /* We're no longer protecting p after this point since we're holding -+ * the wrong runqueue lock. */ -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Move a task off the runqueue and take it to a cpu for it will -+ * become the running task. -+ */ -+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) -+{ -+ struct rq *p_rq = task_rq(p); -+ -+ dequeue_task(p_rq, p, DEQUEUE_SAVE); -+ if (p_rq != rq) { -+ sched_info_dequeued(p_rq, p); -+ sched_info_queued(rq, p); -+ } -+ set_task_cpu(p, cpu); -+} -+ -+/* -+ * Returns a descheduling task to the runqueue unless it is being -+ * deactivated. -+ */ -+static inline void return_task(struct task_struct *p, struct rq *rq, -+ int cpu, bool deactivate) -+{ -+ if (deactivate) -+ deactivate_task(p, rq); -+ else { -+#ifdef CONFIG_SMP -+ /* -+ * set_task_cpu was called on the running task that doesn't -+ * want to deactivate so it has to be enqueued to a different -+ * CPU and we need its lock. Tag it to be moved with as the -+ * lock is dropped in finish_lock_switch. -+ */ -+ if (unlikely(p->wake_cpu != cpu)) -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ else -+#endif -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ } -+} -+ -+/* Enter with rq lock held. We know p is on the local cpu */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ int running, queued; -+ struct rq_flags rf; -+ unsigned long ncsw; -+ struct rq *rq; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(rq, p)) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ rq = task_rq_lock(p, &rf); -+ trace_sched_wait_task(p); -+ running = task_running(rq, p); -+ queued = task_on_rq_queued(p); -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_rq_unlock(rq, p, &rf); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(queued)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_sched_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+#endif -+ -+/* -+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the -+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or -+ * between themselves, they cooperatively multitask. An idle rq scores as -+ * prio PRIO_LIMIT so it is always preempted. -+ */ -+static inline bool -+can_preempt(struct task_struct *p, int prio, u64 deadline) -+{ -+ /* Better static priority RT task or better policy preemption */ -+ if (p->prio < prio) -+ return true; -+ if (p->prio > prio) -+ return false; -+ if (p->policy == SCHED_BATCH) -+ return false; -+ /* SCHED_NORMAL and ISO will preempt based on deadline */ -+ if (!deadline_before(p->deadline, deadline)) -+ return false; -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * Check to see if p can run on cpu, and if not, whether there are any online -+ * CPUs it can run on instead. This only happens with the hotplug threads that -+ * bring up the CPUs. -+ */ -+static inline bool sched_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) -+ return false; -+ if (p->nr_cpus_allowed == 1) { -+ cpumask_t valid_mask; -+ -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); -+ if (unlikely(cpumask_empty(&valid_mask))) -+ return false; -+ } -+ return true; -+} -+ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ return true; -+} -+ -+#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ int i, this_entries = rq_load(this_rq); -+ cpumask_t tmp; -+ -+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) -+ return; -+ -+ /* IDLEPRIO tasks never preempt anything but idle */ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ -+ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *rq = this_rq->cpu_order[i]; -+ -+ if (!cpumask_test_cpu(rq->cpu, &tmp)) -+ continue; -+ -+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) -+ continue; -+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { -+ /* We set rq->preempting lockless, it's a hint only */ -+ rq->preempting = p; -+ resched_curr(rq); -+ return; -+ } -+ } -+} -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check); -+#else /* CONFIG_SMP */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ return false; -+} -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) -+ resched_curr(uprq); -+} -+ -+static inline int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ } else { -+ struct sched_domain *sd; -+ -+ rcu_read_lock(); -+ for_each_domain(rq->cpu, sd) { -+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -+ __schedstat_inc(sd->ttwu_wake_remote); -+ break; -+ } -+ } -+ rcu_read_unlock(); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ /* -+ * Sync wakeups (i.e. those types of wakeups where the waker -+ * has indicated that it will leave the CPU in short order) -+ * don't trigger a preemption if there are no idle cpus, -+ * instead waiting for current to deschedule. -+ */ -+ if (wake_flags & WF_SYNC) -+ resched_suitable_idle(p); -+ else -+ try_preempt(p, rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ int en_flags = ENQUEUE_WAKEUP; -+ -+ lockdep_assert_held(rq->lock); -+ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+#ifdef CONFIG_SMP -+ if (wake_flags & WF_MIGRATED) -+ en_flags |= ENQUEUE_MIGRATED; -+#endif -+ -+ activate_task(rq, p, en_flags); -+ ttwu_do_wakeup(rq, p, wake_flags); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = __task_rq_lock(p, NULL); -+ if (likely(task_on_rq_queued(p))) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_rq_unlock(rq, NULL); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ rq_lock_irqsave(rq, &rf); -+ if (likely(is_idle_task(rq->curr))) -+ smp_sched_reschedule(cpu); -+ /* Else cpu is not in idle, do nothing here */ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ cpumask_t valid_mask; -+ -+ if (p->flags & PF_KTHREAD) -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); -+ else -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); -+ -+ if (unlikely(!cpumask_weight(&valid_mask))) { -+ /* We shouldn't be hitting this any more */ -+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, -+ p->pid, cpumask_weight(p->cpus_ptr)); -+ return cpumask_any(p->cpus_ptr); -+ } -+ return cpumask_any(&valid_mask); -+} -+ -+/* -+ * For a task that's just being woken up we have a valuable balancing -+ * opportunity so choose the nearest cache most lightly loaded runqueue. -+ * Entered with rq locked and returns with the chosen runqueue locked. -+ */ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ unsigned int idlest = ~0U; -+ struct rq *rq = NULL; -+ int i; -+ -+ if (suitable_idle_cpus(p)) { -+ int cpu = task_cpu(p); -+ -+ if (unlikely(needs_other_cpu(p, cpu))) -+ cpu = valid_task_cpu(p); -+ rq = resched_best_idle(p, cpu); -+ if (likely(rq)) -+ return rq->cpu; -+ } -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *other_rq = task_rq(p)->cpu_order[i]; -+ int entries; -+ -+ if (!other_rq->online) -+ continue; -+ if (needs_other_cpu(p, other_rq->cpu)) -+ continue; -+ entries = rq_load(other_rq); -+ if (entries >= idlest) -+ continue; -+ idlest = entries; -+ rq = other_rq; -+ } -+ if (unlikely(!rq)) -+ return task_cpu(p); -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ return NULL; -+} -+#endif /* CONFIG_SMP */ -+ -+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ rq_lock(rq); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ rq_unlock(rq); -+} -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int -+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ cpu = select_best_cpu(p); -+ if (task_cpu(p) != cpu) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+ -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, NULL); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ rq_unlock(rq); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+static void time_slice_expired(struct task_struct *p, struct rq *rq); -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * The process state is set to the same value of the process executing -+ * do_fork() code. That is running. This guarantees that nobody will -+ * actually run it, and a signal or other external event cannot wake -+ * it up and insert it on the runqueue either. -+ */ -+ -+ /* Should be reset in fork.c but done here for ease of MuQSS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = -+ p->stime_ns = -+ p->utime_ns = 0; -+ skiplist_node_init(&p->node); -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { -+ p->policy = SCHED_NORMAL; -+ p->normal_prio = normal_prio(p); -+ } -+ -+ if (PRIO_TO_NICE(p->static_prio) < 0) { -+ p->static_prio = NICE_TO_PRIO(0); -+ p->normal_prio = p->static_prio; -+ } -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rseq_migrate(p); -+ set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_post_fork(struct task_struct *p) -+{ -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, -+ size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); -+ -+static void account_task_cpu(struct rq *rq, struct task_struct *p) -+{ -+ update_clocks(rq); -+ /* This isn't really a context switch but accounting is the same */ -+ update_cpu_clock_switch(rq, p); -+ p->last_ran = rq->niffies; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+static inline int hrexpiry_enabled(struct rq *rq) -+{ -+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrexpiry_timer); -+} -+ -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+static inline void hrexpiry_clear(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (hrtimer_active(&rq->hrexpiry_timer)) -+ hrtimer_cancel(&rq->hrexpiry_timer); -+} -+ -+/* -+ * High-resolution time_slice expiry. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrexpiry(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); -+ struct task_struct *p; -+ -+ /* This can happen during CPU hotplug / resume */ -+ if (unlikely(cpu_of(rq) != smp_processor_id())) -+ goto out; -+ -+ /* -+ * We're doing this without the runqueue lock but this should always -+ * be run on the local CPU. Time slice should run out in __schedule -+ * but we set it to zero here in case niffies is slightly less. -+ */ -+ p = rq->curr; -+ p->time_slice = 0; -+ __set_tsk_resched(p); -+out: -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Called to set the hrexpiry timer state. -+ * -+ * called with irqs disabled from the local CPU only -+ */ -+static void hrexpiry_start(struct rq *rq, u64 delay) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ -+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED); -+} -+ -+static void init_rq_hrexpiry(struct rq *rq) -+{ -+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ rq->hrexpiry_timer.function = hrexpiry; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return HALF_JIFFY_US; -+ return 0; -+} -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ struct task_struct *parent, *rq_curr; -+ struct rq *rq, *new_rq; -+ unsigned long flags; -+ -+ parent = p->parent; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ p->state = TASK_RUNNING; -+ /* Task_rq can't change yet on a new task */ -+ new_rq = rq = task_rq(p); -+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { -+ set_task_cpu(p, valid_task_cpu(p)); -+ new_rq = task_rq(p); -+ } -+ -+ double_rq_lock(rq, new_rq); -+ rq_curr = rq->curr; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = rq_curr->normal_prio; -+ -+ trace_sched_wakeup_new(p); -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. If it's negative, it won't -+ * matter since that's the same as being 0. rq->rq_deadline is only -+ * modified within schedule() so it is always equal to -+ * current->deadline. -+ */ -+ account_task_cpu(rq, rq_curr); -+ p->last_ran = rq_curr->last_ran; -+ if (likely(rq_curr->policy != SCHED_FIFO)) { -+ rq_curr->time_slice /= 2; -+ if (rq_curr->time_slice < RESCHED_US) { -+ /* -+ * Forking task has run out of timeslice. Reschedule it and -+ * start its child with a new time slice and deadline. The -+ * child will end up running first because its deadline will -+ * be slightly earlier. -+ */ -+ __set_tsk_resched(rq_curr); -+ time_slice_expired(p, new_rq); -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+ else if (unlikely(rq != new_rq)) -+ try_preempt(p, new_rq); -+ } else { -+ p->time_slice = rq_curr->time_slice; -+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { -+ /* -+ * The VM isn't cloned, so we're in a good position to -+ * do child-runs-first in anticipation of an exec. This -+ * usually avoids a lot of COW overhead. -+ */ -+ __set_tsk_resched(rq_curr); -+ } else { -+ /* -+ * Adjust the hrexpiry since rq_curr will keep -+ * running and its timeslice has been shortened. -+ */ -+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); -+ try_preempt(p, new_rq); -+ } -+ } -+ } else { -+ time_slice_expired(p, new_rq); -+ try_preempt(p, new_rq); -+ } -+ activate_task(new_rq, p, 0); -+ double_rq_unlock(rq, new_rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock->dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock->owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If prev was marked as migrating to another CPU in return_task, drop -+ * the local runqueue lock but leave interrupts disabled and grab the -+ * remote lock we're migrating it to before enabling them. -+ */ -+ if (unlikely(task_on_rq_migrating(prev))) { -+ sched_info_dequeued(rq, prev); -+ /* -+ * We move the ownership of prev to the new cpu now. ttwu can't -+ * activate prev to the wrong cpu since it has to grab this -+ * runqueue in ttwu_remote. -+ */ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ prev->cpu = prev->wake_cpu; -+#else -+ task_thread_info(prev)->cpu = prev->wake_cpu; -+#endif -+ raw_spin_unlock(rq->lock); -+ -+ raw_spin_lock(&prev->pi_lock); -+ rq = __task_rq_lock(prev, NULL); -+ /* Check that someone else hasn't already queued prev */ -+ if (likely(!task_queued(prev))) { -+ enqueue_task(rq, prev, 0); -+ prev->on_rq = TASK_ON_RQ_QUEUED; -+ /* Wake up the CPU if it's not already running */ -+ resched_if_idle(rq); -+ } -+ raw_spin_unlock(&prev->pi_lock); -+ } -+#endif -+ rq_unlock(rq); -+ local_irq_enable(); -+} -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_switch -+# define finish_arch_switch(prev) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static void finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq, prev); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline void -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+static unsigned long nr_uninterruptible(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_uninterruptible; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ if (rq_load(raw_rq()) == 1) -+ return true; -+ else -+ return false; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int cpu; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += cpu_rq(cpu)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpufreq menu -+ * governor are using nonsensical data. Boosting frequency for a CPU that has -+ * IO-wait which might not even end up running the task when it does become -+ * runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long cpu, sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += nr_iowait_cpu(cpu); -+ -+ return sum; -+} -+ -+unsigned long nr_active(void) -+{ -+ return nr_running() + nr_uninterruptible(); -+} -+ -+/* Variables and functions for calc_load */ -+static unsigned long calc_load_update; -+unsigned long avenrun[3]; -+EXPORT_SYMBOL(avenrun); -+ -+/** -+ * get_avenrun - get the load average array -+ * @loads: pointer to dest load array -+ * @offset: offset to add -+ * @shift: shift count to shift the result left -+ * -+ * These values are estimates at best, so no need for locking. -+ */ -+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -+{ -+ loads[0] = (avenrun[0] + offset) << shift; -+ loads[1] = (avenrun[1] + offset) << shift; -+ loads[2] = (avenrun[2] + offset) << shift; -+} -+ -+/* -+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. -+ */ -+void calc_global_load(void) -+{ -+ long active; -+ -+ if (time_before(jiffies, READ_ONCE(calc_load_update))) -+ return; -+ active = nr_active() * FIXED_1; -+ -+ avenrun[0] = calc_load(avenrun[0], EXP_1, active); -+ avenrun[1] = calc_load(avenrun[1], EXP_5, active); -+ avenrun[2] = calc_load(avenrun[2], EXP_15, active); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+} -+ -+/** -+ * fixed_power_int - compute: x^n, in O(log n) time -+ * -+ * @x: base of the power -+ * @frac_bits: fractional bits of @x -+ * @n: power to raise @x to. -+ * -+ * By exploiting the relation between the definition of the natural power -+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and -+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, -+ * (where: n_i \elem {0, 1}, the binary vector representing n), -+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is -+ * of course trivially computable in O(log_2 n), the length of our binary -+ * vector. -+ */ -+static unsigned long -+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -+{ -+ unsigned long result = 1UL << frac_bits; -+ -+ if (n) { -+ for (;;) { -+ if (n & 1) { -+ result *= x; -+ result += 1UL << (frac_bits - 1); -+ result >>= frac_bits; -+ } -+ n >>= 1; -+ if (!n) -+ break; -+ x *= x; -+ x += 1UL << (frac_bits - 1); -+ x >>= frac_bits; -+ } -+ } -+ -+ return result; -+} -+ -+/* -+ * a1 = a0 * e + a * (1 - e) -+ * -+ * a2 = a1 * e + a * (1 - e) -+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) -+ * = a0 * e^2 + a * (1 - e) * (1 + e) -+ * -+ * a3 = a2 * e + a * (1 - e) -+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) -+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) -+ * -+ * ... -+ * -+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] -+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) -+ * = a0 * e^n + a * (1 - e^n) -+ * -+ * [1] application of the geometric series: -+ * -+ * n 1 - x^(n+1) -+ * S_n := \Sum x^i = ------------- -+ * i=0 1 - x -+ */ -+unsigned long -+calc_load_n(unsigned long load, unsigned long exp, -+ unsigned long active, unsigned int n) -+{ -+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+#ifdef CONFIG_PARAVIRT -+static inline u64 steal_ticks(u64 steal) -+{ -+ if (unlikely(steal > NSEC_PER_SEC)) -+ return div_u64(steal, TICK_NSEC); -+ -+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -+} -+#endif -+ -+#ifndef nsecs_to_cputime -+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -+#endif -+ -+/* -+ * On each tick, add the number of nanoseconds to the unbanked variables and -+ * once one tick's worth has accumulated, account it allowing for accurate -+ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we -+ * deduct nanoseconds. -+ */ -+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ if (atomic_read(&rq->nr_iowait) > 0) { -+ rq->iowait_ns += ns; -+ if (rq->iowait_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->iowait_ns); -+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->iowait_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->idle_ns += ns; -+ if (rq->idle_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->idle_ns); -+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->idle_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(idle); -+} -+ -+static void pc_system_time(struct rq *rq, struct task_struct *p, -+ int hardirq_offset, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->stime_ns += ns; -+ if (p->stime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->stime_ns); -+ p->stime_ns %= JIFFY_NS; -+ p->stime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_system_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (hardirq_count() - hardirq_offset) { -+ rq->irq_ns += ns; -+ if (rq->irq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->irq_ns); -+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->irq_ns %= JIFFY_NS; -+ } -+ } else if (in_serving_softirq()) { -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->system_ns += ns; -+ if (rq->system_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->system_ns); -+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->system_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->utime_ns += ns; -+ if (p->utime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->utime_ns); -+ p->utime_ns %= JIFFY_NS; -+ p->utime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_user_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (this_cpu_ksoftirqd() == p) { -+ /* -+ * ksoftirqd time do not get accounted in cpu_softirq_time. -+ * So, we have to handle it separately here. -+ */ -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } -+ -+ if (task_nice(p) > 0 || idleprio_task(p)) { -+ rq->nice_ns += ns; -+ if (rq->nice_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->nice_ns); -+ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->nice_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->user_ns += ns; -+ if (rq->user_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->user_ns); -+ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->user_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+/* -+ * This is called on clock ticks. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate tick timekeeping */ -+ if (user_mode(get_irq_regs())) -+ pc_user_time(rq, p, account_ns); -+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { -+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); -+ } else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+ -+ p->last_ran = rq->niffies; -+} -+ -+/* -+ * This is called on context switches. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate subtick timekeeping */ -+ if (p != idle) -+ pc_user_time(rq, p, account_ns); -+ else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+} -+ -+/* -+ * Return any ns on the sched_clock that have not yet been accounted in -+ * @p in case that task is currently running. -+ * -+ * Called with task_rq_lock(p) held. -+ */ -+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -+{ -+ u64 ns = 0; -+ -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_clocks(rq); -+ ns = rq->niffies - p->last_ran; -+ } -+ -+ return ns; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ struct rq_flags rf; -+ struct rq *rq; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimisation chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_rq_lock(p, &rf); -+ ns = p->sched_time + do_task_delta_exec(p, rq); -+ task_rq_unlock(rq, p, &rf); -+ -+ return ns; -+} -+ -+/* -+ * Functions to test for when SCHED_ISO tasks have used their allocated -+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All -+ * data is modified only by the local runqueue during scheduler_tick with -+ * interrupts disabled. -+ */ -+ -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a -+ * slow division. -+ */ -+static inline void iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+ rq->iso_ticks += 100; -+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { -+ rq->iso_refractory = true; -+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) -+ rq->iso_ticks = ISO_PERIOD * 100; -+ } -+} -+ -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq, int ticks) -+{ -+ if (rq->iso_ticks > 0 || rq->iso_refractory) { -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; -+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { -+ rq->iso_refractory = false; -+ if (unlikely(rq->iso_ticks < 0)) -+ rq->iso_ticks = 0; -+ } -+ } -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if (rt_task(p) || task_running_iso(p)) -+ iso_tick(rq); -+ else -+ no_iso_tick(rq, 1); -+ -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->policy == SCHED_FIFO) -+ return; -+ -+ if (iso_task(p)) { -+ if (task_running_iso(p)) { -+ if (rq->iso_refractory) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Force it to reschedule as -+ * SCHED_NORMAL by zeroing its time_slice -+ */ -+ p->time_slice = 0; -+ } -+ } else if (!rq->iso_refractory) { -+ /* Can now run again ISO. Reschedule to pick up prio */ -+ goto out_resched; -+ } -+ } -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ * Dither is used as a backup for when hrexpiry is disabled or high res -+ * timers not configured in. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+out_resched: -+ rq_lock(rq); -+ __set_tsk_resched(p); -+ rq_unlock(rq); -+} -+ -+static inline void task_tick(struct rq *rq) -+{ -+ if (!rq_idle(rq)) -+ task_running_tick(rq); -+ else if (rq->last_jiffy > rq->last_scheduler_tick) -+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * We can stop the timer tick any time highres timers are active since -+ * we rely entirely on highres timeouts for task expiry rescheduling. -+ */ -+static void sched_stop_tick(struct rq *rq, int cpu) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (!tick_nohz_full_enabled()) -+ return; -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ rq_lock_irq(rq); -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ curr = rq->curr; -+ update_rq_clock(rq); -+ -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ task_tick(rq); -+ -+out_unlock: -+ rq_unlock_irq(rq, NULL); -+ -+out_requeue: -+ -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ /* There cannot be competing actions, but don't rely on stop-machine. */ -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); -+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); -+ /* Don't cancel, as this would mess up the state machine. */ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_stop_tick(struct rq *rq, int cpu) {} -+static inline void sched_start_tick(struct rq *rq, int cpu) {} -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ update_clocks(rq); -+ update_load_avg(rq, 0); -+ update_cpu_clock_tick(rq, rq->curr); -+ task_tick(rq); -+ rq->last_scheduler_tick = rq->last_jiffy; -+ rq->last_tick = rq->clock; -+ psi_task_tick(rq); -+ perf_event_task_tick(); -+ sched_stop_tick(rq, cpu); -+} -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline. Make sure update_clocks has been called recently to update -+ * rq->niffies. -+ */ -+static void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ p->deadline = rq->niffies + task_deadline_diff(p); -+#ifdef CONFIG_SMT_NICE -+ if (!p->mm) -+ p->smt_bias = 0; -+ else if (rt_task(p)) -+ p->smt_bias = 1 << 30; -+ else if (task_running_iso(p)) -+ p->smt_bias = 1 << 29; -+ else if (idleprio_task(p)) { -+ if (task_running_idle(p)) -+ p->smt_bias = 0; -+ else -+ p->smt_bias = 1; -+ } else if (--p->smt_bias < 1) -+ p->smt_bias = MAX_PRIO - p->static_prio; -+#endif -+} -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (p->time_slice < RESCHED_US || batch_task(p)) -+ time_slice_expired(p, rq); -+} -+ -+/* -+ * Task selection with skiplists is a simple matter of picking off the first -+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) -+ * being bound to the number of processors. -+ * -+ * Runqueues are selectively locked based on their unlocked data and then -+ * unlocked if not needed. At most 3 locks will be held at any time and are -+ * released as soon as they're no longer needed. All balancing between CPUs -+ * is thus done here in an extremely simple first come best fit manner. -+ * -+ * This iterates over runqueues in cache locality order. In interactive mode -+ * it iterates over all CPUs and finds the task with the best key/deadline. -+ * In non-interactive mode it will only take a task if it's from the current -+ * runqueue or a runqueue with more tasks than the current one with a better -+ * key/deadline. -+ */ -+#ifdef CONFIG_SMP -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct rq *locked = NULL, *chosen = NULL; -+ struct task_struct *edt = idle; -+ int i, best_entries = 0; -+ u64 best_key = ~0ULL; -+ -+ for (i = 0; i < total_runqueues; i++) { -+ struct rq *other_rq = rq_order(rq, i); -+ skiplist_node *next; -+ int entries; -+ -+ entries = other_rq->sl->entries; -+ /* -+ * Check for queued entres lockless first. The local runqueue -+ * is locked so entries will always be accurate. -+ */ -+ if (!sched_interactive) { -+ /* -+ * Don't reschedule balance across nodes unless the CPU -+ * is idle. -+ */ -+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) -+ break; -+ if (entries <= best_entries) -+ continue; -+ } else if (!entries) -+ continue; -+ -+ /* if (i) implies other_rq != rq */ -+ if (i) { -+ /* Check for best id queued lockless first */ -+ if (other_rq->best_key >= best_key) -+ continue; -+ -+ if (unlikely(!trylock_rq(rq, other_rq))) -+ continue; -+ -+ /* Need to reevaluate entries after locking */ -+ entries = other_rq->sl->entries; -+ if (unlikely(!entries)) { -+ unlock_rq(other_rq); -+ continue; -+ } -+ } -+ -+ next = other_rq->node; -+ /* -+ * In interactive mode we check beyond the best entry on other -+ * runqueues if we can't get the best for smt or affinity -+ * reasons. -+ */ -+ while ((next = next->next[0]) != other_rq->node) { -+ struct task_struct *p; -+ u64 key = next->key; -+ -+ /* Reevaluate key after locking */ -+ if (key >= best_key) -+ break; -+ -+ p = next->value; -+ if (!smt_schedule(p, rq)) { -+ if (i && !sched_interactive) -+ break; -+ continue; -+ } -+ -+ if (sched_other_cpu(p, cpu)) { -+ if (sched_interactive || !i) -+ continue; -+ break; -+ } -+ /* Make sure affinity is ok */ -+ if (i) { -+ /* From this point on p is the best so far */ -+ if (locked) -+ unlock_rq(locked); -+ chosen = locked = other_rq; -+ } -+ best_entries = entries; -+ best_key = key; -+ edt = p; -+ break; -+ } -+ /* rq->preempting is a hint only as the state may have changed -+ * since it was set with the resched call but if we have met -+ * the condition we can break out here. */ -+ if (edt == rq->preempting) -+ break; -+ if (i && other_rq != chosen) -+ unlock_rq(other_rq); -+ } -+ -+ if (likely(edt != idle)) -+ take_task(rq, cpu, edt); -+ -+ if (locked) -+ unlock_rq(locked); -+ -+ rq->preempting = NULL; -+ -+ return edt; -+} -+#else /* CONFIG_SMP */ -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct task_struct *edt; -+ -+ if (unlikely(!rq->sl->entries)) -+ return idle; -+ edt = rq->node->next[0]->value; -+ take_task(rq, cpu, edt); -+ return edt; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * The currently running task's information is all stored in rq local data -+ * which is only modified by the local CPU. -+ */ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ if (p == rq->idle || p->policy == SCHED_FIFO) -+ hrexpiry_clear(rq); -+ else -+ hrexpiry_start(rq, US_TO_NS(p->time_slice)); -+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) -+ rq->dither = 0; -+ else -+ rq->dither = rq_dither(rq); -+ -+ rq->rq_deadline = p->deadline; -+ rq->rq_prio = p->prio; -+#ifdef CONFIG_SMT_NICE -+ rq->rq_mm = p->mm; -+ rq->rq_smt_bias = p->smt_bias; -+#endif -+} -+ -+#ifdef CONFIG_SMT_NICE -+static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; -+ -+/* Iterate over smt siblings when we've scheduled a process on cpu and decide -+ * whether they should continue running or be descheduled. */ -+static void check_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct task_struct *p; -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ continue; -+ p = rq->curr; -+ if (!smt_schedule(p, this_rq)) -+ resched_curr(rq); -+ } -+} -+ -+static void wake_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ resched_idle(rq); -+ } -+} -+#else -+static void check_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_siblings(struct rq __maybe_unused *this_rq) {} -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next, *idle; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ bool deactivate = false; -+ struct rq *rq; -+ u64 niffies; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ idle = rq->idle; -+ -+ schedule_debug(prev, preempt); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ rq_lock(rq); -+ smp_mb__after_spinlock(); -+#ifdef CONFIG_SMP -+ if (rq->preempt) { -+ /* -+ * Make sure resched_curr hasn't triggered a preemption -+ * locklessly on a task that has since scheduled away. Spurious -+ * wakeup of idle is okay though. -+ */ -+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { -+ rq->preempt = NULL; -+ clear_preempt_need_resched(); -+ rq_unlock_irq(rq, NULL); -+ return; -+ } -+ rq->preempt = NULL; -+ } -+#endif -+ -+ switch_count = &prev->nivcsw; -+ -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state) { -+ if (signal_pending_state(prev_state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ deactivate = true; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ /* -+ * Store the niffy value here for use by the next task's last_ran -+ * below to avoid losing niffies due to update_clocks being called -+ * again after this point. -+ */ -+ update_clocks(rq); -+ niffies = rq->niffies; -+ update_cpu_clock_switch(rq, prev); -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ if (idle != prev) { -+ check_deadline(prev, rq); -+ return_task(prev, rq, cpu, deactivate); -+ } -+ -+ next = earliest_deadline_task(rq, cpu, idle); -+ if (likely(next->prio != PRIO_LIMIT)) -+ clear_cpuidle_map(cpu); -+ else { -+ set_cpuidle_map(cpu); -+ update_load_avg(rq, 0); -+ } -+ -+ set_rq_task(rq, next); -+ next->last_ran = niffies; -+ -+ if (likely(prev != next)) { -+ /* -+ * Don't reschedule an idle task or deactivated tasks -+ */ -+ if (prev == idle) { -+ inc_nr_running(rq); -+ if (rt_task(next)) -+ rq->rt_nr_running++; -+ } else if (!deactivate) -+ resched_suitable_idle(prev); -+ if (unlikely(next == idle)) { -+ dec_nr_running(rq); -+ if (rt_task(prev)) -+ rq->rt_nr_running--; -+ wake_siblings(rq); -+ } else -+ check_siblings(rq); -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ context_switch(rq, prev, next); /* unlocks the rq */ -+ } else { -+ check_siblings(rq); -+ rq_unlock(rq); -+ local_irq_enable(); -+ } -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(). */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static inline void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+ -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != IN_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio, oldprio; -+ struct rq *rq; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_rq_lock(p, NULL); -+ update_rq_clock(rq); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ oldprio = p->prio; -+ p->prio = prio; -+ if (task_running(rq, p)){ -+ if (prio > oldprio) -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (prio < oldprio) -+ try_preempt(p, rq); -+ } -+out_unlock: -+ __task_rq_unlock(rq, NULL); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+/* -+ * Adjust the deadline for when the priority is to change, before it's -+ * changed. -+ */ -+static inline void adjust_deadline(struct task_struct *p, int new_prio) -+{ -+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -+} -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static, old_static; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (has_rt_policy(p)) { -+ p->static_prio = new_static; -+ goto out_unlock; -+ } -+ -+ adjust_deadline(p, new_static); -+ old_static = p->static_prio; -+ p->static_prio = new_static; -+ p->prio = effective_prio(p); -+ -+ if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (new_static < old_static) -+ try_preempt(p, rq); -+ } else if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ if (old_static < new_static) -+ resched_task(p); -+ } -+out_unlock: -+ task_rq_unlock(rq, p, &rf); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int delta, prio = p->prio - MAX_RT_PRIO; -+ -+ /* rt tasks and iso tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ /* Convert to ms to avoid overflows */ -+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); -+ if (unlikely(delta < 0)) -+ delta = 0; -+ delta = delta * 40 / ms_longest_deadline_diff(); -+ if (delta <= 80) -+ prio += delta; -+ if (idleprio_task(p)) -+ prio += 40; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * available_idle_cpu - is a given CPU idle for enqueuing work. -+ * @cpu: the CPU in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int available_idle_cpu(int cpu) -+{ -+ if (!idle_cpu(cpu)) -+ return 0; -+ -+ if (vcpu_is_preempted(cpu)) -+ return 0; -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the CPU @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, -+ int prio, const struct sched_attr *attr, -+ bool keep_boost) -+{ -+ int oldrtprio, oldprio; -+ -+ /* -+ * If params can't change scheduling class changes aren't allowed -+ * either. -+ */ -+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) -+ return; -+ -+ p->policy = policy; -+ oldrtprio = p->rt_priority; -+ p->rt_priority = prio; -+ p->normal_prio = normal_prio(p); -+ oldprio = p->prio; -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ -+ if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (p->prio < oldprio || p->rt_priority > oldrtprio) -+ try_preempt(p, rq); -+ } -+} -+ -+/* -+ * Check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; -+ unsigned long rlim_rtprio = 0; -+ struct rq_flags rf; -+ int reset_on_fork; -+ struct rq *rq; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ priority = 0; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -+ policy &= ~SCHED_RESET_ON_FORK; -+ -+ if (!SCHED_RANGE(policy)) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH is 0. -+ */ -+ if (priority < 0 || -+ (p->mm && priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if (is_rt_policy(policy) != (priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (is_rt_policy(policy)) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (priority > p->rt_priority && -+ priority > rlim_rtprio) -+ return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy != SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ * -+ * To be able to change p->policy safely, the runqueue lock must be -+ * held. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea: -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || -+ priority == p->rt_priority))) { -+ retval = 0; -+ goto unlock; -+ } -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ __setscheduler(p, rq, policy, priority, attr, pi); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ task_rq_unlock(rq, p, &rf); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ preempt_enable(); -+out: -+ return 0; -+ -+unlock: -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) -+ attr.sched_policy = SETPARAM_POLICY; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ cpumask_t *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ unsigned long flags; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ put_online_cpus(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min(len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ -+ if (!sched_yield_type) -+ return; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ -+ if (sched_yield_type > 1) -+ time_slice_expired(current, rq); -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ rq_unlock(rq); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ struct task_struct *rq_p; -+ struct rq *rq, *p_rq; -+ unsigned long flags; -+ int yielded = 0; -+ -+ local_irq_save(flags); -+ rq = this_rq(); -+ -+again: -+ p_rq = task_rq(p); -+ /* -+ * If we're the only runnable task on the rq and target rq also -+ * has only one task, there's absolutely no point in yielding. -+ */ -+ if (task_running(p_rq, p) || p->state) { -+ yielded = -ESRCH; -+ goto out_irq; -+ } -+ -+ double_rq_lock(rq, p_rq); -+ if (unlikely(task_rq(p) != p_rq)) { -+ double_rq_unlock(rq, p_rq); -+ goto again; -+ } -+ -+ yielded = 1; -+ schedstat_inc(rq->yld_count); -+ rq_p = rq->curr; -+ if (p->deadline > rq_p->deadline) -+ p->deadline = rq_p->deadline; -+ p->time_slice += rq_p->time_slice; -+ if (p->time_slice > timeslice()) -+ p->time_slice = timeslice(); -+ time_slice_expired(rq_p, rq); -+ if (preempt && rq != p_rq) -+ resched_task(p_rq->curr); -+ double_rq_unlock(rq, p_rq); -+out_irq: -+ local_irq_restore(flags); -+ -+ if (yielded > 0) -+ schedule(); -+ return yielded; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ unsigned int time_slice; -+ struct rq_flags rf; -+ struct rq *rq; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ rq = task_rq_lock(p, &rf); -+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); -+ task_rq_unlock(rq, p, &rf); -+ -+ rcu_read_unlock(); -+ *t = ns_to_timespec64(time_slice); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * this syscall writes the default timeslice value of a given process -+ * into the user-space timespec buffer. A value of '0' means infinity. -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+#ifdef CONFIG_SMP -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ -+ if (task_queued(p)) { -+ /* -+ * Because __kthread_bind() calls this on blocked tasks without -+ * holding rq->lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ } -+} -+ -+/* -+ * Calling do_set_cpus_allowed from outside the scheduler code should not be -+ * called on a running or queued task. We should be holding pi_lock. -+ */ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+ if (needs_other_cpu(p, task_cpu(p))) { -+ struct rq *rq; -+ -+ rq = __task_rq_lock(p, NULL); -+ set_task_cpu(p, valid_task_cpu(p)); -+ resched_task(p); -+ __task_rq_unlock(rq, NULL); -+ } -+} -+#endif -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(rq->lock); -+ idle->last_ran = rq->niffies; -+ time_slice_expired(idle, rq); -+ idle->state = TASK_RUNNING; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->flags |= PF_IDLE; -+ -+ scs_task_reset(idle); -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#ifdef CONFIG_SMT_NICE -+ idle->smt_bias = 0; -+#endif -+#endif -+ set_rq_task(rq, idle); -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_rq = TASK_ON_RQ_QUEUED; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(rq); -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+void nohz_balance_enter_idle(int cpu) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct sched_domain *sd; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ rcu_read_lock(); -+ for_each_domain(cpu, sd) { -+ for_each_cpu_and(i, sched_domain_span(sd), -+ housekeeping_cpumask(HK_FLAG_TIMER)) { -+ if (cpu == i) -+ continue; -+ -+ if (!idle_cpu(i)) { -+ cpu = i; -+ goto unlock; -+ } -+ } -+ } -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+unlock: -+ rcu_read_unlock(); -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * Wake up the specified CPU. If the CPU is going offline, it is the -+ * caller's responsibility to deal with the lost wakeup, for example, -+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. -+ */ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool queued = false, running_wrong = false, kthread; -+ unsigned int dest_cpu; -+ struct rq_flags rf; -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ kthread = !!(p->flags & PF_KTHREAD); -+ if (kthread) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ /* -+ * Picking a ~random cpu helps in cases where we are changing affinity -+ * for groups of tasks (ie. cpuset), so that load balancing is not -+ * immediately required to distribute the tasks within their new mask. -+ */ -+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ queued = task_queued(p); -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (kthread) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(rq, p)) { -+ /* Task is running on the wrong cpu now, reschedule it. */ -+ if (rq == this_rq()) { -+ set_task_cpu(p, dest_cpu); -+ set_tsk_need_resched(p); -+ running_wrong = true; -+ } else -+ resched_task(p); -+ } else { -+ if (queued) { -+ /* -+ * Switch runqueue locks after dequeueing the task -+ * here while still holding the pi_lock to be holding -+ * the correct lock for enqueueing. -+ */ -+ dequeue_task(rq, p, 0); -+ rq_unlock(rq); -+ -+ rq = cpu_rq(dest_cpu); -+ rq_lock(rq); -+ } -+ set_task_cpu(p, dest_cpu); -+ if (queued) -+ enqueue_task(rq, p, 0); -+ } -+ if (queued) -+ try_preempt(p, rq); -+ if (running_wrong) -+ preempt_disable(); -+out: -+ task_rq_unlock(rq, p, &rf); -+ -+ if (running_wrong) { -+ __schedule(true); -+ preempt_enable(); -+ } -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Run through task list and find tasks affined to the dead cpu, then remove -+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold -+ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and -+ * pi_lock to change cpus_mask but it's not going to matter here. -+ */ -+static void bind_zero(int src_cpu) -+{ -+ struct task_struct *p, *t; -+ struct rq *rq0; -+ int bound = 0; -+ -+ if (src_cpu == 0) -+ return; -+ -+ rq0 = cpu_rq(0); -+ -+ do_each_thread(t, p) { -+ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { -+ bool local = (task_cpu(p) == src_cpu); -+ struct rq *rq = task_rq(p); -+ -+ /* task_running is the cpu stopper thread */ -+ if (local && task_running(rq, p)) -+ continue; -+ atomic_clear_cpu(src_cpu, &p->cpus_mask); -+ atomic_set_cpu(0, &p->cpus_mask); -+ p->zerobound = true; -+ bound++; -+ if (local) { -+ bool queued = task_queued(p); -+ -+ if (queued) -+ dequeue_task(rq, p, 0); -+ set_task_cpu(p, 0); -+ if (queued) -+ enqueue_task(rq0, p, 0); -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (bound) { -+ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", -+ bound, src_cpu); -+ } -+} -+ -+/* Find processes with the zerobound flag and reenable their affinity for the -+ * CPU coming alive. */ -+static void unbind_zero(int src_cpu) -+{ -+ int unbound = 0, zerobound = 0; -+ struct task_struct *p, *t; -+ -+ if (src_cpu == 0) -+ return; -+ -+ do_each_thread(t, p) { -+ if (!p->mm) -+ p->zerobound = false; -+ if (p->zerobound) { -+ unbound++; -+ cpumask_set_cpu(src_cpu, &p->cpus_mask); -+ /* Once every CPU affinity has been re-enabled, remove -+ * the zerobound flag */ -+ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { -+ p->zerobound = false; -+ zerobound++; -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (unbound) { -+ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", -+ unbound, src_cpu); -+ } -+ if (zerobound) { -+ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", -+ zerobound); -+ } -+} -+ -+/* -+ * Ensure that the idle task is using init_mm right before its cpu goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+#else /* CONFIG_HOTPLUG_CPU */ -+static void unbind_zero(int src_cpu) {} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+ -+static struct ctl_table sd_ctl_dir[] = { -+ { -+ .procname = "sched_domain", -+ .mode = 0555, -+ }, -+ {} -+}; -+ -+static struct ctl_table sd_ctl_root[] = { -+ { -+ .procname = "kernel", -+ .mode = 0555, -+ .child = sd_ctl_dir, -+ }, -+ {} -+}; -+ -+static struct ctl_table *sd_alloc_ctl_entry(int n) -+{ -+ struct ctl_table *entry = -+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); -+ -+ return entry; -+} -+ -+static void sd_free_ctl_entry(struct ctl_table **tablep) -+{ -+ struct ctl_table *entry; -+ -+ /* -+ * In the intermediate directories, both the child directory and -+ * procname are dynamically allocated and could fail but the mode -+ * will always be set. In the lowest directory the names are -+ * static strings and all have proc handlers. -+ */ -+ for (entry = *tablep; entry->mode; entry++) { -+ if (entry->child) -+ sd_free_ctl_entry(&entry->child); -+ if (entry->proc_handler == NULL) -+ kfree(entry->procname); -+ } -+ -+ kfree(*tablep); -+ *tablep = NULL; -+} -+ -+static void -+set_table_entry(struct ctl_table *entry, -+ const char *procname, void *data, int maxlen, -+ umode_t mode, proc_handler *proc_handler) -+{ -+ entry->procname = procname; -+ entry->data = data; -+ entry->maxlen = maxlen; -+ entry->mode = mode; -+ entry->proc_handler = proc_handler; -+} -+ -+static struct ctl_table * -+sd_alloc_ctl_domain_table(struct sched_domain *sd) -+{ -+ struct ctl_table *table = sd_alloc_ctl_entry(9); -+ -+ if (table == NULL) -+ return NULL; -+ -+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); -+ /* &table[8] is terminator */ -+ -+ return table; -+} -+ -+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -+{ -+ struct ctl_table *entry, *table; -+ struct sched_domain *sd; -+ int domain_num = 0, i; -+ char buf[32]; -+ -+ for_each_domain(cpu, sd) -+ domain_num++; -+ entry = table = sd_alloc_ctl_entry(domain_num + 1); -+ if (table == NULL) -+ return NULL; -+ -+ i = 0; -+ for_each_domain(cpu, sd) { -+ snprintf(buf, 32, "domain%d", i); -+ entry->procname = kstrdup(buf, GFP_KERNEL); -+ entry->mode = 0555; -+ entry->child = sd_alloc_ctl_domain_table(sd); -+ entry++; -+ i++; -+ } -+ return table; -+} -+ -+static cpumask_var_t sd_sysctl_cpus; -+static struct ctl_table_header *sd_sysctl_header; -+ -+void register_sched_domain_sysctl(void) -+{ -+ static struct ctl_table *cpu_entries; -+ static struct ctl_table **cpu_idx; -+ char buf[32]; -+ int i; -+ -+ if (!cpu_entries) { -+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); -+ if (!cpu_entries) -+ return; -+ -+ WARN_ON(sd_ctl_dir[0].child); -+ sd_ctl_dir[0].child = cpu_entries; -+ } -+ -+ if (!cpu_idx) { -+ struct ctl_table *e = cpu_entries; -+ -+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); -+ if (!cpu_idx) -+ return; -+ -+ /* deal with sparse possible map */ -+ for_each_possible_cpu(i) { -+ cpu_idx[i] = e; -+ e++; -+ } -+ } -+ -+ if (!cpumask_available(sd_sysctl_cpus)) { -+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) -+ return; -+ -+ /* init to possible to not have holes in @cpu_entries */ -+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); -+ } -+ -+ for_each_cpu(i, sd_sysctl_cpus) { -+ struct ctl_table *e = cpu_idx[i]; -+ -+ if (e->child) -+ sd_free_ctl_entry(&e->child); -+ -+ if (!e->procname) { -+ snprintf(buf, 32, "cpu%d", i); -+ e->procname = kstrdup(buf, GFP_KERNEL); -+ } -+ e->mode = 0555; -+ e->child = sd_alloc_ctl_cpu_table(i); -+ -+ __cpumask_clear_cpu(i, sd_sysctl_cpus); -+ } -+ -+ WARN_ON(sd_sysctl_header); -+ sd_sysctl_header = register_sysctl_table(sd_ctl_root); -+} -+ -+void dirty_sched_domain_sysctl(int cpu) -+{ -+ if (cpumask_available(sd_sysctl_cpus)) -+ __cpumask_set_cpu(cpu, sd_sysctl_cpus); -+} -+ -+/* may be called multiple times per register */ -+void unregister_sched_domain_sysctl(void) -+{ -+ unregister_sysctl_table(sd_sysctl_header); -+ sd_sysctl_header = NULL; -+} -+#endif /* CONFIG_SYSCTL */ -+ -+void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) { -+ cpumask_set_cpu(cpu_of(rq), rq->rd->online); -+ rq->online = true; -+ } -+} -+ -+void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) { -+ int cpu = cpu_of(rq); -+ -+ cpumask_clear_cpu(cpu, rq->rd->online); -+ rq->online = false; -+ clear_cpuidle_map(cpu); -+ } -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) { -+ sched_domains_numa_masks_set(cpu); -+ cpuset_cpu_active(); -+ } -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all CPUs have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ rq_lock_irqsave(rq, &rf); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_online(rq); -+ } -+ unbind_zero(cpu); -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ sched_domains_numa_masks_clear(cpu); -+ return 0; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ local_irq_save(flags); -+ double_rq_lock(rq, cpu_rq(0)); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ bind_zero(cpu); -+ double_rq_unlock(rq, cpu_rq(0)); -+ sched_start_tick(rq, cpu); -+ hrexpiry_clear(rq); -+ local_irq_restore(flags); -+ -+ return 0; -+} -+#endif -+ -+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -+/* -+ * Cheaper version of the below functions in case support for SMT and MC is -+ * compiled in but CPUs have no siblings. -+ */ -+static bool sole_cpu_idle(struct rq *rq) -+{ -+ return rq_idle(rq); -+} -+#endif -+#ifdef CONFIG_SCHED_SMT -+static const cpumask_t *thread_cpumask(int cpu) -+{ -+ return topology_sibling_cpumask(cpu); -+} -+/* All this CPU's SMT siblings are idle */ -+static bool siblings_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); -+} -+#endif -+#ifdef CONFIG_SCHED_MC -+static const cpumask_t *core_cpumask(int cpu) -+{ -+ return topology_core_cpumask(cpu); -+} -+/* All this CPU's shared cache siblings are idle */ -+static bool cache_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->core_mask, &cpu_idle_map); -+} -+/* MC siblings CPU mask which share the same LLC */ -+static const cpumask_t *llc_core_cpumask(int cpu) -+{ -+#ifdef CONFIG_X86 -+ return per_cpu(cpu_llc_shared_map, cpu); -+#else -+ return topology_core_cpumask(cpu); -+#endif -+} -+#endif -+ -+enum sched_domain_level { -+ SD_LV_NONE = 0, -+ SD_LV_SIBLING, -+ SD_LV_MC, -+ SD_LV_BOOK, -+ SD_LV_CPU, -+ SD_LV_NODE, -+ SD_LV_ALLNODES, -+ SD_LV_MAX -+}; -+ -+/* -+ * Set up the relative cache distance of each online cpu from each -+ * other in a simple array for quick lookup. Locality is determined -+ * by the closest sched_domain that CPUs are separated by. CPUs with -+ * shared cache in SMT and MC are treated as local. Separate CPUs -+ * (within the same package or physically) within the same node are -+ * treated as not local. CPUs not even in the same domain (different -+ * nodes) are treated as very distant. -+ */ -+static void __init select_leaders(void) -+{ -+ struct rq *rq, *other_rq, *leader; -+ struct sched_domain *sd; -+ int cpu, other_cpu; -+#ifdef CONFIG_SCHED_SMT -+ bool smt_threads = false; -+#endif -+ -+ for (cpu = 0; cpu < num_online_cpus(); cpu++) { -+ rq = cpu_rq(cpu); -+ leader = NULL; -+ /* First check if this cpu is in the same node */ -+ for_each_domain(cpu, sd) { -+ if (sd->level > SD_LV_MC) -+ continue; -+ if (rqshare != RQSHARE_ALL) -+ leader = NULL; -+ /* Set locality to local node if not already found lower */ -+ for_each_cpu(other_cpu, sched_domain_span(sd)) { -+ if (rqshare >= RQSHARE_SMP) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smp_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smp_leader) -+ other_rq->smp_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMP; -+ } -+ } -+ -+ /* -+ * Each runqueue has its own function in case it doesn't have -+ * siblings of its own allowing mixed topologies. -+ */ -+#ifdef CONFIG_SCHED_MC -+ leader = NULL; -+ if (cpumask_weight(core_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->core_mask); -+ for_each_cpu(other_cpu, core_cpumask(cpu)) { -+ if (rqshare == RQSHARE_MC || -+ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the mc_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->mc_leader) -+ other_rq->mc_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { -+ /* this is to get LLC into play even in case LLC sharing is not used */ -+ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) -+ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; -+ else -+ rq->cpu_locality[other_cpu] = LOCALITY_MC; -+ } -+ } -+ rq->cache_idle = cache_cpu_idle; -+ } -+#endif -+#ifdef CONFIG_SCHED_SMT -+ leader = NULL; -+ if (cpumask_weight(thread_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->thread_mask); -+ for_each_cpu(other_cpu, thread_cpumask(cpu)) { -+ if (rqshare == RQSHARE_SMT) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smt_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smt_leader) -+ other_rq->smt_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMT; -+ } -+ rq->siblings_idle = siblings_cpu_idle; -+ smt_threads = true; -+ } -+#endif -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (smt_threads) { -+ check_siblings = &check_smt_siblings; -+ wake_siblings = &wake_smt_siblings; -+ smt_schedule = &smt_should_schedule; -+ } -+#endif -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for_each_online_cpu(other_cpu) { -+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); -+ } -+ } -+} -+ -+/* FIXME freeing locked spinlock */ -+static void __init share_and_free_rq(struct rq *leader, struct rq *rq) -+{ -+ WARN_ON(rq->nr_running > 0); -+ -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ rq->is_leader = false; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+} -+ -+static void __init share_rqs(void) -+{ -+ struct rq *rq, *leader; -+ int cpu; -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smp_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+ -+#ifdef CONFIG_SCHED_MC -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->mc_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_MC */ -+ -+#ifdef CONFIG_SCHED_SMT -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smt_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_SMT */ -+} -+ -+static void __init setup_rq_orders(void) -+{ -+ int *selected_cpus, *ordered_cpus; -+ struct rq *rq, *other_rq; -+ int cpu, other_cpu, i; -+ -+ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ -+ total_runqueues = 0; -+ for_each_online_cpu(cpu) { -+ int locality, total_rqs = 0, total_cpus = 0; -+ -+ rq = cpu_rq(cpu); -+ if (rq->is_leader) -+ total_runqueues++; -+ -+ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { -+ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; -+ int ordered_cpus_idx; -+ -+ ordered_cpus_idx = -1; -+ selected_cpu_cnt = 0; -+ -+ for_each_online_cpu(test_cpu) { -+ if (cpu < num_online_cpus() / 2) -+ other_cpu = cpu + test_cpu; -+ else -+ other_cpu = cpu - test_cpu; -+ if (other_cpu < 0) -+ other_cpu += num_online_cpus(); -+ else -+ other_cpu %= num_online_cpus(); -+ /* gather CPUs of the same locality */ -+ if (rq->cpu_locality[other_cpu] == locality) { -+ selected_cpus[selected_cpu_cnt] = other_cpu; -+ selected_cpu_cnt++; -+ } -+ } -+ -+ /* reserve first CPU as starting point */ -+ if (selected_cpu_cnt > 0) { -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; -+ selected_cpus[ordered_cpus_idx] = -1; -+ } -+ -+ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ -+ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { -+ /* starting point with worst locality and current CPU */ -+ best_locality = LOCALITY_DISTANT; -+ selected_cpu_idx = test_cpu_idx; -+ -+ /* try to find the best locality within group */ -+ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { -+ /* if CPU has not been used and locality is better */ -+ if (selected_cpus[cpu_idx] > -1) { -+ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); -+ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { -+ /* assign best locality and best CPU idx in array */ -+ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; -+ selected_cpu_idx = cpu_idx; -+ } -+ } -+ } -+ -+ /* add our next best CPU to ordered list */ -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; -+ /* mark this CPU as used */ -+ selected_cpus[selected_cpu_idx] = -1; -+ } -+ -+ /* set up RQ and CPU orders */ -+ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { -+ other_rq = cpu_rq(ordered_cpus[test_cpu]); -+ /* set up cpu orders */ -+ rq->cpu_order[total_cpus++] = other_rq; -+ if (other_rq->is_leader) { -+ /* set up RQ orders */ -+ rq->rq_order[total_rqs++] = other_rq; -+ } -+ } -+ } -+ } -+ -+ kfree(selected_cpus); -+ kfree(ordered_cpus); -+ -+#ifdef CONFIG_X86 -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < total_runqueues; i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < num_online_cpus(); i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); -+ } -+ } -+#endif -+} -+ -+void __init sched_init_smp(void) -+{ -+ sched_init_numa(); -+ -+ /* -+ * There's no userspace yet to cause hotplug operations; hence all the -+ * cpu masks are stable and all blatant races in the below code cannot -+ * happen. -+ */ -+ mutex_lock(&sched_domains_mutex); -+ sched_init_domains(cpu_active_mask); -+ mutex_unlock(&sched_domains_mutex); -+ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ local_irq_disable(); -+ mutex_lock(&sched_domains_mutex); -+ lock_all_rqs(); -+ -+ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", -+ num_possible_cpus(), num_present_cpus(), num_online_cpus()); -+ -+ select_leaders(); -+ -+ unlock_all_rqs(); -+ mutex_unlock(&sched_domains_mutex); -+ -+ share_rqs(); -+ -+ local_irq_enable(); -+ -+ setup_rq_orders(); -+ -+ switch (rqshare) { -+ case RQSHARE_ALL: -+ /* This should only ever read 1 */ -+ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMP: -+ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC: -+ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC_LLC: -+ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMT: -+ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_NONE: -+ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", -+ total_runqueues); -+ break; -+ } -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ sched_smp_initialized = true; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+#ifdef CONFIG_SMP -+ int cpu_ids; -+#endif -+ int i; -+ struct rq *rq; -+ -+ wait_bit_init(); -+ -+ prio_ratios[0] = 128; -+ for (i = 1 ; i < NICE_WIDTH ; i++) -+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; -+ -+ skiplist_node_init(&init_task.node); -+ -+#ifdef CONFIG_SMP -+ init_defrootdomain(); -+ cpumask_clear(&cpu_idle_map); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); -+ skiplist_init(rq->node); -+ rq->sl = new_skiplist(rq->node); -+ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); -+ raw_spin_lock_init(rq->lock); -+ rq->nr_running = 0; -+ rq->nr_uninterruptible = 0; -+ rq->nr_switches = 0; -+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; -+ rq->last_jiffy = jiffies; -+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = -+ rq->iowait_ns = rq->idle_ns = 0; -+ rq->dither = 0; -+ set_rq_task(rq, &init_task); -+ rq->iso_ticks = 0; -+ rq->iso_refractory = false; -+#ifdef CONFIG_SMP -+ rq->is_leader = true; -+ rq->smp_leader = NULL; -+#ifdef CONFIG_SCHED_MC -+ rq->mc_leader = NULL; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ rq->smt_leader = NULL; -+#endif -+ rq->sd = NULL; -+ rq->rd = NULL; -+ rq->online = false; -+ rq->cpu = i; -+ rq_attach_root(rq, &def_root_domain); -+#endif -+ init_rq_hrexpiry(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+ -+#ifdef CONFIG_SMP -+ cpu_ids = i; -+ /* -+ * Set the base locality for cpu cache distance calculation to -+ * "distant" (3). Make sure the distance from a CPU to itself is 0. -+ */ -+ for_each_possible_cpu(i) { -+ int j; -+ -+ rq = cpu_rq(i); -+#ifdef CONFIG_SCHED_SMT -+ rq->siblings_idle = sole_cpu_idle; -+#endif -+#ifdef CONFIG_SCHED_MC -+ rq->cache_idle = sole_cpu_idle; -+#endif -+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); -+ for_each_possible_cpu(j) { -+ if (i == j) -+ rq->cpu_locality[j] = LOCALITY_SAME; -+ else -+ rq->cpu_locality[j] = LOCALITY_DISTANT; -+ } -+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->rq_order[0] = rq->cpu_order[0] = rq; -+ for (j = 1; j < cpu_ids; j++) -+ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); -+ } -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && !preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+static inline void normalise_rt_tasks(void) -+{ -+ struct sched_attr attr = {}; -+ struct task_struct *g, *p; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p) && !iso_task(p)) -+ continue; -+ -+ rq = task_rq_lock(p, &rf); -+ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); -+ task_rq_unlock(rq, p, &rf); -+ } -+ read_unlock(&tasklist_lock); -+} -+ -+void normalize_rt_tasks(void) -+{ -+ normalise_rt_tasks(); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+void init_idle_bootup_task(struct task_struct *idle) -+{} -+ -+#ifdef CONFIG_SCHED_DEBUG -+__read_mostly bool sched_debug_enabled; -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void call_trace_sched_update_nr_running(struct rq *rq, int count) -+{ -+ trace_sched_update_nr_running_tp(rq, count); -+} -+ -+/* CFS Compat */ -+#ifdef CONFIG_RCU_TORTURE_TEST -+int sysctl_sched_rt_runtime; -+#endif -diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h -new file mode 100644 -index 000000000000..09a1f2fe64ba ---- /dev/null -+++ b/kernel/sched/MuQSS.h -@@ -0,0 +1,1070 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef MUQSS_SCHED_H -+#define MUQSS_SCHED_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "cpupri.h" -+ -+#include -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+#else -+# define SCHED_WARN_ON(x) ((void)(x)) -+#endif -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -+ -+struct rq; -+ -+#ifdef CONFIG_SMP -+ -+static inline bool sched_asym_prefer(int a, int b) -+{ -+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); -+} -+ -+struct perf_domain { -+ struct em_perf_domain *em_pd; -+ struct perf_domain *next; -+ struct rcu_head rcu; -+}; -+ -+/* Scheduling group status flags */ -+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ -+ -+/* -+ * We add the notion of a root-domain which will be used to define per-domain -+ * variables. Each exclusive cpuset essentially defines an island domain by -+ * fully partitioning the member cpus from any other cpuset. Whenever a new -+ * exclusive cpuset is created, we also create and attach a new root-domain -+ * object. -+ * -+ */ -+struct root_domain { -+ atomic_t refcount; -+ atomic_t rto_count; -+ struct rcu_head rcu; -+ cpumask_var_t span; -+ cpumask_var_t online; -+ -+ /* -+ * Indicate pullable load on at least one CPU, e.g: -+ * - More than one runnable task -+ * - Running task is misfit -+ */ -+ int overload; -+ -+ /* Indicate one or more cpus over-utilized (tipping point) */ -+ int overutilized; -+ -+ /* -+ * The bit corresponding to a CPU gets set here if such CPU has more -+ * than one runnable -deadline task (as it is below for RT tasks). -+ */ -+ cpumask_var_t dlo_mask; -+ atomic_t dlo_count; -+ /* Replace unused CFS structures with void */ -+ //struct dl_bw dl_bw; -+ //struct cpudl cpudl; -+ void *dl_bw; -+ void *cpudl; -+ -+ /* -+ * The "RT overload" flag: it gets set if a CPU has more than -+ * one runnable RT task. -+ */ -+ cpumask_var_t rto_mask; -+ //struct cpupri cpupri; -+ void *cpupri; -+ -+ unsigned long max_cpu_capacity; -+ -+ /* -+ * NULL-terminated list of performance domains intersecting with the -+ * CPUs of the rd. Protected by RCU. -+ */ -+ struct perf_domain *pd; -+}; -+ -+extern void init_defrootdomain(void); -+extern int sched_init_domains(const struct cpumask *cpu_map); -+extern void rq_attach_root(struct rq *rq, struct root_domain *rd); -+ -+static inline void cpupri_cleanup(void __maybe_unused *cpupri) -+{ -+} -+ -+static inline void cpudl_cleanup(void __maybe_unused *cpudl) -+{ -+} -+ -+static inline void init_dl_bw(void __maybe_unused *dl_bw) -+{ -+} -+ -+static inline int cpudl_init(void __maybe_unused *dl_bw) -+{ -+ return 0; -+} -+ -+static inline int cpupri_init(void __maybe_unused *cpupri) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ raw_spinlock_t *lock; -+ raw_spinlock_t *orig_lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle; -+ struct task_struct *stop; -+ struct mm_struct *prev_mm; -+ -+ unsigned int nr_running; -+ /* -+ * This is part of a global counter where only the total sum -+ * over all CPUs matters. A task can increase this counter on -+ * one CPU and if it got migrated afterwards it may decrease -+ * it on another CPU. Always updated under the runqueue lock: -+ */ -+ unsigned long nr_uninterruptible; -+#ifdef CONFIG_SMP -+ unsigned int ttwu_pending; -+#endif -+ u64 nr_switches; -+ -+ /* Stored data about rq->curr to work outside rq lock */ -+ u64 rq_deadline; -+ int rq_prio; -+ -+ /* Best queued id for use outside lock */ -+ u64 best_key; -+ -+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ -+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ -+ u64 niffies; /* Last time this RQ updated rq clock */ -+ u64 last_niffy; /* Last niffies as updated by local clock */ -+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ -+ -+ u64 load_update; /* When we last updated load */ -+ unsigned long load_avg; /* Rolling load average */ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ u64 irq_load_update; /* When we last updated IRQ load */ -+ unsigned long irq_load_avg; /* Rolling IRQ load average */ -+#endif -+#ifdef CONFIG_SMT_NICE -+ struct mm_struct *rq_mm; -+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+ /* Accurate timekeeping data */ -+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, -+ iowait_ns, idle_ns; -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+ skiplist_node *node; -+ skiplist *sl; -+#ifdef CONFIG_SMP -+ struct task_struct *preempt; /* Preempt triggered on this task */ -+ struct task_struct *preempting; /* Hint only, what task is preempting */ -+ -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ struct root_domain *rd; -+ struct sched_domain *sd; -+ -+ unsigned long cpu_capacity_orig; -+ -+ int *cpu_locality; /* CPU relative cache distance */ -+ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ -+ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ -+ -+ bool is_leader; -+ struct rq *smp_leader; /* First physical CPU per node */ -+#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+ struct sched_avg avg_thermal; -+#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ -+#ifdef CONFIG_SCHED_SMT -+ struct rq *smt_leader; /* First logical CPU in SMT siblings */ -+ cpumask_t thread_mask; -+ bool (*siblings_idle)(struct rq *rq); -+ /* See if all smt siblings are idle */ -+#endif /* CONFIG_SCHED_SMT */ -+#ifdef CONFIG_SCHED_MC -+ struct rq *mc_leader; /* First logical CPU in MC siblings */ -+ cpumask_t core_mask; -+ bool (*cache_idle)(struct rq *rq); -+ /* See if all cache siblings are idle */ -+#endif /* CONFIG_SCHED_MC */ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ u64 clock, old_clock, last_tick; -+ /* Ensure that all clocks are in the same cache line */ -+ u64 clock_task ____cacheline_aligned; -+ int dither; -+ -+ int iso_ticks; -+ bool iso_refractory; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ struct hrtimer hrexpiry_timer; -+#endif -+ -+ int rt_nr_running; /* Number real time tasks running */ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock_task; -+} -+ -+/** -+ * By default the decay is the default pelt decay period. -+ * The decay shift can change the decay period in -+ * multiples of 32. -+ * Decay shift Decay period(ms) -+ * 0 32 -+ * 1 64 -+ * 2 128 -+ * 3 256 -+ * 4 512 -+ */ -+extern int sched_thermal_decay_shift; -+ -+static inline u64 rq_clock_thermal(struct rq *rq) -+{ -+ return rq_clock_task(rq) >> sched_thermal_decay_shift; -+} -+ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu); -+#endif -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#endif /* CONFIG_SMP */ -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline int task_running(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->on_cpu; -+#else -+ return task_current(rq, p); -+#endif -+} -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+static inline void rq_lock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(rq->lock); -+} -+ -+static inline void rq_unlock(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(rq->lock); -+} -+ -+static inline void rq_lock_irq(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(rq->lock); -+} -+ -+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(rq->lock); -+} -+ -+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(rq->lock, rf->flags); -+} -+ -+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(rq->lock, rf->flags); -+} -+ -+static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ while (42) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ } -+ return rq; -+} -+ -+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ rq_unlock(rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ while (42) { -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ } -+ return rq; -+} -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) -+{ -+ rq_unlock(rq); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ return rq; -+} -+ -+/* -+ * {de,en}queue flags: Most not used on MuQSS. -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks -+ * are in a known state which allows modification. Such pairs -+ * should preserve as much state as possible. -+ * -+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location -+ * in the runqueue. -+ * -+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) -+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) -+ * ENQUEUE_MIGRATED - the task was migrated during wakeup -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -+ -+#define ENQUEUE_WAKEUP 0x01 -+#define ENQUEUE_RESTORE 0x02 -+ -+#ifdef CONFIG_SMP -+#define ENQUEUE_MIGRATED 0x40 -+#else -+#define ENQUEUE_MIGRATED 0x00 -+#endif -+ -+#ifdef CONFIG_NUMA -+enum numa_topology_type { -+ NUMA_DIRECT, -+ NUMA_GLUELESS_MESH, -+ NUMA_BACKPLANE, -+}; -+extern enum numa_topology_type sched_numa_topology_type; -+extern int sched_max_numa_distance; -+extern bool find_numa_distance(int distance); -+extern void sched_init_numa(void); -+extern void sched_domains_numa_masks_set(unsigned int cpu); -+extern void sched_domains_numa_masks_clear(unsigned int cpu); -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline void sched_init_numa(void) { } -+static inline void sched_domains_numa_masks_set(unsigned int cpu) { } -+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern struct mutex sched_domains_mutex; -+extern struct static_key_false sched_schedstats; -+ -+#define rcu_dereference_check_sched_domain(p) \ -+ rcu_dereference_check((p), \ -+ lockdep_is_held(&sched_domains_mutex)) -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -+ * See destroy_sched_domains: call_rcu for details. -+ * -+ * The domain tree of any CPU may only be accessed from within -+ * preempt-disabled sections. -+ */ -+#define for_each_domain(cpu, __sd) \ -+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ -+ __sd; __sd = __sd->parent) -+ -+/** -+ * highest_flag_domain - Return highest sched_domain containing flag. -+ * @cpu: The cpu whose highest level of sched domain is to -+ * be returned. -+ * @flag: The flag to check for the highest sched_domain -+ * for the given cpu. -+ * -+ * Returns the highest sched_domain of a cpu which contains the given flag. -+ */ -+static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd, *hsd = NULL; -+ -+ for_each_domain(cpu, sd) { -+ if (!(sd->flags & flag)) -+ break; -+ hsd = sd; -+ } -+ -+ return hsd; -+} -+ -+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd; -+ -+ for_each_domain(cpu, sd) { -+ if (sd->flags & flag) -+ break; -+ } -+ -+ return sd; -+} -+ -+DECLARE_PER_CPU(struct sched_domain *, sd_llc); -+DECLARE_PER_CPU(int, sd_llc_size); -+DECLARE_PER_CPU(int, sd_llc_id); -+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -+DECLARE_PER_CPU(struct sched_domain *, sd_numa); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); -+ -+struct sched_group_capacity { -+ atomic_t ref; -+ /* -+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity -+ * for a single CPU. -+ */ -+ unsigned long capacity; -+ unsigned long min_capacity; /* Min per-CPU capacity in group */ -+ unsigned long max_capacity; /* Max per-CPU capacity in group */ -+ unsigned long next_update; -+ int imbalance; /* XXX unrelated to capacity but shared group state */ -+ -+#ifdef CONFIG_SCHED_DEBUG -+ int id; -+#endif -+ -+ unsigned long cpumask[]; /* balance mask */ -+}; -+ -+struct sched_group { -+ struct sched_group *next; /* Must be a circular list */ -+ atomic_t ref; -+ -+ unsigned int group_weight; -+ struct sched_group_capacity *sgc; -+ int asym_prefer_cpu; /* cpu of highest priority in group */ -+ -+ /* -+ * The CPUs this group covers. -+ * -+ * NOTE: this field is variable length. (Allocated dynamically -+ * by attaching extra space to the end of the structure, -+ * depending on how many CPUs the kernel has booted up with) -+ */ -+ unsigned long cpumask[0]; -+}; -+ -+static inline struct cpumask *sched_group_span(struct sched_group *sg) -+{ -+ return to_cpumask(sg->cpumask); -+} -+ -+/* -+ * See build_balance_mask(). -+ */ -+static inline struct cpumask *group_balance_mask(struct sched_group *sg) -+{ -+ return to_cpumask(sg->sgc->cpumask); -+} -+ -+/** -+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. -+ * @group: The group whose first cpu is to be returned. -+ */ -+static inline unsigned int group_first_cpu(struct sched_group *group) -+{ -+ return cpumask_first(sched_group_span(group)); -+} -+ -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void dirty_sched_domain_sysctl(int cpu); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void dirty_sched_domain_sysctl(int cpu) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_rq_online (struct rq *rq); -+extern void set_rq_offline(struct rq *rq); -+extern bool sched_smp_initialized; -+ -+static inline void update_group_capacity(struct sched_domain *sd, int cpu) -+{ -+} -+ -+static inline void trigger_load_balance(struct rq *rq) -+{ -+} -+ -+#define sched_feat(x) 0 -+ -+#else /* CONFIG_SMP */ -+ -+static inline void flush_smp_call_function_from_idle(void) { } -+ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ SCHED_WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+extern bool sched_debug_enabled; -+#endif -+ -+extern void schedule_idle(void); -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+static inline bool sched_stop_runnable(struct rq *rq) -+{ -+ return rq->stop && task_on_rq_queued(rq->stop); -+} -+ -+#ifdef CONFIG_SMP -+static inline int cpu_of(struct rq *rq) -+{ -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static inline int cpu_of(struct rq *rq) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); -+ -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ -+ if (data) -+ data->func(data, rq->niffies, flags); -+} -+#else -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) -+{ -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+static __always_inline -+unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, -+ struct task_struct __maybe_unused *p) -+{ -+ return util; -+} -+ -+static inline bool uclamp_is_used(void) -+{ -+ return false; -+} -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return tsk_seruntime(t); -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ struct rq_flags rf; -+ u64 ns; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = tsk_seruntime(t); -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+/** -+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. -+ * @cpu: the CPU in question. -+ * -+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. -+ * -+ * f_curr -+ * ------ * SCHED_CAPACITY_SCALE -+ * f_max -+ */ -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern bool sched_can_stop_tick(struct rq *rq); -+extern int __init sched_tick_offload_init(void); -+ -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out of -+ * nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (sched_can_stop_tick(rq)) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+static inline bool rt_rq_is_runnable(struct rq *rt_rq) -+{ -+ return rt_rq->rt_nr_running; -+} -+ -+/** -+ * enum schedutil_type - CPU utilization type -+ * @FREQUENCY_UTIL: Utilization used to select frequency -+ * @ENERGY_UTIL: Utilization used during energy calculation -+ * -+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time -+ * need to be aggregated differently depending on the usage made of them. This -+ * enum is used within schedutil_freq_util() to differentiate the types of -+ * utilization expected by the callers, and adjust the aggregation accordingly. -+ */ -+enum schedutil_type { -+ FREQUENCY_UTIL, -+ ENERGY_UTIL, -+}; -+ -+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -+ -+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, -+ unsigned long max, enum schedutil_type type, -+ struct task_struct *p); -+ -+static inline unsigned long cpu_bw_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_cfs(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline unsigned long cpu_util_rt(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->rt_nr_running); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->irq_load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ util *= (max - irq); -+ util /= max; -+ -+ return util; -+ -+} -+#else -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ return util; -+} -+#endif -+#endif -+ -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -+ -+DECLARE_STATIC_KEY_FALSE(sched_energy_present); -+ -+static inline bool sched_energy_enabled(void) -+{ -+ return static_branch_unlikely(&sched_energy_present); -+} -+ -+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ -+ -+#define perf_domain_span(pd) NULL -+static inline bool sched_energy_enabled(void) { return false; } -+ -+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ if (!(p->flags & PF_KTHREAD)) -+ return false; -+ -+ if (p->nr_cpus_allowed != 1) -+ return false; -+ -+ return true; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ -+static inline int -+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -+{ -+ return 0; -+} -+ -+static inline u64 thermal_load_avg(struct rq *rq) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_RCU_TORTURE_TEST -+extern int sysctl_sched_rt_runtime; -+#endif -+ -+#endif /* MUQSS_SCHED_H */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index e39008242cf4..146a3dfe626f 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifdef CONFIG_SCHED_MUQSS -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) -+#else -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) -+#endif -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && -- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { -+ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { - return max; - } - -@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - struct task_struct *thread; - struct sched_attr attr = { - .size = sizeof(struct sched_attr), -+#ifdef CONFIG_SCHED_MUQSS -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, -+#endif - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, -diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h -index efbb492bb94c..f0288c32ab17 100644 ---- a/kernel/sched/cpupri.h -+++ b/kernel/sched/cpupri.h -@@ -17,6 +17,7 @@ struct cpupri { - int *cpu_to_pri; - }; - -+#ifndef CONFIG_SCHED_MUQSS - #ifdef CONFIG_SMP - int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask); -@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); - int cpupri_init(struct cpupri *cp); - void cpupri_cleanup(struct cpupri *cp); - #endif -+#endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 5a55d2300452..283a580754a7 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) - return accounted; - } - --#ifdef CONFIG_64BIT --static inline u64 read_sum_exec_runtime(struct task_struct *t) --{ -- return t->se.sum_exec_runtime; --} --#else --static u64 read_sum_exec_runtime(struct task_struct *t) --{ -- u64 ns; -- struct rq_flags rf; -- struct rq *rq; -- -- rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -- task_rq_unlock(rq, t, &rf); -- -- return ns; --} --#endif -- - /* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. -@@ -614,7 +594,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f324dc36fc43..43ca13ed9ab0 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_MUQSS - /* - * idle-task scheduling class. - */ -@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..4478c11cb51a 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,19 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_MUQSS -+#include "MuQSS.h" -+ -+/* Begin compatibility wrappers for MuQSS/CFS differences */ -+#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->nr_running) -+ -+#else /* CONFIG_SCHED_MUQSS */ -+ -+#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) -+ -+ - #include - - #include -@@ -2626,3 +2639,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* MuQSS compatibility functions */ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return t->se.sum_exec_runtime; -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ u64 ns; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = t->se.sum_exec_runtime; -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..a1dc490c15e4 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -440,7 +440,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - struct root_domain *old_rd = NULL; - unsigned long flags; - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_lock_irqsave(rq->lock, flags); -+#else - raw_spin_lock_irqsave(&rq->lock, flags); -+#endif - - if (rq->rd) { - old_rd = rq->rd; -@@ -466,7 +470,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_unlock_irqrestore(rq->lock, flags); -+#else - raw_spin_unlock_irqrestore(&rq->lock, flags); -+#endif - - if (old_rd) - call_rcu(&old_rd->rcu, free_rootdomain); -diff --git a/kernel/skip_list.c b/kernel/skip_list.c -new file mode 100644 -index 000000000000..bf5c6e97e139 ---- /dev/null -+++ b/kernel/skip_list.c -@@ -0,0 +1,148 @@ -+/* -+ Copyright (C) 2011,2016 Con Kolivas. -+ -+ Code based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+The routine randomLevel has been hard-coded to generate random -+levels using p=0.25. It can be easily changed. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+Levels start at zero and go up to MaxLevel (which is equal to -+MaxNumberOfLevels-1). -+ -+The routines defined in this file are: -+ -+init: defines slnode -+ -+new_skiplist: returns a new, empty list -+ -+randomLevel: Returns a random level based on a u64 random seed passed to it. -+In MuQSS, the "niffy" time is used for this purpose. -+ -+insert(l,key, value): inserts the binding (key, value) into l. This operation -+occurs in O(log n) time. -+ -+delnode(slnode, l, node): deletes any binding of key from the l based on the -+actual node value. This operation occurs in O(k) time where k is the -+number of levels of the node in question (max 8). The original delete -+function occurred in O(log n) time and involved a search. -+ -+MuQSS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+ -+*/ -+ -+#include -+#include -+ -+#define MaxNumberOfLevels 8 -+#define MaxLevel (MaxNumberOfLevels - 1) -+ -+void skiplist_init(skiplist_node *slnode) -+{ -+ int i; -+ -+ slnode->key = 0xFFFFFFFFFFFFFFFF; -+ slnode->level = 0; -+ slnode->value = NULL; -+ for (i = 0; i < MaxNumberOfLevels; i++) -+ slnode->next[i] = slnode->prev[i] = slnode; -+} -+ -+skiplist *new_skiplist(skiplist_node *slnode) -+{ -+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); -+ -+ BUG_ON(!l); -+ l->header = slnode; -+ return l; -+} -+ -+void free_skiplist(skiplist *l) -+{ -+ skiplist_node *p, *q; -+ -+ p = l->header; -+ do { -+ q = p->next[0]; -+ p->next[0]->prev[0] = q->prev[0]; -+ skiplist_node_init(p); -+ p = q; -+ } while (p != l->header); -+ kfree(l); -+} -+ -+void skiplist_node_init(skiplist_node *node) -+{ -+ memset(node, 0, sizeof(skiplist_node)); -+} -+ -+static inline unsigned int randomLevel(const long unsigned int randseed) -+{ -+ return find_first_bit(&randseed, MaxLevel) / 2; -+} -+ -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -+{ -+ skiplist_node *update[MaxNumberOfLevels]; -+ skiplist_node *p, *q; -+ int k = l->level; -+ -+ p = l->header; -+ do { -+ while (q = p->next[k], q->key <= key) -+ p = q; -+ update[k] = p; -+ } while (--k >= 0); -+ -+ ++l->entries; -+ k = randomLevel(randseed); -+ if (k > l->level) { -+ k = ++l->level; -+ update[k] = l->header; -+ } -+ -+ node->level = k; -+ node->key = key; -+ node->value = value; -+ do { -+ p = update[k]; -+ node->next[k] = p->next[k]; -+ p->next[k] = node; -+ node->prev[k] = p; -+ node->next[k]->prev[k] = node; -+ } while (--k >= 0); -+} -+ -+void skiplist_delete(skiplist *l, skiplist_node *node) -+{ -+ int k, m = node->level; -+ -+ for (k = 0; k <= m; k++) { -+ node->prev[k]->next[k] = node->next[k]; -+ node->next[k]->prev[k] = node->prev[k]; -+ } -+ skiplist_node_init(node); -+ if (m == l->level) { -+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) -+ m--; -+ l->level = m; -+ } -+ l->entries--; -+} -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index afad085960b8..d2e35cd54f94 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -120,7 +120,17 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; --#ifdef CONFIG_PRINTK -+static int zero = 0; -+static int one = 1; -+#ifdef CONFIG_SCHED_MUQSS -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_yield_type; -+#endif -+extern int hrtimer_granularity_us; -+extern int hrtimeout_min_us; -+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) - static int ten_thousand = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS -@@ -184,7 +194,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - int sysctl_legacy_va_layout; - #endif - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -193,7 +203,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; - static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; - #endif /* CONFIG_SMP */ --#endif /* CONFIG_SCHED_DEBUG */ -+#endif /* CONFIG_SCHED_DEBUG && !CONFIG_SCHED_MUQSS */ - - #ifdef CONFIG_COMPACTION - static int min_extfrag_threshold; -@@ -1652,6 +1662,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_MUQSS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -1843,6 +1854,73 @@ static struct ctl_table kern_table[] = { - .extra1 = SYSCTL_ONE, - }, - #endif -+#elif defined(CONFIG_SCHED_MUQSS) -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS) -+ { -+ .procname = "sched_schedstats", -+ .data = NULL, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = sysctl_schedstats, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */ -+#endif /* CONFIG_SCHED_MUQSS */ -+ { -+ .procname = "hrtimer_granularity_us", -+ .data = &hrtimer_granularity_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, -+ { -+ .procname = "hrtimeout_min_us", -+ .data = &hrtimeout_min_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", -diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig -index a09b1d61df6a..e7662101fcc3 100644 ---- a/kernel/time/Kconfig -+++ b/kernel/time/Kconfig -@@ -75,6 +75,9 @@ config NO_HZ_COMMON - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - -+config NO_HZ_FULL -+ bool -+ - choice - prompt "Timer tick handling" - default NO_HZ_IDLE if NO_HZ -@@ -96,8 +99,9 @@ config NO_HZ_IDLE - - Most of the time you want to say Y here. - --config NO_HZ_FULL -+config NO_HZ_FULL_NODEF - bool "Full dynticks system (tickless)" -+ select NO_HZ_FULL - # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - # We need at least one periodic CPU for timekeeping -@@ -123,6 +127,8 @@ config NO_HZ_FULL - transitions: syscalls, exceptions and interrupts. Even when it's - dynamically off. - -+ Not recommended for desktops,laptops, or mobile devices. -+ - Say N. - - endchoice -@@ -132,7 +138,7 @@ config CONTEXT_TRACKING - - config CONTEXT_TRACKING_FORCE - bool "Force context tracking" -- depends on CONTEXT_TRACKING -+ depends on CONTEXT_TRACKING && !SCHED_MUQSS - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to -diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c -index f5490222e134..544c58c29267 100644 ---- a/kernel/time/clockevents.c -+++ b/kernel/time/clockevents.c -@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - --/* Limit min_delta to a jiffie */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) -+int __read_mostly hrtimer_granularity_us = 100; -+/* Limit min_delta to 100us */ -+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..19918cf649b0 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2223,3 +2223,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, - return schedule_hrtimeout_range(expires, 0, mode); - } - EXPORT_SYMBOL_GPL(schedule_hrtimeout); -+ -+/* -+ * As per schedule_hrtimeout but taskes a millisecond value and returns how -+ * many milliseconds are left. -+ */ -+long __sched schedule_msec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ int delta, jiffs; -+ ktime_t expires; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ jiffs = msecs_to_jiffies(timeout); -+ /* -+ * If regular timer resolution is adequate or hrtimer resolution is not -+ * (yet) better than Hz, as would occur during startup, use regular -+ * timers. -+ */ -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) -+ return schedule_timeout(jiffs); -+ -+ delta = (timeout % 1000) * NSEC_PER_MSEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_ms(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+EXPORT_SYMBOL(schedule_msec_hrtimeout); -+ -+#define USECS_PER_SEC 1000000 -+extern int hrtimer_granularity_us; -+ -+static inline long schedule_usec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ ktime_t expires; -+ int delta; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(usecs_to_jiffies(timeout)); -+ -+ if (timeout < hrtimer_granularity_us) -+ timeout = hrtimer_granularity_us; -+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_us(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+int __read_mostly hrtimeout_min_us = 500; -+ -+long __sched schedule_min_hrtimeout(void) -+{ -+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); -+} -+ -+EXPORT_SYMBOL(schedule_min_hrtimeout); -+ -+long __sched schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); -+ -+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index a71758e34e45..ebb84a65d928 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index a50364df1054..a86e4530e530 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -44,6 +44,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1587,7 +1588,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ --static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) -+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) - { - u64 nextevt = hrtimer_get_next_event(); - -@@ -1605,6 +1606,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) - if (nextevt <= basem) - return basem; - -+ if (nextevt < expires && nextevt - basem <= TICK_NSEC) -+ base->is_idle = false; -+ - /* - * Round up to the next jiffie. High resolution timers are - * off, so the hrtimers are expired in the tick and we need to -@@ -1674,7 +1678,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) - } - raw_spin_unlock(&base->lock); - -- return cmp_next_hrtimer_event(basem, expires); -+ return cmp_next_hrtimer_event(base, basem, expires); - } - - /** -@@ -1873,6 +1877,18 @@ signed long __sched schedule_timeout(signed long timeout) - - expire = timeout + jiffies; - -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ /* -+ * Special case 1 as being a request for the minimum timeout -+ * and use highres timers to timeout after 1ms to workaround -+ * the granularity of low Hz tick timers. -+ */ -+ if (!schedule_min_hrtimeout()) -+ return 0; -+ goto out_timeout; -+ } -+#endif - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); -@@ -1881,10 +1897,10 @@ signed long __sched schedule_timeout(signed long timeout) - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); -- -+out_timeout: - timeout = expire - jiffies; - -- out: -+out: - return timeout < 0 ? 0 : timeout; - } - EXPORT_SYMBOL(schedule_timeout); -@@ -2027,7 +2043,19 @@ void __init init_timers(void) - */ - void msleep(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ /* -+ * Use high resolution timers where the resolution of tick based -+ * timers is inadequate. -+ */ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs) -+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); -+ return; -+ } -+ timeout = jiffs + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -2041,7 +2069,15 @@ EXPORT_SYMBOL(msleep); - */ - unsigned long msleep_interruptible(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs && !signal_pending(current)) -+ msecs = schedule_msec_hrtimeout_interruptible(msecs); -+ return msecs; -+ } -+ timeout = jiffs + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..68930e7f4d28 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_MUQSS -+ /* No deadline on MuQSS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 466fc3144fff..27224c2d7674 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ struct scan_control { - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 33; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -diff --git a/net/core/pktgen.c b/net/core/pktgen.c -index 44fdbb9c6e53..ae0adfc677c2 100644 ---- a/net/core/pktgen.c -+++ b/net/core/pktgen.c -@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) - mutex_unlock(&pktgen_thread_lock); - pr_debug("%s: waiting for %s to disappear....\n", - __func__, ifname); -- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); -+ schedule_msec_hrtimeout_interruptible((msec_per_try)); - mutex_lock(&pktgen_thread_lock); - - if (++i >= max_tries) { -diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c -index 40232a278b1a..d87fae1113aa 100644 ---- a/sound/pci/maestro3.c -+++ b/sound/pci/maestro3.c -@@ -1995,7 +1995,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(0, io + GPIO_DATA); - outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); -+ schedule_msec_hrtimeout_uninterruptible((delay1)); - - outw(GPO_PRIMARY_AC97, io + GPIO_DATA); - udelay(5); -@@ -2003,7 +2003,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); - outw(~0, io + GPIO_MASK); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); -+ schedule_msec_hrtimeout_uninterruptible((delay2)); - - if (! snd_m3_try_read_vendor(chip)) - break; -diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c -index 653da3eaf355..d77d12902594 100644 ---- a/sound/soc/codecs/rt5631.c -+++ b/sound/soc/codecs/rt5631.c -@@ -417,7 +417,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena - hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - /* config one-bit depop parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); - snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, -@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable - hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - /* config depop sequence parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); -diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c -index a6aa212fa0c8..8bfa549b38db 100644 ---- a/sound/soc/codecs/wm8350.c -+++ b/sound/soc/codecs/wm8350.c -@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) - out2->ramp == WM8350_RAMP_UP) { - /* delay is longer over 0dB as increases are larger */ - if (i >= WM8350_OUTn_0dB) -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (2)); - else -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (1)); - } else - udelay(50); /* doesn't matter if we delay longer */ -@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - (platform->dis_out4 << 6)); - - /* wait for discharge */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - cap_discharge_msecs)); - -@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - WM8350_VBUFEN); - - /* wait for vmid */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_charge_msecs)); - -@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_discharge_msecs)); - -@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - pm1 | WM8350_OUTPUT_DRAIN_EN); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform->drain_msecs)); - - pm1 &= ~WM8350_BIASEN; -diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c -index a9a6d766a176..45bf31de6282 100644 ---- a/sound/soc/codecs/wm8900.c -+++ b/sound/soc/codecs/wm8900.c -@@ -1104,7 +1104,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, - /* Need to let things settle before stopping the clock - * to ensure that restart works, see "Stopping the - * master clock" in the datasheet. */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_write(component, WM8900_REG_POWER2, - WM8900_REG_POWER2_SYSCLK_ENA); - break; -diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c -index 7072ffacbdfd..e8414ec4759c 100644 ---- a/sound/soc/codecs/wm9713.c -+++ b/sound/soc/codecs/wm9713.c -@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, - - /* Gracefully shut down the voice interface. */ - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); - snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); - -@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, - wm9713->pll_in = freq_in; - - /* wait 10ms AC97 link frames for the link to stabilise */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - return 0; - } - -diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c -index 3273161e2787..7fb9b4c6dd7b 100644 ---- a/sound/soc/soc-dapm.c -+++ b/sound/soc/soc-dapm.c -@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) - static void pop_wait(u32 pop_time) - { - if (pop_time) -- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); -+ schedule_msec_hrtimeout_uninterruptible((pop_time)); - } - - __printf(3, 4) -diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c -index fdbdfb7bce92..fa8e8faf3eb3 100644 ---- a/sound/usb/line6/pcm.c -+++ b/sound/usb/line6/pcm.c -@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, - if (!alive) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } while (--timeout > 0); - if (alive) - dev_err(line6pcm->line6->ifcdev, diff --git a/linux510-rc-tkg/linux510-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux510-rc-tkg/linux510-tkg-patches/0004-glitched-ondemand-muqss.patch deleted file mode 100644 index 02933e4..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0004-glitched-ondemand-muqss.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (45) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (45) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux510-rc-tkg/linux510-tkg-patches/0005-glitched-pds.patch b/linux510-rc-tkg/linux510-tkg-patches/0005-glitched-pds.patch deleted file mode 100644 index 08c9ef3..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0005-glitched-pds.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) diff --git a/linux510-rc-tkg/linux510-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux510-rc-tkg/linux510-tkg-patches/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index d1303a5..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/linux510-rc-tkg/linux510-tkg-patches/0007-v5.10-fsync.patch b/linux510-rc-tkg/linux510-tkg-patches/0007-v5.10-fsync.patch deleted file mode 100644 index 47badbb..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0007-v5.10-fsync.patch +++ /dev/null @@ -1,597 +0,0 @@ -From 7b5df0248ce255ef5b7204d65a7b3783ebb76a3d Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Fri, 13 Dec 2019 11:08:02 -0300 -Subject: [PATCH 1/2] futex: Implement mechanism to wait on any of several - futexes -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows -a thread to wait on several futexes at the same time, and be awoken by -any of them. In a sense, it implements one of the features that was -supported by pooling on the old FUTEX_FD interface. - -The use case lies in the Wine implementation of the Windows NT interface -WaitMultipleObjects. This Windows API function allows a thread to sleep -waiting on the first of a set of event sources (mutexes, timers, signal, -console input, etc) to signal. Considering this is a primitive -synchronization operation for Windows applications, being able to quickly -signal events on the producer side, and quickly go to sleep on the -consumer side is essential for good performance of those running over Wine. - -Wine developers have an implementation that uses eventfd, but it suffers -from FD exhaustion (there is applications that go to the order of -multi-milion FDs), and higher CPU utilization than this new operation. - -The futex list is passed as an array of `struct futex_wait_block` -(pointer, value, bitset) to the kernel, which will enqueue all of them -and sleep if none was already triggered. It returns a hint of which -futex caused the wake up event to userspace, but the hint doesn't -guarantee that is the only futex triggered. Before calling the syscall -again, userspace should traverse the list, trying to re-acquire any of -the other futexes, to prevent an immediate -EWOULDBLOCK return code from -the kernel. - -This was tested using three mechanisms: - -1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and -running the unmodified tools/testing/selftests/futex and a full linux -distro on top of this kernel. - -2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a -multi-threaded, event-handling setup. - -3) By running the Wine fsync implementation and executing multi-threaded -applications, in particular modern games, on top of this implementation. - -Changes were tested for the following ABIs: x86_64, i386 and x32. -Support for x32 applications is not implemented since it would -take a major rework adding a new entry point and splitting the current -futex 64 entry point in two and we can't change the current x32 syscall -number without breaking user space compatibility. - -CC: Steven Rostedt -Cc: Richard Yao -Cc: Thomas Gleixner -Cc: Peter Zijlstra -Co-developed-by: Zebediah Figura -Signed-off-by: Zebediah Figura -Co-developed-by: Steven Noonan -Signed-off-by: Steven Noonan -Co-developed-by: Pierre-Loup A. Griffais -Signed-off-by: Pierre-Loup A. Griffais -Signed-off-by: Gabriel Krisman Bertazi -[Added compatibility code] -Co-developed-by: André Almeida -Signed-off-by: André Almeida - -Adjusted for v5.9: Removed `put_futex_key` calls. ---- - include/uapi/linux/futex.h | 20 +++ - kernel/futex.c | 352 ++++++++++++++++++++++++++++++++++++- - 2 files changed, 370 insertions(+), 2 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e2..580001e89c6ca 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -40,6 +41,8 @@ - FUTEX_PRIVATE_FLAG) - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -+#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ -+ FUTEX_PRIVATE_FLAG) - - /* - * Support for robust futexes: the kernel cleans up held futexes at -@@ -150,4 +153,21 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+/* -+ * Maximum number of multiple futexes to wait for -+ */ -+#define FUTEX_MULTIPLE_MAX_COUNT 128 -+ -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index a5876694a60eb..6f4bea76df460 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -197,6 +197,8 @@ struct futex_pi_state { - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup -+ * @uaddr: userspace address of futex -+ * @uval: expected futex's value - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). -@@ -219,6 +221,8 @@ struct futex_q { - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -+ u32 __user *uaddr; -+ u32 uval; - } __randomize_layout; - - static const struct futex_q futex_q_init = { -@@ -2304,6 +2308,29 @@ static int unqueue_me(struct futex_q *q) - return ret; - } - -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_q *q, int count) -+{ -+ int ret = -1; -+ int i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&q[i])) -+ ret = i; -+ } -+ return ret; -+} -+ - /* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -@@ -2662,6 +2689,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_q *qs, int count, -+ unsigned int flags, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret)) { -+ return ret; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &qs[i]; -+ -+ hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, q->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ -+ /* -+ * Keys 0..(i-1) are implicitly put -+ * on unqueue_multiple. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * On a real fault, prioritize the error even if -+ * some other futex was awoken. Userspace gave -+ * us a bad address, -EFAULT them. -+ */ -+ ret = get_user(uval, q->uaddr); -+ if (ret) -+ return ret; -+ -+ /* -+ * Even if the page fault was handled, If -+ * something was already awaken, we can safely -+ * give up and succeed to give a hint for userspace to -+ * acquire the right futex faster. -+ */ -+ if (*awaken >= 0) -+ return 1; -+ -+ goto retry; -+ } -+ -+ if (uval != q->uval) { -+ queue_unlock(hb); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ __set_current_state(TASK_RUNNING); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_q *qs, int op, -+ u32 count, ktime_t *abs_time) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ int ret, flags = 0, hint = 0; -+ unsigned int i; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) -+ flags |= FLAGS_CLOCKRT; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, 0); -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, flags, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ break; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ ret = unqueue_multiple(qs, count); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (ret >= 0) -+ break; -+ if (to && !to->task) { -+ ret = -ETIMEDOUT; -+ break; -+ } else if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -3774,6 +4000,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - return -ENOSYS; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = fwb.uaddr; -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} - - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -@@ -3786,7 +4049,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3807,6 +4071,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs; -+ -+#ifdef CONFIG_X86_X32 -+ if (unlikely(in_x32_syscall())) -+ return -ENOSYS; -+#endif -+ qs = futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - -@@ -3969,6 +4252,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - #endif /* CONFIG_COMPAT */ - - #ifdef CONFIG_COMPAT_32BIT_TIME -+/** -+ * struct compat_futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex (compatible pointer) -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct compat_futex_wait_block { -+ compat_uptr_t uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ -+/** -+ * compat_futex_read_wait_block - Read an array of futex_wait_block from -+ * userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function does the same as futex_read_wait_block(), except that it -+ * converts the pointer to the futex from the compat version to the regular one. -+ */ -+inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, -+ u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct compat_futex_wait_block fwb; -+ struct compat_futex_wait_block __user *entry = -+ (struct compat_futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = compat_ptr(fwb.uaddr); -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} -+ - SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -@@ -3980,7 +4314,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) -@@ -3995,6 +4330,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ - -From ccdddb50d330d2ee1a4d2cbfdd27bdd7fb10eec3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Fri, 7 Feb 2020 23:28:02 -0300 -Subject: [PATCH 2/2] futex: Add Proton compatibility code - ---- - include/uapi/linux/futex.h | 2 +- - kernel/futex.c | 5 +++-- - 2 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 580001e89c6ca..a3e760886b8e7 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -diff --git a/kernel/futex.c b/kernel/futex.c -index 6f4bea76df460..03d89fe7b8392 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -4059,7 +4059,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } -@@ -4260,6 +4260,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - */ - struct compat_futex_wait_block { - compat_uptr_t uaddr; -+ __u32 pad; - __u32 val; - __u32 bitset; - }; -@@ -4322,7 +4323,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } diff --git a/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-bmq.patch b/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-bmq.patch deleted file mode 100644 index e42e522..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-bmq.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - BMQ - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) diff --git a/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-ondemand-bmq.patch deleted file mode 100644 index a926040..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0009-glitched-ondemand-bmq.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux510-rc-tkg/linux510-tkg-patches/0009-prjc_v5.10-r0.patch b/linux510-rc-tkg/linux510-tkg-patches/0009-prjc_v5.10-r0.patch deleted file mode 100644 index 550d29c..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0009-prjc_v5.10-r0.patch +++ /dev/null @@ -1,8809 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a1068742a6df..b97a9697fde4 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4611,6 +4611,12 @@ - - sbni= [NET] Granch SBNI12 leased line adapter - -+ sched_timeslice= -+ [KNL] Time slice in us for BMQ/PDS scheduler. -+ Format: (must be >= 1000) -+ Default: 4000 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_debug [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index d4b32cc32bb7..14118e5168ef 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1515,3 +1515,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 617db4e0faa0..f85926764f9a 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..8918609cb9f0 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -652,12 +653,18 @@ struct task_struct { - unsigned int ptrace; - - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; -+#endif -+ -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -671,6 +678,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -679,13 +687,33 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+ int bmq_idx; -+ struct list_head bmq_node; -+#endif /* CONFIG_SCHED_BMQ */ -+#ifdef CONFIG_SCHED_PDS -+ u64 deadline; -+ u64 priodl; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+#endif /* CONFIG_SCHED_PDS */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* -@@ -1332,6 +1360,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..179d77c8360e 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((p)->priodl) -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..42730d27ceb5 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,11 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ 7 -+#endif -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ 0 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..47ca955a451d ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ * Copyright (C) 2016 Alfred Chen. -+ * -+ * Code based on Con Kolivas's skip list implementation for BFS, and -+ * which is based on example originally by William Pugh. -+ * -+ * Skip Lists are a probabilistic alternative to balanced trees, as -+ * described in the June 1990 issue of CACM and were invented by -+ * William Pugh in 1987. -+ * -+ * A couple of comments about this implementation: -+ * -+ * This file only provides a infrastructure of skip list. -+ * -+ * skiplist_node is embedded into container data structure, to get rid -+ * the dependency of kmalloc/kfree operation in scheduler code. -+ * -+ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+ * macro and be used for skip list insert operation. -+ * -+ * Random Level is also not defined in this file, instead, it should be -+ * customized implemented and set to node->level then pass to the customized -+ * skiplist_insert function. -+ * -+ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ * -+ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+ * considering that there will be 256 entries to enable the top level when using -+ * random level p=0.5, and that number is more than enough for a run queue usage -+ * in a scheduler usage. And it also help to reduce the memory usage of the -+ * embedded skip list node in task_struct to about 50%. -+ * -+ * The insertion routine has been implemented so as to use the -+ * dirty hack described in the CACM paper: if a random level is -+ * generated that is more than the current maximum level, the -+ * current maximum level plus one is used instead. -+ * -+ * BFS Notes: In this implementation of skiplists, there are bidirectional -+ * next/prev pointers and the insert function returns a pointer to the actual -+ * node the value is stored. The key here is chosen by the scheduler so as to -+ * sort tasks according to the priority list requirements and is no longer used -+ * by the scheduler after insertion. The scheduler lookup, however, occurs in -+ * O(1) time because it is always the first item in the level 0 linked list. -+ * Since the task struct stores a copy of the node pointer upon skiplist_insert, -+ * it can also remove it much faster than the original implementation with the -+ * aid of prev<->next pointer manipulation and no searching. -+ */ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty() */ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..2122dba5596f 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -770,9 +770,39 @@ config GENERIC_SCHED_CLOCK - - menu "Scheduler features" - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+config SCHED_PDS -+ bool "PDS CPU scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. -+ -+endchoice -+ -+endif -+ - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -858,6 +888,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_ALT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -944,7 +975,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_ALT - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1200,6 +1231,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_ALT - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..5a23122f3d2c 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -87,6 +93,19 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+#endif -+#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+ .sl_level = 0, -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -94,6 +113,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 642415b8c3c9..7e0e1fe18035 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 733e80f334e7..3f3506c851fd 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..4176ad070bc9 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_ALT -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index cfdd5b93264d..84c284eb544a 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static inline int - rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static void -@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..eb6d7d87779f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o alt_debug.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..f36264fea75c ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,6360 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include "sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+#include "smp.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#define ALT_SCHED_VERSION "v5.9-r0" -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_us; -+ -+ get_option(&str, ×lice_us); -+ if (timeslice_us >= 1000) -+ sched_timeslice_ns = timeslice_us * 1000; -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq_imp.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds_imp.h" -+#endif -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = sched_queue_watermark(rq); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", -+ cpu_of(rq), watermark, last_wm);*/ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Serialization rules: -+ * -+ * Lock order: -+ * -+ * p->pi_lock -+ * rq->lock -+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) -+ * -+ * rq1->lock -+ * rq2->lock where: rq1 < rq2 -+ * -+ * Regular state: -+ * -+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the -+ * local CPU's rq->lock, it optionally removes the task from the runqueue and -+ * always looks at the local rq data structures to find the most elegible task -+ * to run next. -+ * -+ * Task enqueue is also under rq->lock, possibly taken from another CPU. -+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to -+ * the local CPU to avoid bouncing the runqueue state around [ see -+ * ttwu_queue_wakelist() ] -+ * -+ * Task wakeup, specifically wakeups that involve migration, are horribly -+ * complicated to avoid having to take two rq->locks. -+ * -+ * Special state: -+ * -+ * System-calls and anything external will use task_rq_lock() which acquires -+ * both p->pi_lock and rq->lock. As a consequence the state they change is -+ * stable while holding either lock: -+ * -+ * - sched_setaffinity()/ -+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed -+ * - set_user_nice(): p->se.load, p->*prio -+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, -+ * p->se.load, p->rt_priority, -+ * p->dl.dl_{runtime, deadline, period, flags, bw, density} -+ * - sched_setnuma(): p->numa_preferred_nid -+ * - sched_move_task()/ -+ * cpu_cgroup_fork(): p->sched_task_group -+ * - uclamp_update_active() p->uclamp* -+ * -+ * p->state <- TASK_*: -+ * -+ * is changed locklessly using set_current_state(), __set_current_state() or -+ * set_special_state(), see their respective comments, or by -+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against -+ * concurrent self. -+ * -+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: -+ * -+ * is set by activate_task() and cleared by deactivate_task(), under -+ * rq->lock. Non-zero indicates the task is runnable, the special -+ * ON_RQ_MIGRATING state is used for migration without holding both -+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). -+ * -+ * p->on_cpu <- { 0, 1 }: -+ * -+ * is set by prepare_task() and cleared by finish_task() such that it will be -+ * set before p is scheduled-in and cleared after p is scheduled-out, both -+ * under rq->lock. Non-zero indicates the task is running on its CPU. -+ * -+ * [ The astute reader will observe that it is possible for two tasks on one -+ * CPU to have ->on_cpu = 1 at the same time. ] -+ * -+ * task_cpu(p): is changed by set_task_cpu(), the rules are: -+ * -+ * - Don't call set_task_cpu() on a blocked task: -+ * -+ * We don't care what CPU we're not running on, this simplifies hotplug, -+ * the CPU assignment of blocked tasks isn't required to be valid. -+ * -+ * - for try_to_wake_up(), called under p->pi_lock: -+ * -+ * This allows try_to_wake_up() to only take one rq->lock, see its comment. -+ * -+ * - for migration called under rq->lock: -+ * [ see task_on_rq_migrating() in task_rq_lock() ] -+ * -+ * o move_queued_task() -+ * o detach_task() -+ * -+ * - for migration called under double_rq_lock(): -+ * -+ * o __migrate_swap_task() -+ * o push_rt_task() / pull_rt_task() -+ * o push_dl_task() / pull_dl_task() -+ * o dl_task_offline_migration() -+ * -+ */ -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} -+ -+void select_nohz_load_balancer(int stop_tick) {} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static void nohz_csd_func(void *info) -+{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; -+ -+ /* -+ * Release the rq::nohz_csd. -+ */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } -+} -+ -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+static inline void -+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -+{ -+ csd->flags = 0; -+ csd->func = func; -+ csd->info = rq; -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_from_idle(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ fallthrough; -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ check_preempt_curr(rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#else /* !CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() -+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Conceptually does: -+ * -+ * If (@state & @p->state) @p->state = TASK_RUNNING. -+ * -+ * If the task was not queued/runnable, also place it back on a runqueue. -+ * -+ * This function is atomic against schedule() which would dequeue the task. -+ * -+ * It issues a full memory barrier before accessing @p->state, see the comment -+ * with set_current_state(). -+ * -+ * Uses p->pi_lock to serialize against concurrent wake-ups. -+ * -+ * Relies on p->pi_lock stabilizing: -+ * - p->sched_class -+ * - p->cpus_ptr -+ * - p->sched_task_group -+ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). -+ * -+ * Tries really hard to only take one task_rq(p)->lock for performance. -+ * Takes rq->lock in: -+ * - ttwu_runnable() -- old rq, unavoidable, see comment there; -+ * - ttwu_queue() -- new rq, for enqueue of the task; -+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. -+ * -+ * As a consequence we race really badly with just about everything. See the -+ * many memory barriers and their comments for details. -+ * -+ * Return: %true if @p->state changes (an actual wakeup was done), -+ * %false otherwise. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ sched_task_ttwu(p); -+ -+ cpu = select_task_rq(p, this_rq()); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, &rf); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ __task_rq_unlock(rq, &rf); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); -+ -+ rseq_migrate(p); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_post_fork(struct task_struct *p) {} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p, this_rq())); -+#ifdef CONFIG_SMP -+ rseq_migrate(p); -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ * -+ * See the ttwu() WF_ON_CPU case and its ordering comment. -+ */ -+ WRITE_ONCE(next->on_cpu, 1); -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+ struct task_struct *p = current; -+ unsigned long flags; -+ int dest_cpu; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = this_rq(); -+ -+ if (rq != task_rq(p) || rq->nr_running < 2) -+ goto unlock; -+ -+ dest_cpu = select_task_rq(p, task_rq(p)); -+ if (dest_cpu == smp_processor_id()) -+ goto unlock; -+ -+ if (likely(cpu_active(dest_cpu))) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); -+ return; -+ } -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(cpu, &tmp, -+ per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance_check(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu; -+ -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) -+ return; -+ -+ cpu = cpu_of(rq); -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk)) { -+ if (sg_balance_trigger(i)) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[IDLE_WM].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0, ); -+ set_task_cpu(p, dest_cpu); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+#ifdef CONFIG_SMP -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+#endif -+ rq->nr_running += nr_migrated; -+#ifdef CONFIG_SMP -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+#endif -+ update_sched_rq_watermark(rq); -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state && prev_state == prev->state) { -+ if (signal_pending_state(prev_state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ -+ if (likely(prev != next)) { -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { -+ requeue_task(p, rq); -+ check_preempt_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ alt_sched_debug(); -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ pr_cont(" running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ sched_queue_init_idle(rq, idle); -+ -+ scs_task_reset(idle); -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ p = sched_rq_first_task(rq); -+ while (p != rq->idle) { -+ int dest_cpu; -+ -+ /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ /* -+ * Rules for changing task_struct::cpus_allowed are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ p = sched_rq_first_task(rq); -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last) \ -+ if (cpumask_and(chk, chk, mask)) \ -+ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ -+ cpu, (chk++)->bits[0]); \ -+ if (!last) \ -+ cpumask_complement(chk, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ -+ cpumask_complement(chk, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = chk; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(rq); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); -+#endif -+#endif /* CONFIG_SMP */ -+ rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..1212a031700e ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..99be2c51c88d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,555 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_rq */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+#ifdef CONFIG_SCHED_BMQ -+ struct bmq queue; -+#endif -+#ifdef CONFIG_SCHED_PDS -+ struct skiplist_node sl_header; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, -+ const cpumask_t *mask) -+{ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu : -+ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); -+} -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_from_idle(void) { } -+#endif -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..aff0bb30a884 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,20 @@ -+#ifndef BMQ_H -+#define BMQ_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, SCHED_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#endif -diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h -new file mode 100644 -index 000000000000..ad9a7c448da7 ---- /dev/null -+++ b/kernel/sched/bmq_imp.h -@@ -0,0 +1,185 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline int task_sched_prio(struct task_struct *p, struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+} -+ -+static inline void update_task_priodl(struct task_struct *p) {} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return find_first_bit(rq->queue.bitmap, SCHED_BITS); -+} -+ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ struct bmq *q = &rq->queue; -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ struct bmq *q = &rq->queue; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); -+ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); -+ set_bit(idle->bmq_idx, q->bitmap); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ list_del(&p->bmq_node); \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap);\ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->bmq_idx = task_sched_prio(p, rq); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ -+ set_bit(p->bmq_idx, rq->queue.bitmap) -+ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{ \ -+ int idx = task_sched_prio(p, rq); \ -+\ -+ list_del(&p->bmq_node); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ -+ if (idx != p->bmq_idx) { \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap); \ -+ p->bmq_idx = idx; \ -+ set_bit(p->bmq_idx, rq->queue.bitmap); \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ return (task_sched_prio(p, rq) != p->bmq_idx); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+ -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index e39008242cf4..5963716fe391 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_ALT */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -912,6 +923,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) - cpufreq_governor_init(schedutil_gov); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_ALT - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -942,4 +954,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_ALT */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 5a55d2300452..66a0ab7165f0 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -614,7 +614,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f324dc36fc43..a6b566bda65b 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..7fdeace7e8a5 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,14 @@ -+#ifndef PDS_H -+#define PDS_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+#endif -diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h -new file mode 100644 -index 000000000000..6baee5e961b9 ---- /dev/null -+++ b/kernel/sched/pds_imp.h -@@ -0,0 +1,257 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static const u64 user_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, -+/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, -+/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, -+/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, -+/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, -+/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, -+/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, -+/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 -+}; -+ -+static const unsigned char dl_level_map[] = { -+/* 0 4 8 12 */ -+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, -+/* 16 20 24 28 */ -+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, -+/* 32 36 40 44 */ -+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, -+/* 48 52 56 60 */ -+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, -+/* 64 68 72 76 */ -+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, -+/* 80 84 88 92 */ -+ 1, 0 -+}; -+ -+static inline int -+task_sched_prio(const struct task_struct *p, const struct rq *rq) -+{ -+ size_t delta; -+ -+ if (p == rq->idle) -+ return IDLE_TASK_SCHED_PRIO; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return p->prio; -+ -+ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; -+ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); -+ -+ return MAX_RT_PRIO + dl_level_map[delta]; -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+} -+ -+/* -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Init the queue structure in rq -+ */ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ -+ int default_prio = idle->prio; -+ -+ idle->prio = MAX_PRIO; -+ idle->deadline = 0ULL; -+ update_task_priodl(idle); -+ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ -+ idle->sl_node.level = idle->sl_level; -+ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); -+ -+ idle->prio = default_prio; -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ BUG_ON(node == &rq->sl_header); -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *next = p->sl_node.next[0]; -+ -+ BUG_ON(next == &rq->sl_header); -+ return skiplist_entry(next, struct task_struct, sl_node); -+} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return task_sched_prio(sched_rq_first_task(rq), rq); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sl_node.level = p->sl_level; \ -+ pds_skiplist_insert(&rq->sl_header, &p->sl_node) -+ -+/* -+ * Requeue a task @p to @rq -+ */ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{\ -+ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ -+\ -+ p->sl_node.level = p->sl_level; \ -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *node = p->sl_node.prev[0]; -+ -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl > p->priodl) -+ return true; -+ } -+ -+ node = p->sl_node.next[0]; -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl < p->priodl) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->sl_level = pds_skiplist_random_level(p); -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int ret; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ -+ preempt_disable(); -+ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+static void sched_task_ttwu(struct task_struct *p) {} -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index 2c613e1cff3a..0103b2a7201d 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - /* - * thermal: - * -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index 795e43e02afc..856163dac896 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) -@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) - return LOAD_AVG_MAX - 1024 + avg->period_contrib; - } - -+#ifndef CONFIG_SCHED_ALT - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -162,9 +165,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -182,6 +187,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..6bc68bacbac8 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - - #include -@@ -2626,3 +2630,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..108422ebc7bf 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..cc946a9bd550 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -4,6 +4,7 @@ - */ - #include "sched.h" - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology = tl; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index afad085960b8..e91b4cb3042b 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_ALT -+static int __maybe_unused zero = 0; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -184,7 +188,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - int sysctl_legacy_va_layout; - #endif - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -1652,6 +1656,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -1854,6 +1859,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -2430,6 +2436,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..81f2ee62c807 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1927,8 +1927,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index a71758e34e45..d20c347df861 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -801,6 +801,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -808,6 +809,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -835,8 +837,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -850,7 +854,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1086,8 +1090,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..65f60c77bc50 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -index f36264fea75c6ca7c34eaa259c0bff829cbf6ac0..d43ca62fd00fe442bda9b4ad548fae432a7436de 100644 ---- a/kernel/sched/alt_core.c -+++ b/kernel/sched/alt_core.c -@@ -11,6 +11,10 @@ - * scheduler by Alfred Chen. - * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. - */ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ - #include "sched.h" - - #include -@@ -42,8 +46,11 @@ - #include "pelt.h" - #include "smp.h" - --#define CREATE_TRACE_POINTS --#include -+/* -+ * Export tracepoints that act as a bare tracehook (ie: have no trace event -+ * associated with them) to allow external modules to probe them. -+ */ -+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); - - #define ALT_SCHED_VERSION "v5.9-r0" - -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -index 99be2c51c88d0406cced20b36d7230da12930a5c..03f8b8b1aa27eeb15989af25b4050c767da12aad 100644 ---- a/kernel/sched/alt_sched.h -+++ b/kernel/sched/alt_sched.h -@@ -46,6 +46,8 @@ - - #include "cpupri.h" - -+#include -+ - #ifdef CONFIG_SCHED_BMQ - #include "bmq.h" - #endif -@@ -496,6 +498,8 @@ static inline int sched_tick_offload_init(void) { return 0; } - - extern void schedule_idle(void); - -+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * diff --git a/linux510-rc-tkg/linux510-tkg-patches/0011-ZFS-fix.patch b/linux510-rc-tkg/linux510-tkg-patches/0011-ZFS-fix.patch deleted file mode 100644 index af71d04..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0011-ZFS-fix.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= -Date: Thu, 2 May 2019 05:28:08 +0100 -Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with - EXPORT_SYMBOL_GPL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We need these symbols in zfs as the fpu implementation breaks userspace: - -https://github.com/zfsonlinux/zfs/issues/9346 -Signed-off-by: Jörg Thalheim ---- - arch/x86/kernel/fpu/core.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index 12c70840980e..352538b3bb5d 100644 ---- a/arch/x86/kernel/fpu/core.c -+++ b/arch/x86/kernel/fpu/core.c -@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) - } - __cpu_invalidate_fpregs_state(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_begin); -+EXPORT_SYMBOL(kernel_fpu_begin); - - void kernel_fpu_end(void) - { -@@ -111,7 +111,7 @@ void kernel_fpu_end(void) - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_end); -+EXPORT_SYMBOL(kernel_fpu_end); - - /* - * Save the FPU state (mark it for reload if necessary): --- -2.23.0 - - diff --git a/linux510-rc-tkg/linux510-tkg-patches/0012-misc-additions.patch b/linux510-rc-tkg/linux510-tkg-patches/0012-misc-additions.patch deleted file mode 100644 index a4efaef..0000000 --- a/linux510-rc-tkg/linux510-tkg-patches/0012-misc-additions.patch +++ /dev/null @@ -1,54 +0,0 @@ -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 0840d27381ea..73aba9a31064 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP - def_bool y - depends on VT_CONSOLE && PM_SLEEP - -+config NR_TTY_DEVICES -+ int "Maximum tty device number" -+ depends on VT -+ range 12 63 -+ default 63 -+ help -+ This option is used to change the number of tty devices in /dev. -+ The default value is 63. The lowest number you can set is 12, -+ 63 is also the upper limit so we don't overrun the serial -+ consoles. -+ -+ If unsure, say 63. -+ - config HW_CONSOLE - bool - depends on VT && !UML -diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h -index e9d39c48520a..3bceead8da40 100644 ---- a/include/uapi/linux/vt.h -+++ b/include/uapi/linux/vt.h -@@ -3,12 +3,25 @@ - #define _UAPI_LINUX_VT_H - - -+/* -+ * We will make this definition solely for the purpose of making packages -+ * such as splashutils build, because they can not understand that -+ * NR_TTY_DEVICES is defined in the kernel configuration. -+ */ -+#ifndef CONFIG_NR_TTY_DEVICES -+#define CONFIG_NR_TTY_DEVICES 63 -+#endif -+ - /* - * These constants are also useful for user-level apps (e.g., VC - * resizing). - */ - #define MIN_NR_CONSOLES 1 /* must be at least 1 */ --#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -+/* -+ * NR_TTY_DEVICES: -+ * Value MUST be at least 12 and must never be higher then 63 -+ */ -+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ - /* Note: the ioctl VT_GETSTATE does not work for - consoles 16 and higher (since it returns a short) */ \ No newline at end of file diff --git a/linux54-tkg/PKGBUILD b/linux54-tkg/PKGBUILD deleted file mode 100644 index 867a107..0000000 --- a/linux54-tkg/PKGBUILD +++ /dev/null @@ -1,423 +0,0 @@ -# Based on the file created for Arch Linux by: -# Tobias Powalowski -# Thomas Baechler - -# Contributor: Tk-Glitch - -plain ' .---.` `.---.' -plain ' `/syhhhyso- -osyhhhys/`' -plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' -plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' -plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' -plain ' /ssyhhhysssssssssssssssssyhhhyss/' -plain ' .ossssssssssssssssssssssssssssso.' -plain ' :sssssssssssssssssssssssssssssssss:' -plain ' /sssssssssssssssssssssssssssssssssss/' -plain ' :sssssssssssssoosssssssoosssssssssssss:' -plain ' osssssssssssssoosssssssoossssssssssssso' -plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' -plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' -plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' -plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' -plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' -plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' -plain ' `.-:///////:-.`' - -_where="$PWD" # track basedir as different Arch based distros are moving srcdir around - -cp "$_where"/linux54-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking -cp "$_where"/linux54-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - -source "$_where"/customization.cfg # load default configuration from file - -# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. -if [ -e "$_EXT_CONFIG_PATH" ]; then - source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" -fi - -if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then - # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. - plain "Do you want to use a predefined optimized profile?" - read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; -fi -if [ "$_OPTIPROFILE" == "2" ]; then - source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" -elif [ "$_OPTIPROFILE" == "3" ]; then - source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" -fi - -# source cpuschedset early if present -if [ -e "$_where"/cpuschedset ]; then - source "$_where"/cpuschedset -fi - -# CPU SCHED selector -if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then - plain "What CPU sched variant do you want to build/install?" - read -rp "`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" CONDITION; - if [ "$CONDITION" == "2" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$CONDITION" == "3" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - elif [ "$CONDITION" == "4" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - fi - if [ -n "$_custom_pkgbase" ]; then - echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset - fi -elif [ "$_cpusched" == "muqss" ] || [ "$_cpusched" == "MuQSS" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset -elif [ "$_cpusched" == "pds" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset -elif [ "$_cpusched" == "bmq" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset -else - if [ "$_nofallback" != "true" ]; then - warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" - read -rp "`echo $' > N/y : '`" _fallback; - fi - if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" == "true" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - error "Exiting..." - exit 1 - fi -fi - -source "$_where"/cpuschedset - -_basever=54 -if [ -n "$_custom_pkgbase" ]; then - pkgbase="${_custom_pkgbase}" -else - pkgbase=linux"${_basever}"-tkg-"${_cpusched}" -fi -pkgname=("${pkgbase}" "${pkgbase}-headers") -_basekernel=5.4 -_sub=72 -pkgver="${_basekernel}"."${_sub}" -pkgrel=91 -pkgdesc='Linux-tkg' -arch=('x86_64') # no i686 in here -url="http://www.kernel.org/" -license=('GPL2') -makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') -optdepends=('schedtool') -options=('!strip' 'docs') -source=("https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" - "https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" - "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v4.19-v5.4.patch" - 'config.x86_64' # stock Arch config - 'config_hardened.x86_64' # hardened Arch config - 90-cleanup.hook - cleanup - # ARCH Patches - 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - # TkG - 0002-clear-patches.patch - 0003-glitched-base.patch - 0003-glitched-cfs.patch - 0004-glitched-ondemand-muqss.patch - 0004-glitched-muqss.patch - 0004-5.4-ck1.patch - 0005-glitched-ondemand-pds.patch - 0005-glitched-pds.patch - 0005-v5.4_undead-pds099o.patch - 0006-add-acs-overrides_iommu.patch - 0007-v5.4-fsync.patch - #0008-5.4-bcachefs.patch - 0009-glitched-bmq.patch - 0009-bmq_v5.4-r2.patch - 0011-ZFS-fix.patch - 0012-linux-hardened.patch -) -sha256sums=('bf338980b1670bca287f9994b7441c2361907635879169c64ae78364efc5f491' - 'bce941bcb6c8148ac19cd2fa4f1e19c6c75f699a3bcdfd452df7484cff2a2353' - '27b7fc535ade94b636c3ec4e809e141831e9465a0ef55215a9852b87048629e2' - '55dd5117c1da17c9ec38d7bc995958958bcc8b7ebcfd81de1d4c7650b85537ab' - '1f4a20d6eaaa0d969af93152a65191492400c6aa838fc1c290b0dd29bb6019d8' - '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' - '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' - '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' - 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' - '156a2c75fd228920e3c3da5e04a110afa403951bdfbb85772c2fd4b82fd24d61' - '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' - 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' - 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' - '815974c65f47301d2a5d1577bf95e8a4b54cad7d77f226e0065f83e763837c48' - '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' - 'eac7e5d6201528e64f4bdf5e286c842511e1afc52e1518dc8e7d11932bbe0a99' - 'db03fbd179ec78941eefe1c0edde4c19071bc603511d0b5c06c04e412994b62e' - '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - '2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d' - '3832f828a9f402b153fc9a6829c5a4eaf6091804bcda3a0423c8e1b57e26420d' - '6a6a736cf1b3513d108bfd36f60baf50bb36b33aec21ab0d0ffad13602b7ff75' - '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' - 'aeb31404c26ee898d007b1f66cb9572c9884ad8eca14edc4587d68f6cba6de46') - -export KBUILD_BUILD_HOST=archlinux -export KBUILD_BUILD_USER=$pkgbase -export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" - -prepare() { - rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build - - ln -s "${_where}/customization.cfg" "${srcdir}" # workaround - - cd "${srcdir}/linux-${_basekernel}" - - source "$_where/linux$_basever-tkg-config/prepare" - _tkg_srcprep -} - -build() { - cd "${srcdir}/linux-${_basekernel}" - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" == "true" ]; then - _force_all_threads=-j$(nproc) - else - _force_all_threads=${MAKEFLAGS} - fi - - # ccache - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - export PATH="/usr/lib/ccache/bin/:$PATH" - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - fi - - # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". - declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" - - # build! - _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make $_force_all_threads LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make $_force_all_threads LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) -} - -hackbase() { - pkgdesc="The $pkgdesc kernel and modules" - depends=('coreutils' 'kmod' 'initramfs') - optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' - 'crda: to set the correct wireless channels of your country.' - 'linux-firmware: Firmware files for Linux' - 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' - 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' - 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' - 'update-grub: Simple wrapper around grub-mkconfig.') - provides=("linux=${pkgver}" "${pkgbase}") - - cd "${srcdir}/linux-${_basekernel}" - - # get kernel version - local _kernver="$( N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" == "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Reverting your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 -R < "${_f}" - echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi - - _patches=("$_where"/*."${_userpatch_ext}patch") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" == "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Applying your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 < "${_f}" - echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi -} - -_tkg_srcprep() { - msg2 "Setting version..." - scripts/setlocalversion --save-scmversion - echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel - echo "" > localversion.20-pkgname - - # add upstream patch - patch -p1 -i ../patch-"${pkgver}" - - # ARCH Patches - if [ "${_configfile}" == "config_hardened.x86_64" ] && [ "${_cpusched}" == "cfs" ]; then - msg2 "Using linux hardened patchset" - patch -Np1 -i ../0012-linux-hardened.patch - else - patch -Np1 -i ../0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - fi - - # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch - msg2 "Applying graysky's cpu opts patch" - patch -Np1 -i ../enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v4.19-v5.4.patch - - # TkG - msg2 "Applying clear linux patches" - patch -Np1 -i ../0002-clear-patches.patch - - msg2 "Applying glitched base patch" - patch -Np1 -i ../0003-glitched-base.patch - - if [ "${_cpusched}" == "MuQSS" ]; then - # MuQSS - patch -Np1 -i ../0004-5.4-ck1.patch - if [ "${_aggressive_ondemand}" == "true" ]; then - patch -Np1 -i ../0004-glitched-ondemand-muqss.patch - fi - patch -Np1 -i ../0004-glitched-muqss.patch - elif [ "${_cpusched}" == "pds" ]; then - # PDS-mq - patch -Np1 -i ../0005-v5.4_undead-pds099o.patch - if [ "${_aggressive_ondemand}" == "true" ]; then - patch -Np1 -i ../0005-glitched-ondemand-pds.patch - fi - patch -Np1 -i ../0005-glitched-pds.patch - elif [ "${_cpusched}" == "bmq" ]; then - # BMQ - patch -Np1 -i ../0009-bmq_v5.4-r2.patch - patch -Np1 -i ../0009-glitched-bmq.patch - elif [ "${_cpusched}" == "cfs" ]; then - patch -Np1 -i ../0003-glitched-cfs.patch - fi - - if [ -z "${_configfile}" ]; then - _configfile="config.x86_64" - fi - - cat "${srcdir}/${_configfile}" > ./.config - - # Set some -tkg defaults - echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config - sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config - echo "CONFIG_DEFAULT_CAKE=y" >> ./.config - echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config - echo "CONFIG_TP_SMAPI=m" >> ./.config - echo "CONFIG_RAID6_USE_PREFER_GEN=y" >> ./.config - echo "# CONFIG_NTP_PPS is not set" >> ./.config - echo "# CONFIG_X86_P6_NOP is not set" >> ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config - sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config - fi - # Skip dbg package creation on non-Arch - #if [ "$_distro" != "Arch" ]; then - # sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config - #fi - - # Inject cpuopts options - echo "# CONFIG_MK8SSE3 is not set" >> ./.config - echo "# CONFIG_MK10 is not set" >> ./.config - echo "# CONFIG_MBARCELONA is not set" >> ./.config - echo "# CONFIG_MBOBCAT is not set" >> ./.config - echo "# CONFIG_MJAGUAR is not set" >> ./.config - echo "# CONFIG_MBULLDOZER is not set" >> ./.config - echo "# CONFIG_MPILEDRIVER is not set" >> ./.config - echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config - echo "# CONFIG_MEXCAVATOR is not set" >> ./.config - echo "# CONFIG_MZEN is not set" >> ./.config - echo "# CONFIG_MZEN2 is not set" >> ./.config - echo "# CONFIG_MATOM is not set" >> ./.config - echo "# CONFIG_MNEHALEM is not set" >> ./.config - echo "# CONFIG_MWESTMERE is not set" >> ./.config - echo "# CONFIG_MSILVERMONT is not set" >> ./.config - echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config - echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config - echo "# CONFIG_MHASWELL is not set" >> ./.config - echo "# CONFIG_MBROADWELL is not set" >> ./.config - echo "# CONFIG_MSKYLAKE is not set" >> ./.config - echo "# CONFIG_MSKYLAKEX is not set" >> ./.config - echo "# CONFIG_MCANNONLAKE is not set" >> ./.config - echo "# CONFIG_MICELAKE is not set" >> ./.config - echo "# CONFIG_MGOLDMONT is not set" >> ./.config - echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config - echo "# CONFIG_MCASCADELAKE is not set" >> ./.config - echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config - echo "# CONFIG_MTIGERLAKE is not set" >> ./.config - - # Disable some debugging - if [ "${_debugdisable}" == "true" ]; then - sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config - sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config - fi - - if [ "${_cpusched}" == "MuQSS" ]; then - # MuQSS default config - echo "CONFIG_SCHED_MUQSS=y" >> ./.config - elif [ "${_cpusched}" == "pds" ]; then - # PDS default config - echo "CONFIG_SCHED_PDS=y" >> ./.config - elif [ "${_cpusched}" == "bmq" ]; then - # BMQ default config - echo "CONFIG_SCHED_BMQ=y" >> ./.config - fi - - if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then - # Disable CFS - sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config - sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config - sed -i -e 's/CONFIG_CGROUP_CPUACCT=y/# CONFIG_CGROUP_CPUACCT is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_AUTOGROUP=y/# CONFIG_SCHED_AUTOGROUP is not set/' ./.config - # sched yield type - if [ -n "$_sched_yield_type" ]; then - CONDITION0="$_sched_yield_type" - else - plain "" - plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." - plain "" - plain "For PDS and MuQSS:" - plain "0: No yield." - plain "1: Yield only to better priority/deadline tasks." - plain "2: Expire timeslice and recalculate deadline." - plain "" - plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" - plain "0: No yield." - plain "1: Deboost and requeue task. (default)" - plain "2: Set rq skip task." - read -rp "`echo $'\n > 0. Recommended option for gaming on PDS and MuQSS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - fi - if [ "$CONDITION0" == "1" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "$CONDITION0" == "2" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - fi - - # Round Robin interval - if [ "${_cpusched}" == "MuQSS" ] || [ "${_cpusched}" == "pds" ] || [ "${_cpusched}" == "bmq" ]; then - if [ -n "$_rr_interval" ]; then - CONDITION1="$_rr_interval" - else - plain "" - plain "Round Robin interval is the longest duration two tasks with the same nice level will" - plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" - plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" - plain "value can help offset the disadvantages of rescheduling a process that has yielded." - plain "" - plain "MuQSS default: 6ms" - plain "PDS default: 4ms" - plain "BMQ default: 2ms" - read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms (worth a shot with MuQSS + yield_type 2)\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; - fi - if [ "$CONDITION1" == "1" ]; then - msg2 "Using 2ms rr_interval" - _rrvalue="2" - elif [ "$CONDITION1" == "2" ]; then - msg2 "Using 4ms rr_interval" - _rrvalue="4" - elif [ "$CONDITION1" == "3" ]; then - msg2 "Using 6ms rr_interval" - _rrvalue="6" - elif [ "$CONDITION1" == "4" ]; then - msg2 "Using 8ms rr_interval" - _rrvalue="8" - else - msg2 "Using default rr_interval" - _rrvalue="default" - fi - if [ "$_rrvalue" != "default" ]; then - if [ "${_cpusched}" == "MuQSS" ]; then - sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" == "pds" ]; then - sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" == "bmq" ]; then - echo "CONFIG_SCHED_TIMESLICE=${_rrvalue}" >> ./.config - fi - else - if [ "${_cpusched}" == "bmq" ]; then - echo "CONFIG_SCHED_TIMESLICE=2" >> ./.config - fi - fi - fi - - # zenify - if [ "$_zenify" == "true" ]; then - echo "CONFIG_ZENIFY=y" >> ./.config - elif [ "$_zenify" == "false" ]; then - echo "# CONFIG_ZENIFY is not set" >> ./.config - fi - - # compiler optimization level - if [ "$_compileroptlevel" == "1" ]; then - echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config - elif [ "$_compileroptlevel" == "2" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - echo "CONFIG_CC_OPTIMIZE_HARDER=y" >> ./.config - elif [ "$_compileroptlevel" == "3" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config - echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config - fi - - # cpu opt - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then - echo "# CONFIG_MNATIVE is not set" >> ./.config - fi - - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then - sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config - fi - - if [ "$_processor_opt" == "native" ]; then - echo "CONFIG_MNATIVE=y" >> ./.config - elif [ "$_processor_opt" == "k8" ]; then - sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config - elif [ "$_processor_opt" == "k8sse3" ]; then - sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config - elif [ "$_processor_opt" == "k10" ]; then - sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config - elif [ "$_processor_opt" == "barcelona" ]; then - sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config - elif [ "$_processor_opt" == "bobcat" ]; then - sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config - elif [ "$_processor_opt" == "jaguar" ]; then - sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config - elif [ "$_processor_opt" == "bulldozer" ]; then - sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config - elif [ "$_processor_opt" == "piledriver" ]; then - sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config - elif [ "$_processor_opt" == "steamroller" ]; then - sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config - elif [ "$_processor_opt" == "excavator" ]; then - sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config - elif [ "$_processor_opt" == "zen" ]; then - sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config - elif [ "$_processor_opt" == "zen2" ]; then - sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config - elif [ "$_processor_opt" == "mpsc" ]; then - sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config - elif [ "$_processor_opt" == "atom" ]; then - sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config - elif [ "$_processor_opt" == "core2" ]; then - sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config - elif [ "$_processor_opt" == "nehalem" ]; then - sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config - elif [ "$_processor_opt" == "westmere" ]; then - sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config - elif [ "$_processor_opt" == "silvermont" ]; then - sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config - elif [ "$_processor_opt" == "sandybridge" ]; then - sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config - elif [ "$_processor_opt" == "ivybridge" ]; then - sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config - elif [ "$_processor_opt" == "haswell" ]; then - sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config - elif [ "$_processor_opt" == "broadwell" ]; then - sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config - elif [ "$_processor_opt" == "skylake" ]; then - sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config - elif [ "$_processor_opt" == "skylakex" ]; then - sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config - elif [ "$_processor_opt" == "cannonlake" ]; then - sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config - elif [ "$_processor_opt" == "icelake" ]; then - sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config - elif [ "$_processor_opt" == "goldmont" ]; then - sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config - elif [ "$_processor_opt" == "goldmontplus" ]; then - sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config - elif [ "$_processor_opt" == "cascadelake" ]; then - sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config - elif [ "$_processor_opt" == "cooperlake" ]; then - sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config - elif [ "$_processor_opt" == "tigerlake" ]; then - sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config - fi - - # irq threading - if [ "$_irq_threading" == "true" ]; then - echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config - elif [ "$_irq_threading" == "false" ]; then - echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config - fi - - # smt nice - if [ "$_smt_nice" == "true" ]; then - echo "CONFIG_SMT_NICE=y" >> ./.config - elif [ "$_smt_nice" == "false" ]; then - echo "# CONFIG_SMT_NICE is not set" >> ./.config - fi - - # random trust cpu - if [ "$_random_trust_cpu" == "true" ]; then - sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config - fi - - # rq sharing - if [ "$_runqueue_sharing" == "none" ]; then - echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" == "smt" ]; then - echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" == "mc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" == "smp" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" == "all" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config - elif [ "$_runqueue_sharing" == "mc-llc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - fi - - # timer freq - if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - if [ "$_timer_freq" == "1000" ]; then - sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "CONFIG_HZ_1000_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" == "750" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "CONFIG_HZ_750=y" >> ./.config - echo "CONFIG_HZ_750_NODEF=y" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" == "500" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" == "100" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - fi - elif [ "${_cpusched}" == "MuQSS" ] && [ -z "$_timer_freq" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - else - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - fi - - # default cpu gov - if [ "$_default_cpu_gov" == "performance" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config - elif [ "$_default_cpu_gov" == "ondemand" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config - fi - - # ACPI_CPUFREQ disablement - if [ "$_disable_acpi_cpufreq" == "true" ]; then - sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config - fi - - # ftrace - if [ -z "$_ftracedisable" ]; then - plain "" - plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" - plain "and analyzing of kernel functions." - read -rp "`echo $' > N/y : '`" CONDITION2; - fi - if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" == "true" ]; then - sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config - sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config - fi - - # disable numa - if [ -z "$_numadisable" ]; then - plain "" - plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." - plain "https://bbs.archlinux.org/viewtopic.php?id=239174" - read -rp "`echo $' > N/y : '`" CONDITION3; - fi - if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" == "true" ]; then - # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU - sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ - -i -e '/CONFIG_AMD_NUMA=y/d' \ - -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ - -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ - -i -e '/# CONFIG_NUMA_EMU is not set/d' \ - -i -e '/CONFIG_NODES_SHIFT=6/d' \ - -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ - -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ - -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config - fi - - # tickless - if [ -z "$_tickless" ]; then - plain "" - plain "Use CattaRappa mode (Tickless/Dynticks) ?" - plain "Can give higher performances in many cases but lower consistency on some hardware." - plain "Just tickless idle can perform better on some platforms (mostly AMD based)." - read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - fi - if [ "$CONDITION4" == "0" ] || [ "$_tickless" == "0" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config - elif [ "$CONDITION4" == "2" ] || [ "$_tickless" == "2" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config - echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config - fi - - # voluntary preempt - if [ -z "$_voluntary_preempt" ]; then - plain "" - plain "Use explicit preemption points?" - plain "It can improve latency on PDS (at the cost of throughput)" - plain "and improve throughput on other schedulers (at the cost of latency)" - read -rp "`echo $' > N/y : '`" CONDITION5; - fi - if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" == "true" ]; then - sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config - sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config - sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config - fi - - # Open Firmware support - if [ -z "$_OFenable" ]; then - plain "" - plain "Enable Device Tree and Open Firmware support?" - read -rp "`echo $' > N/y : '`" CONDITION6; - fi - if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" == "true" ]; then - sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config - fi - - # acs override - if [ -z "$_acs_override" ]; then - plain "" - plain "Use ACS override patch?" - plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" - read -rp "`echo $' > N/y : '`" CONDITION7; - fi - if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" == "true" ]; then - patch -Np1 -i ../0006-add-acs-overrides_iommu.patch - fi - - # bcachefs -# if [ -z "$_bcachefs" ]; then -# plain "" -# plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." -# plain "https://bcachefs.org/" -# read -rp "`echo $' > N/y : '`" CONDITION8; -# fi -# if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" == "true" ]; then -# patch -Np1 -i ../0008-5.4-bcachefs.patch -# echo "CONFIG_BCACHEFS_FS=m" >> ./.config -# echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config -# echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config -# echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config -# echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config -# echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config -# fi - - # fsync support - if [ -z "$_fsync" ]; then - plain "" - plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" - plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; - fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" == "true" ]; then - patch -Np1 -i ../0007-v5.4-fsync.patch - fi - - # ZFS fix - if [ -z "$_zfsfix" ]; then - plain "" - plain "Add back missing symbol for AES-NI/AVX support on ZFS" - plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" - read -rp "`echo $' > N/y : '`" CONDITION11; - fi - if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" == "true" ]; then - patch -Np1 -i ../0011-ZFS-fix.patch - fi - - # Community patches - if [ -n "$_community_patches" ]; then - if [ ! -d "$_where/../../community-patches" ]; then - cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/linux-${_basekernel}" - fi - _community_patches=($_community_patches) - for _p in ${_community_patches[@]}; do - ln -s "$_where"/../../community-patches/linux54-tkg/$_p "$_where"/ - done - fi - - # userpatches - if [ "$_user_patches" == "true" ]; then - _userpatch_target="linux-${_basekernel}" - _userpatch_ext="my" - user_patcher - fi - - # Community patches removal - for _p in ${_community_patches[@]}; do - rm -f "$_where"/$_p - done - - # don't run depmod on 'make install'. We'll do this ourselves in packaging - sed -i '2iexit 0' scripts/depmod.sh - - # get kernel version - make prepare - - # modprobed-db - if [ -z "$_modprobeddb" ]; then - plain "" - plain "Use modprobed db to clean config from unneeded modules?" - plain "Speeds up compilation considerably. Requires root." - plain "https://wiki.archlinux.org/index.php/Modprobed-db" - plain "!!!! Make sure to have a well populated db !!!!" - read -rp "`echo $' > N/y : '`" CONDITIONMPDB; - fi - if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" == "true" ]; then - sudo modprobed-db recall - yes "" | make localmodconfig - fi - - if [ true = "$_config_fragments" ]; then - local fragments=() - mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) - - if [ true = "$_config_fragments_no_confirm" ]; then - printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" - else - for i in "${!fragments[@]}"; do - while true; do - read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB - CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONMPDB" in - y|yes) - break;; - n|no|'') - unset fragments[$i] - break;; - *) - echo 'Please answer with yes or no' - esac - done - done - fi - - if [ 0 -lt "${#fragments[@]}" ]; then - scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" - fi - fi - - # menuconfig / nconfig - if [ -z "$_menunconfig" ]; then - plain "" - plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" - plain "to configure the kernel before building it?" - plain "If you do, make sure your terminal is currently" - plain "at least 19 lines by 80 columns large or you'll get an error :D" - read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n choice[0-2?]: '`" CONDITIONMNC; - _menunconfig="$CONDITIONMNC" - fi - if [ 1 = "$_menunconfig" ]; then - cp .config .config.orig - make menuconfig - elif [ 2 = "$_menunconfig" ]; then - cp .config .config.orig - make nconfig - else - # rewrite configuration - yes "" | make config >/dev/null - fi - if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ]; then - if [ -z "${_diffconfig}" ]; then - while true; do - read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF - CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONF" in - y|yes) - _diffconfig=true - break;; - n|no|'') - _diffconfig=false - break;; - *) - echo 'Please answer with yes or no' - esac - done - fi - if [ true = "$_diffconfig" ]; then - if [ -z "$_diffconfig_name" ]; then - IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name - fi - if [ -z "$_diffconfig_name" ]; then - echo 'No file name given, not generating config fragment.' - else ( - prev_pwd="${PWD:-$(pwd)}" - cd "$_where" - "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" - ) fi - fi - rm .config.orig - fi - - make -s kernelrelease > version - msg2 "Prepared %s version %s" "$pkgbase" "$( -From: Serge Hallyn -Date: Fri, 31 May 2013 19:12:12 +0100 -Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default - -Signed-off-by: Serge Hallyn -[bwh: Remove unneeded binary sysctl bits] -Signed-off-by: Daniel Micay ---- - kernel/fork.c | 15 +++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 3 +++ - 3 files changed, 30 insertions(+) - -diff --git a/kernel/fork.c b/kernel/fork.c -index 07cc743698d3668e..4011d68a8ff9305c 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b86520ed3fb60fbf..f7dab3760839f1a1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -105,6 +105,9 @@ extern int core_uses_pid; - extern char core_pattern[]; - extern unsigned int core_pipe_limit; - #endif -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - extern int pid_max; - extern int pid_max_min, pid_max_max; - extern int percpu_pagelist_fraction; -@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index c490f1e4313b998a..dd03bd39d7bf194d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - --- -2.15.1 - -From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Thu, 7 Dec 2017 13:50:48 +0100 -Subject: ZEN: Add CONFIG for unprivileged_userns_clone - -This way our default behavior continues to match the vanilla kernel. ---- - init/Kconfig | 16 ++++++++++++++++ - kernel/user_namespace.c | 4 ++++ - 2 files changed, 20 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 4592bf7997c0..f3df02990aff 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1004,6 +1004,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 6b9dbc257e34..107b17f0d528 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -27,7 +27,11 @@ - #include - - /* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else - int unprivileged_userns_clone; -+#endif - - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch b/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch deleted file mode 100644 index a7c9d4a..0000000 --- a/linux54-tkg/linux54-tkg-patches/0002-clear-patches.patch +++ /dev/null @@ -1,354 +0,0 @@ -From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Mon, 14 Mar 2016 11:10:58 -0600 -Subject: [PATCH] pci pme wakeups - -Reduce wakeups for PME checks, which are a workaround for miswired -boards (sadly, too many of them) in laptops. ---- - drivers/pci/pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index c25acace7d91..0ddebdad9f5b 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -61,7 +61,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { --- -2.20.1 - -From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sat, 19 Mar 2016 21:32:19 -0400 -Subject: [PATCH] intel_idle: tweak cpuidle cstates - -Increase target_residency in cpuidle cstate - -Tune intel_idle to be a bit less agressive; -Clear linux is cleaner in hygiene (wakupes) than the average linux, -so we can afford changing these in a way that increases -performance while keeping power efficiency ---- - drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- - 1 file changed, 22 insertions(+), 22 deletions(-) - -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index 8b5d85c91e9d..5e2d813a048d 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { --- -2.20.1 - -From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Fri, 6 Jan 2017 15:34:09 +0000 -Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little - bigger than default - ---- - net/ipv4/tcp.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index cf3c5095c10e..b30d51837b2d 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -3897,8 +3897,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; --- -2.20.1 - -From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH] locking: rwsem: spin faster - -tweak rwsem owner spinning a bit ---- - kernel/locking/rwsem.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index eef04551eae7..1ec5ab4c8ff7 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); -@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - rcu_read_unlock(); - -From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: [PATCH] drivers: Initialize ata before graphics - -ATA init is the long pole in the boot process, and its asynchronous. -move the graphics init after it so that ata and graphics initialize -in parallel ---- - drivers/Makefile | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/Makefile b/drivers/Makefile -index aaef17cc6512..d08f3a394929 100644 ---- a/drivers/Makefile -+++ b/drivers/Makefile -@@ -58,15 +58,8 @@ obj-y += char/ - # iommu/ comes before gpu as gpu are using iommu controllers - obj-y += iommu/ - --# gpu/ comes after char for AGP vs DRM startup and after iommu --obj-y += gpu/ -- - obj-$(CONFIG_CONNECTOR) += connector/ - --# i810fb and intelfb depend on char/agp/ --obj-$(CONFIG_FB_I810) += video/fbdev/i810/ --obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -- - obj-$(CONFIG_PARPORT) += parport/ - obj-$(CONFIG_NVM) += lightnvm/ - obj-y += base/ block/ misc/ mfd/ nfc/ -@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ - obj-y += scsi/ - obj-y += nvme/ - obj-$(CONFIG_ATA) += ata/ -+ -+# gpu/ comes after char for AGP vs DRM startup and after iommu -+obj-y += gpu/ -+ -+# i810fb and intelfb depend on char/agp/ -+obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -+ - obj-$(CONFIG_TARGET_CORE) += target/ - obj-$(CONFIG_MTD) += mtd/ - obj-$(CONFIG_SPI) += spi/ diff --git a/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch b/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch deleted file mode 100644 index 4cbf12d..0000000 --- a/linux54-tkg/linux54-tkg-patches/0003-glitched-base.patch +++ /dev/null @@ -1,4612 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - -diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h -index 87f1fc9..b3be470 100755 ---- a/scripts/mkcompile_h -+++ b/scripts/mkcompile_h -@@ -50,8 +50,8 @@ else - fi - - UTS_VERSION="#$VERSION" --CONFIG_FLAGS="" --if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi -+CONFIG_FLAGS="TKG" -+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi - if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi - UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" - -diff --git a/fs/dcache.c b/fs/dcache.c -index 2acfc69878f5..3f1131431e06 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -69,7 +69,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 211890edf37e..37121563407d 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ --const_debug unsigned int sysctl_sched_nr_migrate = 32; -+const_debug unsigned int sysctl_sched_nr_migrate = 128; - - /* - * period over which we average the RT time consumption, measured -@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - /* - * __task_rq_lock - lock the rq @p resides on. -diff --git a/lib/Kconfig b/lib/Kconfig -index 5fe577673b98..c44c27cd6e05 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -10,6 +10,16 @@ menu "Library routines" - config RAID6_PQ - tristate - -+config RAID6_USE_PREFER_GEN -+ bool "Use prefered raid6 gen function." -+ default n -+ depends on RAID6_PQ -+ help -+ This option is provided for using prefered raid6 gen function -+ directly instead of calculating the best durning boot-up. -+ The prefered function should be the same as the best one from -+ calculating. -+ - config BITREVERSE - tristate - -diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c -index 5065b1e7e327..1bf3c712a4ca 100644 ---- a/lib/raid6/algos.c -+++ b/lib/raid6/algos.c -@@ -150,6 +150,29 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) - return best; - } - -+#ifdef CONFIG_RAID6_USE_PREFER_GEN -+static inline const struct raid6_calls *raid6_choose_prefer_gen(void) -+{ -+ const struct raid6_calls *const *algo; -+ const struct raid6_calls *best; -+ -+ for (best = NULL, algo = raid6_algos; *algo; algo++) { -+ if (!best || (*algo)->prefer >= best->prefer) { -+ if ((*algo)->valid && !(*algo)->valid()) -+ continue; -+ best = *algo; -+ } -+ } -+ -+ if (best) { -+ printk("raid6: using algorithm %s\n", best->name); -+ raid6_call = *best; -+ } else -+ printk("raid6: Yikes! No algorithm found!\n"); -+ -+ return best; -+} -+#else - static inline const struct raid6_calls *raid6_choose_gen( - void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) - { -@@ -221,6 +244,7 @@ static inline const struct raid6_calls *raid6_choose_gen( - - return best; - } -+#endif - - - /* Try to pick the best algorithm */ -@@ -228,10 +252,11 @@ static inline const struct raid6_calls *raid6_choose_gen( - - int __init raid6_select_algo(void) - { -- const int disks = (65536/PAGE_SIZE)+2; -- - const struct raid6_calls *gen_best; - const struct raid6_recov_calls *rec_best; -+#ifndef CONFIG_RAID6_USE_PREFER_GEN -+ const int disks = (65536/PAGE_SIZE)+2; -+ - char *syndromes; - void *dptrs[(65536/PAGE_SIZE)+2]; - int i; -@@ -252,11 +277,16 @@ int __init raid6_select_algo(void) - - /* select raid gen_syndrome function */ - gen_best = raid6_choose_gen(&dptrs, disks); -+#else -+ gen_best = raid6_choose_prefer_gen(); -+#endif - - /* select raid recover functions */ - rec_best = raid6_choose_recov(); - -+#ifndef CONFIG_RAID6_USE_PREFER_GEN - free_pages((unsigned long)syndromes, 1); -+#endif - - return gen_best && rec_best ? 0 : -EINVAL; - } -diff --git a/mm/zswap.c b/mm/zswap.c -index 61a5c41972db..2674c2806130 100644 ---- a/mm/zswap.c -+++ b/mm/zswap.c -@@ -91,7 +91,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { - module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); - - /* Crypto compressor to use */ --#define ZSWAP_COMPRESSOR_DEFAULT "lzo" -+#define ZSWAP_COMPRESSOR_DEFAULT "lz4" - static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; - static int zswap_compressor_param_set(const char *, - const struct kernel_param *); -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 71f39410691b..288f9679e883 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ # echo "+" - return - fi - # If we are past a tagged commit (like - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: Zenify & stuff - - -diff --git a/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt -new file mode 100644 -index 000000000000..a249678a8866 ---- /dev/null -+++ b/Documentation/tp_smapi.txt -@@ -0,0 +1,275 @@ -+tp_smapi version 0.42 -+IBM ThinkPad hardware functions driver -+ -+Author: Shem Multinymous -+Project: http://sourceforge.net/projects/tpctl -+Wiki: http://thinkwiki.org/wiki/tp_smapi -+List: linux-thinkpad@linux-thinkpad.org -+ (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) -+ -+Description -+----------- -+ -+ThinkPad laptops include a proprietary interface called SMAPI BIOS -+(System Management Application Program Interface) which provides some -+hardware control functionality that is not accessible by other means. -+ -+This driver exposes some features of the SMAPI BIOS through a sysfs -+interface. It is suitable for newer models, on which SMAPI is invoked -+through IO port writes. Older models use a different SMAPI interface; -+for those, try the "thinkpad" module from the "tpctl" package. -+ -+WARNING: -+This driver uses undocumented features and direct hardware access. -+It thus cannot be guaranteed to work, and may cause arbitrary damage -+(especially on models it wasn't tested on). -+ -+ -+Module parameters -+----------------- -+ -+thinkpad_ec module: -+ force_io=1 lets thinkpad_ec load on some recent ThinkPad models -+ (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. -+tp_smapi module: -+ debug=1 enables verbose dmesg output. -+ -+ -+Usage -+----- -+ -+Control of battery charging thresholds (in percents of current full charge -+capacity): -+ -+# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh -+# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh -+# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh -+ -+ (This is useful since Li-Ion batteries wear out much faster at very -+ high or low charge levels. The driver will also keeps the thresholds -+ across suspend-to-disk with AC disconnected; this isn't done -+ automatically by the hardware.) -+ -+Inhibiting battery charging for 17 minutes (overrides thresholds): -+ -+# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes -+# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop -+# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes -+ -+ (This can be used to control which battery is charged when using an -+ Ultrabay battery.) -+ -+Forcing battery discharging even if AC power available: -+ -+# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge -+# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge -+# cat /sys/devices/platform/smapi/BAT0/force_discharge -+ -+ (When AC is connected, forced discharging will automatically stop -+ when battery is fully depleted -- this is useful for calibration. -+ Also, this attribute can be used to control which battery is discharged -+ when both a system battery and an Ultrabay battery are connected.) -+ -+Misc read-only battery status attributes (see note about HDAPS below): -+ -+/sys/devices/platform/smapi/BAT0/installed # 0 or 1 -+/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging -+/sys/devices/platform/smapi/BAT0/cycle_count # integer counter -+/sys/devices/platform/smapi/BAT0/current_now # instantaneous current -+/sys/devices/platform/smapi/BAT0/current_avg # last minute average -+/sys/devices/platform/smapi/BAT0/power_now # instantaneous power -+/sys/devices/platform/smapi/BAT0/power_avg # last minute average -+/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh -+/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) -+/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) -+/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power -+/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power -+/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes -+/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh -+/sys/devices/platform/smapi/BAT0/design_capacity # in mWh -+/sys/devices/platform/smapi/BAT0/voltage # in mV -+/sys/devices/platform/smapi/BAT0/design_voltage # in mV -+/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current -+/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage -+/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below -+/sys/devices/platform/smapi/BAT0/manufacturer # string -+/sys/devices/platform/smapi/BAT0/model # string -+/sys/devices/platform/smapi/BAT0/barcoding # string -+/sys/devices/platform/smapi/BAT0/chemistry # string -+/sys/devices/platform/smapi/BAT0/serial # integer -+/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD -+/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD -+/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius -+/sys/devices/platform/smapi/BAT0/dump # see below -+/sys/devices/platform/smapi/ac_connected # 0 or 1 -+ -+The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups -+in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, -+the battery contains 3 cell groups in series, where each group consisting of 2 -+or 3 cells connected in parallel. The voltage of each group is given by these -+attributes, and their sum (roughly) equals the "voltage" attribute. -+(The effective performance of the battery is determined by the weakest group, -+i.e., the one those voltage changes most rapidly during dis/charging.) -+ -+The "BAT0/dump" attribute gives a a hex dump of the raw status data, which -+contains additional data now in the above (if you can figure it out). Some -+unused values are autodetected and replaced by "--": -+ -+In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. -+in the UltraBay). -+ -+ -+Raw SMAPI calls: -+ -+/sys/devices/platform/smapi/smapi_request -+This performs raw SMAPI calls. It uses a bad interface that cannot handle -+multiple simultaneous access. Don't touch it, it's for development only. -+If you did touch it, you would so something like -+# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request -+# cat /sys/devices/platform/smapi/smapi_request -+and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd -+value, converted to decimal is 75: the current charge stop threshold. -+ -+ -+Model-specific status -+--------------------- -+ -+Works (at least partially) on the following ThinkPad model: -+* A30 -+* G41 -+* R40, R50p, R51, R52 -+* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60, T61, T400, T410, T420 (partially) -+* X24, X31, X32, X40, X41, X60, X61, X200, X201, X220 (partially) -+* Z60t, Z61m -+ -+Does not work on: -+* X230 and newer -+* T430 and newer -+* Any ThinkPad Edge -+* Any ThinkPad Yoga -+* Any ThinkPad L series -+* Any ThinkPad P series -+ -+Not all functions are available on all models; for detailed status, see: -+ http://thinkwiki.org/wiki/tp_smapi -+ -+Please report success/failure by e-mail or on the Wiki. -+If you get a "not implemented" or "not supported" message, your laptop -+probably just can't do that (at least not via the SMAPI BIOS). -+For negative reports, follow the bug reporting guidelines below. -+If you send me the necessary technical data (i.e., SMAPI function -+interfaces), I will support additional models. -+ -+ -+Additional HDAPS features -+------------------------- -+ -+The modified hdaps driver has several improvements on the one in mainline -+(beyond resolving the conflict with thinkpad_ec and tp_smapi): -+ -+- Fixes reliability and improves support for recent ThinkPad models -+ (especially *60 and newer). Unlike the mainline driver, the modified hdaps -+ correctly follows the Embedded Controller communication protocol. -+ -+- Extends the "invert" parameter to cover all possible axis orientations. -+ The possible values are as follows. -+ Let X,Y denote the hardware readouts. -+ Let R denote the laptop's roll (tilt left/right). -+ Let P denote the laptop's pitch (tilt forward/backward). -+ invert=0: R= X P= Y (same as mainline) -+ invert=1: R=-X P=-Y (same as mainline) -+ invert=2: R=-X P= Y (new) -+ invert=3: R= X P=-Y (new) -+ invert=4: R= Y P= X (new) -+ invert=5: R=-Y P=-X (new) -+ invert=6: R=-Y P= X (new) -+ invert=7: R= Y P=-X (new) -+ It's probably easiest to just try all 8 possibilities and see which yields -+ correct results (e.g., in the hdaps-gl visualisation). -+ -+- Adds a whitelist which automatically sets the correct axis orientation for -+ some models. If the value for your model is wrong or missing, you can override -+ it using the "invert" parameter. Please also update the tables at -+ http://www.thinkwiki.org/wiki/tp_smapi and -+ http://www.thinkwiki.org/wiki/List_of_DMI_IDs -+ and submit a patch for the whitelist in hdaps.c. -+ -+- Provides new attributes: -+ /sys/devices/platform/hdaps/sampling_rate: -+ This determines the frequency at which the host queries the embedded -+ controller for accelerometer data (and informs the hdaps input devices). -+ Default=50. -+ /sys/devices/platform/hdaps/oversampling_ratio: -+ When set to X, the embedded controller is told to do physical accelerometer -+ measurements at a rate that is X times higher than the rate at which -+ the driver reads those measurements (i.e., X*sampling_rate). This -+ makes the readouts from the embedded controller more fresh, and is also -+ useful for the running average filter (see next). Default=5 -+ /sys/devices/platform/hdaps/running_avg_filter_order: -+ When set to X, reported readouts will be the average of the last X physical -+ accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a -+ high value decreases readout fluctuations. The averaging is handled by the -+ embedded controller, so no CPU resources are used. Higher values make the -+ readouts smoother, since it averages out both sensor noise (good) and abrupt -+ changes (bad). Default=2. -+ -+- Provides a second input device, which publishes the raw accelerometer -+ measurements (without the fuzzing needed for joystick emulation). This input -+ device can be matched by a udev rule such as the following (all on one line): -+ KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", -+ ATTRS{modalias}=="input:b0019v1014p5054e4801-*", -+ SYMLINK+="input/hdaps/accelerometer-event -+ -+A new version of the hdapsd userspace daemon, which uses the input device -+interface instead of polling sysfs, is available seprately. Using this reduces -+the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) -+to 50, down from a value that fluctuates between 50 and 100. Set the -+sampling_rate sysfs attribute to a lower value to further reduce interrupts, -+at the expense of response latency. -+ -+Licensing note: all my changes to the HDAPS driver are licensed under the -+GPL version 2 or, at your option and to the extent allowed by derivation from -+prior works, any later version. My version of hdaps is derived work from the -+mainline version, which at the time of writing is available only under -+GPL version 2. -+ -+Bug reporting -+------------- -+ -+Mail . Please include: -+* Details about your model, -+* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with -+ the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). -+* Output of "dmidecode | grep -C5 Product" -+* Does the failed functionality works under Windows? -+ -+ -+More about SMAPI -+---------------- -+ -+For hints about what may be possible via the SMAPI BIOS and how, see: -+ -+* IBM Technical Reference Manual for the ThinkPad 770 -+ (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) -+* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). -+* drivers/char/mwave/smapi.c in the Linux kernel tree.* -+* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). -+* The SMAPI_* constants in tp_smapi.c. -+ -+Note that in the above Technical Reference and in the "thinkpad" module, -+SMAPI is invoked through a function call to some physical address. However, -+the interface used by tp_smapi and the above mwave drive, and apparently -+required by newer ThinkPad, is different: you set the parameters up in the -+CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this -+triggers an SMI (System Management Interrupt), causing the CPU to enter -+SMM (System Management Mode) and run the BIOS firmware; the results are -+returned in the CPU's registers. It is not clear what is the relation between -+the two variants of SMAPI, though the assignment of error codes seems to be -+similar. -+ -+In addition, the embedded controller on ThinkPad laptops has a non-standard -+interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). -+The interface provides various system management services (currently known: -+battery information and accelerometer readouts). For more information see the -+thinkpad_ec module and the H8S hardware documentation: -+http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf -diff --git a/Makefile b/Makefile -index 863f58503bee..f33cf760af6d 100644 ---- a/Makefile -+++ b/Makefile -@@ -682,12 +682,16 @@ ifdef CONFIG_FUNCTION_TRACER - KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) - KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) - -+ifdef CONFIG_CC_OPTIMIZE_HARDER -+KBUILD_CFLAGS += -O3 $(call cc-disable-warning,maybe-uninitialized,) -+else - ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE - KBUILD_CFLAGS += -O2 - else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 - KBUILD_CFLAGS += -O3 - else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE - KBUILD_CFLAGS += -Os - endif -+endif - - ifdef CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED - KBUILD_CFLAGS += -Wno-maybe-uninitialized - -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 4f32c4062fb6..c0bf039e1b40 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - -diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c -index 55d33500d55e..744e84228a1f 100644 ---- a/drivers/input/mouse/synaptics.c -+++ b/drivers/input/mouse/synaptics.c -@@ -1338,7 +1338,9 @@ static int set_input_params(struct psmouse *psmouse, - if (psmouse_matches_pnp_id(psmouse, topbuttonpad_pnp_ids) && - !SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10)) - __set_bit(INPUT_PROP_TOPBUTTONPAD, dev->propbit); -- } -+ } else if (SYN_CAP_CLICKPAD2BTN(info->ext_cap_0c) || -+ SYN_CAP_CLICKPAD2BTN2(info->ext_cap_0c)) -+ __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); - - return 0; - } -diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h -index fc00e005c611..4cfbeec3ae4c 100644 ---- a/drivers/input/mouse/synaptics.h -+++ b/drivers/input/mouse/synaptics.h -@@ -86,6 +86,7 @@ - */ - #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & BIT(20)) /* 1-button ClickPad */ - #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & BIT(8)) /* 2-button ClickPad */ -+#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & BIT(21)) /* 2-button ClickPad */ - #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & BIT(17)) - #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & BIT(13)) - #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & BIT(19)) -diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig -index 97a420c11eed..c8621e9b2e4a 100644 ---- a/drivers/macintosh/Kconfig -+++ b/drivers/macintosh/Kconfig -@@ -159,6 +159,13 @@ config INPUT_ADBHID - - If unsure, say Y. - -+config ADB_TRACKPAD_ABSOLUTE -+ bool "Enable absolute mode for adb trackpads" -+ depends on INPUT_ADBHID -+ help -+ Enable absolute mode in adb-base trackpads. This feature adds -+ compatibility with synaptics Xorg / Xfree drivers. -+ - config MAC_EMUMOUSEBTN - tristate "Support for mouse button 2+3 emulation" - depends on SYSCTL && INPUT -diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c -index a261892c03b3..a85192de840c 100644 ---- a/drivers/macintosh/adbhid.c -+++ b/drivers/macintosh/adbhid.c -@@ -262,6 +262,15 @@ static struct adb_ids buttons_ids; - #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ - #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ - -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+#define ABS_XMIN 310 -+#define ABS_XMAX 1700 -+#define ABS_YMIN 200 -+#define ABS_YMAX 1000 -+#define ABS_ZMIN 0 -+#define ABS_ZMAX 55 -+#endif -+ - static void - adbhid_keyboard_input(unsigned char *data, int nb, int apoll) - { -@@ -405,6 +414,9 @@ static void - adbhid_mouse_input(unsigned char *data, int nb, int autopoll) - { - int id = (data[0] >> 4) & 0x0f; -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; -+#endif - - if (!adbhid[id]) { - pr_err("ADB HID on ID %d not yet registered\n", id); -@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) - high bits of y-axis motion. XY is additional - high bits of x-axis motion. - -+ For ADB Absolute motion protocol the data array will contain the -+ following values: -+ -+ BITS COMMENTS -+ data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. -+ data[1] = byyy yyyy Left button and y-axis motion. -+ data[2] = bxxx xxxx Second button and x-axis motion. -+ data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. -+ data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. -+ data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. -+ - MacAlly 2-button mouse protocol. - - For MacAlly 2-button mouse protocol the data array will contain the -@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) - switch (adbhid[id]->mouse_kind) - { - case ADBMOUSE_TRACKPAD: -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | -+ ((data[4] & 0x07) << 10); -+ y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | -+ ((data[4] & 0x70) << 6); -+ z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); -+ btn = (!(data[1] >> 7)) & 1; -+#else - data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); - data[2] = data[2] | 0x80; -+#endif - break; - case ADBMOUSE_MICROSPEED: - data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); -@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) - break; - } - -- input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); -- input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { - -- if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) -- input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); -+ if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); -+ if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); - -- input_report_rel(adbhid[id]->input, REL_X, -- ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); -- input_report_rel(adbhid[id]->input, REL_Y, -- ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); -+ if(z_axis > 0){ -+ input_report_abs(adbhid[id]->input, ABS_X, x_axis); -+ input_report_abs(adbhid[id]->input, ABS_Y, y_axis); -+ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); -+ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); -+ } else { -+ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); -+ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); -+ } -+ -+ input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); -+ input_report_key(adbhid[id]->input, BTN_LEFT, btn); -+ } else { -+#endif -+ input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); -+ input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); -+ -+ if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) -+ input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); - -+ input_report_rel(adbhid[id]->input, REL_X, -+ ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); -+ input_report_rel(adbhid[id]->input, REL_Y, -+ ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ } -+#endif - input_sync(adbhid[id]->input); - } - -@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id, - input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | - BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); - input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ set_bit(EV_ABS, input_dev->evbit); -+ input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); -+ input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); -+ input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); -+ set_bit(BTN_TOUCH, input_dev->keybit); -+ set_bit(BTN_TOOL_FINGER, input_dev->keybit); -+ set_bit(ABS_TOOL_WIDTH, input_dev->absbit); -+#endif - break; - - case ADB_MISC: -@@ -1132,7 +1195,11 @@ init_trackpad(int id) - r1_buffer[3], - r1_buffer[4], - r1_buffer[5], -+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE -+ 0x00, /* Enable absolute mode */ -+#else - 0x03, /*r1_buffer[6],*/ -+#endif - r1_buffer[7]); - - /* Without this flush, the trackpad may be locked up */ -diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index ac4d48830415..b272132ac742 100644 ---- a/drivers/platform/x86/Kconfig -+++ b/drivers/platform/x86/Kconfig -@@ -573,9 +573,28 @@ config THINKPAD_ACPI_HOTKEY_POLL - If you are not sure, say Y here. The driver enables polling only if - it is strictly necessary to do so. - -+config THINKPAD_EC -+ tristate -+ ---help--- -+ This is a low-level driver for accessing the ThinkPad H8S embedded -+ controller over the LPC bus (not to be confused with the ACPI Embedded -+ Controller interface). -+ -+config TP_SMAPI -+ tristate "ThinkPad SMAPI Support" -+ select THINKPAD_EC -+ default n -+ help -+ This adds SMAPI support on Lenovo/IBM ThinkPads, for features such -+ as battery charging control. For more information about this driver -+ see . -+ -+ If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. -+ - config SENSORS_HDAPS - tristate "Thinkpad Hard Drive Active Protection System (hdaps)" - depends on INPUT -+ select THINKPAD_EC - select INPUT_POLLDEV - help - This driver provides support for the IBM Hard Drive Active Protection -diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile -index 2ba6cb795338..399f8b88646f 100644 ---- a/drivers/platform/x86/Makefile -+++ b/drivers/platform/x86/Makefile -@@ -35,6 +35,8 @@ obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o - obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o - obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o - obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o -+obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o -+obj-$(CONFIG_TP_SMAPI) += tp_smapi.o - obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o - obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o - obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o -diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c -index c26baf77938e..1814614f240c 100644 ---- a/drivers/platform/x86/hdaps.c -+++ b/drivers/platform/x86/hdaps.c -@@ -2,7 +2,7 @@ - * hdaps.c - driver for IBM's Hard Drive Active Protection System - * - * Copyright (C) 2005 Robert Love -- * Copyright (C) 2005 Jesper Juhl -+ * Copyright (C) 2005 Jesper Juhl - * - * The HardDisk Active Protection System (hdaps) is present in IBM ThinkPads - * starting with the R40, T41, and X40. It provides a basic two-axis -@@ -30,266 +30,384 @@ - - #include - #include --#include -+#include - #include --#include - #include - #include - #include - #include --#include -- --#define HDAPS_LOW_PORT 0x1600 /* first port used by hdaps */ --#define HDAPS_NR_PORTS 0x30 /* number of ports: 0x1600 - 0x162f */ -- --#define HDAPS_PORT_STATE 0x1611 /* device state */ --#define HDAPS_PORT_YPOS 0x1612 /* y-axis position */ --#define HDAPS_PORT_XPOS 0x1614 /* x-axis position */ --#define HDAPS_PORT_TEMP1 0x1616 /* device temperature, in Celsius */ --#define HDAPS_PORT_YVAR 0x1617 /* y-axis variance (what is this?) */ --#define HDAPS_PORT_XVAR 0x1619 /* x-axis variance (what is this?) */ --#define HDAPS_PORT_TEMP2 0x161b /* device temperature (again?) */ --#define HDAPS_PORT_UNKNOWN 0x161c /* what is this? */ --#define HDAPS_PORT_KMACT 0x161d /* keyboard or mouse activity */ -- --#define STATE_FRESH 0x50 /* accelerometer data is fresh */ -+#include -+#include -+#include -+ -+/* Embedded controller accelerometer read command and its result: */ -+static const struct thinkpad_ec_row ec_accel_args = -+ { .mask = 0x0001, .val = {0x11} }; -+#define EC_ACCEL_IDX_READOUTS 0x1 /* readouts included in this read */ -+ /* First readout, if READOUTS>=1: */ -+#define EC_ACCEL_IDX_YPOS1 0x2 /* y-axis position word */ -+#define EC_ACCEL_IDX_XPOS1 0x4 /* x-axis position word */ -+#define EC_ACCEL_IDX_TEMP1 0x6 /* device temperature in Celsius */ -+ /* Second readout, if READOUTS>=2: */ -+#define EC_ACCEL_IDX_XPOS2 0x7 /* y-axis position word */ -+#define EC_ACCEL_IDX_YPOS2 0x9 /* x-axis position word */ -+#define EC_ACCEL_IDX_TEMP2 0xb /* device temperature in Celsius */ -+#define EC_ACCEL_IDX_QUEUED 0xc /* Number of queued readouts left */ -+#define EC_ACCEL_IDX_KMACT 0xd /* keyboard or mouse activity */ -+#define EC_ACCEL_IDX_RETVAL 0xf /* command return value, good=0x00 */ - - #define KEYBD_MASK 0x20 /* set if keyboard activity */ - #define MOUSE_MASK 0x40 /* set if mouse activity */ --#define KEYBD_ISSET(n) (!! (n & KEYBD_MASK)) /* keyboard used? */ --#define MOUSE_ISSET(n) (!! (n & MOUSE_MASK)) /* mouse used? */ - --#define INIT_TIMEOUT_MSECS 4000 /* wait up to 4s for device init ... */ --#define INIT_WAIT_MSECS 200 /* ... in 200ms increments */ -+#define READ_TIMEOUT_MSECS 100 /* wait this long for device read */ -+#define RETRY_MSECS 3 /* retry delay */ - --#define HDAPS_POLL_INTERVAL 50 /* poll for input every 1/20s (50 ms)*/ - #define HDAPS_INPUT_FUZZ 4 /* input event threshold */ - #define HDAPS_INPUT_FLAT 4 -- --#define HDAPS_X_AXIS (1 << 0) --#define HDAPS_Y_AXIS (1 << 1) --#define HDAPS_BOTH_AXES (HDAPS_X_AXIS | HDAPS_Y_AXIS) -- -+#define KMACT_REMEMBER_PERIOD (HZ/10) /* keyboard/mouse persistence */ -+ -+/* Input IDs */ -+#define HDAPS_INPUT_VENDOR PCI_VENDOR_ID_IBM -+#define HDAPS_INPUT_PRODUCT 0x5054 /* "TP", shared with thinkpad_acpi */ -+#define HDAPS_INPUT_JS_VERSION 0x6801 /* Joystick emulation input device */ -+#define HDAPS_INPUT_RAW_VERSION 0x4801 /* Raw accelerometer input device */ -+ -+/* Axis orientation. */ -+/* The unnatural bit-representation of inversions is for backward -+ * compatibility with the"invert=1" module parameter. */ -+#define HDAPS_ORIENT_INVERT_XY 0x01 /* Invert both X and Y axes. */ -+#define HDAPS_ORIENT_INVERT_X 0x02 /* Invert the X axis (uninvert if -+ * already inverted by INVERT_XY). */ -+#define HDAPS_ORIENT_SWAP 0x04 /* Swap the axes. The swap occurs -+ * before inverting X or Y. */ -+#define HDAPS_ORIENT_MAX 0x07 -+#define HDAPS_ORIENT_UNDEFINED 0xFF /* Placeholder during initialization */ -+#define HDAPS_ORIENT_INVERT_Y (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X) -+ -+static struct timer_list hdaps_timer; - static struct platform_device *pdev; --static struct input_polled_dev *hdaps_idev; --static unsigned int hdaps_invert; --static u8 km_activity; --static int rest_x; --static int rest_y; -- --static DEFINE_MUTEX(hdaps_mtx); -- --/* -- * __get_latch - Get the value from a given port. Callers must hold hdaps_mtx. -- */ --static inline u8 __get_latch(u16 port) -+static struct input_dev *hdaps_idev; /* joystick-like device with fuzz */ -+static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */ -+static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED; -+static int needs_calibration; -+ -+/* Configuration: */ -+static int sampling_rate = 50; /* Sampling rate */ -+static int oversampling_ratio = 5; /* Ratio between our sampling rate and -+ * EC accelerometer sampling rate */ -+static int running_avg_filter_order = 2; /* EC running average filter order */ -+ -+/* Latest state readout: */ -+static int pos_x, pos_y; /* position */ -+static int temperature; /* temperature */ -+static int stale_readout = 1; /* last read invalid */ -+static int rest_x, rest_y; /* calibrated rest position */ -+ -+/* Last time we saw keyboard and mouse activity: */ -+static u64 last_keyboard_jiffies = INITIAL_JIFFIES; -+static u64 last_mouse_jiffies = INITIAL_JIFFIES; -+static u64 last_update_jiffies = INITIAL_JIFFIES; -+ -+/* input device use count */ -+static int hdaps_users; -+static DEFINE_MUTEX(hdaps_users_mtx); -+ -+/* Some models require an axis transformation to the standard representation */ -+static void transform_axes(int *x, int *y) - { -- return inb(port) & 0xff; -+ if (hdaps_invert & HDAPS_ORIENT_SWAP) { -+ int z; -+ z = *x; -+ *x = *y; -+ *y = z; -+ } -+ if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) { -+ *x = -*x; -+ *y = -*y; -+ } -+ if (hdaps_invert & HDAPS_ORIENT_INVERT_X) -+ *x = -*x; - } - --/* -- * __check_latch - Check a port latch for a given value. Returns zero if the -- * port contains the given value. Callers must hold hdaps_mtx. -+/** -+ * __hdaps_update - query current state, with locks already acquired -+ * @fast: if nonzero, do one quick attempt without retries. -+ * -+ * Query current accelerometer state and update global state variables. -+ * Also prefetches the next query. Caller must hold controller lock. - */ --static inline int __check_latch(u16 port, u8 val) -+static int __hdaps_update(int fast) - { -- if (__get_latch(port) == val) -- return 0; -- return -EINVAL; --} -+ /* Read data: */ -+ struct thinkpad_ec_row data; -+ int ret; - --/* -- * __wait_latch - Wait up to 100us for a port latch to get a certain value, -- * returning zero if the value is obtained. Callers must hold hdaps_mtx. -- */ --static int __wait_latch(u16 port, u8 val) --{ -- unsigned int i; -+ data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) | -+ (3 << EC_ACCEL_IDX_YPOS1) | (3 << EC_ACCEL_IDX_XPOS1) | -+ (1 << EC_ACCEL_IDX_TEMP1) | (1 << EC_ACCEL_IDX_RETVAL); -+ if (fast) -+ ret = thinkpad_ec_try_read_row(&ec_accel_args, &data); -+ else -+ ret = thinkpad_ec_read_row(&ec_accel_args, &data); -+ thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */ -+ if (ret) -+ return ret; - -- for (i = 0; i < 20; i++) { -- if (!__check_latch(port, val)) -- return 0; -- udelay(5); -+ /* Check status: */ -+ if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) { -+ pr_warn("read RETVAL=0x%02x\n", -+ data.val[EC_ACCEL_IDX_RETVAL]); -+ return -EIO; - } - -- return -EIO; -+ if (data.val[EC_ACCEL_IDX_READOUTS] < 1) -+ return -EBUSY; /* no pending readout, try again later */ -+ -+ /* Parse position data: */ -+ pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1); -+ pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1); -+ transform_axes(&pos_x, &pos_y); -+ -+ /* Keyboard and mouse activity status is cleared as soon as it's read, -+ * so applications will eat each other's events. Thus we remember any -+ * event for KMACT_REMEMBER_PERIOD jiffies. -+ */ -+ if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK) -+ last_keyboard_jiffies = get_jiffies_64(); -+ if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK) -+ last_mouse_jiffies = get_jiffies_64(); -+ -+ temperature = data.val[EC_ACCEL_IDX_TEMP1]; -+ -+ last_update_jiffies = get_jiffies_64(); -+ stale_readout = 0; -+ if (needs_calibration) { -+ rest_x = pos_x; -+ rest_y = pos_y; -+ needs_calibration = 0; -+ } -+ -+ return 0; - } - --/* -- * __device_refresh - request a refresh from the accelerometer. Does not wait -- * for refresh to complete. Callers must hold hdaps_mtx. -+/** -+ * hdaps_update - acquire locks and query current state -+ * -+ * Query current accelerometer state and update global state variables. -+ * Also prefetches the next query. -+ * Retries until timeout if the accelerometer is not in ready status (common). -+ * Does its own locking. - */ --static void __device_refresh(void) -+static int hdaps_update(void) - { -- udelay(200); -- if (inb(0x1604) != STATE_FRESH) { -- outb(0x11, 0x1610); -- outb(0x01, 0x161f); -+ u64 age = get_jiffies_64() - last_update_jiffies; -+ int total, ret; -+ -+ if (!stale_readout && age < (9*HZ)/(10*sampling_rate)) -+ return 0; /* already updated recently */ -+ for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) { -+ ret = thinkpad_ec_lock(); -+ if (ret) -+ return ret; -+ ret = __hdaps_update(0); -+ thinkpad_ec_unlock(); -+ -+ if (!ret) -+ return 0; -+ if (ret != -EBUSY) -+ break; -+ msleep(RETRY_MSECS); - } -+ return ret; - } - --/* -- * __device_refresh_sync - request a synchronous refresh from the -- * accelerometer. We wait for the refresh to complete. Returns zero if -- * successful and nonzero on error. Callers must hold hdaps_mtx. -+/** -+ * hdaps_set_power - enable or disable power to the accelerometer. -+ * Returns zero on success and negative error code on failure. Can sleep. - */ --static int __device_refresh_sync(void) -+static int hdaps_set_power(int on) - { -- __device_refresh(); -- return __wait_latch(0x1604, STATE_FRESH); -+ struct thinkpad_ec_row args = -+ { .mask = 0x0003, .val = {0x14, on?0x01:0x00} }; -+ struct thinkpad_ec_row data = { .mask = 0x8000 }; -+ int ret = thinkpad_ec_read_row(&args, &data); -+ if (ret) -+ return ret; -+ if (data.val[0xF] != 0x00) -+ return -EIO; -+ return 0; - } - --/* -- * __device_complete - indicate to the accelerometer that we are done reading -- * data, and then initiate an async refresh. Callers must hold hdaps_mtx. -+/** -+ * hdaps_set_ec_config - set accelerometer parameters. -+ * @ec_rate: embedded controller sampling rate -+ * @order: embedded controller running average filter order -+ * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.) -+ * Returns zero on success and negative error code on failure. Can sleep. - */ --static inline void __device_complete(void) -+static int hdaps_set_ec_config(int ec_rate, int order) - { -- inb(0x161f); -- inb(0x1604); -- __device_refresh(); -+ struct thinkpad_ec_row args = { .mask = 0x000F, -+ .val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} }; -+ struct thinkpad_ec_row data = { .mask = 0x8000 }; -+ int ret = thinkpad_ec_read_row(&args, &data); -+ pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order); -+ if (ret) -+ return ret; -+ if (data.val[0xF] == 0x03) { -+ pr_warn("config param out of range\n"); -+ return -EINVAL; -+ } -+ if (data.val[0xF] == 0x06) { -+ pr_warn("config change already pending\n"); -+ return -EBUSY; -+ } -+ if (data.val[0xF] != 0x00) { -+ pr_warn("config change error, ret=%d\n", -+ data.val[0xF]); -+ return -EIO; -+ } -+ return 0; - } - --/* -- * hdaps_readb_one - reads a byte from a single I/O port, placing the value in -- * the given pointer. Returns zero on success or a negative error on failure. -- * Can sleep. -+/** -+ * hdaps_get_ec_config - get accelerometer parameters. -+ * @ec_rate: embedded controller sampling rate -+ * @order: embedded controller running average filter order -+ * Returns zero on success and negative error code on failure. Can sleep. - */ --static int hdaps_readb_one(unsigned int port, u8 *val) -+static int hdaps_get_ec_config(int *ec_rate, int *order) - { -- int ret; -- -- mutex_lock(&hdaps_mtx); -- -- /* do a sync refresh -- we need to be sure that we read fresh data */ -- ret = __device_refresh_sync(); -+ const struct thinkpad_ec_row args = -+ { .mask = 0x0003, .val = {0x17, 0x82} }; -+ struct thinkpad_ec_row data = { .mask = 0x801F }; -+ int ret = thinkpad_ec_read_row(&args, &data); - if (ret) -- goto out; -- -- *val = inb(port); -- __device_complete(); -- --out: -- mutex_unlock(&hdaps_mtx); -- return ret; -+ return ret; -+ if (data.val[0xF] != 0x00) -+ return -EIO; -+ if (!(data.val[0x1] & 0x01)) -+ return -ENXIO; /* accelerometer polling not enabled */ -+ if (data.val[0x1] & 0x02) -+ return -EBUSY; /* config change in progress, retry later */ -+ *ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8); -+ *order = data.val[0x4]; -+ return 0; - } - --/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */ --static int __hdaps_read_pair(unsigned int port1, unsigned int port2, -- int *x, int *y) -+/** -+ * hdaps_get_ec_mode - get EC accelerometer mode -+ * Returns zero on success and negative error code on failure. Can sleep. -+ */ -+static int hdaps_get_ec_mode(u8 *mode) - { -- /* do a sync refresh -- we need to be sure that we read fresh data */ -- if (__device_refresh_sync()) -+ const struct thinkpad_ec_row args = -+ { .mask = 0x0001, .val = {0x13} }; -+ struct thinkpad_ec_row data = { .mask = 0x8002 }; -+ int ret = thinkpad_ec_read_row(&args, &data); -+ if (ret) -+ return ret; -+ if (data.val[0xF] != 0x00) { -+ pr_warn("accelerometer not implemented (0x%02x)\n", -+ data.val[0xF]); - return -EIO; -- -- *y = inw(port2); -- *x = inw(port1); -- km_activity = inb(HDAPS_PORT_KMACT); -- __device_complete(); -- -- /* hdaps_invert is a bitvector to negate the axes */ -- if (hdaps_invert & HDAPS_X_AXIS) -- *x = -*x; -- if (hdaps_invert & HDAPS_Y_AXIS) -- *y = -*y; -- -+ } -+ *mode = data.val[0x1]; - return 0; - } - --/* -- * hdaps_read_pair - reads the values from a pair of ports, placing the values -- * in the given pointers. Returns zero on success. Can sleep. -+/** -+ * hdaps_check_ec - checks something about the EC. -+ * Follows the clean-room spec for HDAPS; we don't know what it means. -+ * Returns zero on success and negative error code on failure. Can sleep. - */ --static int hdaps_read_pair(unsigned int port1, unsigned int port2, -- int *val1, int *val2) -+static int hdaps_check_ec(void) - { -- int ret; -- -- mutex_lock(&hdaps_mtx); -- ret = __hdaps_read_pair(port1, port2, val1, val2); -- mutex_unlock(&hdaps_mtx); -- -- return ret; -+ const struct thinkpad_ec_row args = -+ { .mask = 0x0003, .val = {0x17, 0x81} }; -+ struct thinkpad_ec_row data = { .mask = 0x800E }; -+ int ret = thinkpad_ec_read_row(&args, &data); -+ if (ret) -+ return ret; -+ if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */ -+ (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */ -+ data.val[0x3] != 0x00 || data.val[0xF] != 0x00) { -+ pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n", -+ data.val[0x1], data.val[0x2], -+ data.val[0x3], data.val[0xF]); -+ return -EIO; -+ } -+ return 0; - } - --/* -- * hdaps_device_init - initialize the accelerometer. Returns zero on success -- * and negative error code on failure. Can sleep. -+/** -+ * hdaps_device_init - initialize the accelerometer. -+ * -+ * Call several embedded controller functions to test and initialize the -+ * accelerometer. -+ * Returns zero on success and negative error code on failure. Can sleep. - */ -+#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg) - static int hdaps_device_init(void) - { -- int total, ret = -ENXIO; -+ int ret; -+ u8 mode; - -- mutex_lock(&hdaps_mtx); -+ ret = thinkpad_ec_lock(); -+ if (ret) -+ return ret; - -- outb(0x13, 0x1610); -- outb(0x01, 0x161f); -- if (__wait_latch(0x161f, 0x00)) -- goto out; -+ if (hdaps_get_ec_mode(&mode)) -+ { FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; } - -- /* -- * Most ThinkPads return 0x01. -- * -- * Others--namely the R50p, T41p, and T42p--return 0x03. These laptops -- * have "inverted" axises. -- * -- * The 0x02 value occurs when the chip has been previously initialized. -- */ -- if (__check_latch(0x1611, 0x03) && -- __check_latch(0x1611, 0x02) && -- __check_latch(0x1611, 0x01)) -- goto out; -+ pr_debug("initial mode latch is 0x%02x\n", mode); -+ if (mode == 0x00) -+ { FAILED_INIT("accelerometer not available"); goto bad; } - -- printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n", -- __get_latch(0x1611)); -+ if (hdaps_check_ec()) -+ { FAILED_INIT("hdaps_check_ec failed"); goto bad; } - -- outb(0x17, 0x1610); -- outb(0x81, 0x1611); -- outb(0x01, 0x161f); -- if (__wait_latch(0x161f, 0x00)) -- goto out; -- if (__wait_latch(0x1611, 0x00)) -- goto out; -- if (__wait_latch(0x1612, 0x60)) -- goto out; -- if (__wait_latch(0x1613, 0x00)) -- goto out; -- outb(0x14, 0x1610); -- outb(0x01, 0x1611); -- outb(0x01, 0x161f); -- if (__wait_latch(0x161f, 0x00)) -- goto out; -- outb(0x10, 0x1610); -- outb(0xc8, 0x1611); -- outb(0x00, 0x1612); -- outb(0x02, 0x1613); -- outb(0x01, 0x161f); -- if (__wait_latch(0x161f, 0x00)) -- goto out; -- if (__device_refresh_sync()) -- goto out; -- if (__wait_latch(0x1611, 0x00)) -- goto out; -- -- /* we have done our dance, now let's wait for the applause */ -- for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) { -- int x, y; -+ if (hdaps_set_power(1)) -+ { FAILED_INIT("hdaps_set_power failed"); goto bad; } - -- /* a read of the device helps push it into action */ -- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); -- if (!__wait_latch(0x1611, 0x02)) { -- ret = 0; -- break; -- } -+ if (hdaps_set_ec_config(sampling_rate*oversampling_ratio, -+ running_avg_filter_order)) -+ { FAILED_INIT("hdaps_set_ec_config failed"); goto bad; } - -- msleep(INIT_WAIT_MSECS); -- } -+ thinkpad_ec_invalidate(); -+ udelay(200); - --out: -- mutex_unlock(&hdaps_mtx); -+ /* Just prefetch instead of reading, to avoid ~1sec delay on load */ -+ ret = thinkpad_ec_prefetch_row(&ec_accel_args); -+ if (ret) -+ { FAILED_INIT("initial prefetch failed"); goto bad; } -+ goto good; -+bad: -+ thinkpad_ec_invalidate(); -+ ret = -ENXIO; -+good: -+ stale_readout = 1; -+ thinkpad_ec_unlock(); - return ret; - } - -+/** -+ * hdaps_device_shutdown - power off the accelerometer -+ * Returns nonzero on failure. Can sleep. -+ */ -+static int hdaps_device_shutdown(void) -+{ -+ int ret; -+ ret = hdaps_set_power(0); -+ if (ret) { -+ pr_warn("cannot power off\n"); -+ return ret; -+ } -+ ret = hdaps_set_ec_config(0, 1); -+ if (ret) -+ pr_warn("cannot stop EC sampling\n"); -+ return ret; -+} - - /* Device model stuff */ - -@@ -306,13 +424,29 @@ static int hdaps_probe(struct platform_device *dev) - } - - #ifdef CONFIG_PM_SLEEP -+static int hdaps_suspend(struct device *dev) -+{ -+ /* Don't do hdaps polls until resume re-initializes the sensor. */ -+ del_timer_sync(&hdaps_timer); -+ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ -+ return 0; -+} -+ - static int hdaps_resume(struct device *dev) - { -- return hdaps_device_init(); -+ int ret = hdaps_device_init(); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&hdaps_users_mtx); -+ if (hdaps_users) -+ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); -+ mutex_unlock(&hdaps_users_mtx); -+ return 0; - } - #endif - --static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume); -+static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume); - - static struct platform_driver hdaps_driver = { - .probe = hdaps_probe, -@@ -322,30 +456,51 @@ static struct platform_driver hdaps_driver = { - }, - }; - --/* -- * hdaps_calibrate - Set our "resting" values. Callers must hold hdaps_mtx. -+/** -+ * hdaps_calibrate - set our "resting" values. -+ * Does its own locking. - */ - static void hdaps_calibrate(void) - { -- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y); -+ needs_calibration = 1; -+ hdaps_update(); -+ /* If that fails, the mousedev poll will take care of things later. */ - } - --static void hdaps_mousedev_poll(struct input_polled_dev *dev) -+/* Timer handler for updating the input device. Runs in softirq context, -+ * so avoid lenghty or blocking operations. -+ */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) -+static void hdaps_mousedev_poll(unsigned long unused) -+#else -+static void hdaps_mousedev_poll(struct timer_list *unused) -+#endif - { -- struct input_dev *input_dev = dev->input; -- int x, y; -+ int ret; - -- mutex_lock(&hdaps_mtx); -+ stale_readout = 1; - -- if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y)) -- goto out; -+ /* Cannot sleep. Try nonblockingly. If we fail, try again later. */ -+ if (thinkpad_ec_try_lock()) -+ goto keep_active; - -- input_report_abs(input_dev, ABS_X, x - rest_x); -- input_report_abs(input_dev, ABS_Y, y - rest_y); -- input_sync(input_dev); -+ ret = __hdaps_update(1); /* fast update, we're in softirq context */ -+ thinkpad_ec_unlock(); -+ /* Any of "successful", "not yet ready" and "not prefetched"? */ -+ if (ret != 0 && ret != -EBUSY && ret != -ENODATA) { -+ pr_err("poll failed, disabling updates\n"); -+ return; -+ } - --out: -- mutex_unlock(&hdaps_mtx); -+keep_active: -+ /* Even if we failed now, pos_x,y may have been updated earlier: */ -+ input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x); -+ input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y); -+ input_sync(hdaps_idev); -+ input_report_abs(hdaps_idev_raw, ABS_X, pos_x); -+ input_report_abs(hdaps_idev_raw, ABS_Y, pos_y); -+ input_sync(hdaps_idev_raw); -+ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); - } - - -@@ -354,65 +509,41 @@ static void hdaps_mousedev_poll(struct input_polled_dev *dev) - static ssize_t hdaps_position_show(struct device *dev, - struct device_attribute *attr, char *buf) - { -- int ret, x, y; -- -- ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); -+ int ret = hdaps_update(); - if (ret) - return ret; -- -- return sprintf(buf, "(%d,%d)\n", x, y); --} -- --static ssize_t hdaps_variance_show(struct device *dev, -- struct device_attribute *attr, char *buf) --{ -- int ret, x, y; -- -- ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y); -- if (ret) -- return ret; -- -- return sprintf(buf, "(%d,%d)\n", x, y); -+ return sprintf(buf, "(%d,%d)\n", pos_x, pos_y); - } - - static ssize_t hdaps_temp1_show(struct device *dev, - struct device_attribute *attr, char *buf) - { -- u8 uninitialized_var(temp); -- int ret; -- -- ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); -- if (ret) -- return ret; -- -- return sprintf(buf, "%u\n", temp); --} -- --static ssize_t hdaps_temp2_show(struct device *dev, -- struct device_attribute *attr, char *buf) --{ -- u8 uninitialized_var(temp); -- int ret; -- -- ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); -+ int ret = hdaps_update(); - if (ret) - return ret; -- -- return sprintf(buf, "%u\n", temp); -+ return sprintf(buf, "%d\n", temperature); - } - - static ssize_t hdaps_keyboard_activity_show(struct device *dev, - struct device_attribute *attr, - char *buf) - { -- return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity)); -+ int ret = hdaps_update(); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%u\n", -+ get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD); - } - - static ssize_t hdaps_mouse_activity_show(struct device *dev, - struct device_attribute *attr, - char *buf) - { -- return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity)); -+ int ret = hdaps_update(); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%u\n", -+ get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD); - } - - static ssize_t hdaps_calibrate_show(struct device *dev, -@@ -425,10 +556,7 @@ static ssize_t hdaps_calibrate_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) - { -- mutex_lock(&hdaps_mtx); - hdaps_calibrate(); -- mutex_unlock(&hdaps_mtx); -- - return count; - } - -@@ -445,7 +573,7 @@ static ssize_t hdaps_invert_store(struct device *dev, - int invert; - - if (sscanf(buf, "%d", &invert) != 1 || -- invert < 0 || invert > HDAPS_BOTH_AXES) -+ invert < 0 || invert > HDAPS_ORIENT_MAX) - return -EINVAL; - - hdaps_invert = invert; -@@ -454,24 +582,128 @@ static ssize_t hdaps_invert_store(struct device *dev, - return count; - } - -+static ssize_t hdaps_sampling_rate_show( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ return sprintf(buf, "%d\n", sampling_rate); -+} -+ -+static ssize_t hdaps_sampling_rate_store( -+ struct device *dev, struct device_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int rate, ret; -+ if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) { -+ pr_warn("must have 0ident); -- return 1; --} -- - /* hdaps_dmi_match_invert - found an inverted match. */ - static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id) - { -- hdaps_invert = (unsigned long)id->driver_data; -- pr_info("inverting axis (%u) readings\n", hdaps_invert); -- return hdaps_dmi_match(id); -+ unsigned int orient = (kernel_ulong_t) id->driver_data; -+ hdaps_invert = orient; -+ pr_info("%s detected, setting orientation %u\n", id->ident, orient); -+ return 1; /* stop enumeration */ - } - --#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) { \ -+#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \ - .ident = vendor " " model, \ - .callback = hdaps_dmi_match_invert, \ -- .driver_data = (void *)axes, \ -+ .driver_data = (void *)(orient), \ - .matches = { \ - DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ - DMI_MATCH(DMI_PRODUCT_VERSION, model) \ - } \ - } - --#define HDAPS_DMI_MATCH_NORMAL(vendor, model) \ -- HDAPS_DMI_MATCH_INVERT(vendor, model, 0) -- --/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match -- "ThinkPad T42p", so the order of the entries matters. -- If your ThinkPad is not recognized, please update to latest -- BIOS. This is especially the case for some R52 ThinkPads. */ --static const struct dmi_system_id hdaps_whitelist[] __initconst = { -- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"), -- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"), -- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES), -- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES), -+/* List of models with abnormal axis configuration. -+ Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match -+ "ThinkPad T42p", and enumeration stops after first match, -+ so the order of the entries matters. */ -+const struct dmi_system_id hdaps_whitelist[] __initconst = { -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W510", HDAPS_ORIENT_MAX), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W520", HDAPS_ORIENT_MAX), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), -+ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP), - { .ident = NULL } - }; - - static int __init hdaps_init(void) - { -- struct input_dev *idev; - int ret; - -- if (!dmi_check_system(hdaps_whitelist)) { -- pr_warn("supported laptop not found!\n"); -- ret = -ENODEV; -- goto out; -- } -- -- if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) { -- ret = -ENXIO; -- goto out; -- } -- -+ /* Determine axis orientation orientation */ -+ if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */ -+ if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */ -+ hdaps_invert = 0; /* default */ -+ -+ /* Init timer before platform_driver_register, in case of suspend */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) -+ init_timer(&hdaps_timer); -+ hdaps_timer.function = hdaps_mousedev_poll; -+#else -+ timer_setup(&hdaps_timer, hdaps_mousedev_poll, 0); -+#endif - ret = platform_driver_register(&hdaps_driver); - if (ret) -- goto out_region; -+ goto out; - - pdev = platform_device_register_simple("hdaps", -1, NULL, 0); - if (IS_ERR(pdev)) { -@@ -571,47 +801,79 @@ static int __init hdaps_init(void) - if (ret) - goto out_device; - -- hdaps_idev = input_allocate_polled_device(); -+ hdaps_idev = input_allocate_device(); - if (!hdaps_idev) { - ret = -ENOMEM; - goto out_group; - } - -- hdaps_idev->poll = hdaps_mousedev_poll; -- hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL; -- -- /* initial calibrate for the input device */ -- hdaps_calibrate(); -+ hdaps_idev_raw = input_allocate_device(); -+ if (!hdaps_idev_raw) { -+ ret = -ENOMEM; -+ goto out_idev_first; -+ } - -- /* initialize the input class */ -- idev = hdaps_idev->input; -- idev->name = "hdaps"; -- idev->phys = "isa1600/input0"; -- idev->id.bustype = BUS_ISA; -- idev->dev.parent = &pdev->dev; -- idev->evbit[0] = BIT_MASK(EV_ABS); -- input_set_abs_params(idev, ABS_X, -+ /* calibration for the input device (deferred to avoid delay) */ -+ needs_calibration = 1; -+ -+ /* initialize the joystick-like fuzzed input device */ -+ hdaps_idev->name = "ThinkPad HDAPS joystick emulation"; -+ hdaps_idev->phys = "hdaps/input0"; -+ hdaps_idev->id.bustype = BUS_HOST; -+ hdaps_idev->id.vendor = HDAPS_INPUT_VENDOR; -+ hdaps_idev->id.product = HDAPS_INPUT_PRODUCT; -+ hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION; -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) -+ hdaps_idev->cdev.dev = &pdev->dev; -+#endif -+ hdaps_idev->evbit[0] = BIT(EV_ABS); -+ hdaps_idev->open = hdaps_mousedev_open; -+ hdaps_idev->close = hdaps_mousedev_close; -+ input_set_abs_params(hdaps_idev, ABS_X, - -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); -- input_set_abs_params(idev, ABS_Y, -+ input_set_abs_params(hdaps_idev, ABS_Y, - -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); - -- ret = input_register_polled_device(hdaps_idev); -+ ret = input_register_device(hdaps_idev); - if (ret) - goto out_idev; - -- pr_info("driver successfully loaded\n"); -+ /* initialize the raw data input device */ -+ hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data"; -+ hdaps_idev_raw->phys = "hdaps/input1"; -+ hdaps_idev_raw->id.bustype = BUS_HOST; -+ hdaps_idev_raw->id.vendor = HDAPS_INPUT_VENDOR; -+ hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT; -+ hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION; -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) -+ hdaps_idev_raw->cdev.dev = &pdev->dev; -+#endif -+ hdaps_idev_raw->evbit[0] = BIT(EV_ABS); -+ hdaps_idev_raw->open = hdaps_mousedev_open; -+ hdaps_idev_raw->close = hdaps_mousedev_close; -+ input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0); -+ input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0); -+ -+ ret = input_register_device(hdaps_idev_raw); -+ if (ret) -+ goto out_idev_reg_first; -+ -+ pr_info("driver successfully loaded.\n"); - return 0; - -+out_idev_reg_first: -+ input_unregister_device(hdaps_idev); - out_idev: -- input_free_polled_device(hdaps_idev); -+ input_free_device(hdaps_idev_raw); -+out_idev_first: -+ input_free_device(hdaps_idev); - out_group: - sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); - out_device: - platform_device_unregister(pdev); - out_driver: - platform_driver_unregister(&hdaps_driver); --out_region: -- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); -+ hdaps_device_shutdown(); - out: - pr_warn("driver init failed (ret=%d)!\n", ret); - return ret; -@@ -619,12 +881,12 @@ static int __init hdaps_init(void) - - static void __exit hdaps_exit(void) - { -- input_unregister_polled_device(hdaps_idev); -- input_free_polled_device(hdaps_idev); -+ input_unregister_device(hdaps_idev_raw); -+ input_unregister_device(hdaps_idev); -+ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ - sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); - platform_device_unregister(pdev); - platform_driver_unregister(&hdaps_driver); -- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); - - pr_info("driver unloaded\n"); - } -@@ -632,9 +894,8 @@ static void __exit hdaps_exit(void) - module_init(hdaps_init); - module_exit(hdaps_exit); - --module_param_named(invert, hdaps_invert, int, 0); --MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, " -- "2 invert y-axis, 3 invert both axes."); -+module_param_named(invert, hdaps_invert, uint, 0); -+MODULE_PARM_DESC(invert, "axis orientation code"); - - MODULE_AUTHOR("Robert Love"); - MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver"); -diff --git a/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c -new file mode 100644 -index 000000000000..597614bc17e6 ---- /dev/null -+++ b/drivers/platform/x86/thinkpad_ec.c -@@ -0,0 +1,513 @@ -+/* -+ * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions -+ * -+ * The embedded controller on ThinkPad laptops has a non-standard interface, -+ * where LPC channel 3 of the H8S EC chip is hooked up to IO ports -+ * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. -+ * The EC LPC interface provides various system management services (currently -+ * known: battery information and accelerometer readouts). This driver -+ * provides access and mutual exclusion for the EC interface. -+* -+ * The LPC protocol and terminology are documented here: -+ * "H8S/2104B Group Hardware Manual", -+ * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf -+ * -+ * Copyright (C) 2006-2007 Shem Multinymous -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) -+ #include -+#else -+ #include -+#endif -+ -+#define TP_VERSION "0.42" -+ -+MODULE_AUTHOR("Shem Multinymous"); -+MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); -+MODULE_VERSION(TP_VERSION); -+MODULE_LICENSE("GPL"); -+ -+/* IO ports used by embedded controller LPC channel 3: */ -+#define TPC_BASE_PORT 0x1600 -+#define TPC_NUM_PORTS 0x20 -+#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ -+#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ -+#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ -+ /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ -+ msg, args->val[0x0], args->val[0xF], code) -+ -+/* State of request prefetching: */ -+static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ -+static u64 prefetch_jiffies; /* time of prefetch, or: */ -+#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ -+#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ -+ -+/* Locking: */ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) -+static DECLARE_MUTEX(thinkpad_ec_mutex); -+#else -+static DEFINE_SEMAPHORE(thinkpad_ec_mutex); -+#endif -+ -+/* Kludge in case the ACPI DSDT reserves the ports we need. */ -+static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ -+static int reserved_io; /* Successfully reserved the ports? */ -+module_param_named(force_io, force_io, bool, 0600); -+MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); -+ -+/** -+ * thinkpad_ec_lock - get lock on the ThinkPad EC -+ * -+ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 -+ * interface. Returns 0 iff lock acquired. -+ */ -+int thinkpad_ec_lock(void) -+{ -+ int ret; -+ ret = down_interruptible(&thinkpad_ec_mutex); -+ return ret; -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_lock); -+ -+/** -+ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC -+ * -+ * Try getting an exclusive lock for accesing the ThinkPad embedded -+ * controller LPC3. Returns immediately if lock is not available; neither -+ * blocks nor sleeps. Returns 0 iff lock acquired . -+ */ -+int thinkpad_ec_try_lock(void) -+{ -+ return down_trylock(&thinkpad_ec_mutex); -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); -+ -+/** -+ * thinkpad_ec_unlock - release lock on ThinkPad EC -+ * -+ * Release a previously acquired exclusive lock on the ThinkPad ebmedded -+ * controller LPC3 interface. -+ */ -+void thinkpad_ec_unlock(void) -+{ -+ up(&thinkpad_ec_mutex); -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); -+ -+/** -+ * thinkpad_ec_request_row - tell embedded controller to prepare a row -+ * @args Input register arguments -+ * -+ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or -+ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group -+ * Hardware Manual". Does sanity checks via status register STR3. -+ */ -+static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) -+{ -+ u8 str3; -+ int i; -+ -+ /* EC protocol requires write to TWR0 (function code): */ -+ if (!(args->mask & 0x0001)) { -+ printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); -+ return -EINVAL; -+ } -+ -+ /* Check initial STR3 status: */ -+ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; -+ if (str3 & H8S_STR3_OBF3B) { /* data already pending */ -+ inb(TPC_TWR15_PORT); /* marks end of previous transaction */ -+ if (prefetch_jiffies == TPC_PREFETCH_NONE) -+ printk(KERN_WARNING REQ_FMT( -+ "EC has result from unrequested transaction", -+ str3)); -+ return -EBUSY; /* EC will be ready in a few usecs */ -+ } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ -+ if (prefetch_jiffies == TPC_PREFETCH_NONE) -+ printk(KERN_WARNING REQ_FMT( -+ "EC is busy with unrequested transaction", -+ str3)); -+ return -EBUSY; /* data will be pending in a few usecs */ -+ } else if (str3 != 0x00) { /* unexpected status? */ -+ printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); -+ return -EIO; -+ } -+ -+ /* Send TWR0MW: */ -+ outb(args->val[0], TPC_TWR0_PORT); -+ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; -+ if (str3 != H8S_STR3_MWMF) { /* not accepted? */ -+ printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); -+ return -EIO; -+ } -+ -+ /* Send TWR1 through TWR14: */ -+ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) -+ if ((args->mask>>i)&1) -+ outb(args->val[i], TPC_TWR0_PORT+i); -+ -+ /* Send TWR15 (default to 0x01). This marks end of command. */ -+ outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); -+ -+ /* Wait until EC starts writing its reply (~60ns on average). -+ * Releasing locks before this happens may cause an EC hang -+ * due to firmware bug! -+ */ -+ for (i = 0; i < TPC_REQUEST_RETRIES; i++) { -+ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; -+ if (str3 & H8S_STR3_SWMF) /* EC started replying */ -+ return 0; -+ else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) -+ /* Normal progress (the EC hasn't seen the request -+ * yet, or is processing it). Wait it out. */ -+ ndelay(TPC_REQUEST_NDELAY); -+ else { /* weird EC status */ -+ printk(KERN_WARNING -+ REQ_FMT("bad end STR3", str3)); -+ return -EIO; -+ } -+ } -+ printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); -+ return -EIO; -+} -+ -+/** -+ * thinkpad_ec_read_data - read pre-requested row-data from EC -+ * @args Input register arguments of pre-requested rows -+ * @data Output register values -+ * -+ * Reads current row data from the controller, assuming it's already -+ * requested. Follows the H8S spec for register access and status checks. -+ */ -+static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, -+ struct thinkpad_ec_row *data) -+{ -+ int i; -+ u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; -+ /* Once we make a request, STR3 assumes the sequence of values listed -+ * in the following 'if' as it reads the request and writes its data. -+ * It takes about a few dozen nanosecs total, with very high variance. -+ */ -+ if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || -+ str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ -+ str3 == H8S_STR3_SWMF) -+ return -EBUSY; /* not ready yet */ -+ /* Finally, the EC signals output buffer full: */ -+ if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { -+ printk(KERN_WARNING -+ REQ_FMT("bad initial STR3", str3)); -+ return -EIO; -+ } -+ -+ /* Read first byte (signals start of read transactions): */ -+ data->val[0] = inb(TPC_TWR0_PORT); -+ /* Optionally read 14 more bytes: */ -+ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) -+ if ((data->mask >> i)&1) -+ data->val[i] = inb(TPC_TWR0_PORT+i); -+ /* Read last byte from 0x161F (signals end of read transaction): */ -+ data->val[0xF] = inb(TPC_TWR15_PORT); -+ -+ /* Readout still pending? */ -+ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; -+ if (str3 & H8S_STR3_OBF3B) -+ printk(KERN_WARNING -+ REQ_FMT("OBF3B=1 after read", str3)); -+ /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ -+ if (data->val[0xF] == 0x80) -+ printk(KERN_WARNING -+ REQ_FMT("0x161F reports error", data->val[0xF])); -+ return 0; -+} -+ -+/** -+ * thinkpad_ec_is_row_fetched - is the given row currently prefetched? -+ * -+ * To keep things simple we compare only the first and last args; -+ * this suffices for all known cases. -+ */ -+static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) -+{ -+ return (prefetch_jiffies != TPC_PREFETCH_NONE) && -+ (prefetch_jiffies != TPC_PREFETCH_JUNK) && -+ (prefetch_arg0 == args->val[0]) && -+ (prefetch_argF == args->val[0xF]) && -+ (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); -+} -+ -+/** -+ * thinkpad_ec_read_row - request and read data from ThinkPad EC -+ * @args Input register arguments -+ * @data Output register values -+ * -+ * Read a data row from the ThinkPad embedded controller LPC3 interface. -+ * Does fetching and retrying if needed. The row is specified by an -+ * array of 16 bytes, some of which may be undefined (but the first is -+ * mandatory). These bytes are given in @args->val[], where @args->val[i] is -+ * used iff (@args->mask>>i)&1). The resulting row data is stored in -+ * @data->val[], but is only guaranteed to be valid for indices corresponding -+ * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. -+ * -+ * Returns -EBUSY on transient error and -EIO on abnormal condition. -+ * Caller must hold controller lock. -+ */ -+int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, -+ struct thinkpad_ec_row *data) -+{ -+ int retries, ret; -+ -+ if (thinkpad_ec_is_row_fetched(args)) -+ goto read_row; /* already requested */ -+ -+ /* Request the row */ -+ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { -+ ret = thinkpad_ec_request_row(args); -+ if (!ret) -+ goto read_row; -+ if (ret != -EBUSY) -+ break; -+ ndelay(TPC_READ_NDELAY); -+ } -+ printk(KERN_ERR REQ_FMT("failed requesting row", ret)); -+ goto out; -+ -+read_row: -+ /* Read the row's data */ -+ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { -+ ret = thinkpad_ec_read_data(args, data); -+ if (!ret) -+ goto out; -+ if (ret != -EBUSY) -+ break; -+ ndelay(TPC_READ_NDELAY); -+ } -+ -+ printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); -+ -+out: -+ prefetch_jiffies = TPC_PREFETCH_JUNK; -+ return ret; -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); -+ -+/** -+ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC -+ * @args Input register arguments -+ * @data Output register values -+ * -+ * Try reading a data row from the ThinkPad embedded controller LPC3 -+ * interface, if this raw was recently prefetched using -+ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. -+ * The parameters have the same meaning as in thinkpad_ec_read_row(). -+ * -+ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. -+ * Caller must hold controller lock. -+ */ -+int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, -+ struct thinkpad_ec_row *data) -+{ -+ int ret; -+ if (!thinkpad_ec_is_row_fetched(args)) { -+ ret = -ENODATA; -+ } else { -+ ret = thinkpad_ec_read_data(args, data); -+ if (!ret) -+ prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ -+ } -+ return ret; -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); -+ -+/** -+ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC -+ * @args Input register arguments -+ * -+ * Prefetch a data row from the ThinkPad embedded controller LCP3 -+ * interface. A subsequent call to thinkpad_ec_read_row() with the -+ * same arguments will be faster, and a subsequent call to -+ * thinkpad_ec_try_read_row() stands a good chance of succeeding if -+ * done neither too soon nor too late. See -+ * thinkpad_ec_read_row() for the meaning of @args. -+ * -+ * Returns -EBUSY on transient error and -EIO on abnormal condition. -+ * Caller must hold controller lock. -+ */ -+int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) -+{ -+ int ret; -+ ret = thinkpad_ec_request_row(args); -+ if (ret) { -+ prefetch_jiffies = TPC_PREFETCH_JUNK; -+ } else { -+ prefetch_jiffies = get_jiffies_64(); -+ prefetch_arg0 = args->val[0x0]; -+ prefetch_argF = args->val[0xF]; -+ } -+ return ret; -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); -+ -+/** -+ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data -+ * -+ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the -+ * ThinkPad embedded controller LPC3 interface. -+ * Must be called before unlocking by any code that accesses the controller -+ * ports directly. -+ */ -+void thinkpad_ec_invalidate(void) -+{ -+ prefetch_jiffies = TPC_PREFETCH_JUNK; -+} -+EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); -+ -+ -+/*** Checking for EC hardware ***/ -+ -+/** -+ * thinkpad_ec_test - verify the EC is present and follows protocol -+ * -+ * Ensure the EC LPC3 channel really works on this machine by making -+ * an EC request and seeing if the EC follows the documented H8S protocol. -+ * The requested row just reads battery status, so it should be harmless to -+ * access it (on a correct EC). -+ * This test writes to IO ports, so execute only after checking DMI. -+ */ -+static int __init thinkpad_ec_test(void) -+{ -+ int ret; -+ const struct thinkpad_ec_row args = /* battery 0 basic status */ -+ { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; -+ struct thinkpad_ec_row data = { .mask = 0x0000 }; -+ ret = thinkpad_ec_lock(); -+ if (ret) -+ return ret; -+ ret = thinkpad_ec_read_row(&args, &data); -+ thinkpad_ec_unlock(); -+ return ret; -+} -+ -+/* Search all DMI device names of a given type for a substring */ -+static int __init dmi_find_substring(int type, const char *substr) -+{ -+ const struct dmi_device *dev = NULL; -+ while ((dev = dmi_find_device(type, NULL, dev))) { -+ if (strstr(dev->name, substr)) -+ return 1; -+ } -+ return 0; -+} -+ -+#define TP_DMI_MATCH(vendor,model) { \ -+ .ident = vendor " " model, \ -+ .matches = { \ -+ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ -+ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ -+ } \ -+} -+ -+/* Check DMI for existence of ThinkPad embedded controller */ -+static int __init check_dmi_for_ec(void) -+{ -+ /* A few old models that have a good EC but don't report it in DMI */ -+ struct dmi_system_id tp_whitelist[] = { -+ TP_DMI_MATCH("IBM", "ThinkPad A30"), -+ TP_DMI_MATCH("IBM", "ThinkPad T23"), -+ TP_DMI_MATCH("IBM", "ThinkPad X24"), -+ TP_DMI_MATCH("LENOVO", "ThinkPad"), -+ { .ident = NULL } -+ }; -+ return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, -+ "IBM ThinkPad Embedded Controller") || -+ dmi_check_system(tp_whitelist); -+} -+ -+/*** Init and cleanup ***/ -+ -+static int __init thinkpad_ec_init(void) -+{ -+ if (!check_dmi_for_ec()) { -+ printk(KERN_WARNING -+ "thinkpad_ec: no ThinkPad embedded controller!\n"); -+ return -ENODEV; -+ } -+ -+ if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { -+ reserved_io = 1; -+ } else { -+ printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", -+ TPC_BASE_PORT, -+ TPC_BASE_PORT + TPC_NUM_PORTS - 1); -+ if (force_io) { -+ printk("forcing use of unreserved IO ports.\n"); -+ } else { -+ printk("consider using force_io=1.\n"); -+ return -ENXIO; -+ } -+ } -+ prefetch_jiffies = TPC_PREFETCH_JUNK; -+ if (thinkpad_ec_test()) { -+ printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); -+ if (reserved_io) -+ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); -+ return -ENXIO; -+ } -+ printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); -+ return 0; -+} -+ -+static void __exit thinkpad_ec_exit(void) -+{ -+ if (reserved_io) -+ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); -+ printk(KERN_INFO "thinkpad_ec: unloaded.\n"); -+} -+ -+module_init(thinkpad_ec_init); -+module_exit(thinkpad_ec_exit); -diff --git a/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c -new file mode 100644 -index 000000000000..209cb6487e24 ---- /dev/null -+++ b/drivers/platform/x86/tp_smapi.c -@@ -0,0 +1,1493 @@ -+/* -+ * tp_smapi.c - ThinkPad SMAPI support -+ * -+ * This driver exposes some features of the System Management Application -+ * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on -+ * models in which the SMAPI BIOS runs in SMM and is invoked by writing -+ * to the APM control port 0xB2. -+ * It also exposes battery status information, obtained from the ThinkPad -+ * embedded controller (via the thinkpad_ec module). -+ * Ancient ThinkPad models use a different interface, supported by the -+ * "thinkpad" module from "tpctl". -+ * -+ * Many of the battery status values obtained from the EC simply mirror -+ * values provided by the battery's Smart Battery System (SBS) interface, so -+ * their meaning is defined by the Smart Battery Data Specification (see -+ * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec -+ * are given in the code where relevant. -+ * -+ * Copyright (C) 2006 Shem Multinymous . -+ * SMAPI access code based on the mwave driver by Mike Sullivan. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include /* CMOS defines */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define TP_VERSION "0.42" -+#define TP_DESC "ThinkPad SMAPI Support" -+#define TP_DIR "smapi" -+ -+MODULE_AUTHOR("Shem Multinymous"); -+MODULE_DESCRIPTION(TP_DESC); -+MODULE_VERSION(TP_VERSION); -+MODULE_LICENSE("GPL"); -+ -+static struct platform_device *pdev; -+ -+static int tp_debug; -+module_param_named(debug, tp_debug, int, 0600); -+MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); -+ -+/* A few macros for printk()ing: */ -+#define TPRINTK(level, fmt, args...) \ -+ dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) -+#define DPRINTK(fmt, args...) \ -+ do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) -+ -+/********************************************************************* -+ * SMAPI interface -+ */ -+ -+/* SMAPI functions (register BX when making the SMM call). */ -+#define SMAPI_GET_INHIBIT_CHARGE 0x2114 -+#define SMAPI_SET_INHIBIT_CHARGE 0x2115 -+#define SMAPI_GET_THRESH_START 0x2116 -+#define SMAPI_SET_THRESH_START 0x2117 -+#define SMAPI_GET_FORCE_DISCHARGE 0x2118 -+#define SMAPI_SET_FORCE_DISCHARGE 0x2119 -+#define SMAPI_GET_THRESH_STOP 0x211a -+#define SMAPI_SET_THRESH_STOP 0x211b -+ -+/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at -+ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ -+#define SMAPI_RETCODE_EOF 0xff -+static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = -+{ -+ {0x00, "OK", 0}, -+ {0x53, "SMAPI function is not available", -ENXIO}, -+ {0x81, "Invalid parameter", -EINVAL}, -+ {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, -+ {0x90, "System error", -EIO}, -+ {0x91, "System is invalid", -EIO}, -+ {0x92, "System is busy, -EBUSY"}, -+ {0xa0, "Device error (disk read error)", -EIO}, -+ {0xa1, "Device is busy", -EBUSY}, -+ {0xa2, "Device is not attached", -ENXIO}, -+ {0xa3, "Device is disbled", -EIO}, -+ {0xa4, "Request parameter is out of range", -EINVAL}, -+ {0xa5, "Request parameter is not accepted", -EINVAL}, -+ {0xa6, "Transient error", -EBUSY}, /* ? */ -+ {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} -+}; -+ -+ -+#define SMAPI_MAX_RETRIES 10 -+#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ -+static unsigned short smapi_port; /* APM control port, normally 0xB2 */ -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) -+static DECLARE_MUTEX(smapi_mutex); -+#else -+static DEFINE_SEMAPHORE(smapi_mutex); -+#endif -+ -+/** -+ * find_smapi_port - read SMAPI port from NVRAM -+ */ -+static int __init find_smapi_port(void) -+{ -+ u16 smapi_id = 0; -+ unsigned short port = 0; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&rtc_lock, flags); -+ smapi_id = CMOS_READ(0x7C); -+ smapi_id |= (CMOS_READ(0x7D) << 8); -+ spin_unlock_irqrestore(&rtc_lock, flags); -+ -+ if (smapi_id != 0x5349) { -+ printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); -+ return -ENXIO; -+ } -+ spin_lock_irqsave(&rtc_lock, flags); -+ port = CMOS_READ(0x7E); -+ port |= (CMOS_READ(0x7F) << 8); -+ spin_unlock_irqrestore(&rtc_lock, flags); -+ if (port == 0) { -+ printk(KERN_ERR "unable to read SMAPI port number\n"); -+ return -ENXIO; -+ } -+ return port; -+} -+ -+/** -+ * smapi_request - make a SMAPI call -+ * @inEBX, @inECX, @inEDI, @inESI: input registers -+ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers -+ * @msg: textual error message -+ * Invokes the SMAPI SMBIOS with the given input and outpu args. -+ * All outputs are optional (can be %NULL). -+ * Returns 0 when successful, and a negative errno constant -+ * (see smapi_retcode above) upon failure. -+ */ -+static int smapi_request(u32 inEBX, u32 inECX, -+ u32 inEDI, u32 inESI, -+ u32 *outEBX, u32 *outECX, u32 *outEDX, -+ u32 *outEDI, u32 *outESI, const char **msg) -+{ -+ int ret = 0; -+ int i; -+ int retries; -+ u8 rc; -+ /* Must use local vars for output regs, due to reg pressure. */ -+ u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; -+ -+ for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { -+ DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", -+ inEBX, inECX, inEDI, inESI); -+ -+ /* SMAPI's SMBIOS call and thinkpad_ec end up using use -+ * different interfaces to the same chip, so play it safe. */ -+ ret = thinkpad_ec_lock(); -+ if (ret) -+ return ret; -+ -+ __asm__ __volatile__( -+ "movl $0x00005380,%%eax\n\t" -+ "movl %6,%%ebx\n\t" -+ "movl %7,%%ecx\n\t" -+ "movl %8,%%edi\n\t" -+ "movl %9,%%esi\n\t" -+ "xorl %%edx,%%edx\n\t" -+ "movw %10,%%dx\n\t" -+ "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ -+ "out %%al,$0x4F\n\t" -+ "movl %%eax,%0\n\t" -+ "movl %%ebx,%1\n\t" -+ "movl %%ecx,%2\n\t" -+ "movl %%edx,%3\n\t" -+ "movl %%edi,%4\n\t" -+ "movl %%esi,%5\n\t" -+ :"=m"(tmpEAX), -+ "=m"(tmpEBX), -+ "=m"(tmpECX), -+ "=m"(tmpEDX), -+ "=m"(tmpEDI), -+ "=m"(tmpESI) -+ :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), -+ "m"((u16)smapi_port) -+ :"%eax", "%ebx", "%ecx", "%edx", "%edi", -+ "%esi"); -+ -+ thinkpad_ec_invalidate(); -+ thinkpad_ec_unlock(); -+ -+ /* Don't let the next SMAPI access happen too quickly, -+ * may case problems. (We're hold smapi_mutex). */ -+ msleep(50); -+ -+ if (outEBX) *outEBX = tmpEBX; -+ if (outECX) *outECX = tmpECX; -+ if (outEDX) *outEDX = tmpEDX; -+ if (outESI) *outESI = tmpESI; -+ if (outEDI) *outEDI = tmpEDI; -+ -+ /* Look up error code */ -+ rc = (tmpEAX>>8)&0xFF; -+ for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && -+ smapi_retcode[i].rc != rc; ++i) {} -+ ret = smapi_retcode[i].ret; -+ if (msg) -+ *msg = smapi_retcode[i].msg; -+ -+ DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", -+ tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); -+ if (ret) -+ TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", -+ smapi_retcode[i].msg, inEBX); -+ -+ if (ret != -EBUSY) -+ return ret; -+ } -+ return ret; -+} -+ -+/* Convenience wrapper: discard output arguments */ -+static int smapi_write(u32 inEBX, u32 inECX, -+ u32 inEDI, u32 inESI, const char **msg) -+{ -+ return smapi_request(inEBX, inECX, inEDI, inESI, -+ NULL, NULL, NULL, NULL, NULL, msg); -+} -+ -+ -+/********************************************************************* -+ * Specific SMAPI services -+ * All of these functions return 0 upon success, and a negative errno -+ * constant (see smapi_retcode) on failure. -+ */ -+ -+enum thresh_type { -+ THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ -+ THRESH_START -+}; -+#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") -+ -+/** -+ * __get_real_thresh - read battery charge start/stop threshold from SMAPI -+ * @bat: battery number (0 or 1) -+ * @which: THRESH_START or THRESH_STOP -+ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) -+ * @outEDI: some additional state that needs to be preserved, meaning unknown -+ * @outESI: some additional state that needs to be preserved, meaning unknown -+ */ -+static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, -+ u32 *outEDI, u32 *outESI) -+{ -+ u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START -+ : SMAPI_GET_THRESH_STOP; -+ u32 ecx = (bat+1)<<8; -+ const char *msg; -+ int ret = smapi_request(ebx, ecx, 0, 0, NULL, -+ &ecx, NULL, outEDI, outESI, &msg); -+ if (ret) { -+ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", -+ THRESH_NAME(which), bat, msg); -+ return ret; -+ } -+ if (!(ecx&0x00000100)) { -+ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", -+ THRESH_NAME(which), bat, ecx); -+ return -EIO; -+ } -+ if (thresh) -+ *thresh = ecx&0xFF; -+ return 0; -+} -+ -+/** -+ * get_real_thresh - read battery charge start/stop threshold from SMAPI -+ * @bat: battery number (0 or 1) -+ * @which: THRESH_START or THRESH_STOP -+ * @thresh: 1..99, 0=default (passes as-is to SMAPI) -+ */ -+static int get_real_thresh(int bat, enum thresh_type which, int *thresh) -+{ -+ return __get_real_thresh(bat, which, thresh, NULL, NULL); -+} -+ -+/** -+ * set_real_thresh - write battery start/top charge threshold to SMAPI -+ * @bat: battery number (0 or 1) -+ * @which: THRESH_START or THRESH_STOP -+ * @thresh: 1..99, 0=default (passes as-is to SMAPI) -+ */ -+static int set_real_thresh(int bat, enum thresh_type which, int thresh) -+{ -+ u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START -+ : SMAPI_SET_THRESH_STOP; -+ u32 ecx = ((bat+1)<<8) + thresh; -+ u32 getDI, getSI; -+ const char *msg; -+ int ret; -+ -+ /* verify read before writing */ -+ ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); -+ if (ret) -+ return ret; -+ -+ ret = smapi_write(ebx, ecx, getDI, getSI, &msg); -+ if (ret) -+ TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", -+ THRESH_NAME(which), thresh, bat, msg); -+ else -+ TPRINTK(KERN_INFO, "set %s to %d for bat=%d", -+ THRESH_NAME(which), thresh, bat); -+ return ret; -+} -+ -+/** -+ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI -+ * @bat: battery number (0 or 1) -+ * @minutes: period in minutes (1..65535 minutes, 0=disabled) -+ * @outECX: some additional state that needs to be preserved, meaning unknown -+ * Note that @minutes is the originally set value, it does not count down. -+ */ -+static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) -+{ -+ u32 ecx = (bat+1)<<8; -+ u32 esi; -+ const char *msg; -+ int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, -+ NULL, &ecx, NULL, NULL, &esi, &msg); -+ if (ret) { -+ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); -+ return ret; -+ } -+ if (!(ecx&0x0100)) { -+ TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); -+ return -EIO; -+ } -+ if (minutes) -+ *minutes = (ecx&0x0001)?esi:0; -+ if (outECX) -+ *outECX = ecx; -+ return 0; -+} -+ -+/** -+ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI -+ * @bat: battery number (0 or 1) -+ * @minutes: period in minutes (1..65535 minutes, 0=disabled) -+ * Note that @minutes is the originally set value, it does not count down. -+ */ -+static int get_inhibit_charge_minutes(int bat, int *minutes) -+{ -+ return __get_inhibit_charge_minutes(bat, minutes, NULL); -+} -+ -+/** -+ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI -+ * @bat: battery number (0 or 1) -+ * @minutes: period in minutes (1..65535 minutes, 0=disabled) -+ */ -+static int set_inhibit_charge_minutes(int bat, int minutes) -+{ -+ u32 ecx; -+ const char *msg; -+ int ret; -+ -+ /* verify read before writing */ -+ ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); -+ if (ret) -+ return ret; -+ -+ ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); -+ if (minutes > 0xFFFF) -+ minutes = 0xFFFF; -+ ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); -+ if (ret) -+ TPRINTK(KERN_NOTICE, -+ "set to %d failed for bat=%d: %s", minutes, bat, msg); -+ else -+ TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); -+ return ret; -+} -+ -+ -+/** -+ * get_force_discharge - get status of forced discharging from SMAPI -+ * @bat: battery number (0 or 1) -+ * @enabled: 1 if forced discharged is enabled, 0 if not -+ */ -+static int get_force_discharge(int bat, int *enabled) -+{ -+ u32 ecx = (bat+1)<<8; -+ const char *msg; -+ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, -+ NULL, &ecx, NULL, NULL, NULL, &msg); -+ if (ret) { -+ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); -+ return ret; -+ } -+ *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; -+ return 0; -+} -+ -+/** -+ * set_force_discharge - write status of forced discharging to SMAPI -+ * @bat: battery number (0 or 1) -+ * @enabled: 1 if forced discharged is enabled, 0 if not -+ */ -+static int set_force_discharge(int bat, int enabled) -+{ -+ u32 ecx = (bat+1)<<8; -+ const char *msg; -+ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, -+ NULL, &ecx, NULL, NULL, NULL, &msg); -+ if (ret) { -+ TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); -+ return ret; -+ } -+ if (ecx&0x00000100) { -+ TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); -+ return -EIO; -+ } -+ -+ ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); -+ ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); -+ if (ret) -+ TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", -+ enabled, bat, msg); -+ else -+ TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); -+ return ret; -+} -+ -+ -+/********************************************************************* -+ * Wrappers to threshold-related SMAPI functions, which handle default -+ * thresholds and related quirks. -+ */ -+ -+/* Minimum, default and minimum difference for battery charging thresholds: */ -+#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ -+#define MIN_THRESH_START 2 -+#define MAX_THRESH_START (100-MIN_THRESH_DELTA) -+#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) -+#define MAX_THRESH_STOP 100 -+#define DEFAULT_THRESH_START MAX_THRESH_START -+#define DEFAULT_THRESH_STOP MAX_THRESH_STOP -+ -+/* The GUI of IBM's Battery Maximizer seems to show a start threshold that -+ * is 1 more than the value we set/get via SMAPI. Since the threshold is -+ * maintained across reboot, this can be confusing. So we kludge our -+ * interface for interoperability: */ -+#define BATMAX_FIX 1 -+ -+/* Get charge start/stop threshold (1..100), -+ * substituting default values if needed and applying BATMAT_FIX. */ -+static int get_thresh(int bat, enum thresh_type which, int *thresh) -+{ -+ int ret = get_real_thresh(bat, which, thresh); -+ if (ret) -+ return ret; -+ if (*thresh == 0) -+ *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START -+ : DEFAULT_THRESH_STOP; -+ else if (which == THRESH_START) -+ *thresh += BATMAX_FIX; -+ return 0; -+} -+ -+ -+/* Set charge start/stop threshold (1..100), -+ * substituting default values if needed and applying BATMAT_FIX. */ -+static int set_thresh(int bat, enum thresh_type which, int thresh) -+{ -+ if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) -+ thresh = 0; /* 100 is out of range, but default means 100 */ -+ if (which == THRESH_START) -+ thresh -= BATMAX_FIX; -+ return set_real_thresh(bat, which, thresh); -+} -+ -+/********************************************************************* -+ * ThinkPad embedded controller readout and basic functions -+ */ -+ -+/** -+ * read_tp_ec_row - read data row from the ThinkPad embedded controller -+ * @arg0: EC command code -+ * @bat: battery number, 0 or 1 -+ * @j: the byte value to be used for "junk" (unused) input/outputs -+ * @dataval: result vector -+ */ -+static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) -+{ -+ int ret; -+ const struct thinkpad_ec_row args = { .mask = 0xFFFF, -+ .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; -+ struct thinkpad_ec_row data = { .mask = 0xFFFF }; -+ -+ ret = thinkpad_ec_lock(); -+ if (ret) -+ return ret; -+ ret = thinkpad_ec_read_row(&args, &data); -+ thinkpad_ec_unlock(); -+ memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); -+ return ret; -+} -+ -+/** -+ * power_device_present - check for presence of battery or AC power -+ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power -+ * Returns 1 if present, 0 if not present, negative if error. -+ */ -+static int power_device_present(int bat) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ u8 test; -+ int ret = read_tp_ec_row(1, bat, 0, row); -+ if (ret) -+ return ret; -+ switch (bat) { -+ case 0: test = 0x40; break; /* battery 0 */ -+ case 1: test = 0x20; break; /* battery 1 */ -+ default: test = 0x80; /* AC power */ -+ } -+ return (row[0] & test) ? 1 : 0; -+} -+ -+/** -+ * bat_has_status - check if battery can report detailed status -+ * @bat: 0 for battery 0, 1 for battery 1 -+ * Returns 1 if yes, 0 if no, negative if error. -+ */ -+static int bat_has_status(int bat) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ int ret = read_tp_ec_row(1, bat, 0, row); -+ if (ret) -+ return ret; -+ if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ -+ return 0; -+ if ((row[1] & (0x60)) == 0) /* no status */ -+ return 0; -+ return 1; -+} -+ -+/** -+ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data -+ * @arg0: first argument to EC -+ * @off: offset in row returned from EC -+ * @bat: battery (0 or 1) -+ * @val: the 16-bit value obtained -+ * Returns nonzero on error. -+ */ -+static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ int ret; -+ if (bat_has_status(bat) != 1) -+ return -ENXIO; -+ ret = read_tp_ec_row(arg0, bat, 0, row); -+ if (ret) -+ return ret; -+ *val = *(u16 *)(row+offset); -+ return 0; -+} -+ -+/********************************************************************* -+ * sysfs attributes for batteries - -+ * definitions and helper functions -+ */ -+ -+/* A custom device attribute struct which holds a battery number */ -+struct bat_device_attribute { -+ struct device_attribute dev_attr; -+ int bat; -+}; -+ -+/** -+ * attr_get_bat - get the battery to which the attribute belongs -+ */ -+static int attr_get_bat(struct device_attribute *attr) -+{ -+ return container_of(attr, struct bat_device_attribute, dev_attr)->bat; -+} -+ -+/** -+ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute -+ * @arg0: specified 1st argument of EC raw to read -+ * @offset: byte offset in EC raw data -+ * @mul: correction factor to multiply by -+ * @na_msg: string to output is value not available (0xFFFFFFFF) -+ * @attr: battery attribute -+ * @buf: output buffer -+ * The 16-bit value is read from the EC, treated as unsigned, -+ * transformed as x->mul*x, and printed to the buffer. -+ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. -+ */ -+static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, -+ const char *na_msg, -+ struct device_attribute *attr, char *buf) -+{ -+ u16 val; -+ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); -+ if (ret) -+ return ret; -+ if (na_msg && val == 0xFFFF) -+ return sprintf(buf, "%s\n", na_msg); -+ else -+ return sprintf(buf, "%u\n", mul*(unsigned int)val); -+} -+ -+/** -+ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute -+ * @arg0: specified 1st argument of EC raw to read -+ * @offset: byte offset in EC raw data -+ * @mul: correction factor to multiply by -+ * @add: correction term to add after multiplication -+ * @attr: battery attribute -+ * @buf: output buffer -+ * The 16-bit value is read from the EC, treated as signed, -+ * transformed as x->mul*x+add, and printed to the buffer. -+ */ -+static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, -+ struct device_attribute *attr, char *buf) -+{ -+ u16 val; -+ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%d\n", mul*(s16)val+add); -+} -+ -+/** -+ * show_tp_ec_bat_str - show a string from EC battery status data -+ * @arg0: specified 1st argument of EC raw to read -+ * @offset: byte offset in EC raw data -+ * @maxlen: maximum string length -+ * @attr: battery attribute -+ * @buf: output buffer -+ */ -+static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, -+ struct device_attribute *attr, char *buf) -+{ -+ int bat = attr_get_bat(attr); -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ int ret; -+ if (bat_has_status(bat) != 1) -+ return -ENXIO; -+ ret = read_tp_ec_row(arg0, bat, 0, row); -+ if (ret) -+ return ret; -+ strncpy(buf, (char *)row+offset, maxlen); -+ buf[maxlen] = 0; -+ strcat(buf, "\n"); -+ return strlen(buf); -+} -+ -+/** -+ * show_tp_ec_bat_power - show a power readout from EC battery status data -+ * @arg0: specified 1st argument of EC raw to read -+ * @offV: byte offset of voltage in EC raw data -+ * @offI: byte offset of current in EC raw data -+ * @attr: battery attribute -+ * @buf: output buffer -+ * Computes the power as current*voltage from the two given readout offsets. -+ */ -+static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, -+ struct device_attribute *attr, char *buf) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ int milliamp, millivolt, ret; -+ int bat = attr_get_bat(attr); -+ if (bat_has_status(bat) != 1) -+ return -ENXIO; -+ ret = read_tp_ec_row(1, bat, 0, row); -+ if (ret) -+ return ret; -+ millivolt = *(u16 *)(row+offV); -+ milliamp = *(s16 *)(row+offI); -+ return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ -+} -+ -+/** -+ * show_tp_ec_bat_date - decode and show a date from EC battery status data -+ * @arg0: specified 1st argument of EC raw to read -+ * @offset: byte offset in EC raw data -+ * @attr: battery attribute -+ * @buf: output buffer -+ */ -+static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, -+ struct device_attribute *attr, char *buf) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ u16 v; -+ int ret; -+ int day, month, year; -+ int bat = attr_get_bat(attr); -+ if (bat_has_status(bat) != 1) -+ return -ENXIO; -+ ret = read_tp_ec_row(arg0, bat, 0, row); -+ if (ret) -+ return ret; -+ -+ /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ -+ v = *(u16 *)(row+offset); -+ day = v & 0x1F; -+ month = (v >> 5) & 0xF; -+ year = (v >> 9) + 1980; -+ -+ return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); -+} -+ -+ -+/********************************************************************* -+ * sysfs attribute I/O for batteries - -+ * the actual attribute show/store functions -+ */ -+ -+static ssize_t show_battery_start_charge_thresh(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ int thresh; -+ int bat = attr_get_bat(attr); -+ int ret = get_thresh(bat, THRESH_START, &thresh); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%d\n", thresh); /* units: percent */ -+} -+ -+static ssize_t show_battery_stop_charge_thresh(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ int thresh; -+ int bat = attr_get_bat(attr); -+ int ret = get_thresh(bat, THRESH_STOP, &thresh); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%d\n", thresh); /* units: percent */ -+} -+ -+/** -+ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr -+ * Since this is a kernel<->user interface, we ensure a valid state for -+ * the hardware. We do this by clamping the requested threshold to the -+ * valid range and, if necessary, moving the other threshold so that -+ * it's MIN_THRESH_DELTA away from this one. -+ */ -+static ssize_t store_battery_start_charge_thresh(struct device *dev, -+ struct device_attribute *attr, const char *buf, size_t count) -+{ -+ int thresh, other_thresh, ret; -+ int bat = attr_get_bat(attr); -+ -+ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) -+ return -EINVAL; -+ -+ if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ -+ thresh = MIN_THRESH_START; -+ if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ -+ thresh = MAX_THRESH_START; -+ -+ down(&smapi_mutex); -+ ret = get_thresh(bat, THRESH_STOP, &other_thresh); -+ if (ret != -EOPNOTSUPP && ret != -ENXIO) { -+ if (ret) /* other threshold is set? */ -+ goto out; -+ ret = get_real_thresh(bat, THRESH_START, NULL); -+ if (ret) /* this threshold is set? */ -+ goto out; -+ if (other_thresh < thresh+MIN_THRESH_DELTA) { -+ /* move other thresh to keep it above this one */ -+ ret = set_thresh(bat, THRESH_STOP, -+ thresh+MIN_THRESH_DELTA); -+ if (ret) -+ goto out; -+ } -+ } -+ ret = set_thresh(bat, THRESH_START, thresh); -+out: -+ up(&smapi_mutex); -+ return count; -+ -+} -+ -+/** -+ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr -+ * Since this is a kernel<->user interface, we ensure a valid state for -+ * the hardware. We do this by clamping the requested threshold to the -+ * valid range and, if necessary, moving the other threshold so that -+ * it's MIN_THRESH_DELTA away from this one. -+ */ -+static ssize_t store_battery_stop_charge_thresh(struct device *dev, -+ struct device_attribute *attr, const char *buf, size_t count) -+{ -+ int thresh, other_thresh, ret; -+ int bat = attr_get_bat(attr); -+ -+ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) -+ return -EINVAL; -+ -+ if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ -+ thresh = MIN_THRESH_STOP; -+ -+ down(&smapi_mutex); -+ ret = get_thresh(bat, THRESH_START, &other_thresh); -+ if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ -+ if (ret) -+ goto out; -+ /* this threshold exists? */ -+ ret = get_real_thresh(bat, THRESH_STOP, NULL); -+ if (ret) -+ goto out; -+ if (other_thresh >= thresh-MIN_THRESH_DELTA) { -+ /* move other thresh to be below this one */ -+ ret = set_thresh(bat, THRESH_START, -+ thresh-MIN_THRESH_DELTA); -+ if (ret) -+ goto out; -+ } -+ } -+ ret = set_thresh(bat, THRESH_STOP, thresh); -+out: -+ up(&smapi_mutex); -+ return count; -+} -+ -+static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ int minutes; -+ int bat = attr_get_bat(attr); -+ int ret = get_inhibit_charge_minutes(bat, &minutes); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%d\n", minutes); /* units: minutes */ -+} -+ -+static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, -+ struct device_attribute *attr, -+ const char *buf, size_t count) -+{ -+ int ret; -+ int minutes; -+ int bat = attr_get_bat(attr); -+ if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { -+ TPRINTK(KERN_ERR, "inhibit_charge_minutes: " -+ "must be a non-negative integer"); -+ return -EINVAL; -+ } -+ ret = set_inhibit_charge_minutes(bat, minutes); -+ if (ret) -+ return ret; -+ return count; -+} -+ -+static ssize_t show_battery_force_discharge(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ int enabled; -+ int bat = attr_get_bat(attr); -+ int ret = get_force_discharge(bat, &enabled); -+ if (ret) -+ return ret; -+ return sprintf(buf, "%d\n", enabled); /* type: boolean */ -+} -+ -+static ssize_t store_battery_force_discharge(struct device *dev, -+ struct device_attribute *attr, const char *buf, size_t count) -+{ -+ int ret; -+ int enabled; -+ int bat = attr_get_bat(attr); -+ if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) -+ return -EINVAL; -+ ret = set_force_discharge(bat, enabled); -+ if (ret) -+ return ret; -+ return count; -+} -+ -+static ssize_t show_battery_installed( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ int bat = attr_get_bat(attr); -+ int ret = power_device_present(bat); -+ if (ret < 0) -+ return ret; -+ return sprintf(buf, "%d\n", ret); /* type: boolean */ -+} -+ -+static ssize_t show_battery_state( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ u8 row[TP_CONTROLLER_ROW_LEN]; -+ const char *txt; -+ int ret; -+ int bat = attr_get_bat(attr); -+ if (bat_has_status(bat) != 1) -+ return sprintf(buf, "none\n"); -+ ret = read_tp_ec_row(1, bat, 0, row); -+ if (ret) -+ return ret; -+ switch (row[1] & 0xf0) { -+ case 0xc0: txt = "idle"; break; -+ case 0xd0: txt = "discharging"; break; -+ case 0xe0: txt = "charging"; break; -+ default: return sprintf(buf, "unknown (0x%x)\n", row[1]); -+ } -+ return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ -+} -+ -+static ssize_t show_battery_manufacturer( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: string. SBS spec v1.1 p34: ManufacturerName() */ -+ return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); -+} -+ -+static ssize_t show_battery_model( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: string. SBS spec v1.1 p34: DeviceName() */ -+ return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); -+} -+ -+static ssize_t show_battery_barcoding( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: string */ -+ return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); -+} -+ -+static ssize_t show_battery_chemistry( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ -+ return show_tp_ec_bat_str(6, 2, 5, attr, buf); -+} -+ -+static ssize_t show_battery_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV. SBS spec v1.1 p24: Voltage() */ -+ return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_design_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ -+ return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_charging_max_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ -+ return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_group0_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV */ -+ return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_group1_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV */ -+ return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_group2_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV */ -+ return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_group3_voltage( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mV */ -+ return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_current_now( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mA. SBS spec v1.1 p24: Current() */ -+ return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); -+} -+ -+static ssize_t show_battery_current_avg( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ -+ return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); -+} -+ -+static ssize_t show_battery_charging_max_current( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ -+ return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); -+} -+ -+static ssize_t show_battery_power_now( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mW. SBS spec v1.1: Voltage()*Current() */ -+ return show_tp_ec_bat_power(1, 6, 8, attr, buf); -+} -+ -+static ssize_t show_battery_power_avg( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ -+ return show_tp_ec_bat_power(1, 6, 10, attr, buf); -+} -+ -+static ssize_t show_battery_remaining_percent( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ -+ return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_remaining_percent_error( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: percent. SBS spec v1.1 p25: MaxError() */ -+ return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); -+} -+ -+static ssize_t show_battery_remaining_charging_time( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ -+ return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); -+} -+ -+static ssize_t show_battery_remaining_running_time( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ -+ return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); -+} -+ -+static ssize_t show_battery_remaining_running_time_now( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ -+ return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); -+} -+ -+static ssize_t show_battery_remaining_capacity( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mWh. SBS spec v1.1 p26. */ -+ return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); -+} -+ -+static ssize_t show_battery_last_full_capacity( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ -+ return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); -+} -+ -+static ssize_t show_battery_design_capacity( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ -+ return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); -+} -+ -+static ssize_t show_battery_cycle_count( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ -+ return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); -+} -+ -+static ssize_t show_battery_temperature( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ -+ return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); -+} -+ -+static ssize_t show_battery_serial( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: int. SBS spec v1.1 p34: SerialNumber() */ -+ return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); -+} -+ -+static ssize_t show_battery_manufacture_date( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ -+ return show_tp_ec_bat_date(3, 8, attr, buf); -+} -+ -+static ssize_t show_battery_first_use_date( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ /* type: YYYY-MM-DD */ -+ return show_tp_ec_bat_date(8, 2, attr, buf); -+} -+ -+/** -+ * show_battery_dump - show the battery's dump attribute -+ * The dump attribute gives a hex dump of all EC readouts related to a -+ * battery. Some of the enumerated values don't really exist (i.e., the -+ * EC function just leaves them untouched); we use a kludge to detect and -+ * denote these. -+ */ -+#define MIN_DUMP_ARG0 0x00 -+#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ -+static ssize_t show_battery_dump( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ int i; -+ char *p = buf; -+ int bat = attr_get_bat(attr); -+ u8 arg0; /* first argument to EC */ -+ u8 rowa[TP_CONTROLLER_ROW_LEN], -+ rowb[TP_CONTROLLER_ROW_LEN]; -+ const u8 junka = 0xAA, -+ junkb = 0x55; /* junk values for testing changes */ -+ int ret; -+ -+ for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { -+ if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) -+ return -ENOMEM; /* don't overflow sysfs buf */ -+ /* Read raw twice with different junk values, -+ * to detect unused output bytes which are left unchaged: */ -+ ret = read_tp_ec_row(arg0, bat, junka, rowa); -+ if (ret) -+ return ret; -+ ret = read_tp_ec_row(arg0, bat, junkb, rowb); -+ if (ret) -+ return ret; -+ for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { -+ if (rowa[i] == junka && rowb[i] == junkb) -+ p += sprintf(p, "-- "); /* unused by EC */ -+ else -+ p += sprintf(p, "%02x ", rowa[i]); -+ } -+ p += sprintf(p, "\n"); -+ } -+ return p-buf; -+} -+ -+ -+/********************************************************************* -+ * sysfs attribute I/O, other than batteries -+ */ -+ -+static ssize_t show_ac_connected( -+ struct device *dev, struct device_attribute *attr, char *buf) -+{ -+ int ret = power_device_present(0xFF); -+ if (ret < 0) -+ return ret; -+ return sprintf(buf, "%d\n", ret); /* type: boolean */ -+} -+ -+/********************************************************************* -+ * The the "smapi_request" sysfs attribute executes a raw SMAPI call. -+ * You write to make a request and read to get the result. The state -+ * is saved globally rather than per fd (sysfs limitation), so -+ * simultaenous requests may get each other's results! So this is for -+ * development and debugging only. -+ */ -+#define MAX_SMAPI_ATTR_ANSWER_LEN 128 -+static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; -+ -+static ssize_t show_smapi_request(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); -+ smapi_attr_answer[0] = '\0'; -+ return ret; -+} -+ -+static ssize_t store_smapi_request(struct device *dev, -+ struct device_attribute *attr, -+ const char *buf, size_t count) -+{ -+ unsigned int inEBX, inECX, inEDI, inESI; -+ u32 outEBX, outECX, outEDX, outEDI, outESI; -+ const char *msg; -+ int ret; -+ if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { -+ smapi_attr_answer[0] = '\0'; -+ return -EINVAL; -+ } -+ ret = smapi_request( -+ inEBX, inECX, inEDI, inESI, -+ &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); -+ snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, -+ "%x %x %x %x %x %d '%s'\n", -+ (unsigned int)outEBX, (unsigned int)outECX, -+ (unsigned int)outEDX, (unsigned int)outEDI, -+ (unsigned int)outESI, ret, msg); -+ if (ret) -+ return ret; -+ else -+ return count; -+} -+ -+/********************************************************************* -+ * Power management: the embedded controller forgets the battery -+ * thresholds when the system is suspended to disk and unplugged from -+ * AC and battery, so we restore it upon resume. -+ */ -+ -+static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ -+ -+static int tp_suspend(struct platform_device *dev, pm_message_t state) -+{ -+ int restore = (state.event == PM_EVENT_HIBERNATE || -+ state.event == PM_EVENT_FREEZE); -+ if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) -+ saved_threshs[0] = -1; -+ if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) -+ saved_threshs[1] = -1; -+ if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) -+ saved_threshs[2] = -1; -+ if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) -+ saved_threshs[3] = -1; -+ DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], -+ saved_threshs[1], saved_threshs[2], saved_threshs[3]); -+ return 0; -+} -+ -+static int tp_resume(struct platform_device *dev) -+{ -+ DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], -+ saved_threshs[1], saved_threshs[2], saved_threshs[3]); -+ if (saved_threshs[0] >= 0) -+ set_real_thresh(0, THRESH_STOP , saved_threshs[0]); -+ if (saved_threshs[1] >= 0) -+ set_real_thresh(0, THRESH_START, saved_threshs[1]); -+ if (saved_threshs[2] >= 0) -+ set_real_thresh(1, THRESH_STOP , saved_threshs[2]); -+ if (saved_threshs[3] >= 0) -+ set_real_thresh(1, THRESH_START, saved_threshs[3]); -+ return 0; -+} -+ -+ -+/********************************************************************* -+ * Driver model -+ */ -+ -+static struct platform_driver tp_driver = { -+ .suspend = tp_suspend, -+ .resume = tp_resume, -+ .driver = { -+ .name = "smapi", -+ .owner = THIS_MODULE -+ }, -+}; -+ -+ -+/********************************************************************* -+ * Sysfs device model -+ */ -+ -+/* Attributes in /sys/devices/platform/smapi/ */ -+ -+static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); -+static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, -+ store_smapi_request); -+ -+static struct attribute *tp_root_attributes[] = { -+ &dev_attr_ac_connected.attr, -+ &dev_attr_smapi_request.attr, -+ NULL -+}; -+static struct attribute_group tp_root_attribute_group = { -+ .attrs = tp_root_attributes -+}; -+ -+/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : -+ * Every attribute needs to be defined (i.e., statically allocated) for -+ * each battery, and then referenced in the attribute list of each battery. -+ * We use preprocessor voodoo to avoid duplicating the list of attributes 4 -+ * times. The preprocessor output is just normal sysfs attributes code. -+ */ -+ -+/** -+ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes -+ * @_BAT: battery number (0 or 1) -+ * @_ATTR_RW: macro to invoke for each read/write attribute -+ * @_ATTR_R: macro to invoke for each read-only attribute -+ */ -+#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ -+ _ATTR_RW(_BAT, start_charge_thresh) \ -+ _ATTR_RW(_BAT, stop_charge_thresh) \ -+ _ATTR_RW(_BAT, inhibit_charge_minutes) \ -+ _ATTR_RW(_BAT, force_discharge) \ -+ _ATTR_R(_BAT, installed) \ -+ _ATTR_R(_BAT, state) \ -+ _ATTR_R(_BAT, manufacturer) \ -+ _ATTR_R(_BAT, model) \ -+ _ATTR_R(_BAT, barcoding) \ -+ _ATTR_R(_BAT, chemistry) \ -+ _ATTR_R(_BAT, voltage) \ -+ _ATTR_R(_BAT, group0_voltage) \ -+ _ATTR_R(_BAT, group1_voltage) \ -+ _ATTR_R(_BAT, group2_voltage) \ -+ _ATTR_R(_BAT, group3_voltage) \ -+ _ATTR_R(_BAT, current_now) \ -+ _ATTR_R(_BAT, current_avg) \ -+ _ATTR_R(_BAT, charging_max_current) \ -+ _ATTR_R(_BAT, power_now) \ -+ _ATTR_R(_BAT, power_avg) \ -+ _ATTR_R(_BAT, remaining_percent) \ -+ _ATTR_R(_BAT, remaining_percent_error) \ -+ _ATTR_R(_BAT, remaining_charging_time) \ -+ _ATTR_R(_BAT, remaining_running_time) \ -+ _ATTR_R(_BAT, remaining_running_time_now) \ -+ _ATTR_R(_BAT, remaining_capacity) \ -+ _ATTR_R(_BAT, last_full_capacity) \ -+ _ATTR_R(_BAT, design_voltage) \ -+ _ATTR_R(_BAT, charging_max_voltage) \ -+ _ATTR_R(_BAT, design_capacity) \ -+ _ATTR_R(_BAT, cycle_count) \ -+ _ATTR_R(_BAT, temperature) \ -+ _ATTR_R(_BAT, serial) \ -+ _ATTR_R(_BAT, manufacture_date) \ -+ _ATTR_R(_BAT, first_use_date) \ -+ _ATTR_R(_BAT, dump) -+ -+/* Define several macros we will feed into FOREACH_BAT_ATTR: */ -+ -+#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ -+ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ -+ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ -+ store_battery_##_NAME), \ -+ .bat = _BAT \ -+ }; -+ -+#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ -+ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ -+ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ -+ .bat = _BAT \ -+ }; -+ -+#define REF_BAT_ATTR(_BAT,_NAME) \ -+ &dev_attr_##_NAME##_##_BAT.dev_attr.attr, -+ -+/* This provide all attributes for one battery: */ -+ -+#define PROVIDE_BAT_ATTRS(_BAT) \ -+ FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ -+ static struct attribute *tp_bat##_BAT##_attributes[] = { \ -+ FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ -+ NULL \ -+ }; \ -+ static struct attribute_group tp_bat##_BAT##_attribute_group = { \ -+ .name = "BAT" #_BAT, \ -+ .attrs = tp_bat##_BAT##_attributes \ -+ }; -+ -+/* Finally genereate the attributes: */ -+ -+PROVIDE_BAT_ATTRS(0) -+PROVIDE_BAT_ATTRS(1) -+ -+/* List of attribute groups */ -+ -+static struct attribute_group *attr_groups[] = { -+ &tp_root_attribute_group, -+ &tp_bat0_attribute_group, -+ &tp_bat1_attribute_group, -+ NULL -+}; -+ -+ -+/********************************************************************* -+ * Init and cleanup -+ */ -+ -+static struct attribute_group **next_attr_group; /* next to register */ -+ -+static int __init tp_init(void) -+{ -+ int ret; -+ printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); -+ -+ ret = find_smapi_port(); -+ if (ret < 0) -+ goto err; -+ else -+ smapi_port = ret; -+ -+ if (!request_region(smapi_port, 1, "smapi")) { -+ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", -+ smapi_port); -+ ret = -ENXIO; -+ goto err; -+ } -+ -+ if (!request_region(SMAPI_PORT2, 1, "smapi")) { -+ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", -+ SMAPI_PORT2); -+ ret = -ENXIO; -+ goto err_port1; -+ } -+ -+ ret = platform_driver_register(&tp_driver); -+ if (ret) -+ goto err_port2; -+ -+ pdev = platform_device_alloc("smapi", -1); -+ if (!pdev) { -+ ret = -ENOMEM; -+ goto err_driver; -+ } -+ -+ ret = platform_device_add(pdev); -+ if (ret) -+ goto err_device_free; -+ -+ for (next_attr_group = attr_groups; *next_attr_group; -+ ++next_attr_group) { -+ ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); -+ if (ret) -+ goto err_attr; -+ } -+ -+ printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", -+ smapi_port); -+ return 0; -+ -+err_attr: -+ while (--next_attr_group >= attr_groups) -+ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); -+ platform_device_unregister(pdev); -+err_device_free: -+ platform_device_put(pdev); -+err_driver: -+ platform_driver_unregister(&tp_driver); -+err_port2: -+ release_region(SMAPI_PORT2, 1); -+err_port1: -+ release_region(smapi_port, 1); -+err: -+ printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); -+ return ret; -+} -+ -+static void __exit tp_exit(void) -+{ -+ while (next_attr_group && --next_attr_group >= attr_groups) -+ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); -+ platform_device_unregister(pdev); -+ platform_driver_unregister(&tp_driver); -+ release_region(SMAPI_PORT2, 1); -+ if (smapi_port) -+ release_region(smapi_port, 1); -+ -+ printk(KERN_INFO "tp_smapi unloaded.\n"); -+} -+ -+module_init(tp_init); -+module_exit(tp_exit); -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 0840d27381ea..73aba9a31064 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP - def_bool y - depends on VT_CONSOLE && PM_SLEEP - -+config NR_TTY_DEVICES -+ int "Maximum tty device number" -+ depends on VT -+ range 12 63 -+ default 63 -+ ---help--- -+ This option is used to change the number of tty devices in /dev. -+ The default value is 63. The lowest number you can set is 12, -+ 63 is also the upper limit so we don't overrun the serial -+ consoles. -+ -+ If unsure, say 63. -+ - config HW_CONSOLE - bool - depends on VT && !UML -diff --git a/fs/exec.c b/fs/exec.c -index 65eaacaba4f4..1d3b310bd5f0 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -63,6 +63,8 @@ - #include - #include - -+#include -+ - #include - #include - #include -@@ -866,9 +868,12 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) - if (err) - goto exit; - -- if (name->name[0] != '\0') -+ if (name->name[0] != '\0') { - fsnotify_open(file); - -+ trace_open_exec(name->name); -+ } -+ - out: - return file; - -diff --git a/fs/open.c b/fs/open.c -index cb81623a8b09..a92b0f6061ac 100644 ---- a/fs/open.c -+++ b/fs/open.c -@@ -34,6 +34,9 @@ - - #include "internal.h" - -+#define CREATE_TRACE_POINTS -+#include -+ - int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) - { -@@ -1068,6 +1071,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) - } else { - fsnotify_open(f); - fd_install(fd, f); -+ trace_do_sys_open(tmp->name, flags, mode); - } - } - putname(tmp); -diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h -new file mode 100644 -index 000000000000..fb634b74adf3 ---- /dev/null -+++ b/include/trace/events/fs.h -@@ -0,0 +1,53 @@ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM fs -+ -+#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_FS_H -+ -+#include -+#include -+ -+TRACE_EVENT(do_sys_open, -+ -+ TP_PROTO(const char *filename, int flags, int mode), -+ -+ TP_ARGS(filename, flags, mode), -+ -+ TP_STRUCT__entry( -+ __string( filename, filename ) -+ __field( int, flags ) -+ __field( int, mode ) -+ ), -+ -+ TP_fast_assign( -+ __assign_str(filename, filename); -+ __entry->flags = flags; -+ __entry->mode = mode; -+ ), -+ -+ TP_printk("\"%s\" %x %o", -+ __get_str(filename), __entry->flags, __entry->mode) -+); -+ -+TRACE_EVENT(open_exec, -+ -+ TP_PROTO(const char *filename), -+ -+ TP_ARGS(filename), -+ -+ TP_STRUCT__entry( -+ __string( filename, filename ) -+ ), -+ -+ TP_fast_assign( -+ __assign_str(filename, filename); -+ ), -+ -+ TP_printk("\"%s\"", -+ __get_str(filename)) -+); -+ -+#endif /* _TRACE_FS_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 79226ca8f80f..2a30060e7e1d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -47,7 +47,11 @@ struct blk_queue_stats; - struct blk_stat_callback; - - #define BLKDEV_MIN_RQ 4 -+#ifdef CONFIG_ZENIFY -+#define BLKDEV_MAX_RQ 512 -+#else - #define BLKDEV_MAX_RQ 128 /* Default maximum */ -+#endif - - /* Must be consistent with blk_mq_poll_stats_bkt() */ - #define BLK_MQ_POLL_STATS_BKTS 16 -diff --git a/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h -new file mode 100644 -index 000000000000..1b80d7ee5493 ---- /dev/null -+++ b/include/linux/thinkpad_ec.h -@@ -0,0 +1,47 @@ -+/* -+ * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions -+ * -+ * Copyright (C) 2005 Shem Multinymous -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License -+ * along with this program; if not, write to the Free Software -+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -+ */ -+ -+#ifndef _THINKPAD_EC_H -+#define _THINKPAD_EC_H -+ -+#ifdef __KERNEL__ -+ -+#define TP_CONTROLLER_ROW_LEN 16 -+ -+/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ -+struct thinkpad_ec_row { -+ u16 mask; /* bitmap of which entries of val[] are meaningful */ -+ u8 val[TP_CONTROLLER_ROW_LEN]; -+}; -+ -+extern int __must_check thinkpad_ec_lock(void); -+extern int __must_check thinkpad_ec_try_lock(void); -+extern void thinkpad_ec_unlock(void); -+ -+extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, -+ struct thinkpad_ec_row *data); -+extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, -+ struct thinkpad_ec_row *mask); -+extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); -+extern void thinkpad_ec_invalidate(void); -+ -+ -+#endif /* __KERNEL */ -+#endif /* _THINKPAD_EC_H */ -diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h -index e9d39c48520a..3bceead8da40 100644 ---- a/include/uapi/linux/vt.h -+++ b/include/uapi/linux/vt.h -@@ -3,12 +3,25 @@ - #define _UAPI_LINUX_VT_H - - -+/* -+ * We will make this definition solely for the purpose of making packages -+ * such as splashutils build, because they can not understand that -+ * NR_TTY_DEVICES is defined in the kernel configuration. -+ */ -+#ifndef CONFIG_NR_TTY_DEVICES -+#define CONFIG_NR_TTY_DEVICES 63 -+#endif -+ - /* - * These constants are also useful for user-level apps (e.g., VC - * resizing). - */ - #define MIN_NR_CONSOLES 1 /* must be at least 1 */ --#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -+/* -+ * NR_TTY_DEVICES: -+ * Value MUST be at least 12 and must never be higher then 63 -+ */ -+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ - /* Note: the ioctl VT_GETSTATE does not work for - consoles 16 and higher (since it returns a short) */ - -diff --git a/init/Kconfig b/init/Kconfig -index 041f3a022122..5ed70eb1ad3a 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - -@@ -1026,6 +1058,13 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - with the "-O2" compiler flag for best performance and most - helpful compile-time warnings. - -+config CC_OPTIMIZE_HARDER -+ bool "Optimize harder" -+ help -+ This option will pass "-O3" to your compiler resulting in a -+ larger and faster kernel. The more complex optimizations also -+ increase compilation time and may affect stability. -+ - config CC_OPTIMIZE_FOR_SIZE - bool "Optimize for size" - help -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 2f0a0be4d344..bada807c7e59 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -37,8 +37,13 @@ - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_latency = 3000000ULL; -+static unsigned int normalized_sysctl_sched_latency = 3000000ULL; -+#else - unsigned int sysctl_sched_latency = 6000000ULL; - static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -+#endif - - /* - * The initial- and re-scaling of tunables is configurable -@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_min_granularity = 300000ULL; -+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -+#else - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+#endif - - /* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -+#ifdef CONFIG_ZENIFY -+static unsigned int sched_nr_latency = 10; -+#else - static unsigned int sched_nr_latency = 8; -+#endif - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_wakeup_granularity = 500000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; -+ -+const_debug unsigned int sysctl_sched_migration_cost = 50000UL; -+#else - unsigned int sysctl_sched_wakeup_granularity = 1000000UL; - static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -+#endif - - #ifdef CONFIG_SMP - /* -@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -+#else - unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif -+#endif - - /* - * The margin used when comparing utilization with CPU capacity: -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 337c6afb3345..9315e358f292 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; - /* - * Start background writeback (via writeback threads) at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int dirty_background_ratio = 20; -+#else - int dirty_background_ratio = 10; -+#endif - - /* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of -@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; - /* - * The generator of dirty data starts writeback at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int vm_dirty_ratio = 50; -+#else - int vm_dirty_ratio = 20; -+#endif - - /* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 80dad301361d..42b7fa7d01f8 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -702,6 +702,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO - -From: Nick Desaulniers -Date: Mon, 24 Dec 2018 13:37:41 +0200 -Subject: include/linux/compiler*.h: define asm_volatile_goto - -asm_volatile_goto should also be defined for other compilers that -support asm goto. - -Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h -mutually exclusive"). - -Signed-off-by: Nick Desaulniers -Signed-off-by: Miguel Ojeda - -diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h -index ba814f1..e77eeb0 100644 ---- a/include/linux/compiler_types.h -+++ b/include/linux/compiler_types.h -@@ -188,6 +188,10 @@ struct ftrace_likely_data { - #define asm_volatile_goto(x...) asm goto(x) - #endif - -+#ifndef asm_volatile_goto -+#define asm_volatile_goto(x...) asm goto(x) -+#endif -+ - /* Are two types/vars the same type (ignoring qualifiers)? */ - #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) - -From: Andy Lavr -Date: Mon, 24 Dec 2018 14:57:47 +0200 -Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: -https://lwn.net/Articles/711248/ - -Signed-off-by: Andy Lavr - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index e84a10b..21d62b7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1<hw.mac; - struct e1000_phy_info *phy = &adapter->hw.phy; - struct e1000_ring *tx_ring = adapter->tx_ring; -- u32 dmoff_exit_timeout = 100, tries = 0; - struct e1000_hw *hw = &adapter->hw; -+ u32 link, tctl; -- u32 link, tctl, pcim_state; - - if (test_bit(__E1000_DOWN, &adapter->state)) - return; -@@ -5188,21 +5187,6 @@ static void e1000_watchdog_task(struct work_struct *work) - /* Cancel scheduled suspend requests. */ - pm_runtime_resume(netdev->dev.parent); - -- /* Checking if MAC is in DMoff state*/ -- pcim_state = er32(STATUS); -- while (pcim_state & E1000_STATUS_PCIM_STATE) { -- if (tries++ == dmoff_exit_timeout) { -- e_dbg("Error in exiting dmoff\n"); -- break; -- } -- usleep_range(10000, 20000); -- pcim_state = er32(STATUS); -- -- /* Checking if MAC exited DMoff state */ -- if (!(pcim_state & E1000_STATUS_PCIM_STATE)) -- e1000_phy_hw_reset(&adapter->hw); -- } -- - /* update snapshot of PHY registers on LSC */ - e1000_phy_read_status(adapter); - mac->ops.get_link_up_info(&adapter->hw, -From adb1f9df27f08e6488bcd80b1607987c6114a77a Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 076ba7308e65..81f89095aa77 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* -From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 16:45:59 -0300 -Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O - requests - -Signed-off-by: Alexandre Frade ---- - include/linux/blkdev.h | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index f3ea78b0c91c..4dbacc6b073b 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -621,7 +621,8 @@ struct request_queue { - #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ - - #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ -- (1 << QUEUE_FLAG_SAME_COMP)) -+ (1 << QUEUE_FLAG_SAME_COMP) | \ -+ (1 << QUEUE_FLAG_SAME_FORCE)) - - void blk_queue_flag_set(unsigned int flag, struct request_queue *q); - void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 14:32:50 -0300 -Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead - pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index a2adf95b3f9c..e804d9f7583a 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); - void task_dirty_inc(struct task_struct *tsk); - - /* readahead.c */ --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); diff --git a/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch b/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch deleted file mode 100644 index 06b7f02..0000000 --- a/linux54-tkg/linux54-tkg-patches/0003-glitched-cfs.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - diff --git a/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch b/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch deleted file mode 100644 index f3fbde8..0000000 --- a/linux54-tkg/linux54-tkg-patches/0004-5.4-ck1.patch +++ /dev/null @@ -1,17684 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 8dee8f68fe15..e56fb275f607 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4277,6 +4277,14 @@ - Memory area to be used by remote processor image, - managed by CMA. - -+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. -+ Format: -+ smt -- Share SMT (hyperthread) sibling runqueues -+ mc -- Share MC (multicore) sibling runqueues -+ smp -- Share SMP runqueues -+ none -- So not share any runqueues -+ Default value is mc -+ - rw [KNL] Mount root device read-write on boot - - S [KNL] Run init in single mode -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 032c7cd3cede..ff41dfacb34b 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -46,6 +46,7 @@ show up in /proc/sys/kernel: - - hung_task_check_interval_secs - - hung_task_warnings - - hyperv_record_panic_msg -+- iso_cpu - - kexec_load_disabled - - kptr_restrict - - l2cr [ PPC only ] -@@ -82,6 +83,7 @@ show up in /proc/sys/kernel: - - randomize_va_space - - real-root-dev ==> Documentation/admin-guide/initrd.rst - - reboot-cmd [ SPARC only ] -+- rr_interval - - rtsig-max - - rtsig-nr - - sched_energy_aware -@@ -105,6 +107,7 @@ show up in /proc/sys/kernel: - - unknown_nmi_panic - - watchdog - - watchdog_thresh -+- yield_type - - version - - -@@ -438,6 +441,16 @@ When kptr_restrict is set to (2), kernel pointers printed using - %pK will be replaced with 0's regardless of privileges. - - -+iso_cpu: (MuQSS CPU scheduler only) -+=================================== -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling five -+seconds over the -whole- system, meaning all cpus. -+ -+Set to 70 (percent) by default. -+ -+ - l2cr: (PPC only) - ================ - -@@ -905,6 +918,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after - rebooting. ??? - - -+rr_interval: (MuQSS CPU scheduler only) -+======================================= -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. Conversely decreasing it will decrease average and maximum -+latencies but at the expense of throughput. This value is in -+milliseconds and the default value chosen depends on the number of -+cpus available at scheduler initialisation with a minimum of 6. -+ -+Valid values are from 1-1000. -+ -+ - rtsig-max & rtsig-nr: - ===================== - -@@ -1175,3 +1202,13 @@ is 10 seconds. - - The softlockup threshold is (2 * watchdog_thresh). Setting this - tunable to zero will disable lockup detection altogether. -+ -+ -+yield_type: (MuQSS CPU scheduler only) -+====================================== -+ -+This determines what type of yield calls to sched_yield will perform. -+ -+ 0: No yield. -+ 1: Yield only to better priority/deadline tasks. (default) -+ 2: Expire timeslice and recalculate deadline. -diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt -new file mode 100644 -index 000000000000..c0282002a079 ---- /dev/null -+++ b/Documentation/scheduler/sched-BFS.txt -@@ -0,0 +1,351 @@ -+BFS - The Brain Fuck Scheduler by Con Kolivas. -+ -+Goals. -+ -+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to -+completely do away with the complex designs of the past for the cpu process -+scheduler and instead implement one that is very simple in basic design. -+The main focus of BFS is to achieve excellent desktop interactivity and -+responsiveness without heuristics and tuning knobs that are difficult to -+understand, impossible to model and predict the effect of, and when tuned to -+one workload cause massive detriment to another. -+ -+ -+Design summary. -+ -+BFS is best described as a single runqueue, O(n) lookup, earliest effective -+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual -+deadline first) and my previous Staircase Deadline scheduler. Each component -+shall be described in order to understand the significance of, and reasoning for -+it. The codebase when the first stable version was released was approximately -+9000 lines less code than the existing mainline linux kernel scheduler (in -+2.6.31). This does not even take into account the removal of documentation and -+the cgroups code that is not used. -+ -+Design reasoning. -+ -+The single runqueue refers to the queued but not running processes for the -+entire system, regardless of the number of CPUs. The reason for going back to -+a single runqueue design is that once multiple runqueues are introduced, -+per-CPU or otherwise, there will be complex interactions as each runqueue will -+be responsible for the scheduling latency and fairness of the tasks only on its -+own runqueue, and to achieve fairness and low latency across multiple CPUs, any -+advantage in throughput of having CPU local tasks causes other disadvantages. -+This is due to requiring a very complex balancing system to at best achieve some -+semblance of fairness across CPUs and can only maintain relatively low latency -+for tasks bound to the same CPUs, not across them. To increase said fairness -+and latency across CPUs, the advantage of local runqueue locking, which makes -+for better scalability, is lost due to having to grab multiple locks. -+ -+A significant feature of BFS is that all accounting is done purely based on CPU -+used and nowhere is sleep time used in any way to determine entitlement or -+interactivity. Interactivity "estimators" that use some kind of sleep/run -+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag -+tasks that aren't interactive as being so. The reason for this is that it is -+close to impossible to determine that when a task is sleeping, whether it is -+doing it voluntarily, as in a userspace application waiting for input in the -+form of a mouse click or otherwise, or involuntarily, because it is waiting for -+another thread, process, I/O, kernel activity or whatever. Thus, such an -+estimator will introduce corner cases, and more heuristics will be required to -+cope with those corner cases, introducing more corner cases and failed -+interactivity detection and so on. Interactivity in BFS is built into the design -+by virtue of the fact that tasks that are waking up have not used up their quota -+of CPU time, and have earlier effective deadlines, thereby making it very likely -+they will preempt any CPU bound task of equivalent nice level. See below for -+more information on the virtual deadline mechanism. Even if they do not preempt -+a running task, because the rr interval is guaranteed to have a bound upper -+limit on how long a task will wait for, it will be scheduled within a timeframe -+that will not cause visible interface jitter. -+ -+ -+Design details. -+ -+Task insertion. -+ -+BFS inserts tasks into each relevant queue as an O(1) insertion into a double -+linked list. On insertion, *every* running queue is checked to see if the newly -+queued task can run on any idle queue, or preempt the lowest running task on the -+system. This is how the cross-CPU scheduling of BFS achieves significantly lower -+latency per extra CPU the system has. In this case the lookup is, in the worst -+case scenario, O(n) where n is the number of CPUs on the system. -+ -+Data protection. -+ -+BFS has one single lock protecting the process local data of every task in the -+global queue. Thus every insertion, removal and modification of task data in the -+global runqueue needs to grab the global lock. However, once a task is taken by -+a CPU, the CPU has its own local data copy of the running process' accounting -+information which only that CPU accesses and modifies (such as during a -+timer tick) thus allowing the accounting data to be updated lockless. Once a -+CPU has taken a task to run, it removes it from the global queue. Thus the -+global queue only ever has, at most, -+ -+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 -+ -+tasks in the global queue. This value is relevant for the time taken to look up -+tasks during scheduling. This will increase if many tasks with CPU affinity set -+in their policy to limit which CPUs they're allowed to run on if they outnumber -+the number of CPUs. The +1 is because when rescheduling a task, the CPU's -+currently running task is put back on the queue. Lookup will be described after -+the virtual deadline mechanism is explained. -+ -+Virtual deadline. -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in BFS is entirely in the virtual deadline mechanism. The one -+tunable in BFS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in jiffies by this equation: -+ -+ jiffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. Once a task is descheduled, it is put back on the queue, and an -+O(n) lookup of all queued-but-not-running tasks is done to determine which has -+the earliest deadline and that task is chosen to receive CPU next. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (jiffies) is -+constantly moving. -+ -+Task lookup. -+ -+BFS has 103 priority queues. 100 of these are dedicated to the static priority -+of realtime tasks, and the remaining 3 are, in order of best to worst priority, -+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority -+scheduling). When a task of these priorities is queued, a bitmap of running -+priorities is set showing which of these priorities has tasks waiting for CPU -+time. When a CPU is made to reschedule, the lookup for the next task to get -+CPU time is performed in the following way: -+ -+First the bitmap is checked to see what static priority tasks are queued. If -+any realtime priorities are found, the corresponding queue is checked and the -+first task listed there is taken (provided CPU affinity is suitable) and lookup -+is complete. If the priority corresponds to a SCHED_ISO task, they are also -+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds -+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this -+stage, every task in the runlist that corresponds to that priority is checked -+to see which has the earliest set deadline, and (provided it has suitable CPU -+affinity) it is taken off the runqueue and given the CPU. If a task has an -+expired deadline, it is taken and the rest of the lookup aborted (as they are -+chosen in FIFO order). -+ -+Thus, the lookup is O(n) in the worst case only, where n is as described -+earlier, as tasks may be chosen before the whole task list is looked over. -+ -+ -+Scalability. -+ -+The major limitations of BFS will be that of scalability, as the separate -+runqueue designs will have less lock contention as the number of CPUs rises. -+However they do not scale linearly even with separate runqueues as multiple -+runqueues will need to be locked concurrently on such designs to be able to -+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness -+across CPUs, and to achieve low enough latency for tasks on a busy CPU when -+other CPUs would be more suited. BFS has the advantage that it requires no -+balancing algorithm whatsoever, as balancing occurs by proxy simply because -+all CPUs draw off the global runqueue, in priority and deadline order. Despite -+the fact that scalability is _not_ the prime concern of BFS, it both shows very -+good scalability to smaller numbers of CPUs and is likely a more scalable design -+at these numbers of CPUs. -+ -+It also has some very low overhead scalability features built into the design -+when it has been deemed their overhead is so marginal that they're worth adding. -+The first is the local copy of the running process' data to the CPU it's running -+on to allow that data to be updated lockless where possible. Then there is -+deference paid to the last CPU a task was running on, by trying that CPU first -+when looking for an idle CPU to use the next time it's scheduled. Finally there -+is the notion of cache locality beyond the last running CPU. The sched_domains -+information is used to determine the relative virtual "cache distance" that -+other CPUs have from the last CPU a task was running on. CPUs with shared -+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated -+as cache local. CPUs without shared caches are treated as not cache local, and -+CPUs on different NUMA nodes are treated as very distant. This "relative cache -+distance" is used by modifying the virtual deadline value when doing lookups. -+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for -+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning -+behind the doubling of deadlines is as follows. The real cost of migrating a -+task from one CPU to another is entirely dependant on the cache footprint of -+the task, how cache intensive the task is, how long it's been running on that -+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and -+how layered the CPU cache is, how fast a context switch is... and so on. In -+other words, it's close to random in the real world where we do more than just -+one sole workload. The only thing we can be sure of is that it's not free. So -+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs -+is more important than cache locality, and cache locality only plays a part -+after that. Doubling the effective deadline is based on the premise that the -+"cache local" CPUs will tend to work on the same tasks up to double the number -+of cache local CPUs, and once the workload is beyond that amount, it is likely -+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA -+is a value I pulled out of my arse. -+ -+When choosing an idle CPU for a waking task, the cache locality is determined -+according to where the task last ran and then idle CPUs are ranked from best -+to worst to choose the most suitable idle CPU based on cache locality, NUMA -+node locality and hyperthread sibling business. They are chosen in the -+following preference (if idle): -+ -+* Same core, idle or busy cache, idle threads -+* Other core, same cache, idle or busy cache, idle threads. -+* Same node, other CPU, idle cache, idle threads. -+* Same node, other CPU, busy cache, idle threads. -+* Same core, busy threads. -+* Other core, same cache, busy threads. -+* Same node, other CPU, busy threads. -+* Other node, other CPU, idle cache, idle threads. -+* Other node, other CPU, busy cache, idle threads. -+* Other node, other CPU, busy threads. -+ -+This shows the SMT or "hyperthread" awareness in the design as well which will -+choose a real idle core first before a logical SMT sibling which already has -+tasks on the physical CPU. -+ -+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. -+However this benchmarking was performed on an earlier design that was far less -+scalable than the current one so it's hard to know how scalable it is in terms -+of both CPUs (due to the global runqueue) and heavily loaded machines (due to -+O(n) lookup) at this stage. Note that in terms of scalability, the number of -+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) -+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark -+results are very promising indeed, without needing to tweak any knobs, features -+or options. Benchmark contributions are most welcome. -+ -+ -+Features -+ -+As the initial prime target audience for BFS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval -+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition -+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is -+support for CGROUPS. The average user should neither need to know what these -+are, nor should they need to be using them to have good desktop behaviour. -+ -+rr_interval -+ -+There is only one "scheduler" tunable, the round robin interval. This can be -+accessed in -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6 on a -+uniprocessor machine, and automatically set to a progressively higher value on -+multiprocessor machines. The reasoning behind increasing the value on more CPUs -+is that the effective latency is decreased by virtue of there being more CPUs on -+BFS (for reasons explained above), and increasing the value allows for less -+cache contention and more throughput. Valid values are from 1 to 1000 -+Decreasing the value will decrease latencies at the cost of decreasing -+throughput, while increasing it will improve throughput, but at the cost of -+worsening latencies. The accuracy of the rr interval is limited by HZ resolution -+of the kernel configuration. Thus, the worst case latencies are usually slightly -+higher than this actual value. The default value of 6 is not an arbitrary one. -+It is based on the fact that humans can detect jitter at approximately 7ms, so -+aiming for much lower latencies is pointless under most circumstances. It is -+worth noting this fact when comparing the latency performance of BFS to other -+schedulers. Worst case latencies being higher than 7ms are far worse than -+average latencies not being in the microsecond range. -+ -+Isochronous scheduling. -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of _total CPU_ available across the machine, configurable -+as a percentage in the following "resource handling" tunable (as opposed to a -+scheduler tunable): -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of BFS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+Because some applications constantly set their policy as well as their nice -+level, there is potential for them to undo the override specified by the user -+on the command line of setting the policy to SCHED_ISO. To counter this, once -+a task has been set to SCHED_ISO policy, it needs superuser privileges to set -+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child -+processes and threads will also inherit the ISO policy. -+ -+Idleprio scheduling. -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start -+a video encode or so on without any slowdown of other tasks. To avoid this -+policy from grabbing shared resources and holding them indefinitely, if it -+detects a state where the task is waiting on I/O, the machine is about to -+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As -+per the Isochronous task management, once a task has been scheduled as IDLEPRIO, -+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can -+be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+ schedtool -D -e ./mprime -+ -+Subtick accounting. -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the -+timer tick frequency (HZ) is lowered. It is possible to create an application -+which uses almost 100% CPU, yet by being descheduled at the right time, records -+zero CPU usage. While the main problem with this is that there are possible -+security implications, it is also difficult to determine how much CPU a task -+really does use. BFS tries to use the sub-tick accounting from the TSC clock, -+where possible, to determine real CPU usage. This is not entirely reliable, but -+is far more likely to produce accurate CPU usage data than the existing designs -+and will not show tasks as consuming no CPU usage when they actually are. Thus, -+the amount of CPU reported as being used by BFS will more accurately represent -+how much CPU the task itself is using (as is shown for example by the 'time' -+application), so the reported values may be quite different to other schedulers. -+Values reported as the 'load' are more prone to problems with this design, but -+per process values are closer to real usage. When comparing throughput of BFS -+to other designs, it is important to compare the actual completed work in terms -+of total wall clock time taken and total work done, rather than the reported -+"cpu usage". -+ -+ -+Con Kolivas Fri Aug 27 2010 -diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt -new file mode 100644 -index 000000000000..ae28b85c9995 ---- /dev/null -+++ b/Documentation/scheduler/sched-MuQSS.txt -@@ -0,0 +1,373 @@ -+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. -+ -+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with -+one 8 level skiplist per runqueue, and fine grained locking for much more -+scalability. -+ -+ -+Goals. -+ -+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from -+here on (pronounced mux) is to completely do away with the complex designs of -+the past for the cpu process scheduler and instead implement one that is very -+simple in basic design. The main focus of MuQSS is to achieve excellent desktop -+interactivity and responsiveness without heuristics and tuning knobs that are -+difficult to understand, impossible to model and predict the effect of, and when -+tuned to one workload cause massive detriment to another, while still being -+scalable to many CPUs and processes. -+ -+ -+Design summary. -+ -+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) -+lookup, earliest effective virtual deadline first tickless design, loosely based -+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase -+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. -+Each component shall be described in order to understand the significance of, -+and reasoning for it. -+ -+ -+Design reasoning. -+ -+In BFS, the use of a single runqueue across all CPUs meant that each CPU would -+need to scan the entire runqueue looking for the process with the earliest -+deadline and schedule that next, regardless of which CPU it originally came -+from. This made BFS deterministic with respect to latency and provided -+guaranteed latencies dependent on number of processes and CPUs. The single -+runqueue, however, meant that all CPUs would compete for the single lock -+protecting it, which would lead to increasing lock contention as the number of -+CPUs rose and appeared to limit scalability of common workloads beyond 16 -+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously -+increased overhead proportionate to the number of queued proecesses and led to -+cache thrashing while iterating over the linked list. -+ -+MuQSS is an evolution of BFS, designed to maintain the same scheduling -+decision mechanism and be virtually deterministic without relying on the -+constrained design of the single runqueue by splitting out the single runqueue -+to be per-CPU and use skiplists instead of linked lists. -+ -+The original reason for going back to a single runqueue design for BFS was that -+once multiple runqueues are introduced, per-CPU or otherwise, there will be -+complex interactions as each runqueue will be responsible for the scheduling -+latency and fairness of the tasks only on its own runqueue, and to achieve -+fairness and low latency across multiple CPUs, any advantage in throughput of -+having CPU local tasks causes other disadvantages. This is due to requiring a -+very complex balancing system to at best achieve some semblance of fairness -+across CPUs and can only maintain relatively low latency for tasks bound to the -+same CPUs, not across them. To increase said fairness and latency across CPUs, -+the advantage of local runqueue locking, which makes for better scalability, is -+lost due to having to grab multiple locks. -+ -+MuQSS works around the problems inherent in multiple runqueue designs by -+making its skip lists priority ordered and through novel use of lockless -+examination of each other runqueue it can decide if it should take the earliest -+deadline task from another runqueue for latency reasons, or for CPU balancing -+reasons. It still does not have a balancing system, choosing to allow the -+next task scheduling decision and task wakeup CPU choice to allow balancing to -+happen by virtue of its choices. -+ -+As a further evolution of the design, MuQSS normally configures sharing of -+runqueues in a logical fashion for when CPU resources are shared for improved -+latency and throughput. By default it shares runqueues and locks between -+multicore siblings. Optionally it can be configured to run with sharing of -+SMT siblings only, all SMP packages or no sharing at all. Additionally it can -+be selected at boot time. -+ -+ -+Design details. -+ -+Custom skip list implementation: -+ -+To avoid the overhead of building up and tearing down skip list structures, -+the variant used by MuQSS has a number of optimisations making it specific for -+its use case in the scheduler. It uses static arrays of 8 'levels' instead of -+building up and tearing down structures dynamically. This makes each runqueue -+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU -+it means that it scales O(log N) up to 64k x number of logical CPUs which is -+far beyond the realistic task limits each CPU could handle. By being 8 levels -+it also makes the array exactly one cacheline in size. Additionally, each -+skip list node is bidirectional making insertion and removal amortised O(1), -+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very -+first entry in each list at all times with MuQSS, so there is never a need to -+do a search and thus look up is always O(1). In interactive mode, the queues -+will be searched beyond their first entry if the first task is not suitable -+for affinity or SMT nice reasons. -+ -+Task insertion: -+ -+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into -+a custom skip list as described above (based on the original design by William -+Pugh). Insertion is ordered in such a way that there is never a need to do a -+search by ordering tasks according to static priority primarily, and then -+virtual deadline at the time of insertion. -+ -+Niffies: -+ -+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are -+of nanosecond resolution. Niffies are calculated per-runqueue from the high -+resolution TSC timers, and in order to maintain fairness are synchronised -+between CPUs whenever both runqueues are locked concurrently. -+ -+Virtual deadline: -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in MuQSS is entirely in the virtual deadline mechanism. The one -+tunable in MuQSS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in niffies by this equation: -+ -+ niffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (niffies) is -+constantly moving. -+ -+Task lookup: -+ -+As tasks are already pre-ordered according to anticipated scheduling order in -+the skip lists, lookup for the next suitable task per-runqueue is always a -+matter of simply selecting the first task in the 0th level skip list entry. -+In order to maintain optimal latency and fairness across CPUs, MuQSS does a -+novel examination of every other runqueue in cache locality order, choosing the -+best task across all runqueues. This provides near-determinism of how long any -+task across the entire system may wait before receiving CPU time. The other -+runqueues are first examine lockless and then trylocked to minimise the -+potential lock contention if they are likely to have a suitable better task. -+Each other runqueue lock is only held for as long as it takes to examine the -+entry for suitability. In "interactive" mode, the default setting, MuQSS will -+look for the best deadline task across all CPUs, while in !interactive mode, -+it will only select a better deadline task from another CPU if it is more -+heavily laden than the current one. -+ -+Lookup is therefore O(k) where k is number of CPUs. -+ -+ -+Latency. -+ -+Through the use of virtual deadlines to govern the scheduling order of normal -+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by -+the rr_interval tunable which is set to 6ms by default. This means that the -+longest a CPU bound task will wait for more CPU is proportional to the number -+of running tasks and in the common case of 0-2 running tasks per CPU, will be -+under the 7ms threshold for human perception of jitter. Additionally, as newly -+woken tasks will have an early deadline from their previous runtime, the very -+tasks that are usually latency sensitive will have the shortest interval for -+activation, usually preempting any existing CPU bound tasks. -+ -+Tickless expiry: -+ -+A feature of MuQSS is that it is not tied to the resolution of the chosen tick -+rate in Hz, instead depending entirely on the high resolution timers where -+possible for sub-millisecond accuracy on timeouts regarless of the underlying -+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates -+such as 100 by default, benefiting from the improved throughput and lower -+power usage it provides. Another advantage of this approach is that in -+combination with the Full No HZ option, which disables ticks on running task -+CPUs instead of just idle CPUs, the tick can be disabled at all times -+regardless of how many tasks are running instead of being limited to just one -+running task. Note that this option is NOT recommended for regular desktop -+users. -+ -+ -+Scalability and balancing. -+ -+Unlike traditional approaches where balancing is a combination of CPU selection -+at task wakeup and intermittent balancing based on a vast array of rules set -+according to architecture, busyness calculations and special case management, -+MuQSS indirectly balances on the fly at task wakeup and next task selection. -+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for -+each logical CPU and uses this to aid task/CPU selection when CPUs are busy. -+Additionally it selects any idle CPUs, if they are available, at any time over -+busy CPUs according to the following preference: -+ -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ -+Mux is therefore SMT, MC and Numa aware without the need for extra -+intermittent balancing to maintain CPUs busy and make the most of cache -+coherency. -+ -+ -+Features -+ -+As the initial prime target audience for MuQSS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, -+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO -+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS -+does _not_ now feature is support for CGROUPS. The average user should neither -+need to know what these are, nor should they need to be using them to have good -+desktop behaviour. However since some applications refuse to work without -+cgroups, one can enable them with MuQSS as a stub and the filesystem will be -+created which will allow the applications to work. -+ -+rr_interval: -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6. Valid values -+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of -+decreasing throughput, while increasing it will improve throughput, but at the -+cost of worsening latencies. It is based on the fact that humans can detect -+jitter at approximately 7ms, so aiming for much lower latencies is pointless -+under most circumstances. It is worth noting this fact when comparing the -+latency performance of MuQSS to other schedulers. Worst case latencies being -+higher than 7ms are far worse than average latencies not being in the -+microsecond range. -+ -+interactive: -+ -+ /proc/sys/kernel/interactive -+ -+The value is a simple boolean of 1 for on and 0 for off and is set to on by -+default. Disabling this will disable the near-determinism of MuQSS when -+selecting the next task by not examining all CPUs for the earliest deadline -+task, or which CPU to wake to, instead prioritising CPU balancing for improved -+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis -+instead of across the whole system. -+ -+Runqueue sharing. -+ -+By default MuQSS chooses to share runqueue resources (specifically the skip -+list and locking) between multicore siblings. It is configurable at build time -+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing -+only between simultaneous mulithreading siblings, multicore siblings, or -+symmetric multiprocessing physical packages. Additionally it can be se at -+bootime with the use of the rqshare parameter. The reason for configurability -+is that some architectures have CPUs with many multicore siblings (>= 16) -+where it may be detrimental to throughput to share runqueues and another -+sharing option may be desirable. Additionally, more sharing than usual can -+improve latency on a system-wide level at the expense of throughput if desired. -+ -+The options are: -+none, smt, mc, smp -+ -+eg: -+ rqshare=mc -+ -+Isochronous scheduling: -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of CPU available per CPU, configurable as a percentage in -+the following "resource handling" tunable (as opposed to a scheduler tunable): -+ -+iso_cpu: -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of MuQSS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+ -+ -+Idleprio scheduling: -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start a -+video encode or so on without any slowdown of other tasks. To avoid this policy -+from grabbing shared resources and holding them indefinitely, if it detects a -+state where the task is waiting on I/O, the machine is about to suspend to ram -+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has -+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without -+superuser privileges since it is effectively a lower scheduling policy. Tasks -+can be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+schedtool -D -e ./mprime -+ -+Subtick accounting: -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the timer -+tick frequency (HZ) is lowered. It is possible to create an application which -+uses almost 100% CPU, yet by being descheduled at the right time, records zero -+CPU usage. While the main problem with this is that there are possible security -+implications, it is also difficult to determine how much CPU a task really does -+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU -+usage. Thus, the amount of CPU reported as being used by MuQSS will more -+accurately represent how much CPU the task itself is using (as is shown for -+example by the 'time' application), so the reported values may be quite -+different to other schedulers. When comparing throughput of MuQSS to other -+designs, it is important to compare the actual completed work in terms of total -+wall clock time taken and total work done, rather than the reported "cpu usage". -+ -+Symmetric MultiThreading (SMT) aware nice: -+ -+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the -+logical CPU count rises by adding thread units to each CPU core, allowing more -+than one task to be run simultaneously on the same core, the disadvantage of it -+is that the CPU power is shared between the tasks, not summating to the power -+of two CPUs. The practical upshot of this is that two tasks running on -+separate threads of the same core run significantly slower than if they had one -+core each to run on. While smart CPU selection allows each task to have a core -+to itself whenever available (as is done on MuQSS), it cannot offset the -+slowdown that occurs when the cores are all loaded and only a thread is left. -+Most of the time this is harmless as the CPU is effectively overloaded at this -+point and the extra thread is of benefit. However when running a niced task in -+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets -+precisely the same amount of CPU power as the unniced one. MuQSS has an -+optional configuration feature known as SMT-NICE which selectively idles the -+secondary niced thread for a period proportional to the nice difference, -+allowing CPU distribution according to nice level to be maintained, at the -+expense of a small amount of extra overhead. If this is configured in on a -+machine without SMT threads, the overhead is minimal. -+ -+ -+Con Kolivas Sat, 29th October 2016 -diff --git a/Makefile b/Makefile -index d4d36c61940b..4a9dfe471f1f 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus - PHONY := _all - _all: - -+CKVERSION = -ck1 -+CKNAME = MuQSS Powered -+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) -+ - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. - # -diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig -index ef179033a7c2..14b576a531ad 100644 ---- a/arch/alpha/Kconfig -+++ b/arch/alpha/Kconfig -@@ -665,6 +665,8 @@ config HZ - default 1200 if HZ_1200 - default 1024 - -+source "kernel/Kconfig.MuQSS" -+ - config SRM_ENV - tristate "SRM environment through procfs" - depends on PROC_FS -diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index 3a138f8c7299..65f44e309a08 100644 ---- a/arch/arc/configs/tb10x_defconfig -+++ b/arch/arc/configs/tb10x_defconfig -@@ -30,7 +30,7 @@ CONFIG_ARC_PLAT_TB10X=y - CONFIG_ARC_CACHE_LINE_SHIFT=5 - CONFIG_HZ=250 - CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_COMPACTION is not set - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index 8a50efb559f3..d8507d20c258 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -1238,6 +1238,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config HAVE_ARM_SCU - bool - help -diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig -index 519ff58e67b3..b2a05b6f7d80 100644 ---- a/arch/arm/configs/bcm2835_defconfig -+++ b/arch/arm/configs/bcm2835_defconfig -@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_ARCH_MULTI_V6=y - CONFIG_ARCH_BCM=y - CONFIG_ARCH_BCM2835=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_KSM=y - CONFIG_CLEANCACHE=y -diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig -index 0f7381ee0c37..3d747237bfed 100644 ---- a/arch/arm/configs/imx_v6_v7_defconfig -+++ b/arch/arm/configs/imx_v6_v7_defconfig -@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y - CONFIG_PCI_IMX6=y - CONFIG_SMP=y - CONFIG_ARM_PSCI=y -+CONFIG_PREEMPT=y - CONFIG_HIGHMEM=y - CONFIG_FORCE_MAX_ZONEORDER=14 - CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" -diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig -index 1d923dbb9928..9c1931f1fafd 100644 ---- a/arch/arm/configs/mps2_defconfig -+++ b/arch/arm/configs/mps2_defconfig -@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y - CONFIG_SET_MEM_PARAM=y - CONFIG_DRAM_BASE=0x21000000 - CONFIG_DRAM_SIZE=0x1000000 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_ATAGS is not set - CONFIG_ZBOOT_ROM_TEXT=0x0 - CONFIG_ZBOOT_ROM_BSS=0x0 -diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig -index 2773899c21b3..870866aaa39d 100644 ---- a/arch/arm/configs/mxs_defconfig -+++ b/arch/arm/configs/mxs_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT_VOLUNTARY=n - CONFIG_TASKSTATS=y - CONFIG_TASK_DELAY_ACCT=y - CONFIG_TASK_XACCT=y -@@ -27,6 +27,11 @@ CONFIG_MODVERSIONS=y - CONFIG_BLK_DEV_INTEGRITY=y - # CONFIG_IOSCHED_DEADLINE is not set - # CONFIG_IOSCHED_CFQ is not set -+# CONFIG_ARCH_MULTI_V7 is not set -+CONFIG_ARCH_MXS=y -+# CONFIG_ARM_THUMB is not set -+CONFIG_PREEMPT=y -+CONFIG_AEABI=y - CONFIG_NET=y - CONFIG_PACKET=y - CONFIG_UNIX=y -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 3f047afb982c..d35eae0a5c7d 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -864,6 +864,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 -diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig -new file mode 100644 -index 000000000000..39b91dfa55b5 ---- /dev/null -+++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig -@@ -0,0 +1,121 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF518=y -+CONFIG_IRQ_TIMER0=12 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_JEDECPROBE=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+CONFIG_NET_BFIN=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_USB_SUPPORT is not set -+CONFIG_MMC=y -+CONFIG_SDH_BFIN=y -+CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=m -+# CONFIG_DNOTIFY is not set -+CONFIG_VFAT_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC_CCITT=m -diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig -new file mode 100644 -index 000000000000..675cadb3a0c4 ---- /dev/null -+++ b/arch/blackfin/configs/BF526-EZBRD_defconfig -@@ -0,0 +1,158 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF526=y -+CONFIG_IRQ_TIMER0=12 -+CONFIG_BFIN526_EZBRD=y -+CONFIG_IRQ_USB_INT0=11 -+CONFIG_IRQ_USB_INT1=11 -+CONFIG_IRQ_USB_INT2=11 -+CONFIG_IRQ_USB_DMA=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_PHYSMAP=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_NAND=m -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_SCSI=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+CONFIG_BLK_DEV_SR=m -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_BFIN=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT_FF_MEMLESS=m -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_HID_A4TECH=y -+CONFIG_HID_APPLE=y -+CONFIG_HID_BELKIN=y -+CONFIG_HID_CHERRY=y -+CONFIG_HID_CHICONY=y -+CONFIG_HID_CYPRESS=y -+CONFIG_HID_EZKEY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_LOGITECH=y -+CONFIG_HID_MICROSOFT=y -+CONFIG_HID_MONTEREY=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SONY=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_USB=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_OTG_BLACKLIST_HUB=y -+CONFIG_USB_MON=y -+CONFIG_USB_STORAGE=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=m -+# CONFIG_DNOTIFY is not set -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_VFAT_FS=m -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC_CCITT=m -diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig -new file mode 100644 -index 000000000000..4c517c443af5 ---- /dev/null -+++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig -@@ -0,0 +1,188 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF527=y -+CONFIG_BF_REV_0_2=y -+CONFIG_BFIN527_EZKIT_V2=y -+CONFIG_IRQ_USB_INT0=11 -+CONFIG_IRQ_USB_INT1=11 -+CONFIG_IRQ_USB_INT2=11 -+CONFIG_IRQ_USB_DMA=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+CONFIG_BFIN_SIR0=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_JEDECPROBE=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_NAND=m -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_SCSI=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+CONFIG_BLK_DEV_SR=m -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_BFIN=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT_FF_MEMLESS=m -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=y -+CONFIG_KEYBOARD_ADP5520=y -+# CONFIG_KEYBOARD_ATKBD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_TOUCHSCREEN=y -+CONFIG_TOUCHSCREEN_AD7879=y -+CONFIG_TOUCHSCREEN_AD7879_I2C=y -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_PMIC_ADP5520=y -+CONFIG_FB=y -+CONFIG_FB_BFIN_LQ035Q1=y -+CONFIG_BACKLIGHT_LCD_SUPPORT=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+# CONFIG_LOGO_BLACKFIN_VGA16 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SOC=y -+CONFIG_SND_BF5XX_I2S=y -+CONFIG_SND_BF5XX_SOC_SSM2602=y -+CONFIG_HID_A4TECH=y -+CONFIG_HID_APPLE=y -+CONFIG_HID_BELKIN=y -+CONFIG_HID_CHERRY=y -+CONFIG_HID_CHICONY=y -+CONFIG_HID_CYPRESS=y -+CONFIG_HID_EZKEY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_LOGITECH=y -+CONFIG_HID_MICROSOFT=y -+CONFIG_HID_MONTEREY=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SONY=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_USB=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_OTG_BLACKLIST_HUB=y -+CONFIG_USB_MON=y -+CONFIG_USB_MUSB_HDRC=y -+CONFIG_USB_MUSB_BLACKFIN=y -+CONFIG_USB_STORAGE=y -+CONFIG_USB_GADGET=y -+CONFIG_NEW_LEDS=y -+CONFIG_LEDS_CLASS=y -+CONFIG_LEDS_ADP5520=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=m -+# CONFIG_DNOTIFY is not set -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig -new file mode 100644 -index 000000000000..bf8df3e6cf02 ---- /dev/null -+++ b/arch/blackfin/configs/BF527-EZKIT_defconfig -@@ -0,0 +1,181 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF527=y -+CONFIG_BF_REV_0_1=y -+CONFIG_IRQ_USB_INT0=11 -+CONFIG_IRQ_USB_INT1=11 -+CONFIG_IRQ_USB_INT2=11 -+CONFIG_IRQ_USB_DMA=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+CONFIG_BFIN_SIR0=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_JEDECPROBE=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_NAND=m -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_SCSI=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+CONFIG_BLK_DEV_SR=m -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_BFIN=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT_FF_MEMLESS=m -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_FB=y -+CONFIG_FB_BFIN_T350MCQB=y -+CONFIG_BACKLIGHT_LCD_SUPPORT=y -+CONFIG_LCD_LTV350QV=m -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+# CONFIG_LOGO_BLACKFIN_VGA16 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SOC=y -+CONFIG_SND_BF5XX_I2S=y -+CONFIG_SND_BF5XX_SOC_SSM2602=y -+CONFIG_HID_A4TECH=y -+CONFIG_HID_APPLE=y -+CONFIG_HID_BELKIN=y -+CONFIG_HID_CHERRY=y -+CONFIG_HID_CHICONY=y -+CONFIG_HID_CYPRESS=y -+CONFIG_HID_EZKEY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_LOGITECH=y -+CONFIG_HID_MICROSOFT=y -+CONFIG_HID_MONTEREY=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SONY=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_USB=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_OTG_BLACKLIST_HUB=y -+CONFIG_USB_MON=y -+CONFIG_USB_MUSB_HDRC=y -+CONFIG_MUSB_PIO_ONLY=y -+CONFIG_USB_MUSB_BLACKFIN=y -+CONFIG_MUSB_PIO_ONLY=y -+CONFIG_USB_STORAGE=y -+CONFIG_USB_GADGET=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=m -+# CONFIG_DNOTIFY is not set -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig -new file mode 100644 -index 000000000000..0220b3b15c53 ---- /dev/null -+++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig -@@ -0,0 +1,178 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_LOCALVERSION="DEV_0-1_pre2010" -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+CONFIG_PREEMPT=y -+CONFIG_BF527=y -+CONFIG_BF_REV_0_2=y -+CONFIG_BFIN527_TLL6527M=y -+CONFIG_BF527_UART1_PORTG=y -+CONFIG_IRQ_USB_INT0=11 -+CONFIG_IRQ_USB_INT1=11 -+CONFIG_IRQ_USB_INT2=11 -+CONFIG_IRQ_USB_DMA=11 -+CONFIG_BOOT_LOAD=0x400000 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=y -+CONFIG_DMA_UNCACHED_2M=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_0=0xFFC2 -+CONFIG_BANK_1=0xFFC2 -+CONFIG_BANK_2=0xFFC2 -+CONFIG_BANK_3=0xFFC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+CONFIG_BFIN_SIR0=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_GPIO_ADDR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_SCSI=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+CONFIG_BLK_DEV_SR=m -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=y -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_TOUCHSCREEN=y -+CONFIG_TOUCHSCREEN_AD7879=m -+CONFIG_INPUT_MISC=y -+CONFIG_INPUT_AD714X=y -+CONFIG_INPUT_ADXL34X=y -+# CONFIG_SERIO is not set -+CONFIG_BFIN_PPI=m -+CONFIG_BFIN_SIMPLE_TIMER=m -+CONFIG_BFIN_SPORT=m -+# CONFIG_CONSOLE_TRANSLATIONS is not set -+# CONFIG_DEVKMEM is not set -+CONFIG_BFIN_JTAG_COMM=m -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_LEGACY_PTYS is not set -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C_CHARDEV=y -+# CONFIG_I2C_HELPER_AUTO is not set -+CONFIG_I2C_SMBUS=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_MEDIA_SUPPORT=y -+CONFIG_VIDEO_DEV=y -+# CONFIG_MEDIA_TUNER_CUSTOMISE is not set -+CONFIG_VIDEO_HELPER_CHIPS_AUTO=y -+CONFIG_VIDEO_BLACKFIN_CAM=m -+CONFIG_OV9655=y -+CONFIG_FB=y -+CONFIG_BACKLIGHT_LCD_SUPPORT=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_FONTS=y -+CONFIG_FONT_6x11=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+# CONFIG_LOGO_BLACKFIN_VGA16 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_MIXER_OSS=y -+CONFIG_SND_PCM_OSS=y -+CONFIG_SND_SOC=y -+CONFIG_SND_BF5XX_I2S=y -+CONFIG_SND_BF5XX_SOC_SSM2602=y -+# CONFIG_HID_SUPPORT is not set -+# CONFIG_USB_SUPPORT is not set -+CONFIG_MMC=m -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=y -+# CONFIG_DNOTIFY is not set -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=y -+CONFIG_VFAT_FS=y -+CONFIG_JFFS2_FS=y -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+# CONFIG_RPCSEC_GSS_KRB5 is not set -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC7=m -diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig -new file mode 100644 -index 000000000000..6023e3fd2c48 ---- /dev/null -+++ b/arch/blackfin/configs/BF533-EZKIT_defconfig -@@ -0,0 +1,114 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BFIN533_EZKIT=y -+CONFIG_TIMER0=11 -+CONFIG_CLKIN_HZ=27000000 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_JEDECPROBE=y -+CONFIG_MTD_CFI_AMDSTD=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_PHYSMAP=y -+CONFIG_MTD_PLATRAM=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+CONFIG_SMC91X=y -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT=m -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_USB_SUPPORT is not set -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig -new file mode 100644 -index 000000000000..f5cd0f18b711 ---- /dev/null -+++ b/arch/blackfin/configs/BF533-STAMP_defconfig -@@ -0,0 +1,124 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_TIMER0=11 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=m -+CONFIG_MTD_CFI_AMDSTD=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+CONFIG_SMC91X=y -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=m -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_GPIO=m -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_FB=m -+CONFIG_FIRMWARE_EDID=y -+CONFIG_SOUND=m -+CONFIG_SND=m -+CONFIG_SND_MIXER_OSS=m -+CONFIG_SND_PCM_OSS=m -+CONFIG_SND_SOC=m -+CONFIG_SND_BF5XX_I2S=m -+CONFIG_SND_BF5XX_SOC_AD73311=m -+# CONFIG_USB_SUPPORT is not set -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig -new file mode 100644 -index 000000000000..48085fde7f9e ---- /dev/null -+++ b/arch/blackfin/configs/BF537-STAMP_defconfig -@@ -0,0 +1,136 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF537=y -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_CAN=m -+CONFIG_CAN_RAW=m -+CONFIG_CAN_BCM=m -+CONFIG_CAN_BFIN=m -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+CONFIG_BFIN_SIR1=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=m -+CONFIG_MTD_CFI_AMDSTD=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_PHYSMAP=m -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+CONFIG_NET_BFIN=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=m -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_BLACKFIN_TWI=m -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_FB=m -+CONFIG_FIRMWARE_EDID=y -+CONFIG_BACKLIGHT_LCD_SUPPORT=y -+CONFIG_SOUND=m -+CONFIG_SND=m -+CONFIG_SND_MIXER_OSS=m -+CONFIG_SND_PCM_OSS=m -+CONFIG_SND_SOC=m -+CONFIG_SND_BF5XX_I2S=m -+CONFIG_SND_BF5XX_SOC_AD73311=m -+# CONFIG_USB_SUPPORT is not set -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig -new file mode 100644 -index 000000000000..12deeaaef3cb ---- /dev/null -+++ b/arch/blackfin/configs/BF538-EZKIT_defconfig -@@ -0,0 +1,133 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF538=y -+CONFIG_IRQ_TIMER0=12 -+CONFIG_IRQ_TIMER1=12 -+CONFIG_IRQ_TIMER2=12 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_PM=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_CAN=m -+CONFIG_CAN_RAW=m -+CONFIG_CAN_BCM=m -+CONFIG_CAN_DEV=m -+CONFIG_CAN_BFIN=m -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=m -+CONFIG_MTD_CFI_AMDSTD=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_PHYSMAP=m -+CONFIG_MTD_NAND=m -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+CONFIG_PHYLIB=y -+CONFIG_SMSC_PHY=y -+CONFIG_NET_ETHERNET=y -+CONFIG_SMC91X=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_TOUCHSCREEN=y -+CONFIG_TOUCHSCREEN_AD7879=y -+CONFIG_TOUCHSCREEN_AD7879_SPI=y -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_DEVKMEM is not set -+CONFIG_BFIN_JTAG_COMM=m -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+CONFIG_SERIAL_BFIN_UART1=y -+CONFIG_SERIAL_BFIN_UART2=y -+# CONFIG_LEGACY_PTYS is not set -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=m -+CONFIG_I2C_BLACKFIN_TWI=m -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_FB=m -+CONFIG_FB_BFIN_LQ035Q1=m -+# CONFIG_USB_SUPPORT is not set -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_SMB_FS=m -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig -new file mode 100644 -index 000000000000..6a68ffc55b5a ---- /dev/null -+++ b/arch/blackfin/configs/BF548-EZKIT_defconfig -@@ -0,0 +1,207 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF548_std=y -+CONFIG_IRQ_TIMER0=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_CACHELINE_ALIGNED_L1=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_DMA_UNCACHED_2M=y -+CONFIG_BFIN_EXTMEM_WRITETHROUGH=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_EBIU_MBSCTLVAL=0x0 -+CONFIG_EBIU_MODEVAL=0x1 -+CONFIG_EBIU_FCTLVAL=0x6 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_CAN=m -+CONFIG_CAN_RAW=m -+CONFIG_CAN_BCM=m -+CONFIG_CAN_BFIN=m -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRTTY_SIR=m -+CONFIG_BFIN_SIR=m -+CONFIG_BFIN_SIR3=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+CONFIG_FW_LOADER=m -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_PHYSMAP=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_NAND=y -+CONFIG_MTD_NAND_BF5XX=y -+# CONFIG_MTD_NAND_BF5XX_HWECC is not set -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_RAM=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+CONFIG_BLK_DEV_SR=m -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_ATA=y -+# CONFIG_SATA_PMP is not set -+CONFIG_PATA_BF54X=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+CONFIG_SMSC911X=y -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT_FF_MEMLESS=m -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+CONFIG_INPUT_EVBUG=m -+# CONFIG_KEYBOARD_ATKBD is not set -+CONFIG_KEYBOARD_BFIN=y -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_TOUCHSCREEN=y -+CONFIG_TOUCHSCREEN_AD7877=m -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_FB=y -+CONFIG_FIRMWARE_EDID=y -+CONFIG_FB_BF54X_LQ043=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_FONTS=y -+CONFIG_FONT_6x11=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+# CONFIG_LOGO_BLACKFIN_VGA16 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_MIXER_OSS=y -+CONFIG_SND_PCM_OSS=y -+CONFIG_SND_SOC=y -+CONFIG_SND_BF5XX_AC97=y -+CONFIG_SND_BF5XX_SOC_AD1980=y -+CONFIG_HID_A4TECH=y -+CONFIG_HID_APPLE=y -+CONFIG_HID_BELKIN=y -+CONFIG_HID_CHERRY=y -+CONFIG_HID_CHICONY=y -+CONFIG_HID_CYPRESS=y -+CONFIG_HID_EZKEY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_LOGITECH=y -+CONFIG_HID_MICROSOFT=y -+CONFIG_HID_MONTEREY=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SONY=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_USB=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_OTG_BLACKLIST_HUB=y -+CONFIG_USB_MON=y -+CONFIG_USB_MUSB_HDRC=y -+CONFIG_USB_MUSB_BLACKFIN=y -+CONFIG_USB_STORAGE=y -+CONFIG_USB_GADGET=y -+CONFIG_MMC=y -+CONFIG_MMC_BLOCK=m -+CONFIG_SDH_BFIN=y -+CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+# CONFIG_DNOTIFY is not set -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_ZISOFS=y -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_NTFS_FS=m -+CONFIG_NTFS_RW=y -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V3=y -+CONFIG_CIFS=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig -new file mode 100644 -index 000000000000..e9f3ba783a4e ---- /dev/null -+++ b/arch/blackfin/configs/BF561-ACVILON_defconfig -@@ -0,0 +1,149 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_SYSFS_DEPRECATED_V2=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+CONFIG_PREEMPT=y -+CONFIG_BF561=y -+CONFIG_BF_REV_0_5=y -+CONFIG_IRQ_TIMER0=10 -+CONFIG_BFIN561_ACVILON=y -+# CONFIG_BF561_COREB is not set -+CONFIG_CLKIN_HZ=12000000 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=y -+CONFIG_DMA_UNCACHED_4M=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_0=0x99b2 -+CONFIG_BANK_1=0x3350 -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+CONFIG_SYN_COOKIES=y -+# CONFIG_INET_LRO is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_PLATRAM=y -+CONFIG_MTD_PHRAM=y -+CONFIG_MTD_BLOCK2MTD=y -+CONFIG_MTD_NAND=y -+CONFIG_MTD_NAND_PLATFORM=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_COUNT=2 -+CONFIG_BLK_DEV_RAM_SIZE=16384 -+CONFIG_SCSI=y -+# CONFIG_SCSI_PROC_FS is not set -+CONFIG_BLK_DEV_SD=y -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_SMSC911X=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_PIO=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_PCA_PLATFORM=y -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_SPI_SPIDEV=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+CONFIG_GPIO_PCF857X=y -+CONFIG_SENSORS_LM75=y -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_MIXER_OSS=y -+CONFIG_SND_PCM_OSS=y -+# CONFIG_SND_DRIVERS is not set -+# CONFIG_SND_USB is not set -+CONFIG_SND_SOC=y -+CONFIG_SND_BF5XX_I2S=y -+CONFIG_SND_BF5XX_SPORT_NUM=1 -+CONFIG_USB=y -+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_MON=y -+CONFIG_USB_STORAGE=y -+CONFIG_USB_SERIAL=y -+CONFIG_USB_SERIAL_FTDI_SIO=y -+CONFIG_USB_SERIAL_PL2303=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_DS1307=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+CONFIG_EXT2_FS_POSIX_ACL=y -+CONFIG_EXT2_FS_SECURITY=y -+# CONFIG_DNOTIFY is not set -+CONFIG_MSDOS_FS=y -+CONFIG_VFAT_FS=y -+CONFIG_FAT_DEFAULT_CODEPAGE=866 -+CONFIG_FAT_DEFAULT_IOCHARSET="cp1251" -+CONFIG_NTFS_FS=y -+CONFIG_CONFIGFS_FS=y -+CONFIG_JFFS2_FS=y -+CONFIG_JFFS2_COMPRESSION_OPTIONS=y -+# CONFIG_JFFS2_ZLIB is not set -+CONFIG_JFFS2_LZO=y -+# CONFIG_JFFS2_RTIME is not set -+CONFIG_JFFS2_CMODE_FAVOURLZO=y -+CONFIG_CRAMFS=y -+CONFIG_MINIX_FS=y -+CONFIG_NFS_FS=y -+CONFIG_NFS_V3=y -+CONFIG_ROOT_NFS=y -+CONFIG_NLS_DEFAULT="cp1251" -+CONFIG_NLS_CODEPAGE_866=y -+CONFIG_NLS_CODEPAGE_1251=y -+CONFIG_NLS_KOI8_R=y -+CONFIG_NLS_UTF8=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+# CONFIG_DEBUG_BUGVERBOSE is not set -+CONFIG_DEBUG_INFO=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set -+CONFIG_CPLB_INFO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig -new file mode 100644 -index 000000000000..89b75a6c3fab ---- /dev/null -+++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig -@@ -0,0 +1,112 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF561=y -+CONFIG_SMP=y -+CONFIG_IRQ_TIMER0=10 -+CONFIG_CLKIN_HZ=30000000 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_AMDSTD=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_PHYSMAP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+CONFIG_SMC91X=y -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT=m -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_USB_SUPPORT is not set -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig -new file mode 100644 -index 000000000000..67b3d2f419ba ---- /dev/null -+++ b/arch/blackfin/configs/BF561-EZKIT_defconfig -@@ -0,0 +1,114 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF561=y -+CONFIG_IRQ_TIMER0=10 -+CONFIG_CLKIN_HZ=30000000 -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_BFIN_EXTMEM_WRITETHROUGH=y -+CONFIG_BFIN_L2_DCACHEABLE=y -+CONFIG_BFIN_L2_WRITETHROUGH=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_AMDSTD=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_PHYSMAP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+CONFIG_SMC91X=y -+# CONFIG_NET_VENDOR_STMICRO is not set -+# CONFIG_WLAN is not set -+CONFIG_INPUT=m -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_JTAG_COMM=m -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_USB_SUPPORT is not set -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig -new file mode 100644 -index 000000000000..8cc75d4218fb ---- /dev/null -+++ b/arch/blackfin/configs/BF609-EZKIT_defconfig -@@ -0,0 +1,154 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF609=y -+CONFIG_PINT1_ASSIGN=0x01010000 -+CONFIG_PINT2_ASSIGN=0x07000101 -+CONFIG_PINT3_ASSIGN=0x02020303 -+CONFIG_IP_CHECKSUM_L1=y -+CONFIG_SYSCALL_TAB_L1=y -+CONFIG_CPLB_SWITCH_TAB_L1=y -+# CONFIG_APP_STACK_L1 is not set -+# CONFIG_BFIN_INS_LOWOVERHEAD is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_PM_BFIN_WAKE_PE12=y -+CONFIG_PM_BFIN_WAKE_PE12_POL=1 -+CONFIG_CPU_FREQ=y -+CONFIG_CPU_FREQ_GOV_POWERSAVE=y -+CONFIG_CPU_FREQ_GOV_ONDEMAND=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_IP_PNP_RARP=y -+# CONFIG_IPV6 is not set -+CONFIG_NETFILTER=y -+CONFIG_CAN=y -+CONFIG_CAN_BFIN=y -+CONFIG_IRDA=y -+CONFIG_IRTTY_SIR=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+CONFIG_FW_LOADER=m -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_CFI_STAA=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_PHYSMAP=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_SPI_NOR=y -+CONFIG_MTD_UBI=m -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_NETDEVICES=y -+# CONFIG_NET_VENDOR_BROADCOM is not set -+# CONFIG_NET_VENDOR_CHELSIO is not set -+# CONFIG_NET_VENDOR_INTEL is not set -+# CONFIG_NET_VENDOR_MARVELL is not set -+# CONFIG_NET_VENDOR_MICREL is not set -+# CONFIG_NET_VENDOR_MICROCHIP is not set -+# CONFIG_NET_VENDOR_NATSEMI is not set -+# CONFIG_NET_VENDOR_SEEQ is not set -+# CONFIG_NET_VENDOR_SMSC is not set -+CONFIG_STMMAC_ETH=y -+CONFIG_STMMAC_IEEE1588=y -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=y -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+CONFIG_INPUT_BFIN_ROTARY=y -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_BFIN_SIMPLE_TIMER=m -+# CONFIG_BFIN_CRC is not set -+CONFIG_BFIN_LINKPORT=y -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_ADI_V3=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+CONFIG_PINCTRL_MCP23S08=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_SOUND=m -+CONFIG_SND=m -+CONFIG_SND_MIXER_OSS=m -+CONFIG_SND_PCM_OSS=m -+# CONFIG_SND_DRIVERS is not set -+# CONFIG_SND_SPI is not set -+# CONFIG_SND_USB is not set -+CONFIG_SND_SOC=m -+CONFIG_USB=y -+CONFIG_USB_MUSB_HDRC=y -+CONFIG_USB_MUSB_BLACKFIN=m -+CONFIG_USB_STORAGE=y -+CONFIG_USB_GADGET=y -+CONFIG_USB_GADGET_MUSB_HDRC=y -+CONFIG_USB_ZERO=y -+CONFIG_MMC=y -+CONFIG_SDH_BFIN=y -+# CONFIG_IOMMU_SUPPORT is not set -+CONFIG_EXT2_FS=y -+# CONFIG_DNOTIFY is not set -+CONFIG_MSDOS_FS=y -+CONFIG_VFAT_FS=y -+CONFIG_JFFS2_FS=m -+CONFIG_UBIFS_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_ISO8859_1=y -+CONFIG_DEBUG_FS=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+CONFIG_FRAME_POINTER=y -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_BFIN_PSEUDODBG_INSNS=y -+CONFIG_CRYPTO_HMAC=m -+CONFIG_CRYPTO_MD4=m -+CONFIG_CRYPTO_MD5=m -+CONFIG_CRYPTO_ARC4=m -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRYPTO_DEV_BFIN_CRC=m -diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig -new file mode 100644 -index 000000000000..9faf0ec7007f ---- /dev/null -+++ b/arch/blackfin/configs/BlackStamp_defconfig -@@ -0,0 +1,108 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_SYSFS_DEPRECATED_V2=y -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+CONFIG_PREEMPT=y -+CONFIG_BF532=y -+CONFIG_BF_REV_0_5=y -+CONFIG_BLACKSTAMP=y -+CONFIG_TIMER0=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_ROMKERNEL=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xAAC2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_BINFMT_SHARED_FLAT=y -+CONFIG_PM=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_LRO is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=m -+CONFIG_MTD_CFI_AMDSTD=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_M25P80=y -+CONFIG_MTD_SPI_NOR=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_NBD=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_MISC_DEVICES=y -+CONFIG_EEPROM_AT25=y -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_SMC91X=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_HW_RANDOM=y -+CONFIG_I2C=m -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_GPIO=m -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_SPI_SPIDEV=m -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_USB_SUPPORT is not set -+CONFIG_MMC=y -+CONFIG_MMC_SPI=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_MSDOS_FS=y -+CONFIG_VFAT_FS=y -+CONFIG_JFFS2_FS=y -+CONFIG_NFS_FS=y -+CONFIG_NFS_V3=y -+CONFIG_NFS_V4=y -+CONFIG_SMB_FS=y -+CONFIG_CIFS=y -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_ASCII=y -+CONFIG_NLS_UTF8=y -+CONFIG_SYSCTL_SYSCALL_CHECK=y -+CONFIG_DEBUG_MMRS=y -+# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_CRC_CCITT=m -diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig -new file mode 100644 -index 000000000000..4a1ad4fd7bb2 ---- /dev/null -+++ b/arch/blackfin/configs/CM-BF527_defconfig -@@ -0,0 +1,129 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_KERNEL_LZMA=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_RD_GZIP is not set -+CONFIG_RD_LZMA=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+CONFIG_PREEMPT=y -+CONFIG_BF527=y -+CONFIG_BF_REV_0_1=y -+CONFIG_IRQ_TIMER0=12 -+CONFIG_BFIN527_BLUETECHNIX_CM=y -+CONFIG_IRQ_USB_INT0=11 -+CONFIG_IRQ_USB_INT1=11 -+CONFIG_IRQ_USB_INT2=11 -+CONFIG_IRQ_USB_DMA=11 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0xFFC0 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_GPIO_ADDR=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+# CONFIG_SCSI_LOWLEVEL is not set -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_DEVKMEM is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_LEGACY_PTYS is not set -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_BLACKFIN_TWI=m -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+CONFIG_USB=m -+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -+# CONFIG_USB_DEVICE_CLASS is not set -+CONFIG_USB_OTG_BLACKLIST_HUB=y -+CONFIG_USB_MON=m -+CONFIG_USB_MUSB_HDRC=m -+CONFIG_USB_MUSB_PERIPHERAL=y -+CONFIG_USB_GADGET_MUSB_HDRC=y -+CONFIG_MUSB_PIO_ONLY=y -+CONFIG_USB_STORAGE=m -+CONFIG_USB_GADGET=m -+CONFIG_USB_ETH=m -+CONFIG_USB_MASS_STORAGE=m -+CONFIG_USB_G_SERIAL=m -+CONFIG_USB_G_PRINTER=m -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+# CONFIG_DNOTIFY is not set -+CONFIG_MSDOS_FS=y -+CONFIG_VFAT_FS=y -+CONFIG_JFFS2_FS=y -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_SMB_FS=m -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_ISO8859_1=y -+CONFIG_DEBUG_FS=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set -+CONFIG_EARLY_PRINTK=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC_CCITT=m -+CONFIG_CRC_ITU_T=y -+CONFIG_CRC7=y -diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig -new file mode 100644 -index 000000000000..9d787e28bbe8 ---- /dev/null -+++ b/arch/blackfin/configs/PNAV-10_defconfig -@@ -0,0 +1,111 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_LOG_BUF_SHIFT=14 -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF537=y -+CONFIG_IRQ_TIMER0=12 -+CONFIG_PNAV10=y -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+CONFIG_IP_CHECKSUM_L1=y -+CONFIG_SYSCALL_TAB_L1=y -+CONFIG_CPLB_SWITCH_TAB_L1=y -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=y -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_1=0x33B0 -+CONFIG_BANK_2=0x33B0 -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_UCLINUX=y -+CONFIG_MTD_NAND=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_BFIN_MAC=y -+# CONFIG_BFIN_MAC_USE_L1 is not set -+CONFIG_BFIN_TX_DESC_NUM=100 -+CONFIG_BFIN_RX_DESC_NUM=100 -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=y -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_TOUCHSCREEN=y -+CONFIG_TOUCHSCREEN_AD7877=y -+CONFIG_INPUT_MISC=y -+CONFIG_INPUT_UINPUT=y -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+CONFIG_SERIAL_BFIN_UART1=y -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_HW_RANDOM=y -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_FB=y -+CONFIG_FIRMWARE_EDID=y -+CONFIG_BACKLIGHT_LCD_SUPPORT=y -+CONFIG_LCD_CLASS_DEVICE=y -+CONFIG_BACKLIGHT_CLASS_DEVICE=y -+CONFIG_SOUND=y -+CONFIG_SND=m -+# CONFIG_SND_SUPPORT_OLD_API is not set -+# CONFIG_SND_VERBOSE_PROCFS is not set -+CONFIG_SOUND_PRIME=y -+# CONFIG_HID is not set -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+# CONFIG_DNOTIFY is not set -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_SMB_FS=m -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_DEBUG_HUNT_FOR_ZERO is not set -+# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set -+# CONFIG_ACCESS_CHECK is not set -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC_CCITT=m -diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig -new file mode 100644 -index 000000000000..225df32dc9a8 ---- /dev/null -+++ b/arch/blackfin/configs/SRV1_defconfig -@@ -0,0 +1,88 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_SYSVIPC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+CONFIG_KALLSYMS_ALL=y -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_IOSCHED_DEADLINE is not set -+CONFIG_PREEMPT=y -+CONFIG_BF537=y -+CONFIG_IRQ_TIMER0=12 -+CONFIG_BOOT_LOAD=0x400000 -+CONFIG_CLKIN_HZ=22118400 -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_DMA_UNCACHED_2M=y -+CONFIG_C_CDPRIO=y -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_PM=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_IPV6 is not set -+CONFIG_IRDA=m -+CONFIG_IRLAN=m -+CONFIG_IRCOMM=m -+CONFIG_IRDA_CACHE_LAST_LSAP=y -+CONFIG_IRTTY_SIR=m -+# CONFIG_WIRELESS is not set -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_JEDECPROBE=m -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_UCLINUX=y -+CONFIG_MTD_NAND=m -+CONFIG_BLK_DEV_RAM=y -+CONFIG_MISC_DEVICES=y -+CONFIG_EEPROM_AT25=m -+CONFIG_NETDEVICES=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+CONFIG_INPUT_EVDEV=m -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+CONFIG_INPUT_UINPUT=y -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_HWMON=m -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_HID is not set -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+# CONFIG_DNOTIFY is not set -+CONFIG_JFFS2_FS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3=y -+CONFIG_SMB_FS=m -+CONFIG_DEBUG_KERNEL=y -+# CONFIG_DEBUG_BUGVERBOSE is not set -+CONFIG_DEBUG_INFO=y -+# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set -+CONFIG_CPLB_INFO=y -diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig -new file mode 100644 -index 000000000000..425c24e43c34 ---- /dev/null -+++ b/arch/blackfin/configs/TCM-BF518_defconfig -@@ -0,0 +1,131 @@ -+CONFIG_EXPERIMENTAL=y -+CONFIG_KERNEL_LZMA=y -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=14 -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_RD_GZIP is not set -+CONFIG_RD_LZMA=y -+CONFIG_EXPERT=y -+# CONFIG_SYSCTL_SYSCALL is not set -+# CONFIG_ELF_CORE is not set -+# CONFIG_FUTEX is not set -+# CONFIG_SIGNALFD is not set -+# CONFIG_TIMERFD is not set -+# CONFIG_EVENTFD is not set -+# CONFIG_AIO is not set -+CONFIG_SLAB=y -+CONFIG_MMAP_ALLOW_UNINITIALIZED=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+# CONFIG_LBDAF is not set -+# CONFIG_BLK_DEV_BSG is not set -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+CONFIG_PREEMPT=y -+CONFIG_BF518=y -+CONFIG_BF_REV_0_1=y -+CONFIG_BFIN518F_TCM=y -+CONFIG_IRQ_TIMER0=12 -+# CONFIG_CYCLES_CLOCKSOURCE is not set -+# CONFIG_SCHEDULE_L1 is not set -+# CONFIG_MEMSET_L1 is not set -+# CONFIG_MEMCPY_L1 is not set -+# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set -+CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 -+CONFIG_BFIN_GPTIMERS=m -+CONFIG_C_CDPRIO=y -+CONFIG_BANK_3=0x99B2 -+CONFIG_BINFMT_FLAT=y -+CONFIG_BINFMT_ZFLAT=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_INET=y -+CONFIG_IP_PNP=y -+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set -+# CONFIG_INET_XFRM_MODE_TUNNEL is not set -+# CONFIG_INET_XFRM_MODE_BEET is not set -+# CONFIG_INET_LRO is not set -+# CONFIG_INET_DIAG is not set -+# CONFIG_IPV6 is not set -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+# CONFIG_FW_LOADER is not set -+CONFIG_MTD=y -+CONFIG_MTD_CMDLINE_PARTS=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_ADV_OPTIONS=y -+CONFIG_MTD_CFI_GEOMETRY=y -+# CONFIG_MTD_MAP_BANK_WIDTH_1 is not set -+# CONFIG_MTD_MAP_BANK_WIDTH_4 is not set -+# CONFIG_MTD_CFI_I2 is not set -+CONFIG_MTD_CFI_INTELEXT=y -+CONFIG_MTD_RAM=y -+CONFIG_MTD_ROM=m -+CONFIG_MTD_PHYSMAP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_NETDEVICES=y -+CONFIG_NET_ETHERNET=y -+CONFIG_BFIN_MAC=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+# CONFIG_SERIO is not set -+# CONFIG_DEVKMEM is not set -+CONFIG_BFIN_JTAG_COMM=m -+CONFIG_SERIAL_BFIN=y -+CONFIG_SERIAL_BFIN_CONSOLE=y -+CONFIG_SERIAL_BFIN_UART0=y -+# CONFIG_LEGACY_PTYS is not set -+# CONFIG_HW_RANDOM is not set -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+CONFIG_I2C_BLACKFIN_TWI=y -+CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 -+CONFIG_SPI=y -+CONFIG_SPI_BFIN5XX=y -+CONFIG_GPIOLIB=y -+CONFIG_GPIO_SYSFS=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_BFIN_WDT=y -+# CONFIG_HID_SUPPORT is not set -+# CONFIG_USB_SUPPORT is not set -+CONFIG_MMC=y -+CONFIG_MMC_DEBUG=y -+CONFIG_MMC_SPI=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_BFIN=y -+CONFIG_EXT2_FS=y -+# CONFIG_DNOTIFY is not set -+CONFIG_VFAT_FS=m -+# CONFIG_MISC_FILESYSTEMS is not set -+CONFIG_NFS_FS=y -+CONFIG_NFS_V3=y -+CONFIG_ROOT_NFS=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_SHIRQ=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_INFO=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+# CONFIG_FTRACE is not set -+CONFIG_DEBUG_MMRS=y -+CONFIG_DEBUG_HWERR=y -+CONFIG_EXACT_HWERR=y -+CONFIG_DEBUG_DOUBLEFAULT=y -+CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y -+CONFIG_EARLY_PRINTK=y -+CONFIG_CPLB_INFO=y -+CONFIG_CRYPTO=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+CONFIG_CRC_CCITT=m -diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig -index 7a7af706e898..be19bf122fde 100644 ---- a/arch/mips/configs/fuloong2e_defconfig -+++ b/arch/mips/configs/fuloong2e_defconfig -@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig -index 9085f4d6c698..fb23111d45f6 100644 ---- a/arch/mips/configs/gpr_defconfig -+++ b/arch/mips/configs/gpr_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig -index 21a1168ae301..529a1b1007cf 100644 ---- a/arch/mips/configs/ip22_defconfig -+++ b/arch/mips/configs/ip22_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig -index 0921ef38e9fb..6da05cef46f8 100644 ---- a/arch/mips/configs/ip28_defconfig -+++ b/arch/mips/configs/ip28_defconfig -@@ -1,5 +1,5 @@ - CONFIG_SYSVIPC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig -index 328d4dfeb4cb..e17cb23173ea 100644 ---- a/arch/mips/configs/jazz_defconfig -+++ b/arch/mips/configs/jazz_defconfig -@@ -1,6 +1,6 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig -index 914af125a7fa..76a64290373f 100644 ---- a/arch/mips/configs/mtx1_defconfig -+++ b/arch/mips/configs/mtx1_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig -index 4ecb157e56d4..ea7309283b01 100644 ---- a/arch/mips/configs/nlm_xlr_defconfig -+++ b/arch/mips/configs/nlm_xlr_defconfig -@@ -1,10 +1,10 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_TASKSTATS=y -diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig -index 63fe2da1b37f..7f08ee237345 100644 ---- a/arch/mips/configs/pic32mzda_defconfig -+++ b/arch/mips/configs/pic32mzda_defconfig -@@ -1,7 +1,7 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig -index 24e07180c57d..38582e8f71c4 100644 ---- a/arch/mips/configs/pistachio_defconfig -+++ b/arch/mips/configs/pistachio_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_DEFAULT_HOSTNAME="localhost" - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=m - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=18 -diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig -index 738ba3b1374b..6a3267e8aa0d 100644 ---- a/arch/mips/configs/pnx8335_stb225_defconfig -+++ b/arch/mips/configs/pnx8335_stb225_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - # CONFIG_SWAP is not set - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_EXPERT=y - CONFIG_SLAB=y -diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig -index 2c7adea7638f..1c82d62bee72 100644 ---- a/arch/mips/configs/rm200_defconfig -+++ b/arch/mips/configs/rm200_defconfig -@@ -1,6 +1,6 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig -index d3e3d94e90c3..578524f80cc4 100644 ---- a/arch/parisc/configs/712_defconfig -+++ b/arch/parisc/configs/712_defconfig -@@ -13,7 +13,7 @@ CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_PA7100LC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_GSC_LASI=y - # CONFIG_PDC_CHASSIS is not set - CONFIG_BINFMT_MISC=m -diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig -index 64d45a8b6ca0..d1bdfad94048 100644 ---- a/arch/parisc/configs/c3000_defconfig -+++ b/arch/parisc/configs/c3000_defconfig -@@ -13,7 +13,7 @@ CONFIG_MODULES=y - CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_PA8X00=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_GSC is not set - CONFIG_PCI=y - CONFIG_PCI_LBA=y -diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig -index 5b877ca34ebf..0d976614934c 100644 ---- a/arch/parisc/configs/defconfig -+++ b/arch/parisc/configs/defconfig -@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - # CONFIG_BLK_DEV_BSG is not set - CONFIG_PA7100LC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IOMMU_CCIO=y - CONFIG_GSC_LASI=y - CONFIG_GSC_WAX=y -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index 3e56c9c2f16e..ecee9c2a0062 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -853,6 +853,8 @@ config SCHED_SMT - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config PPC_DENORMALISATION - bool "PowerPC denormalisation exception handling" - depends on PPC_BOOK3S_64 -diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig -new file mode 100644 -index 000000000000..04fee07ea6c5 ---- /dev/null -+++ b/arch/powerpc/configs/c2k_defconfig -@@ -0,0 +1,389 @@ -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_AUDIT=y -+CONFIG_BSD_PROCESS_ACCT=y -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_KPROBES=y -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODVERSIONS=y -+CONFIG_PARTITION_ADVANCED=y -+CONFIG_OSF_PARTITION=y -+CONFIG_MAC_PARTITION=y -+CONFIG_BSD_DISKLABEL=y -+CONFIG_MINIX_SUBPARTITION=y -+CONFIG_SOLARIS_X86_PARTITION=y -+CONFIG_UNIXWARE_DISKLABEL=y -+CONFIG_SGI_PARTITION=y -+CONFIG_SUN_PARTITION=y -+# CONFIG_PPC_CHRP is not set -+# CONFIG_PPC_PMAC is not set -+CONFIG_EMBEDDED6xx=y -+CONFIG_PPC_C2K=y -+CONFIG_CPU_FREQ=y -+CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y -+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -+CONFIG_CPU_FREQ_GOV_POWERSAVE=m -+CONFIG_CPU_FREQ_GOV_ONDEMAND=m -+CONFIG_GEN_RTC=y -+CONFIG_HIGHMEM=y -+CONFIG_PREEMPT=y -+CONFIG_BINFMT_MISC=y -+CONFIG_PM=y -+CONFIG_PCI_MSI=y -+CONFIG_HOTPLUG_PCI=y -+CONFIG_HOTPLUG_PCI_SHPC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=y -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_ADVANCED_ROUTER=y -+CONFIG_IP_MULTIPLE_TABLES=y -+CONFIG_IP_ROUTE_MULTIPATH=y -+CONFIG_IP_ROUTE_VERBOSE=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_NET_IPIP=m -+CONFIG_IP_MROUTE=y -+CONFIG_IP_PIMSM_V1=y -+CONFIG_IP_PIMSM_V2=y -+CONFIG_SYN_COOKIES=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_IPCOMP=m -+CONFIG_INET6_AH=m -+CONFIG_INET6_ESP=m -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_NETFILTER=y -+# CONFIG_NETFILTER_XT_MATCH_SCTP is not set -+CONFIG_IP_NF_IPTABLES=m -+CONFIG_IP_NF_MATCH_ECN=m -+CONFIG_IP_NF_MATCH_TTL=m -+CONFIG_IP_NF_FILTER=m -+CONFIG_IP_NF_TARGET_REJECT=m -+CONFIG_IP_NF_MANGLE=m -+CONFIG_IP_NF_TARGET_ECN=m -+CONFIG_IP_NF_RAW=m -+CONFIG_IP_NF_ARPTABLES=m -+CONFIG_IP_NF_ARPFILTER=m -+CONFIG_IP_NF_ARP_MANGLE=m -+CONFIG_IP6_NF_IPTABLES=m -+CONFIG_IP6_NF_MATCH_EUI64=m -+CONFIG_IP6_NF_MATCH_FRAG=m -+CONFIG_IP6_NF_MATCH_OPTS=m -+CONFIG_IP6_NF_MATCH_HL=m -+CONFIG_IP6_NF_MATCH_IPV6HEADER=m -+CONFIG_IP6_NF_MATCH_RT=m -+CONFIG_IP6_NF_FILTER=m -+CONFIG_IP6_NF_MANGLE=m -+CONFIG_IP6_NF_RAW=m -+CONFIG_BRIDGE_NF_EBTABLES=m -+CONFIG_BRIDGE_EBT_BROUTE=m -+CONFIG_BRIDGE_EBT_T_FILTER=m -+CONFIG_BRIDGE_EBT_T_NAT=m -+CONFIG_BRIDGE_EBT_802_3=m -+CONFIG_BRIDGE_EBT_AMONG=m -+CONFIG_BRIDGE_EBT_ARP=m -+CONFIG_BRIDGE_EBT_IP=m -+CONFIG_BRIDGE_EBT_LIMIT=m -+CONFIG_BRIDGE_EBT_MARK=m -+CONFIG_BRIDGE_EBT_PKTTYPE=m -+CONFIG_BRIDGE_EBT_STP=m -+CONFIG_BRIDGE_EBT_VLAN=m -+CONFIG_BRIDGE_EBT_ARPREPLY=m -+CONFIG_BRIDGE_EBT_DNAT=m -+CONFIG_BRIDGE_EBT_MARK_T=m -+CONFIG_BRIDGE_EBT_REDIRECT=m -+CONFIG_BRIDGE_EBT_SNAT=m -+CONFIG_BRIDGE_EBT_LOG=m -+CONFIG_IP_SCTP=m -+CONFIG_ATM=m -+CONFIG_ATM_CLIP=m -+CONFIG_ATM_LANE=m -+CONFIG_ATM_BR2684=m -+CONFIG_BRIDGE=m -+CONFIG_VLAN_8021Q=m -+CONFIG_NET_SCHED=y -+CONFIG_NET_SCH_CBQ=m -+CONFIG_NET_SCH_HTB=m -+CONFIG_NET_SCH_HFSC=m -+CONFIG_NET_SCH_ATM=m -+CONFIG_NET_SCH_PRIO=m -+CONFIG_NET_SCH_RED=m -+CONFIG_NET_SCH_SFQ=m -+CONFIG_NET_SCH_TEQL=m -+CONFIG_NET_SCH_TBF=m -+CONFIG_NET_SCH_GRED=m -+CONFIG_NET_SCH_DSMARK=m -+CONFIG_NET_SCH_NETEM=m -+CONFIG_NET_CLS_TCINDEX=m -+CONFIG_NET_CLS_ROUTE4=m -+CONFIG_NET_CLS_FW=m -+CONFIG_NET_CLS_U32=m -+CONFIG_CLS_U32_PERF=y -+CONFIG_NET_CLS_RSVP=m -+CONFIG_NET_CLS_RSVP6=m -+CONFIG_NET_CLS_IND=y -+CONFIG_BT=m -+CONFIG_BT_RFCOMM=m -+CONFIG_BT_RFCOMM_TTY=y -+CONFIG_BT_BNEP=m -+CONFIG_BT_BNEP_MC_FILTER=y -+CONFIG_BT_BNEP_PROTO_FILTER=y -+CONFIG_BT_HIDP=m -+CONFIG_BT_HCIUART=m -+CONFIG_BT_HCIUART_H4=y -+CONFIG_BT_HCIUART_BCSP=y -+CONFIG_BT_HCIBCM203X=m -+CONFIG_BT_HCIBFUSB=m -+CONFIG_BT_HCIVHCI=m -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+CONFIG_MTD=y -+CONFIG_MTD_BLOCK=y -+CONFIG_MTD_CFI=y -+CONFIG_MTD_CFI_AMDSTD=y -+CONFIG_MTD_COMPLEX_MAPPINGS=y -+CONFIG_MTD_PHYSMAP_OF=y -+CONFIG_BLK_DEV_LOOP=m -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_BLK_DEV_NBD=m -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=16384 -+CONFIG_SCSI=m -+CONFIG_BLK_DEV_SD=m -+CONFIG_CHR_DEV_ST=m -+CONFIG_CHR_DEV_OSST=m -+CONFIG_BLK_DEV_SR=m -+CONFIG_BLK_DEV_SR_VENDOR=y -+CONFIG_CHR_DEV_SG=m -+CONFIG_SCSI_CONSTANTS=y -+CONFIG_SCSI_LOGGING=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_BLK_DEV_3W_XXXX_RAID=m -+CONFIG_SCSI_3W_9XXX=m -+CONFIG_SCSI_ACARD=m -+CONFIG_SCSI_AACRAID=m -+CONFIG_SCSI_AIC7XXX=m -+CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 -+CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -+# CONFIG_AIC7XXX_DEBUG_ENABLE is not set -+# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set -+CONFIG_SCSI_AIC79XX=m -+CONFIG_AIC79XX_CMDS_PER_DEVICE=4 -+CONFIG_AIC79XX_RESET_DELAY_MS=15000 -+# CONFIG_AIC79XX_DEBUG_ENABLE is not set -+# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set -+CONFIG_SCSI_ARCMSR=m -+CONFIG_MEGARAID_NEWGEN=y -+CONFIG_MEGARAID_MM=m -+CONFIG_MEGARAID_MAILBOX=m -+CONFIG_MEGARAID_SAS=m -+CONFIG_SCSI_GDTH=m -+CONFIG_SCSI_IPS=m -+CONFIG_SCSI_INITIO=m -+CONFIG_SCSI_SYM53C8XX_2=m -+CONFIG_SCSI_QLOGIC_1280=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_NETCONSOLE=m -+CONFIG_TUN=m -+# CONFIG_ATM_DRIVERS is not set -+CONFIG_MV643XX_ETH=y -+CONFIG_VITESSE_PHY=y -+CONFIG_INPUT_EVDEV=y -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+CONFIG_INPUT_MISC=y -+CONFIG_INPUT_UINPUT=m -+# CONFIG_SERIO is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_SERIAL_NONSTANDARD=y -+CONFIG_SERIAL_MPSC=y -+CONFIG_SERIAL_MPSC_CONSOLE=y -+CONFIG_NVRAM=m -+CONFIG_RAW_DRIVER=y -+CONFIG_MAX_RAW_DEVS=8192 -+CONFIG_I2C=m -+CONFIG_I2C_CHARDEV=m -+CONFIG_I2C_MV64XXX=m -+CONFIG_HWMON=m -+CONFIG_SENSORS_ADM1021=m -+CONFIG_SENSORS_ADM1025=m -+CONFIG_SENSORS_ADM1026=m -+CONFIG_SENSORS_ADM1031=m -+CONFIG_SENSORS_DS1621=m -+CONFIG_SENSORS_GL518SM=m -+CONFIG_SENSORS_MAX1619=m -+CONFIG_SENSORS_LM75=m -+CONFIG_SENSORS_LM77=m -+CONFIG_SENSORS_LM78=m -+CONFIG_SENSORS_LM80=m -+CONFIG_SENSORS_LM83=m -+CONFIG_SENSORS_LM85=m -+CONFIG_SENSORS_LM87=m -+CONFIG_SENSORS_LM90=m -+CONFIG_SENSORS_PCF8591=m -+CONFIG_SENSORS_VIA686A=m -+CONFIG_SENSORS_W83781D=m -+CONFIG_SENSORS_W83L785TS=m -+CONFIG_WATCHDOG=y -+CONFIG_SOFT_WATCHDOG=m -+CONFIG_PCIPCWATCHDOG=m -+CONFIG_WDTPCI=m -+CONFIG_USBPCWATCHDOG=m -+# CONFIG_VGA_CONSOLE is not set -+CONFIG_USB=m -+CONFIG_USB_MON=m -+CONFIG_USB_EHCI_HCD=m -+CONFIG_USB_EHCI_ROOT_HUB_TT=y -+CONFIG_USB_OHCI_HCD=m -+CONFIG_USB_OHCI_HCD_PPC_OF_BE=y -+CONFIG_USB_UHCI_HCD=m -+CONFIG_USB_ACM=m -+CONFIG_USB_PRINTER=m -+CONFIG_USB_STORAGE=m -+CONFIG_USB_STORAGE_DATAFAB=m -+CONFIG_USB_STORAGE_FREECOM=m -+CONFIG_USB_STORAGE_ISD200=m -+CONFIG_USB_STORAGE_SDDR09=m -+CONFIG_USB_STORAGE_SDDR55=m -+CONFIG_USB_STORAGE_JUMPSHOT=m -+CONFIG_USB_MDC800=m -+CONFIG_USB_MICROTEK=m -+CONFIG_USB_SERIAL=m -+CONFIG_USB_SERIAL_GENERIC=y -+CONFIG_USB_SERIAL_BELKIN=m -+CONFIG_USB_SERIAL_WHITEHEAT=m -+CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -+CONFIG_USB_SERIAL_EMPEG=m -+CONFIG_USB_SERIAL_FTDI_SIO=m -+CONFIG_USB_SERIAL_VISOR=m -+CONFIG_USB_SERIAL_IPAQ=m -+CONFIG_USB_SERIAL_IR=m -+CONFIG_USB_SERIAL_EDGEPORT=m -+CONFIG_USB_SERIAL_EDGEPORT_TI=m -+CONFIG_USB_SERIAL_KEYSPAN_PDA=m -+CONFIG_USB_SERIAL_KEYSPAN=m -+CONFIG_USB_SERIAL_KLSI=m -+CONFIG_USB_SERIAL_KOBIL_SCT=m -+CONFIG_USB_SERIAL_MCT_U232=m -+CONFIG_USB_SERIAL_PL2303=m -+CONFIG_USB_SERIAL_SAFE=m -+CONFIG_USB_SERIAL_SAFE_PADDED=y -+CONFIG_USB_SERIAL_CYBERJACK=m -+CONFIG_USB_SERIAL_XIRCOM=m -+CONFIG_USB_SERIAL_OMNINET=m -+CONFIG_USB_EMI62=m -+CONFIG_USB_RIO500=m -+CONFIG_USB_LEGOTOWER=m -+CONFIG_USB_LCD=m -+CONFIG_USB_LED=m -+CONFIG_USB_TEST=m -+CONFIG_USB_ATM=m -+CONFIG_USB_SPEEDTOUCH=m -+CONFIG_INFINIBAND=m -+CONFIG_INFINIBAND_USER_MAD=m -+CONFIG_INFINIBAND_USER_ACCESS=m -+CONFIG_INFINIBAND_MTHCA=m -+CONFIG_INFINIBAND_IPOIB=m -+CONFIG_INFINIBAND_IPOIB_CM=y -+CONFIG_INFINIBAND_SRP=m -+CONFIG_DMADEVICES=y -+CONFIG_EXT4_FS=m -+CONFIG_EXT4_FS_POSIX_ACL=y -+CONFIG_EXT4_FS_SECURITY=y -+CONFIG_QUOTA=y -+CONFIG_QFMT_V2=y -+CONFIG_AUTOFS4_FS=m -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_HFS_FS=m -+CONFIG_HFSPLUS_FS=m -+CONFIG_JFFS2_FS=y -+CONFIG_CRAMFS=m -+CONFIG_VXFS_FS=m -+CONFIG_NFS_FS=y -+CONFIG_NFS_V3_ACL=y -+CONFIG_NFS_V4=y -+CONFIG_ROOT_NFS=y -+CONFIG_CIFS=m -+CONFIG_CIFS_XATTR=y -+CONFIG_CIFS_POSIX=y -+CONFIG_NLS=y -+CONFIG_NLS_DEFAULT="utf8" -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=y -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_CRC_CCITT=m -+CONFIG_CRC_T10DIF=m -+CONFIG_DEBUG_INFO=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_STACK_USAGE=y -+CONFIG_DEBUG_HIGHMEM=y -+CONFIG_DEBUG_STACKOVERFLOW=y -+CONFIG_DETECT_HUNG_TASK=y -+CONFIG_DEBUG_SPINLOCK=y -+CONFIG_BOOTX_TEXT=y -+CONFIG_PPC_EARLY_DEBUG=y -+CONFIG_SECURITY=y -+CONFIG_SECURITY_NETWORK=y -+CONFIG_SECURITY_SELINUX=y -+CONFIG_SECURITY_SELINUX_BOOTPARAM=y -+CONFIG_SECURITY_SELINUX_DISABLE=y -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA1=y -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig -index 9dca4cffa623..09d38c3e59a5 100644 ---- a/arch/powerpc/configs/ppc6xx_defconfig -+++ b/arch/powerpc/configs/ppc6xx_defconfig -@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y - CONFIG_MCU_MPC8349EMITX=y - CONFIG_HIGHMEM=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_HIBERNATION=y - CONFIG_PM_DEBUG=y -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig -new file mode 100644 -index 000000000000..46434ca1fa10 ---- /dev/null -+++ b/arch/score/configs/spct6600_defconfig -@@ -0,0 +1,84 @@ -+CONFIG_HZ_100=y -+CONFIG_PREEMPT=y -+CONFIG_EXPERIMENTAL=y -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_BSD_PROCESS_ACCT=y -+CONFIG_LOG_BUF_SHIFT=12 -+CONFIG_SYSFS_DEPRECATED_V2=y -+CONFIG_BLK_DEV_INITRD=y -+# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -+CONFIG_EXPERT=y -+# CONFIG_KALLSYMS is not set -+# CONFIG_HOTPLUG is not set -+CONFIG_SLAB=y -+CONFIG_MODULES=y -+CONFIG_MODULE_FORCE_LOAD=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+CONFIG_BINFMT_MISC=y -+CONFIG_NET=y -+CONFIG_UNIX=y -+CONFIG_NET_KEY=y -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_ARPD=y -+# CONFIG_INET_LRO is not set -+# CONFIG_IPV6 is not set -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_COUNT=1 -+# CONFIG_MISC_DEVICES is not set -+CONFIG_NETDEVICES=y -+# CONFIG_NETDEV_1000 is not set -+# CONFIG_NETDEV_10000 is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+CONFIG_SERIAL_NONSTANDARD=y -+CONFIG_STALDRV=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+CONFIG_MAX_RAW_DEVS=8192 -+# CONFIG_HWMON is not set -+# CONFIG_VGA_CONSOLE is not set -+# CONFIG_HID_SUPPORT is not set -+# CONFIG_USB_SUPPORT is not set -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+CONFIG_EXT2_FS_POSIX_ACL=y -+CONFIG_EXT3_FS=y -+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -+CONFIG_EXT3_FS_POSIX_ACL=y -+CONFIG_AUTOFS_FS=y -+CONFIG_AUTOFS4_FS=y -+CONFIG_PROC_KCORE=y -+# CONFIG_PROC_PAGE_MONITOR is not set -+CONFIG_TMPFS=y -+CONFIG_TMPFS_POSIX_ACL=y -+CONFIG_NFS_FS=y -+CONFIG_NFS_V3=y -+CONFIG_NFS_V3_ACL=y -+CONFIG_NFS_V4=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V3_ACL=y -+CONFIG_NFSD_V4=y -+# CONFIG_RCU_CPU_STALL_DETECTOR is not set -+CONFIG_SECURITY=y -+CONFIG_SECURITY_NETWORK=y -+CONFIG_CRYPTO_NULL=y -+CONFIG_CRYPTO_CRYPTD=y -+CONFIG_CRYPTO_SEQIV=y -+CONFIG_CRYPTO_MD4=y -+CONFIG_CRYPTO_MICHAEL_MIC=y -+# CONFIG_CRYPTO_ANSI_CPRNG is not set -+# CONFIG_CRYPTO_HW is not set -+CONFIG_CRC_CCITT=y -+CONFIG_CRC16=y -+CONFIG_LIBCRC32C=y -diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig -index 9a527f978106..5895f2cc726e 100644 ---- a/arch/sh/configs/se7712_defconfig -+++ b/arch/sh/configs/se7712_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=66666666 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" - CONFIG_NET=y -diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig -index 3b0e1eb6e874..e296a2cd9903 100644 ---- a/arch/sh/configs/se7721_defconfig -+++ b/arch/sh/configs/se7721_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_7721_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=33333333 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" - CONFIG_NET=y -diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig -index 4ec961ace688..a03a1ad670a0 100644 ---- a/arch/sh/configs/titan_defconfig -+++ b/arch/sh/configs/titan_defconfig -@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y - CONFIG_SH_PCLK_FREQ=30000000 - CONFIG_SH_DMA=y - CONFIG_SH_DMA_API=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" - CONFIG_PCI=y -diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig -index 6c325d53a20a..98d4ef3d76cf 100644 ---- a/arch/sparc/configs/sparc64_defconfig -+++ b/arch/sparc/configs/sparc64_defconfig -@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NUMA=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_SUN_LDOMS=y - CONFIG_PCI=y - CONFIG_PCI_MSI=y -diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig -new file mode 100644 -index 000000000000..939c63ba7e6e ---- /dev/null -+++ b/arch/tile/configs/tilegx_defconfig -@@ -0,0 +1,411 @@ -+CONFIG_TILEGX=y -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_FHANDLE=y -+CONFIG_AUDIT=y -+CONFIG_NO_HZ=y -+CONFIG_BSD_PROCESS_ACCT=y -+CONFIG_BSD_PROCESS_ACCT_V3=y -+CONFIG_TASKSTATS=y -+CONFIG_TASK_DELAY_ACCT=y -+CONFIG_TASK_XACCT=y -+CONFIG_TASK_IO_ACCOUNTING=y -+CONFIG_LOG_BUF_SHIFT=19 -+CONFIG_CGROUPS=y -+CONFIG_CGROUP_DEBUG=y -+CONFIG_CGROUP_DEVICE=y -+CONFIG_CPUSETS=y -+CONFIG_CGROUP_CPUACCT=y -+CONFIG_CGROUP_SCHED=y -+CONFIG_RT_GROUP_SCHED=y -+CONFIG_BLK_CGROUP=y -+CONFIG_NAMESPACES=y -+CONFIG_RELAY=y -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_RD_XZ=y -+CONFIG_SYSCTL_SYSCALL=y -+CONFIG_EMBEDDED=y -+# CONFIG_COMPAT_BRK is not set -+CONFIG_PROFILING=y -+CONFIG_KPROBES=y -+CONFIG_MODULES=y -+CONFIG_MODULE_FORCE_LOAD=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_BLK_DEV_INTEGRITY=y -+CONFIG_PARTITION_ADVANCED=y -+CONFIG_OSF_PARTITION=y -+CONFIG_AMIGA_PARTITION=y -+CONFIG_MAC_PARTITION=y -+CONFIG_BSD_DISKLABEL=y -+CONFIG_MINIX_SUBPARTITION=y -+CONFIG_SOLARIS_X86_PARTITION=y -+CONFIG_UNIXWARE_DISKLABEL=y -+CONFIG_SGI_PARTITION=y -+CONFIG_SUN_PARTITION=y -+CONFIG_KARMA_PARTITION=y -+CONFIG_CFQ_GROUP_IOSCHED=y -+CONFIG_NR_CPUS=100 -+CONFIG_HZ_100=y -+# CONFIG_COMPACTION is not set -+CONFIG_PREEMPT=y -+CONFIG_TILE_PCI_IO=y -+CONFIG_PCI_DEBUG=y -+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -+CONFIG_BINFMT_MISC=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=y -+CONFIG_XFRM_SUB_POLICY=y -+CONFIG_XFRM_STATISTICS=y -+CONFIG_NET_KEY=m -+CONFIG_NET_KEY_MIGRATE=y -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_ADVANCED_ROUTER=y -+CONFIG_IP_MULTIPLE_TABLES=y -+CONFIG_IP_ROUTE_MULTIPATH=y -+CONFIG_IP_ROUTE_VERBOSE=y -+CONFIG_NET_IPIP=m -+CONFIG_IP_MROUTE=y -+CONFIG_IP_PIMSM_V1=y -+CONFIG_IP_PIMSM_V2=y -+CONFIG_SYN_COOKIES=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_IPCOMP=m -+CONFIG_INET_XFRM_MODE_TRANSPORT=m -+CONFIG_INET_XFRM_MODE_TUNNEL=m -+CONFIG_INET_XFRM_MODE_BEET=m -+CONFIG_INET_DIAG=m -+CONFIG_TCP_CONG_ADVANCED=y -+CONFIG_TCP_CONG_HSTCP=m -+CONFIG_TCP_CONG_HYBLA=m -+CONFIG_TCP_CONG_SCALABLE=m -+CONFIG_TCP_CONG_LP=m -+CONFIG_TCP_CONG_VENO=m -+CONFIG_TCP_CONG_YEAH=m -+CONFIG_TCP_CONG_ILLINOIS=m -+CONFIG_TCP_MD5SIG=y -+CONFIG_IPV6=y -+CONFIG_IPV6_ROUTER_PREF=y -+CONFIG_IPV6_ROUTE_INFO=y -+CONFIG_IPV6_OPTIMISTIC_DAD=y -+CONFIG_INET6_AH=m -+CONFIG_INET6_ESP=m -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_MIP6=m -+CONFIG_INET6_XFRM_MODE_TRANSPORT=m -+CONFIG_INET6_XFRM_MODE_TUNNEL=m -+CONFIG_INET6_XFRM_MODE_BEET=m -+CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m -+CONFIG_IPV6_SIT=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_IPV6_MULTIPLE_TABLES=y -+CONFIG_IPV6_MROUTE=y -+CONFIG_IPV6_PIMSM_V2=y -+CONFIG_NETLABEL=y -+CONFIG_RDS=m -+CONFIG_RDS_TCP=m -+CONFIG_BRIDGE=m -+CONFIG_VLAN_8021Q=m -+CONFIG_VLAN_8021Q_GVRP=y -+CONFIG_PHONET=m -+CONFIG_NET_SCHED=y -+CONFIG_NET_SCH_CBQ=m -+CONFIG_NET_SCH_HTB=m -+CONFIG_NET_SCH_HFSC=m -+CONFIG_NET_SCH_PRIO=m -+CONFIG_NET_SCH_MULTIQ=m -+CONFIG_NET_SCH_RED=m -+CONFIG_NET_SCH_SFQ=m -+CONFIG_NET_SCH_TEQL=m -+CONFIG_NET_SCH_TBF=m -+CONFIG_NET_SCH_GRED=m -+CONFIG_NET_SCH_DSMARK=m -+CONFIG_NET_SCH_NETEM=m -+CONFIG_NET_SCH_DRR=m -+CONFIG_NET_SCH_INGRESS=m -+CONFIG_NET_CLS_BASIC=m -+CONFIG_NET_CLS_TCINDEX=m -+CONFIG_NET_CLS_ROUTE4=m -+CONFIG_NET_CLS_FW=m -+CONFIG_NET_CLS_U32=m -+CONFIG_CLS_U32_PERF=y -+CONFIG_CLS_U32_MARK=y -+CONFIG_NET_CLS_RSVP=m -+CONFIG_NET_CLS_RSVP6=m -+CONFIG_NET_CLS_FLOW=m -+CONFIG_NET_CLS_CGROUP=y -+CONFIG_NET_EMATCH=y -+CONFIG_NET_EMATCH_CMP=m -+CONFIG_NET_EMATCH_NBYTE=m -+CONFIG_NET_EMATCH_U32=m -+CONFIG_NET_EMATCH_META=m -+CONFIG_NET_EMATCH_TEXT=m -+CONFIG_NET_CLS_ACT=y -+CONFIG_NET_ACT_POLICE=m -+CONFIG_NET_ACT_GACT=m -+CONFIG_GACT_PROB=y -+CONFIG_NET_ACT_MIRRED=m -+CONFIG_NET_ACT_NAT=m -+CONFIG_NET_ACT_PEDIT=m -+CONFIG_NET_ACT_SIMP=m -+CONFIG_NET_ACT_SKBEDIT=m -+CONFIG_NET_CLS_IND=y -+CONFIG_DCB=y -+CONFIG_DNS_RESOLVER=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+CONFIG_CONNECTOR=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_BLK_DEV_SX8=m -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=16384 -+CONFIG_ATA_OVER_ETH=m -+CONFIG_RAID_ATTRS=m -+CONFIG_BLK_DEV_SD=y -+CONFIG_SCSI_CONSTANTS=y -+CONFIG_SCSI_LOGGING=y -+CONFIG_SCSI_SAS_ATA=y -+CONFIG_ISCSI_TCP=m -+CONFIG_SCSI_MVSAS=y -+# CONFIG_SCSI_MVSAS_DEBUG is not set -+CONFIG_SCSI_MVSAS_TASKLET=y -+CONFIG_ATA=y -+CONFIG_SATA_AHCI=y -+CONFIG_SATA_SIL24=y -+# CONFIG_ATA_SFF is not set -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=m -+CONFIG_MD_RAID0=m -+CONFIG_MD_RAID1=m -+CONFIG_MD_RAID10=m -+CONFIG_MD_RAID456=m -+CONFIG_MD_FAULTY=m -+CONFIG_BLK_DEV_DM=m -+CONFIG_DM_DEBUG=y -+CONFIG_DM_CRYPT=m -+CONFIG_DM_SNAPSHOT=m -+CONFIG_DM_MIRROR=m -+CONFIG_DM_LOG_USERSPACE=m -+CONFIG_DM_ZERO=m -+CONFIG_DM_MULTIPATH=m -+CONFIG_DM_MULTIPATH_QL=m -+CONFIG_DM_MULTIPATH_ST=m -+CONFIG_DM_DELAY=m -+CONFIG_DM_UEVENT=y -+CONFIG_TARGET_CORE=m -+CONFIG_TCM_IBLOCK=m -+CONFIG_TCM_FILEIO=m -+CONFIG_TCM_PSCSI=m -+CONFIG_LOOPBACK_TARGET=m -+CONFIG_ISCSI_TARGET=m -+CONFIG_FUSION=y -+CONFIG_FUSION_SAS=y -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_IFB=m -+CONFIG_MACVLAN=m -+CONFIG_MACVTAP=m -+CONFIG_NETCONSOLE=m -+CONFIG_NETCONSOLE_DYNAMIC=y -+CONFIG_TUN=y -+CONFIG_VETH=m -+CONFIG_NET_DSA_MV88E6060=y -+CONFIG_NET_DSA_MV88E6XXX=y -+CONFIG_SKY2=y -+CONFIG_PTP_1588_CLOCK_TILEGX=y -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_SERIAL_TILEGX=y -+CONFIG_HW_RANDOM=y -+CONFIG_HW_RANDOM_TIMERIOMEM=m -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_WATCHDOG_NOWAYOUT=y -+# CONFIG_VGA_ARB is not set -+CONFIG_DRM=m -+CONFIG_DRM_TDFX=m -+CONFIG_DRM_R128=m -+CONFIG_DRM_MGA=m -+CONFIG_DRM_VIA=m -+CONFIG_DRM_SAVAGE=m -+CONFIG_USB=y -+CONFIG_USB_EHCI_HCD=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_STORAGE=y -+CONFIG_EDAC=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_TILE=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+CONFIG_EXT2_FS_POSIX_ACL=y -+CONFIG_EXT2_FS_SECURITY=y -+CONFIG_EXT2_FS_XIP=y -+CONFIG_EXT3_FS=y -+CONFIG_EXT3_FS_POSIX_ACL=y -+CONFIG_EXT3_FS_SECURITY=y -+CONFIG_EXT4_FS=y -+CONFIG_EXT4_FS_POSIX_ACL=y -+CONFIG_EXT4_FS_SECURITY=y -+CONFIG_XFS_FS=y -+CONFIG_XFS_QUOTA=y -+CONFIG_XFS_POSIX_ACL=y -+CONFIG_GFS2_FS=m -+CONFIG_GFS2_FS_LOCKING_DLM=y -+CONFIG_BTRFS_FS=m -+CONFIG_BTRFS_FS_POSIX_ACL=y -+CONFIG_QUOTA=y -+CONFIG_QUOTA_NETLINK_INTERFACE=y -+# CONFIG_PRINT_QUOTA_WARNING is not set -+CONFIG_QFMT_V2=y -+CONFIG_AUTOFS4_FS=m -+CONFIG_FUSE_FS=y -+CONFIG_CUSE=m -+CONFIG_FSCACHE=m -+CONFIG_FSCACHE_STATS=y -+CONFIG_CACHEFILES=m -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_ZISOFS=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_TMPFS_POSIX_ACL=y -+CONFIG_HUGETLBFS=y -+CONFIG_ECRYPT_FS=m -+CONFIG_CRAMFS=m -+CONFIG_SQUASHFS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3_ACL=y -+CONFIG_NFS_V4=m -+CONFIG_NFS_V4_1=y -+CONFIG_NFS_FSCACHE=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V3_ACL=y -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_CIFS_STATS=y -+CONFIG_CIFS_WEAK_PW_HASH=y -+CONFIG_CIFS_UPCALL=y -+CONFIG_CIFS_XATTR=y -+CONFIG_CIFS_POSIX=y -+CONFIG_CIFS_DFS_UPCALL=y -+CONFIG_CIFS_FSCACHE=y -+CONFIG_NLS_DEFAULT="utf8" -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=y -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=m -+CONFIG_DLM=m -+CONFIG_DLM_DEBUG=y -+CONFIG_DYNAMIC_DEBUG=y -+CONFIG_DEBUG_INFO=y -+CONFIG_DEBUG_INFO_REDUCED=y -+# CONFIG_ENABLE_WARN_DEPRECATED is not set -+CONFIG_STRIP_ASM_SYMS=y -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_CHECK=y -+# CONFIG_FRAME_POINTER is not set -+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y -+CONFIG_DEBUG_VM=y -+CONFIG_DEBUG_MEMORY_INIT=y -+CONFIG_DEBUG_STACKOVERFLOW=y -+CONFIG_LOCKUP_DETECTOR=y -+CONFIG_SCHEDSTATS=y -+CONFIG_TIMER_STATS=y -+CONFIG_DEBUG_LIST=y -+CONFIG_DEBUG_CREDENTIALS=y -+CONFIG_RCU_CPU_STALL_TIMEOUT=60 -+CONFIG_ASYNC_RAID6_TEST=m -+CONFIG_KGDB=y -+CONFIG_SECURITY=y -+CONFIG_SECURITYFS=y -+CONFIG_SECURITY_NETWORK=y -+CONFIG_SECURITY_NETWORK_XFRM=y -+CONFIG_SECURITY_SELINUX=y -+CONFIG_SECURITY_SELINUX_BOOTPARAM=y -+CONFIG_SECURITY_SELINUX_DISABLE=y -+CONFIG_CRYPTO_PCRYPT=m -+CONFIG_CRYPTO_CRYPTD=m -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_CCM=m -+CONFIG_CRYPTO_GCM=m -+CONFIG_CRYPTO_CTS=m -+CONFIG_CRYPTO_LRW=m -+CONFIG_CRYPTO_PCBC=m -+CONFIG_CRYPTO_XTS=m -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_XCBC=m -+CONFIG_CRYPTO_VMAC=m -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_RMD128=m -+CONFIG_CRYPTO_RMD160=m -+CONFIG_CRYPTO_RMD256=m -+CONFIG_CRYPTO_RMD320=m -+CONFIG_CRYPTO_SHA1=y -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAMELLIA=m -+CONFIG_CRYPTO_CAST5=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_FCRYPT=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SEED=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+CONFIG_CRYPTO_LZO=m -diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig -new file mode 100644 -index 000000000000..e8c4003cbd81 ---- /dev/null -+++ b/arch/tile/configs/tilepro_defconfig -@@ -0,0 +1,524 @@ -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_AUDIT=y -+CONFIG_NO_HZ=y -+CONFIG_HIGH_RES_TIMERS=y -+CONFIG_BSD_PROCESS_ACCT=y -+CONFIG_BSD_PROCESS_ACCT_V3=y -+CONFIG_TASKSTATS=y -+CONFIG_TASK_DELAY_ACCT=y -+CONFIG_TASK_XACCT=y -+CONFIG_TASK_IO_ACCOUNTING=y -+CONFIG_LOG_BUF_SHIFT=19 -+CONFIG_CGROUPS=y -+CONFIG_CGROUP_DEBUG=y -+CONFIG_CGROUP_DEVICE=y -+CONFIG_CPUSETS=y -+CONFIG_CGROUP_CPUACCT=y -+CONFIG_CGROUP_SCHED=y -+CONFIG_RT_GROUP_SCHED=y -+CONFIG_BLK_CGROUP=y -+CONFIG_NAMESPACES=y -+CONFIG_RELAY=y -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_RD_XZ=y -+CONFIG_SYSCTL_SYSCALL=y -+CONFIG_EMBEDDED=y -+# CONFIG_COMPAT_BRK is not set -+CONFIG_PROFILING=y -+CONFIG_MODULES=y -+CONFIG_MODULE_FORCE_LOAD=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_BLK_DEV_INTEGRITY=y -+CONFIG_PARTITION_ADVANCED=y -+CONFIG_OSF_PARTITION=y -+CONFIG_AMIGA_PARTITION=y -+CONFIG_MAC_PARTITION=y -+CONFIG_BSD_DISKLABEL=y -+CONFIG_MINIX_SUBPARTITION=y -+CONFIG_SOLARIS_X86_PARTITION=y -+CONFIG_UNIXWARE_DISKLABEL=y -+CONFIG_SGI_PARTITION=y -+CONFIG_SUN_PARTITION=y -+CONFIG_KARMA_PARTITION=y -+CONFIG_CFQ_GROUP_IOSCHED=y -+CONFIG_HZ_100=y -+# CONFIG_COMPACTION is not set -+CONFIG_PREEMPT=y -+CONFIG_PCI_DEBUG=y -+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -+CONFIG_BINFMT_MISC=y -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=y -+CONFIG_XFRM_SUB_POLICY=y -+CONFIG_XFRM_STATISTICS=y -+CONFIG_NET_KEY=m -+CONFIG_NET_KEY_MIGRATE=y -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_ADVANCED_ROUTER=y -+CONFIG_IP_MULTIPLE_TABLES=y -+CONFIG_IP_ROUTE_MULTIPATH=y -+CONFIG_IP_ROUTE_VERBOSE=y -+CONFIG_NET_IPIP=m -+CONFIG_IP_MROUTE=y -+CONFIG_IP_PIMSM_V1=y -+CONFIG_IP_PIMSM_V2=y -+CONFIG_SYN_COOKIES=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_IPCOMP=m -+CONFIG_INET_XFRM_MODE_TRANSPORT=m -+CONFIG_INET_XFRM_MODE_TUNNEL=m -+CONFIG_INET_XFRM_MODE_BEET=m -+CONFIG_INET_DIAG=m -+CONFIG_TCP_CONG_ADVANCED=y -+CONFIG_TCP_CONG_HSTCP=m -+CONFIG_TCP_CONG_HYBLA=m -+CONFIG_TCP_CONG_SCALABLE=m -+CONFIG_TCP_CONG_LP=m -+CONFIG_TCP_CONG_VENO=m -+CONFIG_TCP_CONG_YEAH=m -+CONFIG_TCP_CONG_ILLINOIS=m -+CONFIG_TCP_MD5SIG=y -+CONFIG_IPV6=y -+CONFIG_IPV6_ROUTER_PREF=y -+CONFIG_IPV6_ROUTE_INFO=y -+CONFIG_IPV6_OPTIMISTIC_DAD=y -+CONFIG_INET6_AH=m -+CONFIG_INET6_ESP=m -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_MIP6=m -+CONFIG_INET6_XFRM_MODE_TRANSPORT=m -+CONFIG_INET6_XFRM_MODE_TUNNEL=m -+CONFIG_INET6_XFRM_MODE_BEET=m -+CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m -+CONFIG_IPV6_SIT=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_IPV6_MULTIPLE_TABLES=y -+CONFIG_IPV6_MROUTE=y -+CONFIG_IPV6_PIMSM_V2=y -+CONFIG_NETLABEL=y -+CONFIG_NETFILTER=y -+CONFIG_NF_CONNTRACK=m -+CONFIG_NF_CONNTRACK_SECMARK=y -+CONFIG_NF_CONNTRACK_ZONES=y -+CONFIG_NF_CONNTRACK_EVENTS=y -+CONFIG_NF_CT_PROTO_DCCP=m -+CONFIG_NF_CT_PROTO_UDPLITE=m -+CONFIG_NF_CONNTRACK_AMANDA=m -+CONFIG_NF_CONNTRACK_FTP=m -+CONFIG_NF_CONNTRACK_H323=m -+CONFIG_NF_CONNTRACK_IRC=m -+CONFIG_NF_CONNTRACK_NETBIOS_NS=m -+CONFIG_NF_CONNTRACK_PPTP=m -+CONFIG_NF_CONNTRACK_SANE=m -+CONFIG_NF_CONNTRACK_SIP=m -+CONFIG_NF_CONNTRACK_TFTP=m -+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -+CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -+CONFIG_NETFILTER_XT_TARGET_DSCP=m -+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -+CONFIG_NETFILTER_XT_TARGET_MARK=m -+CONFIG_NETFILTER_XT_TARGET_NFLOG=m -+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -+CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -+CONFIG_NETFILTER_XT_TARGET_TEE=m -+CONFIG_NETFILTER_XT_TARGET_TPROXY=m -+CONFIG_NETFILTER_XT_TARGET_TRACE=m -+CONFIG_NETFILTER_XT_TARGET_SECMARK=m -+CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -+CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m -+CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -+CONFIG_NETFILTER_XT_MATCH_COMMENT=m -+CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -+CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -+CONFIG_NETFILTER_XT_MATCH_DCCP=m -+CONFIG_NETFILTER_XT_MATCH_DSCP=m -+CONFIG_NETFILTER_XT_MATCH_ESP=m -+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -+CONFIG_NETFILTER_XT_MATCH_HELPER=m -+CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -+CONFIG_NETFILTER_XT_MATCH_IPVS=m -+CONFIG_NETFILTER_XT_MATCH_LENGTH=m -+CONFIG_NETFILTER_XT_MATCH_LIMIT=m -+CONFIG_NETFILTER_XT_MATCH_MAC=m -+CONFIG_NETFILTER_XT_MATCH_MARK=m -+CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -+CONFIG_NETFILTER_XT_MATCH_OSF=m -+CONFIG_NETFILTER_XT_MATCH_OWNER=m -+CONFIG_NETFILTER_XT_MATCH_POLICY=m -+CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -+CONFIG_NETFILTER_XT_MATCH_QUOTA=m -+CONFIG_NETFILTER_XT_MATCH_RATEEST=m -+CONFIG_NETFILTER_XT_MATCH_REALM=m -+CONFIG_NETFILTER_XT_MATCH_RECENT=m -+CONFIG_NETFILTER_XT_MATCH_SOCKET=m -+CONFIG_NETFILTER_XT_MATCH_STATE=m -+CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -+CONFIG_NETFILTER_XT_MATCH_STRING=m -+CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -+CONFIG_NETFILTER_XT_MATCH_TIME=m -+CONFIG_NETFILTER_XT_MATCH_U32=m -+CONFIG_IP_VS=m -+CONFIG_IP_VS_IPV6=y -+CONFIG_IP_VS_PROTO_TCP=y -+CONFIG_IP_VS_PROTO_UDP=y -+CONFIG_IP_VS_PROTO_ESP=y -+CONFIG_IP_VS_PROTO_AH=y -+CONFIG_IP_VS_PROTO_SCTP=y -+CONFIG_IP_VS_RR=m -+CONFIG_IP_VS_WRR=m -+CONFIG_IP_VS_LC=m -+CONFIG_IP_VS_WLC=m -+CONFIG_IP_VS_LBLC=m -+CONFIG_IP_VS_LBLCR=m -+CONFIG_IP_VS_SED=m -+CONFIG_IP_VS_NQ=m -+CONFIG_NF_CONNTRACK_IPV4=m -+# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set -+CONFIG_IP_NF_IPTABLES=y -+CONFIG_IP_NF_MATCH_AH=m -+CONFIG_IP_NF_MATCH_ECN=m -+CONFIG_IP_NF_MATCH_TTL=m -+CONFIG_IP_NF_FILTER=y -+CONFIG_IP_NF_TARGET_REJECT=y -+CONFIG_IP_NF_MANGLE=m -+CONFIG_IP_NF_TARGET_ECN=m -+CONFIG_IP_NF_TARGET_TTL=m -+CONFIG_IP_NF_RAW=m -+CONFIG_IP_NF_SECURITY=m -+CONFIG_IP_NF_ARPTABLES=m -+CONFIG_IP_NF_ARPFILTER=m -+CONFIG_IP_NF_ARP_MANGLE=m -+CONFIG_NF_CONNTRACK_IPV6=m -+CONFIG_IP6_NF_MATCH_AH=m -+CONFIG_IP6_NF_MATCH_EUI64=m -+CONFIG_IP6_NF_MATCH_FRAG=m -+CONFIG_IP6_NF_MATCH_OPTS=m -+CONFIG_IP6_NF_MATCH_HL=m -+CONFIG_IP6_NF_MATCH_IPV6HEADER=m -+CONFIG_IP6_NF_MATCH_MH=m -+CONFIG_IP6_NF_MATCH_RT=m -+CONFIG_IP6_NF_TARGET_HL=m -+CONFIG_IP6_NF_FILTER=m -+CONFIG_IP6_NF_TARGET_REJECT=m -+CONFIG_IP6_NF_MANGLE=m -+CONFIG_IP6_NF_RAW=m -+CONFIG_IP6_NF_SECURITY=m -+CONFIG_BRIDGE_NF_EBTABLES=m -+CONFIG_BRIDGE_EBT_BROUTE=m -+CONFIG_BRIDGE_EBT_T_FILTER=m -+CONFIG_BRIDGE_EBT_T_NAT=m -+CONFIG_BRIDGE_EBT_802_3=m -+CONFIG_BRIDGE_EBT_AMONG=m -+CONFIG_BRIDGE_EBT_ARP=m -+CONFIG_BRIDGE_EBT_IP=m -+CONFIG_BRIDGE_EBT_IP6=m -+CONFIG_BRIDGE_EBT_LIMIT=m -+CONFIG_BRIDGE_EBT_MARK=m -+CONFIG_BRIDGE_EBT_PKTTYPE=m -+CONFIG_BRIDGE_EBT_STP=m -+CONFIG_BRIDGE_EBT_VLAN=m -+CONFIG_BRIDGE_EBT_ARPREPLY=m -+CONFIG_BRIDGE_EBT_DNAT=m -+CONFIG_BRIDGE_EBT_MARK_T=m -+CONFIG_BRIDGE_EBT_REDIRECT=m -+CONFIG_BRIDGE_EBT_SNAT=m -+CONFIG_BRIDGE_EBT_LOG=m -+CONFIG_BRIDGE_EBT_ULOG=m -+CONFIG_BRIDGE_EBT_NFLOG=m -+CONFIG_RDS=m -+CONFIG_RDS_TCP=m -+CONFIG_BRIDGE=m -+CONFIG_VLAN_8021Q=m -+CONFIG_VLAN_8021Q_GVRP=y -+CONFIG_PHONET=m -+CONFIG_NET_SCHED=y -+CONFIG_NET_SCH_CBQ=m -+CONFIG_NET_SCH_HTB=m -+CONFIG_NET_SCH_HFSC=m -+CONFIG_NET_SCH_PRIO=m -+CONFIG_NET_SCH_MULTIQ=m -+CONFIG_NET_SCH_RED=m -+CONFIG_NET_SCH_SFQ=m -+CONFIG_NET_SCH_TEQL=m -+CONFIG_NET_SCH_TBF=m -+CONFIG_NET_SCH_GRED=m -+CONFIG_NET_SCH_DSMARK=m -+CONFIG_NET_SCH_NETEM=m -+CONFIG_NET_SCH_DRR=m -+CONFIG_NET_SCH_INGRESS=m -+CONFIG_NET_CLS_BASIC=m -+CONFIG_NET_CLS_TCINDEX=m -+CONFIG_NET_CLS_ROUTE4=m -+CONFIG_NET_CLS_FW=m -+CONFIG_NET_CLS_U32=m -+CONFIG_CLS_U32_PERF=y -+CONFIG_CLS_U32_MARK=y -+CONFIG_NET_CLS_RSVP=m -+CONFIG_NET_CLS_RSVP6=m -+CONFIG_NET_CLS_FLOW=m -+CONFIG_NET_CLS_CGROUP=y -+CONFIG_NET_EMATCH=y -+CONFIG_NET_EMATCH_CMP=m -+CONFIG_NET_EMATCH_NBYTE=m -+CONFIG_NET_EMATCH_U32=m -+CONFIG_NET_EMATCH_META=m -+CONFIG_NET_EMATCH_TEXT=m -+CONFIG_NET_CLS_ACT=y -+CONFIG_NET_ACT_POLICE=m -+CONFIG_NET_ACT_GACT=m -+CONFIG_GACT_PROB=y -+CONFIG_NET_ACT_MIRRED=m -+CONFIG_NET_ACT_IPT=m -+CONFIG_NET_ACT_NAT=m -+CONFIG_NET_ACT_PEDIT=m -+CONFIG_NET_ACT_SIMP=m -+CONFIG_NET_ACT_SKBEDIT=m -+CONFIG_NET_CLS_IND=y -+CONFIG_DCB=y -+CONFIG_DNS_RESOLVER=y -+# CONFIG_WIRELESS is not set -+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+CONFIG_CONNECTOR=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_BLK_DEV_SX8=m -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=16384 -+CONFIG_ATA_OVER_ETH=m -+CONFIG_RAID_ATTRS=m -+CONFIG_BLK_DEV_SD=y -+CONFIG_SCSI_CONSTANTS=y -+CONFIG_SCSI_LOGGING=y -+CONFIG_ATA=y -+CONFIG_SATA_SIL24=y -+# CONFIG_ATA_SFF is not set -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=m -+CONFIG_MD_RAID0=m -+CONFIG_MD_RAID1=m -+CONFIG_MD_RAID10=m -+CONFIG_MD_RAID456=m -+CONFIG_MD_FAULTY=m -+CONFIG_BLK_DEV_DM=m -+CONFIG_DM_DEBUG=y -+CONFIG_DM_CRYPT=m -+CONFIG_DM_SNAPSHOT=m -+CONFIG_DM_MIRROR=m -+CONFIG_DM_LOG_USERSPACE=m -+CONFIG_DM_ZERO=m -+CONFIG_DM_MULTIPATH=m -+CONFIG_DM_MULTIPATH_QL=m -+CONFIG_DM_MULTIPATH_ST=m -+CONFIG_DM_DELAY=m -+CONFIG_DM_UEVENT=y -+CONFIG_FUSION=y -+CONFIG_FUSION_SAS=y -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_IFB=m -+CONFIG_MACVLAN=m -+CONFIG_MACVTAP=m -+CONFIG_NETCONSOLE=m -+CONFIG_NETCONSOLE_DYNAMIC=y -+CONFIG_TUN=y -+CONFIG_VETH=m -+CONFIG_NET_DSA_MV88E6060=y -+CONFIG_NET_DSA_MV88E6XXX=y -+# CONFIG_NET_VENDOR_3COM is not set -+CONFIG_E1000E=y -+# CONFIG_WLAN is not set -+# CONFIG_INPUT_MOUSEDEV is not set -+# CONFIG_INPUT_KEYBOARD is not set -+# CONFIG_INPUT_MOUSE is not set -+# CONFIG_SERIO is not set -+# CONFIG_VT is not set -+# CONFIG_LEGACY_PTYS is not set -+CONFIG_HW_RANDOM=y -+CONFIG_HW_RANDOM_TIMERIOMEM=m -+CONFIG_I2C=y -+CONFIG_I2C_CHARDEV=y -+# CONFIG_HWMON is not set -+CONFIG_WATCHDOG=y -+CONFIG_WATCHDOG_NOWAYOUT=y -+# CONFIG_VGA_ARB is not set -+# CONFIG_USB_SUPPORT is not set -+CONFIG_EDAC=y -+CONFIG_RTC_CLASS=y -+CONFIG_RTC_DRV_TILE=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT2_FS_XATTR=y -+CONFIG_EXT2_FS_POSIX_ACL=y -+CONFIG_EXT2_FS_SECURITY=y -+CONFIG_EXT2_FS_XIP=y -+CONFIG_EXT3_FS=y -+CONFIG_EXT3_FS_POSIX_ACL=y -+CONFIG_EXT3_FS_SECURITY=y -+CONFIG_EXT4_FS=y -+CONFIG_EXT4_FS_POSIX_ACL=y -+CONFIG_EXT4_FS_SECURITY=y -+CONFIG_XFS_FS=y -+CONFIG_XFS_QUOTA=y -+CONFIG_XFS_POSIX_ACL=y -+CONFIG_GFS2_FS=m -+CONFIG_GFS2_FS_LOCKING_DLM=y -+CONFIG_BTRFS_FS=m -+CONFIG_BTRFS_FS_POSIX_ACL=y -+CONFIG_QUOTA=y -+CONFIG_QUOTA_NETLINK_INTERFACE=y -+# CONFIG_PRINT_QUOTA_WARNING is not set -+CONFIG_QFMT_V2=y -+CONFIG_AUTOFS4_FS=m -+CONFIG_FUSE_FS=y -+CONFIG_CUSE=m -+CONFIG_FSCACHE=m -+CONFIG_FSCACHE_STATS=y -+CONFIG_CACHEFILES=m -+CONFIG_ISO9660_FS=m -+CONFIG_JOLIET=y -+CONFIG_ZISOFS=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_FAT_DEFAULT_IOCHARSET="ascii" -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_TMPFS_POSIX_ACL=y -+CONFIG_HUGETLBFS=y -+CONFIG_CONFIGFS_FS=m -+CONFIG_ECRYPT_FS=m -+CONFIG_CRAMFS=m -+CONFIG_SQUASHFS=m -+CONFIG_NFS_FS=m -+CONFIG_NFS_V3_ACL=y -+CONFIG_NFS_V4=m -+CONFIG_NFS_V4_1=y -+CONFIG_NFS_FSCACHE=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V3_ACL=y -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_CIFS_STATS=y -+CONFIG_CIFS_WEAK_PW_HASH=y -+CONFIG_CIFS_UPCALL=y -+CONFIG_CIFS_XATTR=y -+CONFIG_CIFS_POSIX=y -+CONFIG_CIFS_DFS_UPCALL=y -+CONFIG_CIFS_FSCACHE=y -+CONFIG_NLS=y -+CONFIG_NLS_DEFAULT="utf8" -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=y -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=m -+CONFIG_DLM=m -+CONFIG_DLM_DEBUG=y -+CONFIG_DYNAMIC_DEBUG=y -+CONFIG_DEBUG_INFO=y -+CONFIG_DEBUG_INFO_REDUCED=y -+# CONFIG_ENABLE_WARN_DEPRECATED is not set -+CONFIG_FRAME_WARN=2048 -+CONFIG_STRIP_ASM_SYMS=y -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_CHECK=y -+# CONFIG_FRAME_POINTER is not set -+CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_VM=y -+CONFIG_DEBUG_MEMORY_INIT=y -+CONFIG_DEBUG_STACKOVERFLOW=y -+CONFIG_LOCKUP_DETECTOR=y -+CONFIG_SCHEDSTATS=y -+CONFIG_TIMER_STATS=y -+CONFIG_DEBUG_LIST=y -+CONFIG_DEBUG_CREDENTIALS=y -+CONFIG_RCU_CPU_STALL_TIMEOUT=60 -+CONFIG_ASYNC_RAID6_TEST=m -+CONFIG_SECURITY=y -+CONFIG_SECURITYFS=y -+CONFIG_SECURITY_NETWORK=y -+CONFIG_SECURITY_NETWORK_XFRM=y -+CONFIG_SECURITY_SELINUX=y -+CONFIG_SECURITY_SELINUX_BOOTPARAM=y -+CONFIG_SECURITY_SELINUX_DISABLE=y -+CONFIG_CRYPTO_PCRYPT=m -+CONFIG_CRYPTO_CRYPTD=m -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_CCM=m -+CONFIG_CRYPTO_GCM=m -+CONFIG_CRYPTO_CTS=m -+CONFIG_CRYPTO_LRW=m -+CONFIG_CRYPTO_PCBC=m -+CONFIG_CRYPTO_XTS=m -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_XCBC=m -+CONFIG_CRYPTO_VMAC=m -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_RMD128=m -+CONFIG_CRYPTO_RMD160=m -+CONFIG_CRYPTO_RMD256=m -+CONFIG_CRYPTO_RMD320=m -+CONFIG_CRYPTO_SHA1=y -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAMELLIA=m -+CONFIG_CRYPTO_CAST5=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_FCRYPT=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SEED=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+CONFIG_CRYPTO_LZO=m -+CONFIG_CRC_CCITT=m -+CONFIG_CRC7=m -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 8ef85139553f..6f6ecda60d5b 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1034,6 +1034,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_MUQSS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -@@ -1064,6 +1080,8 @@ config SCHED_MC_PRIO - - If unsure say Y here. - -+source "kernel/Kconfig.MuQSS" -+ - config UP_LATE_INIT - def_bool y - depends on !SMP && X86_LOCAL_APIC -@@ -1433,7 +1451,7 @@ config HIGHMEM64G - endchoice - - choice -- prompt "Memory split" if EXPERT -+ prompt "Memory split" - default VMSPLIT_3G - depends on X86_32 - ---help--- -@@ -1453,17 +1471,17 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !X86_PAE -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !X86_PAE -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig -index 59ce9ed58430..f19741b0f43d 100644 ---- a/arch/x86/configs/i386_defconfig -+++ b/arch/x86/configs/i386_defconfig -@@ -29,7 +29,7 @@ CONFIG_SMP=y - CONFIG_X86_GENERIC=y - CONFIG_HPET_TIMER=y - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_X86_REBOOTFIXUPS=y -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index d0a5ffeae8df..63f1fb92590c 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -28,7 +28,7 @@ CONFIG_SMP=y - CONFIG_CALGARY_IOMMU=y - CONFIG_NR_CPUS=64 - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_MICROCODE=y -diff --git a/drivers/block/swim.c b/drivers/block/swim.c -index 4c297f69171d..5bc4f1be2617 100644 ---- a/drivers/block/swim.c -+++ b/drivers/block/swim.c -@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, - if (swim_readbit(base, MOTOR_ON)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - } else if (action == OFF) { - swim_action(base, MOTOR_OFF); -@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) - if (!swim_readbit(base, DISK_IN)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - swim_select(base, RELAX); - } -@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) - for (wait = 0; wait < HZ; wait++) { - - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - swim_select(base, RELAX); - if (!swim_readbit(base, STEP)) -diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c -index 2aab80e19ae0..6200dbb3b5ef 100644 ---- a/drivers/char/ipmi/ipmi_msghandler.c -+++ b/drivers/char/ipmi/ipmi_msghandler.c -@@ -3544,7 +3544,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) - /* Current message first, to preserve order */ - while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { - /* Wait for the message to clear out. */ -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - /* No need for locks, the interface is down. */ -diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c -index 22c6a2e61236..c4bccd444cbf 100644 ---- a/drivers/char/ipmi/ipmi_ssif.c -+++ b/drivers/char/ipmi/ipmi_ssif.c -@@ -1289,7 +1289,7 @@ static void shutdown_ssif(void *send_info) - - /* make sure the driver is not looking for flags any more. */ - while (ssif_info->ssif_state != SSIF_NORMAL) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - ssif_info->stopping = true; - del_timer_sync(&ssif_info->watch_timer); -diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c -new file mode 100644 -index 000000000000..5228e78df804 ---- /dev/null -+++ b/drivers/char/snsc.c -@@ -0,0 +1,469 @@ -+/* -+ * SN Platform system controller communication support -+ * -+ * This file is subject to the terms and conditions of the GNU General Public -+ * License. See the file "COPYING" in the main directory of this archive -+ * for more details. -+ * -+ * Copyright (C) 2004, 2006 Silicon Graphics, Inc. All rights reserved. -+ */ -+ -+/* -+ * System controller communication driver -+ * -+ * This driver allows a user process to communicate with the system -+ * controller (a.k.a. "IRouter") network in an SGI SN system. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "snsc.h" -+ -+#define SYSCTL_BASENAME "snsc" -+ -+#define SCDRV_BUFSZ 2048 -+#define SCDRV_TIMEOUT 1000 -+ -+static DEFINE_MUTEX(scdrv_mutex); -+static irqreturn_t -+scdrv_interrupt(int irq, void *subch_data) -+{ -+ struct subch_data_s *sd = subch_data; -+ unsigned long flags; -+ int status; -+ -+ spin_lock_irqsave(&sd->sd_rlock, flags); -+ spin_lock(&sd->sd_wlock); -+ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); -+ -+ if (status > 0) { -+ if (status & SAL_IROUTER_INTR_RECV) { -+ wake_up(&sd->sd_rq); -+ } -+ if (status & SAL_IROUTER_INTR_XMIT) { -+ ia64_sn_irtr_intr_disable -+ (sd->sd_nasid, sd->sd_subch, -+ SAL_IROUTER_INTR_XMIT); -+ wake_up(&sd->sd_wq); -+ } -+ } -+ spin_unlock(&sd->sd_wlock); -+ spin_unlock_irqrestore(&sd->sd_rlock, flags); -+ return IRQ_HANDLED; -+} -+ -+/* -+ * scdrv_open -+ * -+ * Reserve a subchannel for system controller communication. -+ */ -+ -+static int -+scdrv_open(struct inode *inode, struct file *file) -+{ -+ struct sysctl_data_s *scd; -+ struct subch_data_s *sd; -+ int rv; -+ -+ /* look up device info for this device file */ -+ scd = container_of(inode->i_cdev, struct sysctl_data_s, scd_cdev); -+ -+ /* allocate memory for subchannel data */ -+ sd = kzalloc(sizeof (struct subch_data_s), GFP_KERNEL); -+ if (sd == NULL) { -+ printk("%s: couldn't allocate subchannel data\n", -+ __func__); -+ return -ENOMEM; -+ } -+ -+ /* initialize subch_data_s fields */ -+ sd->sd_nasid = scd->scd_nasid; -+ sd->sd_subch = ia64_sn_irtr_open(scd->scd_nasid); -+ -+ if (sd->sd_subch < 0) { -+ kfree(sd); -+ printk("%s: couldn't allocate subchannel\n", __func__); -+ return -EBUSY; -+ } -+ -+ spin_lock_init(&sd->sd_rlock); -+ spin_lock_init(&sd->sd_wlock); -+ init_waitqueue_head(&sd->sd_rq); -+ init_waitqueue_head(&sd->sd_wq); -+ sema_init(&sd->sd_rbs, 1); -+ sema_init(&sd->sd_wbs, 1); -+ -+ file->private_data = sd; -+ -+ /* hook this subchannel up to the system controller interrupt */ -+ mutex_lock(&scdrv_mutex); -+ rv = request_irq(SGI_UART_VECTOR, scdrv_interrupt, -+ IRQF_SHARED, SYSCTL_BASENAME, sd); -+ if (rv) { -+ ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); -+ kfree(sd); -+ printk("%s: irq request failed (%d)\n", __func__, rv); -+ mutex_unlock(&scdrv_mutex); -+ return -EBUSY; -+ } -+ mutex_unlock(&scdrv_mutex); -+ return 0; -+} -+ -+/* -+ * scdrv_release -+ * -+ * Release a previously-reserved subchannel. -+ */ -+ -+static int -+scdrv_release(struct inode *inode, struct file *file) -+{ -+ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; -+ int rv; -+ -+ /* free the interrupt */ -+ free_irq(SGI_UART_VECTOR, sd); -+ -+ /* ask SAL to close the subchannel */ -+ rv = ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); -+ -+ kfree(sd); -+ return rv; -+} -+ -+/* -+ * scdrv_read -+ * -+ * Called to read bytes from the open IRouter pipe. -+ * -+ */ -+ -+static inline int -+read_status_check(struct subch_data_s *sd, int *len) -+{ -+ return ia64_sn_irtr_recv(sd->sd_nasid, sd->sd_subch, sd->sd_rb, len); -+} -+ -+static ssize_t -+scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) -+{ -+ int status; -+ int len; -+ unsigned long flags; -+ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; -+ -+ /* try to get control of the read buffer */ -+ if (down_trylock(&sd->sd_rbs)) { -+ /* somebody else has it now; -+ * if we're non-blocking, then exit... -+ */ -+ if (file->f_flags & O_NONBLOCK) { -+ return -EAGAIN; -+ } -+ /* ...or if we want to block, then do so here */ -+ if (down_interruptible(&sd->sd_rbs)) { -+ /* something went wrong with wait */ -+ return -ERESTARTSYS; -+ } -+ } -+ -+ /* anything to read? */ -+ len = CHUNKSIZE; -+ spin_lock_irqsave(&sd->sd_rlock, flags); -+ status = read_status_check(sd, &len); -+ -+ /* if not, and we're blocking I/O, loop */ -+ while (status < 0) { -+ DECLARE_WAITQUEUE(wait, current); -+ -+ if (file->f_flags & O_NONBLOCK) { -+ spin_unlock_irqrestore(&sd->sd_rlock, flags); -+ up(&sd->sd_rbs); -+ return -EAGAIN; -+ } -+ -+ len = CHUNKSIZE; -+ set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(&sd->sd_rq, &wait); -+ spin_unlock_irqrestore(&sd->sd_rlock, flags); -+ -+ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); -+ -+ remove_wait_queue(&sd->sd_rq, &wait); -+ if (signal_pending(current)) { -+ /* wait was interrupted */ -+ up(&sd->sd_rbs); -+ return -ERESTARTSYS; -+ } -+ -+ spin_lock_irqsave(&sd->sd_rlock, flags); -+ status = read_status_check(sd, &len); -+ } -+ spin_unlock_irqrestore(&sd->sd_rlock, flags); -+ -+ if (len > 0) { -+ /* we read something in the last read_status_check(); copy -+ * it out to user space -+ */ -+ if (count < len) { -+ pr_debug("%s: only accepting %d of %d bytes\n", -+ __func__, (int) count, len); -+ } -+ len = min((int) count, len); -+ if (copy_to_user(buf, sd->sd_rb, len)) -+ len = -EFAULT; -+ } -+ -+ /* release the read buffer and wake anyone who might be -+ * waiting for it -+ */ -+ up(&sd->sd_rbs); -+ -+ /* return the number of characters read in */ -+ return len; -+} -+ -+/* -+ * scdrv_write -+ * -+ * Writes a chunk of an IRouter packet (or other system controller data) -+ * to the system controller. -+ * -+ */ -+static inline int -+write_status_check(struct subch_data_s *sd, int count) -+{ -+ return ia64_sn_irtr_send(sd->sd_nasid, sd->sd_subch, sd->sd_wb, count); -+} -+ -+static ssize_t -+scdrv_write(struct file *file, const char __user *buf, -+ size_t count, loff_t *f_pos) -+{ -+ unsigned long flags; -+ int status; -+ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; -+ -+ /* try to get control of the write buffer */ -+ if (down_trylock(&sd->sd_wbs)) { -+ /* somebody else has it now; -+ * if we're non-blocking, then exit... -+ */ -+ if (file->f_flags & O_NONBLOCK) { -+ return -EAGAIN; -+ } -+ /* ...or if we want to block, then do so here */ -+ if (down_interruptible(&sd->sd_wbs)) { -+ /* something went wrong with wait */ -+ return -ERESTARTSYS; -+ } -+ } -+ -+ count = min((int) count, CHUNKSIZE); -+ if (copy_from_user(sd->sd_wb, buf, count)) { -+ up(&sd->sd_wbs); -+ return -EFAULT; -+ } -+ -+ /* try to send the buffer */ -+ spin_lock_irqsave(&sd->sd_wlock, flags); -+ status = write_status_check(sd, count); -+ -+ /* if we failed, and we want to block, then loop */ -+ while (status <= 0) { -+ DECLARE_WAITQUEUE(wait, current); -+ -+ if (file->f_flags & O_NONBLOCK) { -+ spin_unlock_irqrestore(&sd->sd_wlock, flags); -+ up(&sd->sd_wbs); -+ return -EAGAIN; -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ add_wait_queue(&sd->sd_wq, &wait); -+ spin_unlock_irqrestore(&sd->sd_wlock, flags); -+ -+ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); -+ -+ remove_wait_queue(&sd->sd_wq, &wait); -+ if (signal_pending(current)) { -+ /* wait was interrupted */ -+ up(&sd->sd_wbs); -+ return -ERESTARTSYS; -+ } -+ -+ spin_lock_irqsave(&sd->sd_wlock, flags); -+ status = write_status_check(sd, count); -+ } -+ spin_unlock_irqrestore(&sd->sd_wlock, flags); -+ -+ /* release the write buffer and wake anyone who's waiting for it */ -+ up(&sd->sd_wbs); -+ -+ /* return the number of characters accepted (should be the complete -+ * "chunk" as requested) -+ */ -+ if ((status >= 0) && (status < count)) { -+ pr_debug("Didn't accept the full chunk; %d of %d\n", -+ status, (int) count); -+ } -+ return status; -+} -+ -+static __poll_t -+scdrv_poll(struct file *file, struct poll_table_struct *wait) -+{ -+ __poll_t mask = 0; -+ int status = 0; -+ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; -+ unsigned long flags; -+ -+ poll_wait(file, &sd->sd_rq, wait); -+ poll_wait(file, &sd->sd_wq, wait); -+ -+ spin_lock_irqsave(&sd->sd_rlock, flags); -+ spin_lock(&sd->sd_wlock); -+ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); -+ spin_unlock(&sd->sd_wlock); -+ spin_unlock_irqrestore(&sd->sd_rlock, flags); -+ -+ if (status > 0) { -+ if (status & SAL_IROUTER_INTR_RECV) { -+ mask |= EPOLLIN | EPOLLRDNORM; -+ } -+ if (status & SAL_IROUTER_INTR_XMIT) { -+ mask |= EPOLLOUT | EPOLLWRNORM; -+ } -+ } -+ -+ return mask; -+} -+ -+static const struct file_operations scdrv_fops = { -+ .owner = THIS_MODULE, -+ .read = scdrv_read, -+ .write = scdrv_write, -+ .poll = scdrv_poll, -+ .open = scdrv_open, -+ .release = scdrv_release, -+ .llseek = noop_llseek, -+}; -+ -+static struct class *snsc_class; -+ -+/* -+ * scdrv_init -+ * -+ * Called at boot time to initialize the system controller communication -+ * facility. -+ */ -+int __init -+scdrv_init(void) -+{ -+ geoid_t geoid; -+ cnodeid_t cnode; -+ char devname[32]; -+ char *devnamep; -+ struct sysctl_data_s *scd; -+ void *salbuf; -+ dev_t first_dev, dev; -+ nasid_t event_nasid; -+ -+ if (!ia64_platform_is("sn2")) -+ return -ENODEV; -+ -+ event_nasid = ia64_sn_get_console_nasid(); -+ -+ snsc_class = class_create(THIS_MODULE, SYSCTL_BASENAME); -+ if (IS_ERR(snsc_class)) { -+ printk("%s: failed to allocate class\n", __func__); -+ return PTR_ERR(snsc_class); -+ } -+ -+ if (alloc_chrdev_region(&first_dev, 0, num_cnodes, -+ SYSCTL_BASENAME) < 0) { -+ printk("%s: failed to register SN system controller device\n", -+ __func__); -+ return -ENODEV; -+ } -+ -+ for (cnode = 0; cnode < num_cnodes; cnode++) { -+ geoid = cnodeid_get_geoid(cnode); -+ devnamep = devname; -+ format_module_id(devnamep, geo_module(geoid), -+ MODULE_FORMAT_BRIEF); -+ devnamep = devname + strlen(devname); -+ sprintf(devnamep, "^%d#%d", geo_slot(geoid), -+ geo_slab(geoid)); -+ -+ /* allocate sysctl device data */ -+ scd = kzalloc(sizeof (struct sysctl_data_s), -+ GFP_KERNEL); -+ if (!scd) { -+ printk("%s: failed to allocate device info" -+ "for %s/%s\n", __func__, -+ SYSCTL_BASENAME, devname); -+ continue; -+ } -+ -+ /* initialize sysctl device data fields */ -+ scd->scd_nasid = cnodeid_to_nasid(cnode); -+ if (!(salbuf = kmalloc(SCDRV_BUFSZ, GFP_KERNEL))) { -+ printk("%s: failed to allocate driver buffer" -+ "(%s%s)\n", __func__, -+ SYSCTL_BASENAME, devname); -+ kfree(scd); -+ continue; -+ } -+ -+ if (ia64_sn_irtr_init(scd->scd_nasid, salbuf, -+ SCDRV_BUFSZ) < 0) { -+ printk -+ ("%s: failed to initialize SAL for" -+ " system controller communication" -+ " (%s/%s): outdated PROM?\n", -+ __func__, SYSCTL_BASENAME, devname); -+ kfree(scd); -+ kfree(salbuf); -+ continue; -+ } -+ -+ dev = first_dev + cnode; -+ cdev_init(&scd->scd_cdev, &scdrv_fops); -+ if (cdev_add(&scd->scd_cdev, dev, 1)) { -+ printk("%s: failed to register system" -+ " controller device (%s%s)\n", -+ __func__, SYSCTL_BASENAME, devname); -+ kfree(scd); -+ kfree(salbuf); -+ continue; -+ } -+ -+ device_create(snsc_class, NULL, dev, NULL, -+ "%s", devname); -+ -+ ia64_sn_irtr_intr_enable(scd->scd_nasid, -+ 0 /*ignored */ , -+ SAL_IROUTER_INTR_RECV); -+ -+ /* on the console nasid, prepare to receive -+ * system controller environmental events -+ */ -+ if(scd->scd_nasid == event_nasid) { -+ scdrv_event_init(scd); -+ } -+ } -+ return 0; -+} -+device_initcall(scdrv_init); -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -index e5252ef3812f..6ae6241185ea 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -@@ -237,7 +237,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, - DRM_ERROR("SVGA device lockup.\n"); - break; - } -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - if (interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -index 75f3efee21a4..09b1932ce85b 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, - break; - } - if (lazy) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - else if ((++count & 0x0F) == 0) { - /** - * FIXME: Use schedule_hr_timeout here for -diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c -index 267eac00a3fb..352af68c6cd7 100644 ---- a/drivers/hwmon/fam15h_power.c -+++ b/drivers/hwmon/fam15h_power.c -@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, - prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; - } - -- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); -+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); - if (leftover) - return 0; - -diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c -index d8c40a83097d..8332baf4961c 100644 ---- a/drivers/iio/light/tsl2563.c -+++ b/drivers/iio/light/tsl2563.c -@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) - default: - delay = 402; - } -- /* -- * TODO: Make sure that we wait at least required delay but why we -- * have to extend it one tick more? -- */ -- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); -+ schedule_msec_hrtimeout_interruptible(delay + 1); - } - - static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) -diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c -index 39530d43590e..a7caf2eb5771 100644 ---- a/drivers/media/i2c/msp3400-driver.c -+++ b/drivers/media/i2c/msp3400-driver.c -@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) - break; - dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) - break; - dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c -index cf7cfda94107..f63e17489547 100644 ---- a/drivers/media/pci/cx18/cx18-gpio.c -+++ b/drivers/media/pci/cx18/cx18-gpio.c -@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, - - /* Assert */ - gpio_update(cx, mask, ~active_lo); -- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); - - /* Deassert */ - gpio_update(cx, mask, ~active_hi); -- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); - } - - /* -diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c -index 856e7ab7f33e..766a26251337 100644 ---- a/drivers/media/pci/ivtv/ivtv-gpio.c -+++ b/drivers/media/pci/ivtv/ivtv-gpio.c -@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) - curout = (curout & ~0xF) | 1; - write_reg(curout, IVTV_REG_GPIO_OUT); - /* We could use something else for smaller time */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - curout |= 2; - write_reg(curout, IVTV_REG_GPIO_OUT); - curdir &= ~0x80; -@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) - curout = read_reg(IVTV_REG_GPIO_OUT); - curout &= ~(1 << itv->card->xceive_pin); - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - - curout |= 1 << itv->card->xceive_pin; - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - return 0; - } - -diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c -index 137853944e46..76830892f373 100644 ---- a/drivers/media/pci/ivtv/ivtv-ioctl.c -+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c -@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) - TASK_UNINTERRUPTIBLE); - if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) - break; -- schedule_timeout(msecs_to_jiffies(25)); -+ schedule_msec_hrtimeout((25)); - } - finish_wait(&itv->vsync_waitq, &wait); - mutex_lock(&itv->serialize_lock); -diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c -index f7de9118f609..f39ad2952c0f 100644 ---- a/drivers/media/pci/ivtv/ivtv-streams.c -+++ b/drivers/media/pci/ivtv/ivtv-streams.c -@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) - while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && - time_before(jiffies, - then + msecs_to_jiffies(2000))) { -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - } - - /* To convert jiffies to ms, we must multiply by 1000 -diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c -index cb0437b4c331..163fffc0e1d4 100644 ---- a/drivers/media/radio/radio-mr800.c -+++ b/drivers/media/radio/radio-mr800.c -@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, - retval = -ENODATA; - break; - } -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - retval = -ERESTARTSYS; - break; - } -diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c -index fb9de7bbcd19..e53cf45e7f3f 100644 ---- a/drivers/media/radio/radio-tea5777.c -+++ b/drivers/media/radio/radio-tea5777.c -@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) - } - - if (wait) { -- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) -+ if (schedule_msec_hrtimeout_interruptible((wait))) - return -ERESTARTSYS; - } - -diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c -index b0303cf00387..0925b5065147 100644 ---- a/drivers/media/radio/tea575x.c -+++ b/drivers/media/radio/tea575x.c -@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, - for (;;) { - if (time_after(jiffies, timeout)) - break; -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - /* some signal arrived, stop search */ - tea->val &= ~TEA575X_BIT_SEARCH; - snd_tea575x_set_freq(tea); -diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c -index b690796d24d4..448b13da62b4 100644 ---- a/drivers/mfd/ucb1x00-core.c -+++ b/drivers/mfd/ucb1x00-core.c -@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) - break; - /* yield to other processes */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - return UCB_ADC_DAT(val); -diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c -index 8e6607fc8a67..b9ab770bbdb5 100644 ---- a/drivers/misc/sgi-xp/xpc_channel.c -+++ b/drivers/misc/sgi-xp/xpc_channel.c -@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) - - atomic_inc(&ch->n_on_msg_allocate_wq); - prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); -- ret = schedule_timeout(1); -+ ret = schedule_min_hrtimeout(); - finish_wait(&ch->msg_allocate_wq, &wait); - atomic_dec(&ch->n_on_msg_allocate_wq); - -diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c -index bbb2575d4728..637757144221 100644 ---- a/drivers/net/caif/caif_hsi.c -+++ b/drivers/net/caif/caif_hsi.c -@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) - break; - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - retry--; - } - -diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c -index d2539c95adb6..0c2f31a03ce9 100644 ---- a/drivers/net/can/usb/peak_usb/pcan_usb.c -+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c -@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) - } else { - /* the PCAN-USB needs time to init */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); -+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); - } - - return err; -diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c -index f24a1b0b801f..972313b92b0a 100644 ---- a/drivers/net/usb/lan78xx.c -+++ b/drivers/net/usb/lan78xx.c -@@ -2676,7 +2676,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) - while (!skb_queue_empty(&dev->rxq) && - !skb_queue_empty(&dev->txq) && - !skb_queue_empty(&dev->done)) { -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); -diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c -index dde05e2fdc3e..fa6c1581136e 100644 ---- a/drivers/net/usb/usbnet.c -+++ b/drivers/net/usb/usbnet.c -@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) - spin_lock_irqsave(&q->lock, flags); - while (!skb_queue_empty(q)) { - spin_unlock_irqrestore(&q->lock, flags); -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&q->lock, flags); - } -diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -index 8dfbaff2d1fe..d1d6b9777f47 100644 ---- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c -+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, - * doesn't seem to have as many firmware restart cycles... - * - * As a test, we're sticking in a 1/100s delay here */ -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - return 0; - -@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) - IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); - i = 5000; - do { -- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_uninterruptible((40)); - /* Todo... wait for sync command ... */ - - read_register(priv->net_dev, IPW_REG_INTA, &inta); -diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c -index 90fb73575495..c94048b048a5 100644 ---- a/drivers/parport/ieee1284.c -+++ b/drivers/parport/ieee1284.c -@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, - /* parport_wait_event didn't time out, but the - * peripheral wasn't actually ready either. - * Wait for another 10ms. */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - } - -diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c -index 5d41dda6da4e..34705f6b423f 100644 ---- a/drivers/parport/ieee1284_ops.c -+++ b/drivers/parport/ieee1284_ops.c -@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, - /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { - parport_release (dev); -- schedule_timeout_interruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_interruptible((40)); - parport_claim_or_block (dev); - } - else -diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c -index bffe548187ee..c2918ee3e100 100644 ---- a/drivers/platform/x86/intel_ips.c -+++ b/drivers/platform/x86/intel_ips.c -@@ -798,7 +798,7 @@ static int ips_adjust(void *data) - ips_gpu_lower(ips); - - sleep: -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); - } while (!kthread_should_stop()); - - dev_dbg(ips->dev, "ips-adjust thread stopped\n"); -@@ -974,7 +974,7 @@ static int ips_monitor(void *data) - seqno_timestamp = get_jiffies_64(); - - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - - /* Collect an initial average */ - for (i = 0; i < IPS_SAMPLE_COUNT; i++) { -@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) - mchp_samples[i] = mchp; - } - -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - if (kthread_should_stop()) - break; - } -@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) - * us to reduce the sample frequency if the CPU and GPU are idle. - */ - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - last_sample_period = IPS_SAMPLE_PERIOD; - - timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); -diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c -index 2018614f258f..fc19b312c345 100644 ---- a/drivers/rtc/rtc-wm8350.c -+++ b/drivers/rtc/rtc-wm8350.c -@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); - - if (!retries) { -@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); - - if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) -@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) - /* Wait until confirmation */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); - - if (rtc_ctrl & WM8350_RTC_ALMSTS) -diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c -index 80608b53897b..84051b538fa8 100644 ---- a/drivers/scsi/fnic/fnic_scsi.c -+++ b/drivers/scsi/fnic/fnic_scsi.c -@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) - - /* wait for io cmpl */ - while (atomic_read(&fnic->in_flight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); - -@@ -2273,7 +2273,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, - } - } - -- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); -+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); - - /* walk again to check, if IOs are still pending in fw */ - if (fnic_is_abts_pending(fnic, lr_sc)) -diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c -index 6822cd9ff8f1..ac3ad534be1a 100644 ---- a/drivers/scsi/lpfc/lpfc_scsi.c -+++ b/drivers/scsi/lpfc/lpfc_scsi.c -@@ -5176,7 +5176,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, - tgt_id, lun_id, context); - later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; - while (time_after(later, jiffies) && cnt) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); -+ schedule_msec_hrtimeout_uninterruptible((20)); - cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); - } - if (cnt) { -diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c -index b3650c989ed4..7ed1fb285754 100644 ---- a/drivers/scsi/snic/snic_scsi.c -+++ b/drivers/scsi/snic/snic_scsi.c -@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) - - /* Wait for all the IOs that are entered in Qcmd */ - while (atomic_read(&snic->ios_inflight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - ret = snic_issue_hba_reset(snic, sc); - if (ret) { -diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c -index f98e3ae27bff..0741c8352a6d 100644 ---- a/drivers/staging/comedi/drivers/ni_mio_common.c -+++ b/drivers/staging/comedi/drivers/ni_mio_common.c -@@ -4742,7 +4742,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) - if ((status & NI67XX_CAL_STATUS_BUSY) == 0) - break; - set_current_state(TASK_INTERRUPTIBLE); -- if (schedule_timeout(1)) -+ if (schedule_min_hrtimeout()) - return -EIO; - } - if (i == timeout) { -diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c -new file mode 100644 -index 000000000000..8cca151741b2 ---- /dev/null -+++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c -@@ -0,0 +1,426 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * GPL HEADER START -+ * -+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License version 2 only, -+ * as published by the Free Software Foundation. -+ * -+ * This program is distributed in the hope that it will be useful, but -+ * WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ * General Public License version 2 for more details (a copy is included -+ * in the LICENSE file that accompanied this code). -+ * -+ * You should have received a copy of the GNU General Public License -+ * version 2 along with this program; If not, see -+ * http://www.gnu.org/licenses/gpl-2.0.html -+ * -+ * GPL HEADER END -+ */ -+/* -+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. -+ * Use is subject to license terms. -+ * -+ * Copyright (c) 2012, Intel Corporation. -+ */ -+/* -+ * This file is part of Lustre, http://www.lustre.org/ -+ * Lustre is a trademark of Sun Microsystems, Inc. -+ * -+ * lnet/lnet/lib-eq.c -+ * -+ * Library level Event queue management routines -+ */ -+ -+#define DEBUG_SUBSYSTEM S_LNET -+ -+#include -+ -+/** -+ * Create an event queue that has room for \a count number of events. -+ * -+ * The event queue is circular and older events will be overwritten by new -+ * ones if they are not removed in time by the user using the functions -+ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to -+ * determine the appropriate size of the event queue to prevent this loss -+ * of events. Note that when EQ handler is specified in \a callback, no -+ * event loss can happen, since the handler is run for each event deposited -+ * into the EQ. -+ * -+ * \param count The number of events to be stored in the event queue. It -+ * will be rounded up to the next power of two. -+ * \param callback A handler function that runs when an event is deposited -+ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to -+ * indicate that no event handler is desired. -+ * \param handle On successful return, this location will hold a handle for -+ * the newly created EQ. -+ * -+ * \retval 0 On success. -+ * \retval -EINVAL If an parameter is not valid. -+ * \retval -ENOMEM If memory for the EQ can't be allocated. -+ * -+ * \see lnet_eq_handler_t for the discussion on EQ handler semantics. -+ */ -+int -+LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, -+ struct lnet_handle_eq *handle) -+{ -+ struct lnet_eq *eq; -+ -+ LASSERT(the_lnet.ln_refcount > 0); -+ -+ /* -+ * We need count to be a power of 2 so that when eq_{enq,deq}_seq -+ * overflow, they don't skip entries, so the queue has the same -+ * apparent capacity at all times -+ */ -+ if (count) -+ count = roundup_pow_of_two(count); -+ -+ if (callback != LNET_EQ_HANDLER_NONE && count) -+ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); -+ -+ /* -+ * count can be 0 if only need callback, we can eliminate -+ * overhead of enqueue event -+ */ -+ if (!count && callback == LNET_EQ_HANDLER_NONE) -+ return -EINVAL; -+ -+ eq = kzalloc(sizeof(*eq), GFP_NOFS); -+ if (!eq) -+ return -ENOMEM; -+ -+ if (count) { -+ eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), -+ GFP_KERNEL | __GFP_ZERO); -+ if (!eq->eq_events) -+ goto failed; -+ /* -+ * NB allocator has set all event sequence numbers to 0, -+ * so all them should be earlier than eq_deq_seq -+ */ -+ } -+ -+ eq->eq_deq_seq = 1; -+ eq->eq_enq_seq = 1; -+ eq->eq_size = count; -+ eq->eq_callback = callback; -+ -+ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), -+ sizeof(*eq->eq_refs[0])); -+ if (!eq->eq_refs) -+ goto failed; -+ -+ /* MUST hold both exclusive lnet_res_lock */ -+ lnet_res_lock(LNET_LOCK_EX); -+ /* -+ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do -+ * both EQ lookup and poll event with only lnet_eq_wait_lock -+ */ -+ lnet_eq_wait_lock(); -+ -+ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); -+ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); -+ -+ lnet_eq_wait_unlock(); -+ lnet_res_unlock(LNET_LOCK_EX); -+ -+ lnet_eq2handle(handle, eq); -+ return 0; -+ -+failed: -+ kvfree(eq->eq_events); -+ -+ if (eq->eq_refs) -+ cfs_percpt_free(eq->eq_refs); -+ -+ kfree(eq); -+ return -ENOMEM; -+} -+EXPORT_SYMBOL(LNetEQAlloc); -+ -+/** -+ * Release the resources associated with an event queue if it's idle; -+ * otherwise do nothing and it's up to the user to try again. -+ * -+ * \param eqh A handle for the event queue to be released. -+ * -+ * \retval 0 If the EQ is not in use and freed. -+ * \retval -ENOENT If \a eqh does not point to a valid EQ. -+ * \retval -EBUSY If the EQ is still in use by some MDs. -+ */ -+int -+LNetEQFree(struct lnet_handle_eq eqh) -+{ -+ struct lnet_eq *eq; -+ struct lnet_event *events = NULL; -+ int **refs = NULL; -+ int *ref; -+ int rc = 0; -+ int size = 0; -+ int i; -+ -+ LASSERT(the_lnet.ln_refcount > 0); -+ -+ lnet_res_lock(LNET_LOCK_EX); -+ /* -+ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do -+ * both EQ lookup and poll event with only lnet_eq_wait_lock -+ */ -+ lnet_eq_wait_lock(); -+ -+ eq = lnet_handle2eq(&eqh); -+ if (!eq) { -+ rc = -ENOENT; -+ goto out; -+ } -+ -+ cfs_percpt_for_each(ref, i, eq->eq_refs) { -+ LASSERT(*ref >= 0); -+ if (!*ref) -+ continue; -+ -+ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", -+ i, *ref); -+ rc = -EBUSY; -+ goto out; -+ } -+ -+ /* stash for free after lock dropped */ -+ events = eq->eq_events; -+ size = eq->eq_size; -+ refs = eq->eq_refs; -+ -+ lnet_res_lh_invalidate(&eq->eq_lh); -+ list_del(&eq->eq_list); -+ kfree(eq); -+ out: -+ lnet_eq_wait_unlock(); -+ lnet_res_unlock(LNET_LOCK_EX); -+ -+ kvfree(events); -+ if (refs) -+ cfs_percpt_free(refs); -+ -+ return rc; -+} -+EXPORT_SYMBOL(LNetEQFree); -+ -+void -+lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) -+{ -+ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ -+ int index; -+ -+ if (!eq->eq_size) { -+ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); -+ eq->eq_callback(ev); -+ return; -+ } -+ -+ lnet_eq_wait_lock(); -+ ev->sequence = eq->eq_enq_seq++; -+ -+ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); -+ index = ev->sequence & (eq->eq_size - 1); -+ -+ eq->eq_events[index] = *ev; -+ -+ if (eq->eq_callback != LNET_EQ_HANDLER_NONE) -+ eq->eq_callback(ev); -+ -+ /* Wake anyone waiting in LNetEQPoll() */ -+ if (waitqueue_active(&the_lnet.ln_eq_waitq)) -+ wake_up_all(&the_lnet.ln_eq_waitq); -+ lnet_eq_wait_unlock(); -+} -+ -+static int -+lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) -+{ -+ int new_index = eq->eq_deq_seq & (eq->eq_size - 1); -+ struct lnet_event *new_event = &eq->eq_events[new_index]; -+ int rc; -+ -+ /* must called with lnet_eq_wait_lock hold */ -+ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) -+ return 0; -+ -+ /* We've got a new event... */ -+ *ev = *new_event; -+ -+ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", -+ new_event, eq->eq_deq_seq, eq->eq_size); -+ -+ /* ...but did it overwrite an event we've not seen yet? */ -+ if (eq->eq_deq_seq == new_event->sequence) { -+ rc = 1; -+ } else { -+ /* -+ * don't complain with CERROR: some EQs are sized small -+ * anyway; if it's important, the caller should complain -+ */ -+ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", -+ eq->eq_deq_seq, new_event->sequence); -+ rc = -EOVERFLOW; -+ } -+ -+ eq->eq_deq_seq = new_event->sequence + 1; -+ return rc; -+} -+ -+/** -+ * A nonblocking function that can be used to get the next event in an EQ. -+ * If an event handler is associated with the EQ, the handler will run before -+ * this function returns successfully. The event is removed from the queue. -+ * -+ * \param eventq A handle for the event queue. -+ * \param event On successful return (1 or -EOVERFLOW), this location will -+ * hold the next event in the EQ. -+ * -+ * \retval 0 No pending event in the EQ. -+ * \retval 1 Indicates success. -+ * \retval -ENOENT If \a eventq does not point to a valid EQ. -+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that -+ * at least one event between this event and the last event obtained from the -+ * EQ has been dropped due to limited space in the EQ. -+ */ -+ -+/** -+ * Block the calling process until there is an event in the EQ. -+ * If an event handler is associated with the EQ, the handler will run before -+ * this function returns successfully. This function returns the next event -+ * in the EQ and removes it from the EQ. -+ * -+ * \param eventq A handle for the event queue. -+ * \param event On successful return (1 or -EOVERFLOW), this location will -+ * hold the next event in the EQ. -+ * -+ * \retval 1 Indicates success. -+ * \retval -ENOENT If \a eventq does not point to a valid EQ. -+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that -+ * at least one event between this event and the last event obtained from the -+ * EQ has been dropped due to limited space in the EQ. -+ */ -+ -+static int -+lnet_eq_wait_locked(int *timeout_ms, long state) -+__must_hold(&the_lnet.ln_eq_wait_lock) -+{ -+ int tms = *timeout_ms; -+ int wait; -+ wait_queue_entry_t wl; -+ unsigned long now; -+ -+ if (!tms) -+ return -ENXIO; /* don't want to wait and no new event */ -+ -+ init_waitqueue_entry(&wl, current); -+ set_current_state(state); -+ add_wait_queue(&the_lnet.ln_eq_waitq, &wl); -+ -+ lnet_eq_wait_unlock(); -+ -+ if (tms < 0) { -+ schedule(); -+ } else { -+ now = jiffies; -+ schedule_msec_hrtimeout((tms)); -+ tms -= jiffies_to_msecs(jiffies - now); -+ if (tms < 0) /* no more wait but may have new event */ -+ tms = 0; -+ } -+ -+ wait = tms; /* might need to call here again */ -+ *timeout_ms = tms; -+ -+ lnet_eq_wait_lock(); -+ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); -+ -+ return wait; -+} -+ -+/** -+ * Block the calling process until there's an event from a set of EQs or -+ * timeout happens. -+ * -+ * If an event handler is associated with the EQ, the handler will run before -+ * this function returns successfully, in which case the corresponding event -+ * is consumed. -+ * -+ * LNetEQPoll() provides a timeout to allow applications to poll, block for a -+ * fixed period, or block indefinitely. -+ * -+ * \param eventqs,neq An array of EQ handles, and size of the array. -+ * \param timeout_ms Time in milliseconds to wait for an event to occur on -+ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an -+ * infinite timeout. -+ * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD -+ * \param event,which On successful return (1 or -EOVERFLOW), \a event will -+ * hold the next event in the EQs, and \a which will contain the index of the -+ * EQ from which the event was taken. -+ * -+ * \retval 0 No pending event in the EQs after timeout. -+ * \retval 1 Indicates success. -+ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that -+ * at least one event between this event and the last event obtained from the -+ * EQ indicated by \a which has been dropped due to limited space in the EQ. -+ * \retval -ENOENT If there's an invalid handle in \a eventqs. -+ */ -+int -+LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, -+ int interruptible, -+ struct lnet_event *event, int *which) -+{ -+ int wait = 1; -+ int rc; -+ int i; -+ -+ LASSERT(the_lnet.ln_refcount > 0); -+ -+ if (neq < 1) -+ return -ENOENT; -+ -+ lnet_eq_wait_lock(); -+ -+ for (;;) { -+ for (i = 0; i < neq; i++) { -+ struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); -+ -+ if (!eq) { -+ lnet_eq_wait_unlock(); -+ return -ENOENT; -+ } -+ -+ rc = lnet_eq_dequeue_event(eq, event); -+ if (rc) { -+ lnet_eq_wait_unlock(); -+ *which = i; -+ return rc; -+ } -+ } -+ -+ if (!wait) -+ break; -+ -+ /* -+ * return value of lnet_eq_wait_locked: -+ * -1 : did nothing and it's sure no new event -+ * 1 : sleep inside and wait until new event -+ * 0 : don't want to wait anymore, but might have new event -+ * so need to call dequeue again -+ */ -+ wait = lnet_eq_wait_locked(&timeout_ms, -+ interruptible ? TASK_INTERRUPTIBLE -+ : TASK_NOLOAD); -+ if (wait < 0) /* no new event */ -+ break; -+ } -+ -+ lnet_eq_wait_unlock(); -+ return 0; -+} -diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c -index fa597953e9a0..685cf842badc 100644 ---- a/drivers/staging/rts5208/rtsx.c -+++ b/drivers/staging/rts5208/rtsx.c -@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); -+ schedule_msec_hrtimeout((POLLING_INTERVAL)); - - /* lock the device pointers */ - mutex_lock(&dev->dev_mutex); -diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c -index c94328a5bd4a..6e7d4671aa69 100644 ---- a/drivers/staging/speakup/speakup_acntpc.c -+++ b/drivers/staging/speakup/speakup_acntpc.c -@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c -index 0877b4044c28..627102d048c1 100644 ---- a/drivers/staging/speakup/speakup_apollo.c -+++ b/drivers/staging/speakup/speakup_apollo.c -@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) - if (!synth->io_ops->synth_out(synth, ch)) { - synth->io_ops->tiocmset(0, UART_MCR_RTS); - synth->io_ops->tiocmset(UART_MCR_RTS, 0); -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c -index ddbb7e97d118..f9502addc765 100644 ---- a/drivers/staging/speakup/speakup_decext.c -+++ b/drivers/staging/speakup/speakup_decext.c -@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c -index 798c42dfa16c..d85b41db67a3 100644 ---- a/drivers/staging/speakup/speakup_decpc.c -+++ b/drivers/staging/speakup/speakup_decpc.c -@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (dt_sendchar(ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c -index dccb4ea29d37..8ecead307d04 100644 ---- a/drivers/staging/speakup/speakup_dectlk.c -+++ b/drivers/staging/speakup/speakup_dectlk.c -@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c -index dbebed0eeeec..6d83c13ca4a6 100644 ---- a/drivers/staging/speakup/speakup_dtlk.c -+++ b/drivers/staging/speakup/speakup_dtlk.c -@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - jiffy_delta_val = jiffy_delta->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c -index 414827e888fc..cb31c9176daa 100644 ---- a/drivers/staging/speakup/speakup_keypc.c -+++ b/drivers/staging/speakup/speakup_keypc.c -@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c -index 3568bfb89912..0a80b3b098b2 100644 ---- a/drivers/staging/speakup/synth.c -+++ b/drivers/staging/speakup/synth.c -@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (ch == '\n') - ch = synth->procspeech; -- if (unicode) -- ret = synth->io_ops->synth_out_unicode(synth, ch); -- else -- ret = synth->io_ops->synth_out(synth, ch); -- if (!ret) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ if (!synth->io_ops->synth_out(synth, ch)) { -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth->io_ops->synth_out(synth, synth->procspeech)) -- schedule_timeout( -- msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - else -- schedule_timeout( -- msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c -index 1d1440d43002..52fe89ae1d9d 100644 ---- a/drivers/staging/unisys/visornic/visornic_main.c -+++ b/drivers/staging/unisys/visornic/visornic_main.c -@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - if (atomic_read(&devdata->usage)) - break; -@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c -index cfe63932f825..71c00ef772a3 100644 ---- a/drivers/video/fbdev/omap/hwa742.c -+++ b/drivers/video/fbdev/omap/hwa742.c -@@ -913,7 +913,7 @@ static void hwa742_resume(void) - if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(5)); -+ schedule_msec_hrtimeout((5)); - } - hwa742_set_update_mode(hwa742.update_mode_before_suspend); - } -diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c -index f70c9f79622e..0b363eaee24f 100644 ---- a/drivers/video/fbdev/pxafb.c -+++ b/drivers/video/fbdev/pxafb.c -@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) - mutex_unlock(&fbi->ctrlr_lock); - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(30)); -+ schedule_msec_hrtimeout((30)); - } - - pr_debug("%s(): task ending\n", __func__); -diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c -index 37345fb6191d..3874c17d1bc5 100644 ---- a/fs/btrfs/inode-map.c -+++ b/fs/btrfs/inode-map.c -@@ -91,7 +91,7 @@ static int caching_kthread(void *data) - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - goto again; - } else - continue; -diff --git a/fs/proc/base.c b/fs/proc/base.c -index ebea9501afb8..51c9346a69fe 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 21f5aa0b217f..ee9b46394fdf 100644 ---- a/include/linux/freezer.h -+++ b/include/linux/freezer.h -@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} - #define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - -+#define pm_freezing (false) - #endif /* !CONFIG_FREEZER */ - - #endif /* FREEZER_H_INCLUDED */ -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..73417df5daa2 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_MUQSS -+#define INIT_TASK_COMM "MuQSS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h -index e9bfe6972aed..16ba1c7e5bde 100644 ---- a/include/linux/ioprio.h -+++ b/include/linux/ioprio.h -@@ -53,6 +53,8 @@ enum { - */ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (iso_task(task)) -+ return 0; - return (task_nice(task) + 20) / 5; - } - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 67a1d86981a9..95b427fdbb2e 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -31,6 +31,9 @@ - #include - #include - #include -+#ifdef CONFIG_SCHED_MUQSS -+#include -+#endif - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -214,13 +217,40 @@ struct task_group; - - extern void scheduler_tick(void); - --#define MAX_SCHEDULE_TIMEOUT LONG_MAX -- -+#define MAX_SCHEDULE_TIMEOUT LONG_MAX - extern long schedule_timeout(long timeout); - extern long schedule_timeout_interruptible(long timeout); - extern long schedule_timeout_killable(long timeout); - extern long schedule_timeout_uninterruptible(long timeout); - extern long schedule_timeout_idle(long timeout); -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+extern long schedule_msec_hrtimeout(long timeout); -+extern long schedule_min_hrtimeout(void); -+extern long schedule_msec_hrtimeout_interruptible(long timeout); -+extern long schedule_msec_hrtimeout_uninterruptible(long timeout); -+#else -+static inline long schedule_msec_hrtimeout(long timeout) -+{ -+ return schedule_timeout(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_min_hrtimeout(void) -+{ -+ return schedule_timeout(1); -+} -+ -+static inline long schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); -+} -+#endif -+ - asmlinkage void schedule(void); - extern void schedule_preempt_disabled(void); - asmlinkage void preempt_schedule_irq(void); -@@ -644,9 +674,11 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) -+ int on_cpu; -+#endif - #ifdef CONFIG_SMP - struct llist_node wake_entry; -- int on_cpu; - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -671,10 +703,25 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+#ifdef CONFIG_SCHED_MUQSS -+ int time_slice; -+ u64 deadline; -+ skiplist_node node; /* Skip list node */ -+ u64 last_ran; -+ u64 sched_time; /* sched_clock time spent running */ -+#ifdef CONFIG_SMT_NICE -+ int smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+#ifdef CONFIG_HOTPLUG_CPU -+ bool zerobound; /* Bound to CPU0 for hotplug */ -+#endif -+ unsigned long rt_timeout; -+#else /* CONFIG_SCHED_MUQSS */ - - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -@@ -839,6 +886,10 @@ struct task_struct { - #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME - u64 utimescaled; - u64 stimescaled; -+#endif -+#ifdef CONFIG_SCHED_MUQSS -+ /* Unbanked cpu time */ -+ unsigned long utime_ns, stime_ns; - #endif - u64 gtime; - struct prev_cputime prev_cputime; -@@ -1283,6 +1334,40 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_MUQSS -+#define tsk_seruntime(t) ((t)->sched_time) -+#define tsk_rttimeout(t) ((t)->rt_timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+} -+ -+void print_scheduler_version(void); -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return (p->policy == SCHED_ISO); -+} -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+ p->nr_cpus_allowed = current->nr_cpus_allowed; -+} -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "CFS CPU scheduler.\n"); -+} -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_MUQSS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..73d6319a856a 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) - #ifdef CONFIG_SMP - - struct root_domain; -+#ifdef CONFIG_SCHED_MUQSS -+static inline void dl_clear_root_domain(struct root_domain *rd) -+{ -+} -+static inline void dl_add_task_root_domain(struct task_struct *p) -+{ -+} -+#else /* CONFIG_SCHED_MUQSS */ - extern void dl_add_task_root_domain(struct task_struct *p); - extern void dl_clear_root_domain(struct root_domain *rd); -+#endif /* CONFIG_SCHED_MUQSS */ - - #endif /* CONFIG_SMP */ -diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h -index 1abe91ff6e4a..20ba383562b0 100644 ---- a/include/linux/sched/nohz.h -+++ b/include/linux/sched/nohz.h -@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); - static inline void nohz_balance_enter_idle(int cpu) { } - #endif - --#ifdef CONFIG_NO_HZ_COMMON -+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - void calc_load_nohz_start(void); - void calc_load_nohz_stop(void); - #else -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..43c9d9e50c09 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,8 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_MUQSS -+/* Note different MAX_RT_PRIO */ -+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) -+ -+#define ISO_PRIO (MAX_RT_PRIO) -+#define NORMAL_PRIO (MAX_RT_PRIO + 1) -+#define IDLE_PRIO (MAX_RT_PRIO + 2) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* CONFIG_SCHED_MUQSS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - -+#endif /* CONFIG_SCHED_MUQSS */ -+ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..010b2244e0b6 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_MUQSS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 4b1c3b664f51..a9671b48799c 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..d4be84ba273b ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,33 @@ -+#ifndef _LINUX_SKIP_LISTS_H -+#define _LINUX_SKIP_LISTS_H -+typedef u64 keyType; -+typedef void *valueType; -+ -+typedef struct nodeStructure skiplist_node; -+ -+struct nodeStructure { -+ int level; /* Levels in this structure */ -+ keyType key; -+ valueType value; -+ skiplist_node *next[8]; -+ skiplist_node *prev[8]; -+}; -+ -+typedef struct listStructure { -+ int entries; -+ int level; /* Maximum level of the list -+ (1 more than the number of levels in the list) */ -+ skiplist_node *header; /* pointer to header */ -+} skiplist; -+ -+void skiplist_init(skiplist_node *slnode); -+skiplist *new_skiplist(skiplist_node *slnode); -+void free_skiplist(skiplist *l); -+void skiplist_node_init(skiplist_node *node); -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); -+void skiplist_delete(skiplist *l, skiplist_node *node); -+ -+static inline bool skiplist_node_empty(skiplist_node *node) { -+ return (!node->next[0]); -+} -+#endif /* _LINUX_SKIP_LISTS_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 25b4fa00bad1..c2503cd28025 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -84,9 +84,16 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented on MuQSS only */ - #define SCHED_IDLE 5 -+#ifdef CONFIG_SCHED_MUQSS -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO SCHED_IDLE -+#define SCHED_MAX (SCHED_IDLEPRIO) -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+#else /* CONFIG_SCHED_MUQSS */ - #define SCHED_DEADLINE 6 -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index b4daad2bac23..da90d33ba4b3 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -73,6 +73,18 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_MUQSS -+ bool "MuQSS cpu scheduler" -+ select HIGH_RES_TIMERS -+ ---help--- -+ The Multiple Queue Skiplist Scheduler for excellent interactivity and -+ responsiveness on the desktop and highly scalable deterministic -+ low latency on any hardware. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -802,6 +814,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_MUQSS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -901,9 +914,13 @@ menuconfig CGROUP_SCHED - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. It uses cgroups to group -- tasks. -+ tasks. In combination with MuQSS this is purely a STUB to create the -+ files associated with the CPU controller cgroup but most of the -+ controls do nothing. This is useful for working in environments and -+ with applications that will only work if this control group is -+ present. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_MUQSS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1032,6 +1049,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_MUQSS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1150,6 +1168,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_MUQSS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index 9e5cbe5eab7b..5c2bcbf25add 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -66,9 +66,17 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_MUQSS -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, -+ .time_slice = 1000000, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -78,6 +86,7 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifndef CONFIG_SCHED_MUQSS - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -85,6 +94,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/init/main.c b/init/main.c -index 91f6ebb30ef0..22792032de64 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -1124,6 +1124,8 @@ static int __ref kernel_init(void *unused) - - rcu_end_inkernel_boot(); - -+ print_scheduler_version(); -+ - if (ramdisk_execute_command) { - ret = run_init_process(ramdisk_execute_command); - if (!ret) -diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS -new file mode 100644 -index 000000000000..a6a58781ef91 ---- /dev/null -+++ b/kernel/Kconfig.MuQSS -@@ -0,0 +1,105 @@ -+choice -+ prompt "CPU scheduler runqueue sharing" -+ default RQ_MC if SCHED_MUQSS -+ default RQ_NONE -+ -+config RQ_NONE -+ bool "No sharing" -+ help -+ This is the default behaviour where the CPU scheduler has one runqueue -+ per CPU, whether it is a physical or logical CPU (hyperthread). -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=none -+ -+ If unsure, say N. -+ -+config RQ_SMT -+ bool "SMT (hyperthread) siblings" -+ depends on SCHED_SMT && SCHED_MUQSS -+ -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by SMT (hyperthread) siblings. As these logical cores share -+ one physical core, sharing the runqueue resource can lead to decreased -+ overhead, lower latency and higher throughput. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smt -+ -+ If unsure, say N. -+ -+config RQ_MC -+ bool "Multicore siblings" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by multicore siblings in addition to any SMT siblings. -+ As these physical cores share caches, sharing the runqueue resource -+ will lead to lower latency, but its effects on overhead and throughput -+ are less predictable. As a general rule, 6 or fewer cores will likely -+ benefit from this, while larger CPUs will only derive a latency -+ benefit. If your workloads are primarily single threaded, this will -+ possibly worsen throughput. If you are only concerned about latency -+ then enable this regardless of how many cores you have. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=mc -+ -+ If unsure, say Y. -+ -+config RQ_MC_LLC -+ bool "Multicore siblings (LLC)" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will behave similarly as -+ with "Multicore siblings". -+ This option takes LLC cache into account when scheduling tasks. -+ Option may benefit CPUs with multiple LLC caches, such as Ryzen -+ and Xeon CPUs. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=llc -+ -+ If unsure, say N. -+ -+config RQ_SMP -+ bool "Symmetric Multi-Processing" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by all physical CPUs unless they are on separate NUMA nodes. -+ As physical CPUs usually do not share resources, sharing the runqueue -+ will normally worsen throughput but improve latency. If you only -+ care about latency enable this. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smp -+ -+ If unsure, say N. -+ -+config RQ_ALL -+ bool "NUMA" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ regardless of the architecture configuration, including across NUMA -+ nodes. This can substantially decrease throughput in NUMA -+ configurations, but light NUMA designs will not be dramatically -+ affected. This option should only be chosen if latency is the prime -+ concern. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=all -+ -+ If unsure, say N. -+endchoice -+ -+config SHARERQ -+ int -+ default 0 if RQ_NONE -+ default 1 if RQ_SMT -+ default 2 if RQ_MC -+ default 3 if RQ_MC_LLC -+ default 4 if RQ_SMP -+ default 5 if RQ_ALL -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888e..89ed751ac4e4 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,8 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_100 if SCHED_MUQSS -+ default HZ_250_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -20,11 +21,18 @@ choice - config HZ_100 - bool "100 HZ" - help -+ 100 Hz is a suitable choice in combination with MuQSS which does -+ not rely on ticks for rescheduling interrupts, and is not Hz limited -+ for timeouts and sleeps from both the kernel and userspace. -+ This allows us to benefit from the lower overhead and higher -+ throughput of fewer timer ticks. -+ -+ Non-MuQSS kernels: - 100 Hz is a typical choice for servers, SMP and NUMA systems - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEF - bool "250 HZ" - help - 250 Hz is a good compromise choice allowing server performance -@@ -32,7 +40,10 @@ choice - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. - -- config HZ_300 -+ 250 Hz is the default choice for the mainline scheduler but not -+ advantageous in combination with MuQSS. -+ -+ config HZ_300_NODEF - bool "300 HZ" - help - 300 Hz is a good compromise choice allowing server performance -@@ -40,7 +51,7 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -- config HZ_1000 -+ config HZ_1000_NODEF - bool "1000 HZ" - help - 1000 Hz is the preferred choice for desktop systems and other -@@ -51,9 +62,9 @@ endchoice - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -- default 300 if HZ_300 -- default 1000 if HZ_1000 -+ default 250 if HZ_250_NODEF -+ default 300 if HZ_300_NODEF -+ default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - def_bool HIGH_RES_TIMERS -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index deff97217496..883998dd0437 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -2,7 +2,7 @@ - - choice - prompt "Preemption Model" -- default PREEMPT_NONE -+ default PREEMPT - - config PREEMPT_NONE - bool "No Forced Preemption (Server)" -@@ -18,7 +18,7 @@ config PREEMPT_NONE - latencies. - - config PREEMPT_VOLUNTARY -- bool "Voluntary Kernel Preemption (Desktop)" -+ bool "Voluntary Kernel Preemption (Nothing)" - depends on !ARCH_NO_PREEMPT - help - This option reduces the latency of the kernel by adding more -@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY - applications to run more 'smoothly' even when the system is - under load. - -- Select this if you are building a kernel for a desktop system. -+ Select this for no system in particular (choose Preemptible -+ instead on a desktop if you know what's good for you). - - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" -diff --git a/kernel/Makefile b/kernel/Makefile -index daad787fb795..9bb44fc4ef5b 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ - extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ - notifier.o ksysfs.o cred.o reboot.o \ -- async.o range.o smpboot.o ucount.o -+ async.o range.o smpboot.o ucount.o skip_list.o - - obj-$(CONFIG_MODULES) += kmod.o - obj-$(CONFIG_MULTIUSER) += groups.o -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index a46a50d67002..58043176b285 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig -index f92d9a687372..d17db0ff775f 100644 ---- a/kernel/irq/Kconfig -+++ b/kernel/irq/Kconfig -@@ -111,6 +111,23 @@ config GENERIC_IRQ_RESERVATION_MODE - config IRQ_FORCED_THREADING - bool - -+config FORCE_IRQ_THREADING -+ bool "Make IRQ threading compulsory" -+ depends on IRQ_FORCED_THREADING -+ default n -+ ---help--- -+ -+ Make IRQ threading mandatory for any IRQ handlers that support it -+ instead of being optional and requiring the threadirqs kernel -+ parameter. Instead they can be optionally disabled with the -+ nothreadirqs kernel parameter. -+ -+ Enabling this may make some architectures not boot with runqueue -+ sharing and MuQSS. -+ -+ Enable if you are building for a desktop or low latency system, -+ otherwise say N. -+ - config SPARSE_IRQ - bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 1753486b440c..f43423737493 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -24,9 +24,20 @@ - #include "internals.h" - - #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -+#ifdef CONFIG_FORCE_IRQ_THREADING -+__read_mostly bool force_irqthreads = true; -+#else - __read_mostly bool force_irqthreads; -+#endif - EXPORT_SYMBOL_GPL(force_irqthreads); - -+static int __init setup_noforced_irqthreads(char *arg) -+{ -+ force_irqthreads = false; -+ return 0; -+} -+early_param("nothreadirqs", setup_noforced_irqthreads); -+ - static int __init setup_forced_irqthreads(char *arg) - { - force_irqthreads = true; -diff --git a/kernel/kthread.c b/kernel/kthread.c -index b262f47046ca..9797ad652268 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -433,6 +433,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) - } - EXPORT_SYMBOL(kthread_bind); - -+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) -+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -+ -+/* -+ * new_kthread_bind is a special variant of __kthread_bind_mask. -+ * For new threads to work on muqss we want to call do_set_cpus_allowed -+ * without the task_cpu being set and the task rescheduled until they're -+ * rescheduled on their own so we call __do_set_cpus_allowed directly which -+ * only changes the cpumask. This is particularly important for smpboot threads -+ * to work. -+ */ -+static void new_kthread_bind(struct task_struct *p, unsigned int cpu) -+{ -+ unsigned long flags; -+ -+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) -+ return; -+ -+ /* It's safe because the task is inactive. */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ __do_set_cpus_allowed(p, cpumask_of(cpu)); -+ p->flags |= PF_NO_SETAFFINITY; -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+#else -+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) -+#endif -+ - /** - * kthread_create_on_cpu - Create a cpu bound kthread - * @threadfn: the function to run until signal_pending(current). -@@ -454,7 +482,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), - cpu); - if (IS_ERR(p)) - return p; -- kthread_bind(p, cpu); -+ new_kthread_bind(p, cpu); - /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); - to_kthread(p)->cpu = cpu; -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index cdf318d86dd6..304c0c8c2bea 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) - { - static char err_buf[STACK_ERR_BUF_SIZE]; - struct rq *rq; -- struct rq_flags flags; -+ struct rq_flags rf; - int ret; - bool success = false; - -@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) - * functions. If all goes well, switch the task to the target patch - * state. - */ -- rq = task_rq_lock(task, &flags); -+ rq = task_rq_lock(task, &rf); - - if (task_running(rq, task) && task != current) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, -@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) - task->patch_state = klp_target_state; - - done: -- task_rq_unlock(rq, task, &flags); -+ task_rq_unlock(rq, task, &rf); - - /* - * Due to console deadlock issues, pr_debug() can't be used while -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..a04ffebc6b7a 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - -+ifdef CONFIG_SCHED_MUQSS -+obj-y += MuQSS.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+ -+obj-$(CONFIG_SMP) += topology.o -+else - obj-y += core.o loadavg.o clock.o cputime.o - obj-y += idle.o fair.o rt.o deadline.o - obj-y += wait.o wait_bit.o swait.o completion.o - - obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -new file mode 100644 -index 000000000000..fafb5a790cf1 ---- /dev/null -+++ b/kernel/sched/MuQSS.c -@@ -0,0 +1,7606 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * kernel/sched/MuQSS.c, was kernel/sched.c -+ * -+ * Kernel scheduler and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and -+ * make semaphores SMP safe -+ * 1998-11-19 Implemented schedule_timeout() and related stuff -+ * by Andrea Arcangeli -+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: -+ * hybrid priority-list and round-robin design with -+ * an array-switch method of distributing timeslices -+ * and per-CPU runqueues. Cleanups and useful suggestions -+ * by Davide Libenzi, preemptible kernel bits by Robert Love. -+ * 2003-09-03 Interactivity tuning by Con Kolivas. -+ * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-04-15 Work begun on replacing all interactivity tuning with a -+ * fair scheduling design by Con Kolivas. -+ * 2007-05-05 Load balancing (smp-nice) and other improvements -+ * by Peter Williams -+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith -+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri -+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, -+ * Thomas Gleixner, Mike Kravetz -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS -+ * scheduler by Con Kolivas. -+ * 2019-08-31 LLC bits by Eduards Bezverhijs -+ */ -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "../workqueue_internal.h" -+#include "../smpboot.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#include "MuQSS.h" -+ -+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) -+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+ -+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -+ -+#define is_iso_policy(policy) ((policy) == SCHED_ISO) -+#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -+ -+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -+ -+#define ISO_PERIOD (5 * HZ) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) -+#define JIFFY_NS (APPROX_NS_PS / HZ) -+#define JIFFY_US (1048576 / HZ) -+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) -+#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) -+#define HALF_JIFFY_US (1048576 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "MuQSS CPU scheduler v0.196 by Con Kolivas.\n"); -+} -+ -+/* Define RQ share levels */ -+#define RQSHARE_NONE 0 -+#define RQSHARE_SMT 1 -+#define RQSHARE_MC 2 -+#define RQSHARE_MC_LLC 3 -+#define RQSHARE_SMP 4 -+#define RQSHARE_ALL 5 -+ -+/* Define locality levels */ -+#define LOCALITY_SAME 0 -+#define LOCALITY_SMT 1 -+#define LOCALITY_MC_LLC 2 -+#define LOCALITY_MC 3 -+#define LOCALITY_SMP 4 -+#define LOCALITY_DISTANT 5 -+ -+/* -+ * This determines what level of runqueue sharing will be done and is -+ * configurable at boot time with the bootparam rqshare = -+ */ -+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ -+ -+static int __init set_rqshare(char *str) -+{ -+ if (!strncmp(str, "none", 4)) { -+ rqshare = RQSHARE_NONE; -+ return 0; -+ } -+ if (!strncmp(str, "smt", 3)) { -+ rqshare = RQSHARE_SMT; -+ return 0; -+ } -+ if (!strncmp(str, "mc", 2)) { -+ rqshare = RQSHARE_MC; -+ return 0; -+ } -+ if (!strncmp(str, "llc", 3)) { -+ rqshare = RQSHARE_MC_LLC; -+ return 0; -+ } -+ if (!strncmp(str, "smp", 3)) { -+ rqshare = RQSHARE_SMP; -+ return 0; -+ } -+ if (!strncmp(str, "all", 3)) { -+ rqshare = RQSHARE_ALL; -+ return 0; -+ } -+ return 1; -+} -+__setup("rqshare=", set_rqshare); -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+ -+/* -+ * Tunable to choose whether to prioritise latency or throughput, simple -+ * binary yes or no -+ */ -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run five seconds as real time tasks. This is the total over -+ * all online cpus. -+ */ -+int sched_iso_cpu __read_mostly = 70; -+ -+/* -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The relative length of deadline for each priority(nice) level. -+ */ -+static int prio_ratios[NICE_WIDTH] __read_mostly; -+ -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifdef CONFIG_SMP -+/* -+ * Total number of runqueues. Equals number of CPUs when there is no runqueue -+ * sharing but is usually less with SMT/MC sharing of runqueues. -+ */ -+static int total_runqueues __read_mostly = 1; -+ -+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -+ -+struct rq *cpu_rq(int cpu) -+{ -+ return &per_cpu(runqueues, (cpu)); -+} -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+/* -+ * For asym packing, by default the lower numbered cpu has higher priority. -+ */ -+int __weak arch_asym_cpu_priority(int cpu) -+{ -+ return -cpu; -+} -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+#include "stats.h" -+ -+/* -+ * All common locking functions performed on rq->lock. rq->clock is local to -+ * the CPU accessing it so it can be modified just with interrupts disabled -+ * when we're not updating niffies. -+ * Looking up task_rq must be done under rq->lock to be safe. -+ */ -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+static void update_irq_load_avg(struct rq *rq, long delta); -+#else -+static inline void update_irq_load_avg(struct rq *rq, long delta) {} -+#endif -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if (irq_delta + steal) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta < 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * Niffies are a globally increasing nanosecond counter. They're only used by -+ * update_load_avg and time_slice_expired, however deadlines are based on them -+ * across CPUs. Update them whenever we will call one of those functions, and -+ * synchronise them across CPUs whenever we hold both runqueue locks. -+ */ -+static inline void update_clocks(struct rq *rq) -+{ -+ s64 ndiff, minndiff; -+ long jdiff; -+ -+ update_rq_clock(rq); -+ ndiff = rq->clock - rq->old_clock; -+ rq->old_clock = rq->clock; -+ jdiff = jiffies - rq->last_jiffy; -+ -+ /* Subtract any niffies added by balancing with other rqs */ -+ ndiff -= rq->niffies - rq->last_niffy; -+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; -+ if (minndiff < 0) -+ minndiff = 0; -+ ndiff = max(ndiff, minndiff); -+ rq->niffies += ndiff; -+ rq->last_niffy = rq->niffies; -+ if (jdiff) { -+ rq->last_jiffy += jdiff; -+ rq->last_jiffy_niffies = rq->niffies; -+ } -+} -+ -+/* -+ * Any time we have two runqueues locked we use that as an opportunity to -+ * synchronise niffies to the highest value as idle ticks may have artificially -+ * kept niffies low on one CPU and the truth can only be later. -+ */ -+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) -+{ -+ if (rq1->niffies > rq2->niffies) -+ rq2->niffies = rq1->niffies; -+ else -+ rq1->niffies = rq2->niffies; -+} -+ -+/* -+ * double_rq_lock - safely lock two runqueues -+ * -+ * Note this does not disable interrupts like task_rq_lock, -+ * you need to do so manually before calling. -+ */ -+ -+/* For when we know rq1 != rq2 */ -+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ if (rq1 < rq2) { -+ raw_spin_lock(rq1->lock); -+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); -+ } else { -+ raw_spin_lock(rq2->lock); -+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ BUG_ON(!irqs_disabled()); -+ if (rq1->lock == rq2->lock) { -+ raw_spin_lock(rq1->lock); -+ __acquire(rq2->lock); /* Fake it out ;) */ -+ } else -+ __double_rq_lock(rq1, rq2); -+ synchronise_niffies(rq1, rq2); -+} -+ -+/* -+ * double_rq_unlock - safely unlock two runqueues -+ * -+ * Note this does not restore interrupts like task_rq_unlock, -+ * you need to do so manually after calling. -+ */ -+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) -+ __releases(rq1->lock) -+ __releases(rq2->lock) -+{ -+ raw_spin_unlock(rq1->lock); -+ if (rq1->lock != rq2->lock) -+ raw_spin_unlock(rq2->lock); -+ else -+ __release(rq2->lock); -+} -+ -+static inline void lock_all_rqs(void) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_lock(rq->lock); -+ } -+} -+ -+static inline void unlock_all_rqs(void) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_unlock(rq->lock); -+ } -+ preempt_enable(); -+} -+ -+/* Specially nest trylock an rq */ -+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) -+{ -+ if (unlikely(!do_raw_spin_trylock(rq->lock))) -+ return false; -+ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ synchronise_niffies(this_rq, rq); -+ return true; -+} -+ -+/* Unlock a specially nested trylocked rq */ -+static inline void unlock_rq(struct rq *rq) -+{ -+ spin_release(&rq->lock->dep_map, 1, _RET_IP_); -+ do_raw_spin_unlock(rq->lock); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* Task can safely be re-inserted now */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+static inline void smp_sched_reschedule(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ smp_send_reschedule(cpu); -+} -+ -+/* -+ * resched_task - mark a task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_task(struct task_struct *p) -+{ -+ int cpu; -+#ifdef CONFIG_LOCKDEP -+ /* Kernel threads call this when creating workqueues while still -+ * inactive from __kthread_bind_mask, holding only the pi_lock */ -+ if (!(p->flags & PF_KTHREAD)) { -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(rq->lock); -+ } -+#endif -+ if (test_tsk_need_resched(p)) -+ return; -+ -+ cpu = task_cpu(p); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(p)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * A task that is not running or queued will not have a node set. -+ * A task that is queued but not running will have a node set. -+ * A task that is currently running will have ->on_cpu set but no node set. -+ */ -+static inline bool task_queued(struct task_struct *p) -+{ -+ return !skiplist_node_empty(&p->node); -+} -+ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -+static inline void resched_if_idle(struct rq *rq); -+ -+/* Dodgy workaround till we figure out where the softirqs are going */ -+static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) -+{ -+ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) -+ do_softirq_own_stack(); -+} -+ -+static inline bool deadline_before(u64 deadline, u64 time) -+{ -+ return (deadline < time); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes cpu fairly amongst tasks of the -+ * same nice value, it proportions cpu according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 prio_deadline_diff(int user_prio) -+{ -+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -+} -+ -+static inline u64 task_deadline_diff(struct task_struct *p) -+{ -+ return prio_deadline_diff(TASK_USER_PRIO(p)); -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return prio_deadline_diff(USER_PRIO(static_prio)); -+} -+ -+static inline int longest_deadline_diff(void) -+{ -+ return prio_deadline_diff(39); -+} -+ -+static inline int ms_longest_deadline_diff(void) -+{ -+ return NS_TO_MS(longest_deadline_diff()); -+} -+ -+static inline bool rq_local(struct rq *rq); -+ -+#ifndef SCHED_CAPACITY_SCALE -+#define SCHED_CAPACITY_SCALE 1024 -+#endif -+ -+static inline int rq_load(struct rq *rq) -+{ -+ return rq->nr_running; -+} -+ -+/* -+ * Update the load average for feeding into cpu frequency governors. Use a -+ * rough estimate of a rolling average with ~ time constant of 32ms. -+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 -+ * Make sure a call to update_clocks has been made before calling this to get -+ * an updated rq->niffies. -+ */ -+static void update_load_avg(struct rq *rq, unsigned int flags) -+{ -+ long us_interval, load; -+ unsigned long curload; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ curload = rq_load(rq); -+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->load_avg = load; -+ -+ rq->load_update = rq->niffies; -+ update_irq_load_avg(rq, 0); -+ if (likely(rq_local(rq))) -+ cpufreq_trigger(rq, flags); -+} -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+/* -+ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds -+ * here so we scale curload to how long it's been since the last update. -+ */ -+static void update_irq_load_avg(struct rq *rq, long delta) -+{ -+ long us_interval, load; -+ unsigned long curload; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ curload = NS_TO_US(delta) / us_interval; -+ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->irq_load_avg = load; -+ -+ rq->irq_load_update = rq->niffies; -+} -+#endif -+ -+/* -+ * Removing from the runqueue. Enter with rq locked. Deleting a task -+ * from the skip list is done via the stored node reference in the task struct -+ * and does not require a full look up. Thus it occurs in O(k) time where k -+ * is the "level" of the list the task was stored at - usually < 4, max 8. -+ */ -+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ skiplist_delete(rq->sl, &p->node); -+ rq->best_key = rq->node->next[0]->key; -+ update_clocks(rq); -+ -+ if (!(flags & DEQUEUE_SAVE)) { -+ sched_info_dequeued(rq, p); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ } -+ rq->nr_running--; -+ if (rt_task(p)) -+ rq->rt_nr_running--; -+ update_load_avg(rq, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_RCU -+static bool rcu_read_critical(struct task_struct *p) -+{ -+ return p->rcu_read_unlock_special.b.blocked; -+} -+#else /* CONFIG_PREEMPT_RCU */ -+#define rcu_read_critical(p) (false) -+#endif /* CONFIG_PREEMPT_RCU */ -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && -+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -+} -+ -+/* -+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check -+ * that the iso_refractory flag is not set. -+ */ -+static inline bool isoprio_suitable(struct rq *rq) -+{ -+ return !rq->iso_refractory; -+} -+ -+/* -+ * Adding to the runqueue. Enter with rq locked. -+ */ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ unsigned int randseed, cflags = 0; -+ u64 sl_id; -+ -+ if (!rt_task(p)) { -+ /* Check it hasn't gotten rt from PI */ -+ if ((idleprio_task(p) && idleprio_suitable(p)) || -+ (iso_task(p) && isoprio_suitable(rq))) -+ p->prio = p->normal_prio; -+ else -+ p->prio = NORMAL_PRIO; -+ } else -+ rq->rt_nr_running++; -+ /* -+ * The sl_id key passed to the skiplist generates a sorted list. -+ * Realtime and sched iso tasks run FIFO so they only need be sorted -+ * according to priority. The skiplist will put tasks of the same -+ * key inserted later in FIFO order. Tasks of sched normal, batch -+ * and idleprio are sorted according to their deadlines. Idleprio -+ * tasks are offset by an impossibly large deadline value ensuring -+ * they get sorted into last positions, but still according to their -+ * own deadlines. This creates a "landscape" of skiplists running -+ * from priority 0 realtime in first place to the lowest priority -+ * idleprio tasks last. Skiplist insertion is an O(log n) process. -+ */ -+ if (p->prio <= ISO_PRIO) { -+ sl_id = p->prio; -+ } else { -+ sl_id = p->deadline; -+ if (idleprio_task(p)) { -+ if (p->prio == IDLE_PRIO) -+ sl_id |= 0xF000000000000000; -+ else -+ sl_id += longest_deadline_diff(); -+ } -+ } -+ /* -+ * Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as the random seed for skiplist insertion. -+ */ -+ update_clocks(rq); -+ if (!(flags & ENQUEUE_RESTORE)) { -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags & ENQUEUE_WAKEUP); -+ } -+ -+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; -+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); -+ rq->best_key = rq->node->next[0]->key; -+ if (p->in_iowait) -+ cflags |= SCHED_CPUFREQ_IOWAIT; -+ rq->nr_running++; -+ update_load_avg(rq, cflags); -+} -+ -+/* -+ * Returns the relative length of deadline all compared to the shortest -+ * deadline which is that of nice -20. -+ */ -+static inline int task_prio_ratio(struct task_struct *p) -+{ -+ return prio_ratios[TASK_USER_PRIO(p)]; -+} -+ -+/* -+ * task_timeslice - all tasks of all priorities get the exact same timeslice -+ * length. CPU distribution is handled by giving different deadlines to -+ * tasks of different priorities. Use 128 as the base value for fast shifts. -+ */ -+static inline int task_timeslice(struct task_struct *p) -+{ -+ return (rr_interval * task_prio_ratio(p) / 128); -+} -+ -+#ifdef CONFIG_SMP -+/* Entered with rq locked */ -+static inline void resched_if_idle(struct rq *rq) -+{ -+ if (rq_idle(rq)) -+ resched_task(rq->curr); -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return (rq->cpu == smp_processor_id()); -+} -+#ifdef CONFIG_SMT_NICE -+static const cpumask_t *thread_cpumask(int cpu); -+ -+/* Find the best real time priority running on any SMT siblings of cpu and if -+ * none are running, the static priority of the best deadline task running. -+ * The lookups to the other runqueues is done lockless as the occasional wrong -+ * value would be harmless. */ -+static int best_smt_bias(struct rq *this_rq) -+{ -+ int other_cpu, best_bias = 0; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq = cpu_rq(other_cpu); -+ -+ if (rq_idle(rq)) -+ continue; -+ if (unlikely(!rq->online)) -+ continue; -+ if (!rq->rq_mm) -+ continue; -+ if (likely(rq->rq_smt_bias > best_bias)) -+ best_bias = rq->rq_smt_bias; -+ } -+ return best_bias; -+} -+ -+static int task_prio_bias(struct task_struct *p) -+{ -+ if (rt_task(p)) -+ return 1 << 30; -+ else if (task_running_iso(p)) -+ return 1 << 29; -+ else if (task_running_idle(p)) -+ return 0; -+ return MAX_PRIO - p->static_prio; -+} -+ -+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -+{ -+ return true; -+} -+ -+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; -+ -+/* We've already decided p can run on CPU, now test if it shouldn't for SMT -+ * nice reasons. */ -+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -+{ -+ int best_bias, task_bias; -+ -+ /* Kernel threads always run */ -+ if (unlikely(!p->mm)) -+ return true; -+ if (rt_task(p)) -+ return true; -+ if (!idleprio_suitable(p)) -+ return true; -+ best_bias = best_smt_bias(this_rq); -+ /* The smt siblings are all idle or running IDLEPRIO */ -+ if (best_bias < 1) -+ return true; -+ task_bias = task_prio_bias(p); -+ if (task_bias < 1) -+ return false; -+ if (task_bias >= best_bias) -+ return true; -+ /* Dither 25% cpu of normal tasks regardless of nice difference */ -+ if (best_bias % 4 == 1) -+ return true; -+ /* Sorry, you lose */ -+ return false; -+} -+#else /* CONFIG_SMT_NICE */ -+#define smt_schedule(p, this_rq) (true) -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) -+{ -+ set_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+/* -+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to -+ * allow easy lookup of whether any suitable idle CPUs are available. -+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the -+ * idle_cpus variable than to do a full bitmask check when we are busy. The -+ * bits are set atomically but read locklessly as occasional false positive / -+ * negative is harmless. -+ */ -+static inline void set_cpuidle_map(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ atomic_set_cpu(cpu, &cpu_idle_map); -+} -+ -+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) -+{ -+ clear_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+ atomic_clear_cpu(cpu, &cpu_idle_map); -+} -+ -+static bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); -+} -+ -+/* -+ * Resched current on rq. We don't know if rq is local to this CPU nor if it -+ * is locked so we do not use an intermediate variable for the task to avoid -+ * having it dereferenced. -+ */ -+static void resched_curr(struct rq *rq) -+{ -+ int cpu; -+ -+ if (test_tsk_need_resched(rq->curr)) -+ return; -+ -+ rq->preempt = rq->curr; -+ cpu = rq->cpu; -+ -+ /* We're doing this without holding the rq lock if it's not task_rq */ -+ -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(rq->curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(rq->curr)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+#define CPUIDLE_DIFF_THREAD (1) -+#define CPUIDLE_DIFF_CORE_LLC (2) -+#define CPUIDLE_DIFF_CORE (4) -+#define CPUIDLE_CACHE_BUSY (8) -+#define CPUIDLE_DIFF_CPU (16) -+#define CPUIDLE_THREAD_BUSY (32) -+#define CPUIDLE_DIFF_NODE (64) -+ -+/* -+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the -+ * lowest value would give the most suitable CPU to schedule p onto next. The -+ * order works out to be the following: -+ * -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ */ -+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -+{ -+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | -+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | -+ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; -+ int cpu_tmp; -+ -+ if (cpumask_test_cpu(best_cpu, tmpmask)) -+ goto out; -+ -+ for_each_cpu(cpu_tmp, tmpmask) { -+ int ranking, locality; -+ struct rq *tmp_rq; -+ -+ ranking = 0; -+ tmp_rq = cpu_rq(cpu_tmp); -+ -+ locality = rq->cpu_locality[cpu_tmp]; -+#ifdef CONFIG_NUMA -+ if (locality > LOCALITY_SMP) -+ ranking |= CPUIDLE_DIFF_NODE; -+ else -+#endif -+ if (locality > LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CPU; -+#ifdef CONFIG_SCHED_MC -+ else if (locality == LOCALITY_MC_LLC) -+ ranking |= CPUIDLE_DIFF_CORE_LLC; -+ else if (locality == LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CORE; -+ if (!(tmp_rq->cache_idle(tmp_rq))) -+ ranking |= CPUIDLE_CACHE_BUSY; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (locality == LOCALITY_SMT) -+ ranking |= CPUIDLE_DIFF_THREAD; -+#endif -+ if (ranking < best_ranking -+#ifdef CONFIG_SCHED_SMT -+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) -+#endif -+ ) { -+ best_cpu = cpu_tmp; -+ best_ranking = ranking; -+ } -+ } -+out: -+ return best_cpu; -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ struct rq *this_rq = cpu_rq(this_cpu); -+ -+ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); -+} -+ -+/* As per resched_curr but only will resched idle task */ -+static inline void resched_idle(struct rq *rq) -+{ -+ if (test_tsk_need_resched(rq->idle)) -+ return; -+ -+ rq->preempt = rq->idle; -+ -+ set_tsk_need_resched(rq->idle); -+ -+ if (rq_local(rq)) { -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ smp_sched_reschedule(rq->cpu); -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ cpumask_t tmpmask; -+ struct rq *rq; -+ int best_cpu; -+ -+ cpumask_and(&tmpmask, p->cpus_ptr, &cpu_idle_map); -+ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); -+ rq = cpu_rq(best_cpu); -+ if (!smt_schedule(p, rq)) -+ return NULL; -+ rq->preempt = p; -+ resched_idle(rq); -+ return rq; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq->rq_order[cpu]; -+} -+#else /* CONFIG_SMP */ -+static inline void set_cpuidle_map(int cpu) -+{ -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+} -+ -+static inline bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return uprq->curr == uprq->idle; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+} -+ -+static inline void resched_curr(struct rq *rq) -+{ -+ resched_task(rq->curr); -+} -+ -+static inline void resched_if_idle(struct rq *rq) -+{ -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return true; -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq; -+} -+ -+static inline bool smt_schedule(struct task_struct *p, struct rq *rq) -+{ -+ return true; -+} -+#endif /* CONFIG_SMP */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ if (idleprio_task(p)) -+ return IDLE_PRIO; -+ if (iso_task(p)) -+ return ISO_PRIO; -+ return NORMAL_PRIO; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. Enter with rq locked. -+ */ -+static void activate_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ resched_if_idle(rq); -+ -+ /* -+ * Sleep time is in units of nanosecs, so shift by 20 to get a -+ * milliseconds-range estimation of the amount of time that the task -+ * spent sleeping: -+ */ -+ if (unlikely(prof_on == SLEEP_PROFILING)) { -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), -+ (rq->niffies - p->last_ran) >> 20); -+ } -+ -+ p->prio = effective_prio(p); -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ -+ enqueue_task(rq, p, flags); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+} -+ -+/* -+ * deactivate_task - If it's running, it's not on the runqueue and we can just -+ * decrement the nr_running. Enter with rq locked. -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ -+ p->on_rq = 0; -+ sched_info_dequeued(rq, p); -+ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ -+ psi_dequeue(p, DEQUEUE_SLEEP); -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+ struct rq *rq; -+ -+ if (task_cpu(p) == new_cpu) -+ return; -+ -+ /* Do NOT call set_task_cpu on a currently queued task as we will not -+ * be reliably holding the rq lock after changing CPU. */ -+ BUG_ON(task_queued(p)); -+ rq = task_rq(p); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * Furthermore, all task_rq users should acquire both locks, see -+ * task_rq_lock(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(rq->lock))); -+#endif -+ -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ /* -+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ p->wake_cpu = new_cpu; -+ -+ if (task_running(rq, p)) { -+ /* -+ * We should only be calling this on a running task if we're -+ * holding rq lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ -+ /* -+ * We can't change the task_thread_info CPU on a running task -+ * as p will still be protected by the rq lock of the CPU it -+ * is still running on so we only set the wake_cpu for it to be -+ * lazily updated once off the CPU. -+ */ -+ return; -+ } -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, new_cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); -+#endif -+ /* We're no longer protecting p after this point since we're holding -+ * the wrong runqueue lock. */ -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Move a task off the runqueue and take it to a cpu for it will -+ * become the running task. -+ */ -+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) -+{ -+ struct rq *p_rq = task_rq(p); -+ -+ dequeue_task(p_rq, p, DEQUEUE_SAVE); -+ if (p_rq != rq) { -+ sched_info_dequeued(p_rq, p); -+ sched_info_queued(rq, p); -+ } -+ set_task_cpu(p, cpu); -+} -+ -+/* -+ * Returns a descheduling task to the runqueue unless it is being -+ * deactivated. -+ */ -+static inline void return_task(struct task_struct *p, struct rq *rq, -+ int cpu, bool deactivate) -+{ -+ if (deactivate) -+ deactivate_task(p, rq); -+ else { -+#ifdef CONFIG_SMP -+ /* -+ * set_task_cpu was called on the running task that doesn't -+ * want to deactivate so it has to be enqueued to a different -+ * CPU and we need its lock. Tag it to be moved with as the -+ * lock is dropped in finish_lock_switch. -+ */ -+ if (unlikely(p->wake_cpu != cpu)) -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ else -+#endif -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ } -+} -+ -+/* Enter with rq lock held. We know p is on the local cpu */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ int running, queued; -+ struct rq_flags rf; -+ unsigned long ncsw; -+ struct rq *rq; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(rq, p)) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ rq = task_rq_lock(p, &rf); -+ trace_sched_wait_task(p); -+ running = task_running(rq, p); -+ queued = task_on_rq_queued(p); -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_rq_unlock(rq, p, &rf); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(queued)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_sched_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+#endif -+ -+/* -+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the -+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or -+ * between themselves, they cooperatively multitask. An idle rq scores as -+ * prio PRIO_LIMIT so it is always preempted. -+ */ -+static inline bool -+can_preempt(struct task_struct *p, int prio, u64 deadline) -+{ -+ /* Better static priority RT task or better policy preemption */ -+ if (p->prio < prio) -+ return true; -+ if (p->prio > prio) -+ return false; -+ if (p->policy == SCHED_BATCH) -+ return false; -+ /* SCHED_NORMAL and ISO will preempt based on deadline */ -+ if (!deadline_before(p->deadline, deadline)) -+ return false; -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ if (!(p->flags & PF_KTHREAD)) -+ return false; -+ -+ if (p->nr_cpus_allowed != 1) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * Check to see if p can run on cpu, and if not, whether there are any online -+ * CPUs it can run on instead. This only happens with the hotplug threads that -+ * bring up the CPUs. -+ */ -+static inline bool sched_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) -+ return false; -+ if (p->nr_cpus_allowed == 1) { -+ cpumask_t valid_mask; -+ -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); -+ if (unlikely(cpumask_empty(&valid_mask))) -+ return false; -+ } -+ return true; -+} -+ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ return true; -+} -+ -+#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ int i, this_entries = rq_load(this_rq); -+ cpumask_t tmp; -+ -+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) -+ return; -+ -+ /* IDLEPRIO tasks never preempt anything but idle */ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ -+ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *rq = this_rq->cpu_order[i]; -+ -+ if (!cpumask_test_cpu(rq->cpu, &tmp)) -+ continue; -+ -+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) -+ continue; -+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { -+ /* We set rq->preempting lockless, it's a hint only */ -+ rq->preempting = p; -+ resched_curr(rq); -+ return; -+ } -+ } -+} -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check); -+#else /* CONFIG_SMP */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ return false; -+} -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) -+ resched_curr(uprq); -+} -+ -+static inline int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ } else { -+ struct sched_domain *sd; -+ -+ rcu_read_lock(); -+ for_each_domain(rq->cpu, sd) { -+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -+ __schedstat_inc(sd->ttwu_wake_remote); -+ break; -+ } -+ } -+ rcu_read_unlock(); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ /* -+ * Sync wakeups (i.e. those types of wakeups where the waker -+ * has indicated that it will leave the CPU in short order) -+ * don't trigger a preemption if there are no idle cpus, -+ * instead waiting for current to deschedule. -+ */ -+ if (wake_flags & WF_SYNC) -+ resched_suitable_idle(p); -+ else -+ try_preempt(p, rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ int en_flags = ENQUEUE_WAKEUP; -+ -+ lockdep_assert_held(rq->lock); -+ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ if (wake_flags & WF_MIGRATED) -+ en_flags |= ENQUEUE_MIGRATED; -+#endif -+ -+ activate_task(rq, p, en_flags); -+ ttwu_do_wakeup(rq, p, wake_flags); -+} -+ -+/* -+ * Called in case the task @p isn't fully descheduled from its runqueue, -+ * in this case we must do a remote wakeup. Its a 'light' wakeup though, -+ * since all we need to do is flip p->state to TASK_RUNNING, since -+ * the task is still ->on_rq. -+ */ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = __task_rq_lock(p, NULL); -+ if (likely(task_on_rq_queued(p))) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_rq_unlock(rq, NULL); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void) -+{ -+ struct rq *rq = this_rq(); -+ struct llist_node *llist = llist_del_all(&rq->wake_list); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry) -+ ttwu_do_activate(rq, p, 0); -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) -+ return; -+ -+ /* -+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since -+ * traditionally all their work was done from the interrupt return -+ * path. Now that we actually do some work, we need to make sure -+ * we do call them. -+ * -+ * Some archs already do call them, luckily irq_enter/exit nest -+ * properly. -+ * -+ * Arguably we should visit all archs and update all handlers, -+ * however a fair share of IPIs are still resched only so this would -+ * somewhat pessimize the simple resched case. -+ */ -+ irq_enter(); -+ sched_ttwu_pending(); -+ irq_exit(); -+} -+ -+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { -+ if (!set_nr_if_polling(rq->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+ } -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ rq_lock_irqsave(rq, &rf); -+ if (likely(is_idle_task(rq->curr))) -+ smp_sched_reschedule(cpu); -+ /* Else cpu is not in idle, do nothing here */ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ cpumask_t valid_mask; -+ -+ if (p->flags & PF_KTHREAD) -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); -+ else -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); -+ -+ if (unlikely(!cpumask_weight(&valid_mask))) { -+ /* We shouldn't be hitting this any more */ -+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, -+ p->pid, cpumask_weight(p->cpus_ptr)); -+ return cpumask_any(p->cpus_ptr); -+ } -+ return cpumask_any(&valid_mask); -+} -+ -+/* -+ * For a task that's just being woken up we have a valuable balancing -+ * opportunity so choose the nearest cache most lightly loaded runqueue. -+ * Entered with rq locked and returns with the chosen runqueue locked. -+ */ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ unsigned int idlest = ~0U; -+ struct rq *rq = NULL; -+ int i; -+ -+ if (suitable_idle_cpus(p)) { -+ int cpu = task_cpu(p); -+ -+ if (unlikely(needs_other_cpu(p, cpu))) -+ cpu = valid_task_cpu(p); -+ rq = resched_best_idle(p, cpu); -+ if (likely(rq)) -+ return rq->cpu; -+ } -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *other_rq = task_rq(p)->cpu_order[i]; -+ int entries; -+ -+ if (!other_rq->online) -+ continue; -+ if (needs_other_cpu(p, other_rq->cpu)) -+ continue; -+ entries = rq_load(other_rq); -+ if (entries >= idlest) -+ continue; -+ idlest = entries; -+ rq = other_rq; -+ } -+ if (unlikely(!rq)) -+ return task_cpu(p); -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ return NULL; -+} -+#endif /* CONFIG_SMP */ -+ -+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+#if defined(CONFIG_SMP) -+ if (!cpus_share_cache(smp_processor_id(), cpu)) { -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ ttwu_queue_remote(p, cpu, wake_flags); -+ return; -+ } -+#endif -+ rq_lock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ rq_unlock(rq); -+} -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int -+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_remote()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ cpu = task_cpu(p); -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ cpu = select_best_cpu(p); -+ if (task_cpu(p) != cpu) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+ -+#else /* CONFIG_SMP */ -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, cpu, wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+static void time_slice_expired(struct task_struct *p, struct rq *rq); -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * The process state is set to the same value of the process executing -+ * do_fork() code. That is running. This guarantees that nobody will -+ * actually run it, and a signal or other external event cannot wake -+ * it up and insert it on the runqueue either. -+ */ -+ -+ /* Should be reset in fork.c but done here for ease of MuQSS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = -+ p->stime_ns = -+ p->utime_ns = 0; -+ skiplist_node_init(&p->node); -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { -+ p->policy = SCHED_NORMAL; -+ p->normal_prio = normal_prio(p); -+ } -+ -+ if (PRIO_TO_NICE(p->static_prio) < 0) { -+ p->static_prio = NICE_TO_PRIO(0); -+ p->normal_prio = p->static_prio; -+ } -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); -+ -+static void account_task_cpu(struct rq *rq, struct task_struct *p) -+{ -+ update_clocks(rq); -+ /* This isn't really a context switch but accounting is the same */ -+ update_cpu_clock_switch(rq, p); -+ p->last_ran = rq->niffies; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+static inline int hrexpiry_enabled(struct rq *rq) -+{ -+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrexpiry_timer); -+} -+ -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+static inline void hrexpiry_clear(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (hrtimer_active(&rq->hrexpiry_timer)) -+ hrtimer_cancel(&rq->hrexpiry_timer); -+} -+ -+/* -+ * High-resolution time_slice expiry. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrexpiry(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); -+ struct task_struct *p; -+ -+ /* This can happen during CPU hotplug / resume */ -+ if (unlikely(cpu_of(rq) != smp_processor_id())) -+ goto out; -+ -+ /* -+ * We're doing this without the runqueue lock but this should always -+ * be run on the local CPU. Time slice should run out in __schedule -+ * but we set it to zero here in case niffies is slightly less. -+ */ -+ p = rq->curr; -+ p->time_slice = 0; -+ __set_tsk_resched(p); -+out: -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Called to set the hrexpiry timer state. -+ * -+ * called with irqs disabled from the local CPU only -+ */ -+static void hrexpiry_start(struct rq *rq, u64 delay) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ -+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED); -+} -+ -+static void init_rq_hrexpiry(struct rq *rq) -+{ -+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ rq->hrexpiry_timer.function = hrexpiry; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return HALF_JIFFY_US; -+ return 0; -+} -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ struct task_struct *parent, *rq_curr; -+ struct rq *rq, *new_rq; -+ unsigned long flags; -+ -+ parent = p->parent; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ p->state = TASK_RUNNING; -+ /* Task_rq can't change yet on a new task */ -+ new_rq = rq = task_rq(p); -+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { -+ set_task_cpu(p, valid_task_cpu(p)); -+ new_rq = task_rq(p); -+ } -+ -+ double_rq_lock(rq, new_rq); -+ rq_curr = rq->curr; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = rq_curr->normal_prio; -+ -+ trace_sched_wakeup_new(p); -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. If it's negative, it won't -+ * matter since that's the same as being 0. rq->rq_deadline is only -+ * modified within schedule() so it is always equal to -+ * current->deadline. -+ */ -+ account_task_cpu(rq, rq_curr); -+ p->last_ran = rq_curr->last_ran; -+ if (likely(rq_curr->policy != SCHED_FIFO)) { -+ rq_curr->time_slice /= 2; -+ if (rq_curr->time_slice < RESCHED_US) { -+ /* -+ * Forking task has run out of timeslice. Reschedule it and -+ * start its child with a new time slice and deadline. The -+ * child will end up running first because its deadline will -+ * be slightly earlier. -+ */ -+ __set_tsk_resched(rq_curr); -+ time_slice_expired(p, new_rq); -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+ else if (unlikely(rq != new_rq)) -+ try_preempt(p, new_rq); -+ } else { -+ p->time_slice = rq_curr->time_slice; -+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { -+ /* -+ * The VM isn't cloned, so we're in a good position to -+ * do child-runs-first in anticipation of an exec. This -+ * usually avoids a lot of COW overhead. -+ */ -+ __set_tsk_resched(rq_curr); -+ } else { -+ /* -+ * Adjust the hrexpiry since rq_curr will keep -+ * running and its timeslice has been shortened. -+ */ -+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); -+ try_preempt(p, new_rq); -+ } -+ } -+ } else { -+ time_slice_expired(p, new_rq); -+ try_preempt(p, new_rq); -+ } -+ activate_task(new_rq, p, 0); -+ double_rq_unlock(rq, new_rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock->dep_map, 1, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock->owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If prev was marked as migrating to another CPU in return_task, drop -+ * the local runqueue lock but leave interrupts disabled and grab the -+ * remote lock we're migrating it to before enabling them. -+ */ -+ if (unlikely(task_on_rq_migrating(prev))) { -+ sched_info_dequeued(rq, prev); -+ /* -+ * We move the ownership of prev to the new cpu now. ttwu can't -+ * activate prev to the wrong cpu since it has to grab this -+ * runqueue in ttwu_remote. -+ */ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ prev->cpu = prev->wake_cpu; -+#else -+ task_thread_info(prev)->cpu = prev->wake_cpu; -+#endif -+ raw_spin_unlock(rq->lock); -+ -+ raw_spin_lock(&prev->pi_lock); -+ rq = __task_rq_lock(prev, NULL); -+ /* Check that someone else hasn't already queued prev */ -+ if (likely(!task_queued(prev))) { -+ enqueue_task(rq, prev, 0); -+ prev->on_rq = TASK_ON_RQ_QUEUED; -+ /* Wake up the CPU if it's not already running */ -+ resched_if_idle(rq); -+ } -+ raw_spin_unlock(&prev->pi_lock); -+ } -+#endif -+ rq_unlock(rq); -+ -+ do_pending_softirq(rq, current); -+ -+ local_irq_enable(); -+} -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_switch -+# define finish_arch_switch(prev) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static void finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq, prev); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline void -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+static unsigned long nr_uninterruptible(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_uninterruptible; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ if (rq_load(raw_rq()) == 1) -+ return true; -+ else -+ return false; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int cpu; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += cpu_rq(cpu)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpufreq menu -+ * governor are using nonsensical data. Boosting frequency for a CPU that has -+ * IO-wait which might not even end up running the task when it does become -+ * runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long cpu, sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += nr_iowait_cpu(cpu); -+ -+ return sum; -+} -+ -+unsigned long nr_active(void) -+{ -+ return nr_running() + nr_uninterruptible(); -+} -+ -+/* Variables and functions for calc_load */ -+static unsigned long calc_load_update; -+unsigned long avenrun[3]; -+EXPORT_SYMBOL(avenrun); -+ -+/** -+ * get_avenrun - get the load average array -+ * @loads: pointer to dest load array -+ * @offset: offset to add -+ * @shift: shift count to shift the result left -+ * -+ * These values are estimates at best, so no need for locking. -+ */ -+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -+{ -+ loads[0] = (avenrun[0] + offset) << shift; -+ loads[1] = (avenrun[1] + offset) << shift; -+ loads[2] = (avenrun[2] + offset) << shift; -+} -+ -+/* -+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. -+ */ -+void calc_global_load(unsigned long ticks) -+{ -+ long active; -+ -+ if (time_before(jiffies, READ_ONCE(calc_load_update))) -+ return; -+ active = nr_active() * FIXED_1; -+ -+ avenrun[0] = calc_load(avenrun[0], EXP_1, active); -+ avenrun[1] = calc_load(avenrun[1], EXP_5, active); -+ avenrun[2] = calc_load(avenrun[2], EXP_15, active); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+} -+ -+/** -+ * fixed_power_int - compute: x^n, in O(log n) time -+ * -+ * @x: base of the power -+ * @frac_bits: fractional bits of @x -+ * @n: power to raise @x to. -+ * -+ * By exploiting the relation between the definition of the natural power -+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and -+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, -+ * (where: n_i \elem {0, 1}, the binary vector representing n), -+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is -+ * of course trivially computable in O(log_2 n), the length of our binary -+ * vector. -+ */ -+static unsigned long -+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -+{ -+ unsigned long result = 1UL << frac_bits; -+ -+ if (n) { -+ for (;;) { -+ if (n & 1) { -+ result *= x; -+ result += 1UL << (frac_bits - 1); -+ result >>= frac_bits; -+ } -+ n >>= 1; -+ if (!n) -+ break; -+ x *= x; -+ x += 1UL << (frac_bits - 1); -+ x >>= frac_bits; -+ } -+ } -+ -+ return result; -+} -+ -+/* -+ * a1 = a0 * e + a * (1 - e) -+ * -+ * a2 = a1 * e + a * (1 - e) -+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) -+ * = a0 * e^2 + a * (1 - e) * (1 + e) -+ * -+ * a3 = a2 * e + a * (1 - e) -+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) -+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) -+ * -+ * ... -+ * -+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] -+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) -+ * = a0 * e^n + a * (1 - e^n) -+ * -+ * [1] application of the geometric series: -+ * -+ * n 1 - x^(n+1) -+ * S_n := \Sum x^i = ------------- -+ * i=0 1 - x -+ */ -+unsigned long -+calc_load_n(unsigned long load, unsigned long exp, -+ unsigned long active, unsigned int n) -+{ -+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+#ifdef CONFIG_PARAVIRT -+static inline u64 steal_ticks(u64 steal) -+{ -+ if (unlikely(steal > NSEC_PER_SEC)) -+ return div_u64(steal, TICK_NSEC); -+ -+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -+} -+#endif -+ -+#ifndef nsecs_to_cputime -+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -+#endif -+ -+/* -+ * On each tick, add the number of nanoseconds to the unbanked variables and -+ * once one tick's worth has accumulated, account it allowing for accurate -+ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we -+ * deduct nanoseconds. -+ */ -+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ if (atomic_read(&rq->nr_iowait) > 0) { -+ rq->iowait_ns += ns; -+ if (rq->iowait_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->iowait_ns); -+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->iowait_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->idle_ns += ns; -+ if (rq->idle_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->idle_ns); -+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->idle_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(idle); -+} -+ -+static void pc_system_time(struct rq *rq, struct task_struct *p, -+ int hardirq_offset, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->stime_ns += ns; -+ if (p->stime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->stime_ns); -+ p->stime_ns %= JIFFY_NS; -+ p->stime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_system_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (hardirq_count() - hardirq_offset) { -+ rq->irq_ns += ns; -+ if (rq->irq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->irq_ns); -+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->irq_ns %= JIFFY_NS; -+ } -+ } else if (in_serving_softirq()) { -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->system_ns += ns; -+ if (rq->system_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->system_ns); -+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->system_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->utime_ns += ns; -+ if (p->utime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->utime_ns); -+ p->utime_ns %= JIFFY_NS; -+ p->utime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_user_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (this_cpu_ksoftirqd() == p) { -+ /* -+ * ksoftirqd time do not get accounted in cpu_softirq_time. -+ * So, we have to handle it separately here. -+ */ -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } -+ -+ if (task_nice(p) > 0 || idleprio_task(p)) { -+ rq->nice_ns += ns; -+ if (rq->nice_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->nice_ns); -+ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->nice_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->user_ns += ns; -+ if (rq->user_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->user_ns); -+ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->user_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+/* -+ * This is called on clock ticks. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate tick timekeeping */ -+ if (user_mode(get_irq_regs())) -+ pc_user_time(rq, p, account_ns); -+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { -+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); -+ } else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+ -+ p->last_ran = rq->niffies; -+} -+ -+/* -+ * This is called on context switches. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate subtick timekeeping */ -+ if (p != idle) -+ pc_user_time(rq, p, account_ns); -+ else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+} -+ -+/* -+ * Return any ns on the sched_clock that have not yet been accounted in -+ * @p in case that task is currently running. -+ * -+ * Called with task_rq_lock(p) held. -+ */ -+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -+{ -+ u64 ns = 0; -+ -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_clocks(rq); -+ ns = rq->niffies - p->last_ran; -+ } -+ -+ return ns; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ * -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ struct rq_flags rf; -+ struct rq *rq; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimisation chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_rq_lock(p, &rf); -+ ns = p->sched_time + do_task_delta_exec(p, rq); -+ task_rq_unlock(rq, p, &rf); -+ -+ return ns; -+} -+ -+/* -+ * Functions to test for when SCHED_ISO tasks have used their allocated -+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All -+ * data is modified only by the local runqueue during scheduler_tick with -+ * interrupts disabled. -+ */ -+ -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a -+ * slow division. -+ */ -+static inline void iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+ rq->iso_ticks += 100; -+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { -+ rq->iso_refractory = true; -+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) -+ rq->iso_ticks = ISO_PERIOD * 100; -+ } -+} -+ -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq, int ticks) -+{ -+ if (rq->iso_ticks > 0 || rq->iso_refractory) { -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; -+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { -+ rq->iso_refractory = false; -+ if (unlikely(rq->iso_ticks < 0)) -+ rq->iso_ticks = 0; -+ } -+ } -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if (rt_task(p) || task_running_iso(p)) -+ iso_tick(rq); -+ else -+ no_iso_tick(rq, 1); -+ -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->policy == SCHED_FIFO) -+ return; -+ -+ if (iso_task(p)) { -+ if (task_running_iso(p)) { -+ if (rq->iso_refractory) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Force it to reschedule as -+ * SCHED_NORMAL by zeroing its time_slice -+ */ -+ p->time_slice = 0; -+ } -+ } else if (!rq->iso_refractory) { -+ /* Can now run again ISO. Reschedule to pick up prio */ -+ goto out_resched; -+ } -+ } -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ * Dither is used as a backup for when hrexpiry is disabled or high res -+ * timers not configured in. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+out_resched: -+ rq_lock(rq); -+ __set_tsk_resched(p); -+ rq_unlock(rq); -+} -+ -+static inline void task_tick(struct rq *rq) -+{ -+ if (!rq_idle(rq)) -+ task_running_tick(rq); -+ else if (rq->last_jiffy > rq->last_scheduler_tick) -+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * We can stop the timer tick any time highres timers are active since -+ * we rely entirely on highres timeouts for task expiry rescheduling. -+ */ -+static void sched_stop_tick(struct rq *rq, int cpu) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (!tick_nohz_full_enabled()) -+ return; -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ rq_lock_irq(rq); -+ curr = rq->curr; -+ if (is_idle_task(curr) || cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ delta = rq_clock_task(rq) - curr->last_ran; -+ -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ task_tick(rq); -+ -+out_unlock: -+ rq_unlock_irq(rq, NULL); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ /* There cannot be competing actions, but don't rely on stop-machine. */ -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); -+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); -+ /* Don't cancel, as this would mess up the state machine. */ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_stop_tick(struct rq *rq, int cpu) {} -+static inline void sched_start_tick(struct rq *rq, int cpu) {} -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ sched_clock_tick(); -+ update_clocks(rq); -+ update_load_avg(rq, 0); -+ update_cpu_clock_tick(rq, rq->curr); -+ task_tick(rq); -+ rq->last_scheduler_tick = rq->last_jiffy; -+ rq->last_tick = rq->clock; -+ psi_task_tick(rq); -+ perf_event_task_tick(); -+ sched_stop_tick(rq, cpu); -+} -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline. Make sure update_clocks has been called recently to update -+ * rq->niffies. -+ */ -+static void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ p->deadline = rq->niffies + task_deadline_diff(p); -+#ifdef CONFIG_SMT_NICE -+ if (!p->mm) -+ p->smt_bias = 0; -+ else if (rt_task(p)) -+ p->smt_bias = 1 << 30; -+ else if (task_running_iso(p)) -+ p->smt_bias = 1 << 29; -+ else if (idleprio_task(p)) { -+ if (task_running_idle(p)) -+ p->smt_bias = 0; -+ else -+ p->smt_bias = 1; -+ } else if (--p->smt_bias < 1) -+ p->smt_bias = MAX_PRIO - p->static_prio; -+#endif -+} -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (p->time_slice < RESCHED_US || batch_task(p)) -+ time_slice_expired(p, rq); -+} -+ -+/* -+ * Task selection with skiplists is a simple matter of picking off the first -+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) -+ * being bound to the number of processors. -+ * -+ * Runqueues are selectively locked based on their unlocked data and then -+ * unlocked if not needed. At most 3 locks will be held at any time and are -+ * released as soon as they're no longer needed. All balancing between CPUs -+ * is thus done here in an extremely simple first come best fit manner. -+ * -+ * This iterates over runqueues in cache locality order. In interactive mode -+ * it iterates over all CPUs and finds the task with the best key/deadline. -+ * In non-interactive mode it will only take a task if it's from the current -+ * runqueue or a runqueue with more tasks than the current one with a better -+ * key/deadline. -+ */ -+#ifdef CONFIG_SMP -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct rq *locked = NULL, *chosen = NULL; -+ struct task_struct *edt = idle; -+ int i, best_entries = 0; -+ u64 best_key = ~0ULL; -+ -+ for (i = 0; i < total_runqueues; i++) { -+ struct rq *other_rq = rq_order(rq, i); -+ skiplist_node *next; -+ int entries; -+ -+ entries = other_rq->sl->entries; -+ /* -+ * Check for queued entres lockless first. The local runqueue -+ * is locked so entries will always be accurate. -+ */ -+ if (!sched_interactive) { -+ /* -+ * Don't reschedule balance across nodes unless the CPU -+ * is idle. -+ */ -+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) -+ break; -+ if (entries <= best_entries) -+ continue; -+ } else if (!entries) -+ continue; -+ -+ /* if (i) implies other_rq != rq */ -+ if (i) { -+ /* Check for best id queued lockless first */ -+ if (other_rq->best_key >= best_key) -+ continue; -+ -+ if (unlikely(!trylock_rq(rq, other_rq))) -+ continue; -+ -+ /* Need to reevaluate entries after locking */ -+ entries = other_rq->sl->entries; -+ if (unlikely(!entries)) { -+ unlock_rq(other_rq); -+ continue; -+ } -+ } -+ -+ next = other_rq->node; -+ /* -+ * In interactive mode we check beyond the best entry on other -+ * runqueues if we can't get the best for smt or affinity -+ * reasons. -+ */ -+ while ((next = next->next[0]) != other_rq->node) { -+ struct task_struct *p; -+ u64 key = next->key; -+ -+ /* Reevaluate key after locking */ -+ if (key >= best_key) -+ break; -+ -+ p = next->value; -+ if (!smt_schedule(p, rq)) { -+ if (i && !sched_interactive) -+ break; -+ continue; -+ } -+ -+ if (sched_other_cpu(p, cpu)) { -+ if (sched_interactive || !i) -+ continue; -+ break; -+ } -+ /* Make sure affinity is ok */ -+ if (i) { -+ /* From this point on p is the best so far */ -+ if (locked) -+ unlock_rq(locked); -+ chosen = locked = other_rq; -+ } -+ best_entries = entries; -+ best_key = key; -+ edt = p; -+ break; -+ } -+ /* rq->preempting is a hint only as the state may have changed -+ * since it was set with the resched call but if we have met -+ * the condition we can break out here. */ -+ if (edt == rq->preempting) -+ break; -+ if (i && other_rq != chosen) -+ unlock_rq(other_rq); -+ } -+ -+ if (likely(edt != idle)) -+ take_task(rq, cpu, edt); -+ -+ if (locked) -+ unlock_rq(locked); -+ -+ rq->preempting = NULL; -+ -+ return edt; -+} -+#else /* CONFIG_SMP */ -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct task_struct *edt; -+ -+ if (unlikely(!rq->sl->entries)) -+ return idle; -+ edt = rq->node->next[0]->value; -+ take_task(rq, cpu, edt); -+ return edt; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * The currently running task's information is all stored in rq local data -+ * which is only modified by the local CPU. -+ */ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ if (p == rq->idle || p->policy == SCHED_FIFO) -+ hrexpiry_clear(rq); -+ else -+ hrexpiry_start(rq, US_TO_NS(p->time_slice)); -+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) -+ rq->dither = 0; -+ else -+ rq->dither = rq_dither(rq); -+ -+ rq->rq_deadline = p->deadline; -+ rq->rq_prio = p->prio; -+#ifdef CONFIG_SMT_NICE -+ rq->rq_mm = p->mm; -+ rq->rq_smt_bias = p->smt_bias; -+#endif -+} -+ -+#ifdef CONFIG_SMT_NICE -+static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; -+ -+/* Iterate over smt siblings when we've scheduled a process on cpu and decide -+ * whether they should continue running or be descheduled. */ -+static void check_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct task_struct *p; -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ continue; -+ p = rq->curr; -+ if (!smt_schedule(p, this_rq)) -+ resched_curr(rq); -+ } -+} -+ -+static void wake_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ resched_idle(rq); -+ } -+} -+#else -+static void check_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_siblings(struct rq __maybe_unused *this_rq) {} -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next, *idle; -+ unsigned long *switch_count; -+ bool deactivate = false; -+ struct rq *rq; -+ u64 niffies; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ idle = rq->idle; -+ -+ schedule_debug(prev, preempt); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ rq_lock(rq); -+ smp_mb__after_spinlock(); -+#ifdef CONFIG_SMP -+ if (rq->preempt) { -+ /* -+ * Make sure resched_curr hasn't triggered a preemption -+ * locklessly on a task that has since scheduled away. Spurious -+ * wakeup of idle is okay though. -+ */ -+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { -+ rq->preempt = NULL; -+ clear_preempt_need_resched(); -+ rq_unlock_irq(rq, NULL); -+ return; -+ } -+ rq->preempt = NULL; -+ } -+#endif -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate = true; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ /* -+ * Store the niffy value here for use by the next task's last_ran -+ * below to avoid losing niffies due to update_clocks being called -+ * again after this point. -+ */ -+ update_clocks(rq); -+ niffies = rq->niffies; -+ update_cpu_clock_switch(rq, prev); -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ if (idle != prev) { -+ check_deadline(prev, rq); -+ return_task(prev, rq, cpu, deactivate); -+ } -+ -+ next = earliest_deadline_task(rq, cpu, idle); -+ if (likely(next->prio != PRIO_LIMIT)) -+ clear_cpuidle_map(cpu); -+ else { -+ set_cpuidle_map(cpu); -+ update_load_avg(rq, 0); -+ } -+ -+ set_rq_task(rq, next); -+ next->last_ran = niffies; -+ -+ if (likely(prev != next)) { -+ /* -+ * Don't reschedule an idle task or deactivated tasks -+ */ -+ if (prev == idle) { -+ rq->nr_running++; -+ if (rt_task(next)) -+ rq->rt_nr_running++; -+ } else if (!deactivate) -+ resched_suitable_idle(prev); -+ if (unlikely(next == idle)) { -+ rq->nr_running--; -+ if (rt_task(prev)) -+ rq->rt_nr_running--; -+ wake_siblings(rq); -+ } else -+ check_siblings(rq); -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ trace_sched_switch(preempt, prev, next); -+ context_switch(rq, prev, next); /* unlocks the rq */ -+ } else { -+ check_siblings(rq); -+ rq_unlock(rq); -+ do_pending_softirq(rq, next); -+ local_irq_enable(); -+ } -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(). */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker. -+ */ -+ if (tsk->flags & PF_WQ_WORKER) { -+ preempt_disable(); -+ wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static inline void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+ -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != IN_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio, oldprio; -+ struct rq *rq; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_rq_lock(p, NULL); -+ update_rq_clock(rq); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ oldprio = p->prio; -+ p->prio = prio; -+ if (task_running(rq, p)){ -+ if (prio > oldprio) -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (prio < oldprio) -+ try_preempt(p, rq); -+ } -+out_unlock: -+ __task_rq_unlock(rq, NULL); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+/* -+ * Adjust the deadline for when the priority is to change, before it's -+ * changed. -+ */ -+static inline void adjust_deadline(struct task_struct *p, int new_prio) -+{ -+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -+} -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static, old_static; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (has_rt_policy(p)) { -+ p->static_prio = new_static; -+ goto out_unlock; -+ } -+ -+ adjust_deadline(p, new_static); -+ old_static = p->static_prio; -+ p->static_prio = new_static; -+ p->prio = effective_prio(p); -+ -+ if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (new_static < old_static) -+ try_preempt(p, rq); -+ } else if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ if (old_static < new_static) -+ resched_task(p); -+ } -+out_unlock: -+ task_rq_unlock(rq, p, &rf); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int delta, prio = p->prio - MAX_RT_PRIO; -+ -+ /* rt tasks and iso tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ /* Convert to ms to avoid overflows */ -+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); -+ if (unlikely(delta < 0)) -+ delta = 0; -+ delta = delta * 40 / ms_longest_deadline_diff(); -+ if (delta <= 80) -+ prio += delta; -+ if (idleprio_task(p)) -+ prio += 40; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * available_idle_cpu - is a given CPU idle for enqueuing work. -+ * @cpu: the CPU in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int available_idle_cpu(int cpu) -+{ -+ if (!idle_cpu(cpu)) -+ return 0; -+ -+ if (vcpu_is_preempted(cpu)) -+ return 0; -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the CPU @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, -+ int prio, const struct sched_attr *attr, -+ bool keep_boost) -+{ -+ int oldrtprio, oldprio; -+ -+ /* -+ * If params can't change scheduling class changes aren't allowed -+ * either. -+ */ -+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) -+ return; -+ -+ p->policy = policy; -+ oldrtprio = p->rt_priority; -+ p->rt_priority = prio; -+ p->normal_prio = normal_prio(p); -+ oldprio = p->prio; -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ -+ if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (p->prio < oldprio || p->rt_priority > oldrtprio) -+ try_preempt(p, rq); -+ } -+} -+ -+/* -+ * Check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; -+ unsigned long rlim_rtprio = 0; -+ struct rq_flags rf; -+ int reset_on_fork; -+ struct rq *rq; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ priority = 0; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -+ policy &= ~SCHED_RESET_ON_FORK; -+ -+ if (!SCHED_RANGE(policy)) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH is 0. -+ */ -+ if (priority < 0 || -+ (p->mm && priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if (is_rt_policy(policy) != (priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (is_rt_policy(policy)) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (priority > p->rt_priority && -+ priority > rlim_rtprio) -+ return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy != SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ * -+ * To be able to change p->policy safely, the runqueue lock must be -+ * held. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea: -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || -+ priority == p->rt_priority))) { -+ retval = 0; -+ goto unlock; -+ } -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ __setscheduler(p, rq, policy, priority, attr, pi); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ task_rq_unlock(rq, p, &rf); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ preempt_enable(); -+out: -+ return 0; -+ -+unlock: -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) -+ attr.sched_policy = SETPARAM_POLICY; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ cpumask_t *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ unsigned long flags; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ put_online_cpus(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min(len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ -+ if (!sched_yield_type) -+ return; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ -+ if (sched_yield_type > 1) -+ time_slice_expired(current, rq); -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ rq_unlock(rq); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ struct task_struct *rq_p; -+ struct rq *rq, *p_rq; -+ unsigned long flags; -+ int yielded = 0; -+ -+ local_irq_save(flags); -+ rq = this_rq(); -+ -+again: -+ p_rq = task_rq(p); -+ /* -+ * If we're the only runnable task on the rq and target rq also -+ * has only one task, there's absolutely no point in yielding. -+ */ -+ if (task_running(p_rq, p) || p->state) { -+ yielded = -ESRCH; -+ goto out_irq; -+ } -+ -+ double_rq_lock(rq, p_rq); -+ if (unlikely(task_rq(p) != p_rq)) { -+ double_rq_unlock(rq, p_rq); -+ goto again; -+ } -+ -+ yielded = 1; -+ schedstat_inc(rq->yld_count); -+ rq_p = rq->curr; -+ if (p->deadline > rq_p->deadline) -+ p->deadline = rq_p->deadline; -+ p->time_slice += rq_p->time_slice; -+ if (p->time_slice > timeslice()) -+ p->time_slice = timeslice(); -+ time_slice_expired(rq_p, rq); -+ if (preempt && rq != p_rq) -+ resched_task(p_rq->curr); -+ double_rq_unlock(rq, p_rq); -+out_irq: -+ local_irq_restore(flags); -+ -+ if (yielded > 0) -+ schedule(); -+ return yielded; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ unsigned int time_slice; -+ struct rq_flags rf; -+ struct rq *rq; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ rq = task_rq_lock(p, &rf); -+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); -+ task_rq_unlock(rq, p, &rf); -+ -+ rcu_read_unlock(); -+ *t = ns_to_timespec64(time_slice); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * this syscall writes the default timeslice value of a given process -+ * into the user-space timespec buffer. A value of '0' means infinity. -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+#ifdef CONFIG_SMP -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ -+ if (task_queued(p)) { -+ /* -+ * Because __kthread_bind() calls this on blocked tasks without -+ * holding rq->lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ } -+} -+ -+/* -+ * Calling do_set_cpus_allowed from outside the scheduler code should not be -+ * called on a running or queued task. We should be holding pi_lock. -+ */ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+ if (needs_other_cpu(p, task_cpu(p))) { -+ struct rq *rq; -+ -+ rq = __task_rq_lock(p, NULL); -+ set_task_cpu(p, valid_task_cpu(p)); -+ resched_task(p); -+ __task_rq_unlock(rq, NULL); -+ } -+} -+#endif -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(rq->lock); -+ idle->last_ran = rq->niffies; -+ time_slice_expired(idle, rq); -+ idle->state = TASK_RUNNING; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#ifdef CONFIG_SMT_NICE -+ idle->smt_bias = 0; -+#endif -+#endif -+ set_rq_task(rq, idle); -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_rq = TASK_ON_RQ_QUEUED; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(rq); -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+void nohz_balance_enter_idle(int cpu) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(); -+ struct sched_domain *sd; -+ -+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ return cpu; -+ -+ rcu_read_lock(); -+ for_each_domain(cpu, sd) { -+ for_each_cpu(i, sched_domain_span(sd)) { -+ if (cpu == i) -+ continue; -+ -+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { -+ cpu = i; -+ cpu = i; -+ goto unlock; -+ } -+ } -+ } -+ -+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+unlock: -+ rcu_read_unlock(); -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * Wake up the specified CPU. If the CPU is going offline, it is the -+ * caller's responsibility to deal with the lost wakeup, for example, -+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. -+ */ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool queued = false, running_wrong = false, kthread; -+ struct cpumask old_mask; -+ unsigned int dest_cpu; -+ struct rq_flags rf; -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ kthread = !!(p->flags & PF_KTHREAD); -+ if (kthread) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ cpumask_copy(&old_mask, p->cpus_ptr); -+ if (cpumask_equal(&old_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ queued = task_queued(p); -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (kthread) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(rq, p)) { -+ /* Task is running on the wrong cpu now, reschedule it. */ -+ if (rq == this_rq()) { -+ set_task_cpu(p, dest_cpu); -+ set_tsk_need_resched(p); -+ running_wrong = true; -+ } else -+ resched_task(p); -+ } else { -+ if (queued) { -+ /* -+ * Switch runqueue locks after dequeueing the task -+ * here while still holding the pi_lock to be holding -+ * the correct lock for enqueueing. -+ */ -+ dequeue_task(rq, p, 0); -+ rq_unlock(rq); -+ -+ rq = cpu_rq(dest_cpu); -+ rq_lock(rq); -+ } -+ set_task_cpu(p, dest_cpu); -+ if (queued) -+ enqueue_task(rq, p, 0); -+ } -+ if (queued) -+ try_preempt(p, rq); -+ if (running_wrong) -+ preempt_disable(); -+out: -+ task_rq_unlock(rq, p, &rf); -+ -+ if (running_wrong) { -+ __schedule(true); -+ preempt_enable(); -+ } -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Run through task list and find tasks affined to the dead cpu, then remove -+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold -+ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and -+ * pi_lock to change cpus_mask but it's not going to matter here. -+ */ -+static void bind_zero(int src_cpu) -+{ -+ struct task_struct *p, *t; -+ struct rq *rq0; -+ int bound = 0; -+ -+ if (src_cpu == 0) -+ return; -+ -+ rq0 = cpu_rq(0); -+ -+ do_each_thread(t, p) { -+ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { -+ bool local = (task_cpu(p) == src_cpu); -+ struct rq *rq = task_rq(p); -+ -+ /* task_running is the cpu stopper thread */ -+ if (local && task_running(rq, p)) -+ continue; -+ atomic_clear_cpu(src_cpu, &p->cpus_mask); -+ atomic_set_cpu(0, &p->cpus_mask); -+ p->zerobound = true; -+ bound++; -+ if (local) { -+ bool queued = task_queued(p); -+ -+ if (queued) -+ dequeue_task(rq, p, 0); -+ set_task_cpu(p, 0); -+ if (queued) -+ enqueue_task(rq0, p, 0); -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (bound) { -+ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", -+ bound, src_cpu); -+ } -+} -+ -+/* Find processes with the zerobound flag and reenable their affinity for the -+ * CPU coming alive. */ -+static void unbind_zero(int src_cpu) -+{ -+ int unbound = 0, zerobound = 0; -+ struct task_struct *p, *t; -+ -+ if (src_cpu == 0) -+ return; -+ -+ do_each_thread(t, p) { -+ if (!p->mm) -+ p->zerobound = false; -+ if (p->zerobound) { -+ unbound++; -+ cpumask_set_cpu(src_cpu, &p->cpus_mask); -+ /* Once every CPU affinity has been re-enabled, remove -+ * the zerobound flag */ -+ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { -+ p->zerobound = false; -+ zerobound++; -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (unbound) { -+ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", -+ unbound, src_cpu); -+ } -+ if (zerobound) { -+ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", -+ zerobound); -+ } -+} -+ -+/* -+ * Ensure that the idle task is using init_mm right before its cpu goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ current->active_mm = &init_mm; -+ finish_arch_post_lock_switch(); -+ } -+ mmdrop(mm); -+} -+#else /* CONFIG_HOTPLUG_CPU */ -+static void unbind_zero(int src_cpu) {} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+ -+static struct ctl_table sd_ctl_dir[] = { -+ { -+ .procname = "sched_domain", -+ .mode = 0555, -+ }, -+ {} -+}; -+ -+static struct ctl_table sd_ctl_root[] = { -+ { -+ .procname = "kernel", -+ .mode = 0555, -+ .child = sd_ctl_dir, -+ }, -+ {} -+}; -+ -+static struct ctl_table *sd_alloc_ctl_entry(int n) -+{ -+ struct ctl_table *entry = -+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); -+ -+ return entry; -+} -+ -+static void sd_free_ctl_entry(struct ctl_table **tablep) -+{ -+ struct ctl_table *entry; -+ -+ /* -+ * In the intermediate directories, both the child directory and -+ * procname are dynamically allocated and could fail but the mode -+ * will always be set. In the lowest directory the names are -+ * static strings and all have proc handlers. -+ */ -+ for (entry = *tablep; entry->mode; entry++) { -+ if (entry->child) -+ sd_free_ctl_entry(&entry->child); -+ if (entry->proc_handler == NULL) -+ kfree(entry->procname); -+ } -+ -+ kfree(*tablep); -+ *tablep = NULL; -+} -+ -+static void -+set_table_entry(struct ctl_table *entry, -+ const char *procname, void *data, int maxlen, -+ umode_t mode, proc_handler *proc_handler) -+{ -+ entry->procname = procname; -+ entry->data = data; -+ entry->maxlen = maxlen; -+ entry->mode = mode; -+ entry->proc_handler = proc_handler; -+} -+ -+static struct ctl_table * -+sd_alloc_ctl_domain_table(struct sched_domain *sd) -+{ -+ struct ctl_table *table = sd_alloc_ctl_entry(9); -+ -+ if (table == NULL) -+ return NULL; -+ -+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); -+ /* &table[8] is terminator */ -+ -+ return table; -+} -+ -+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -+{ -+ struct ctl_table *entry, *table; -+ struct sched_domain *sd; -+ int domain_num = 0, i; -+ char buf[32]; -+ -+ for_each_domain(cpu, sd) -+ domain_num++; -+ entry = table = sd_alloc_ctl_entry(domain_num + 1); -+ if (table == NULL) -+ return NULL; -+ -+ i = 0; -+ for_each_domain(cpu, sd) { -+ snprintf(buf, 32, "domain%d", i); -+ entry->procname = kstrdup(buf, GFP_KERNEL); -+ entry->mode = 0555; -+ entry->child = sd_alloc_ctl_domain_table(sd); -+ entry++; -+ i++; -+ } -+ return table; -+} -+ -+static cpumask_var_t sd_sysctl_cpus; -+static struct ctl_table_header *sd_sysctl_header; -+ -+void register_sched_domain_sysctl(void) -+{ -+ static struct ctl_table *cpu_entries; -+ static struct ctl_table **cpu_idx; -+ char buf[32]; -+ int i; -+ -+ if (!cpu_entries) { -+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); -+ if (!cpu_entries) -+ return; -+ -+ WARN_ON(sd_ctl_dir[0].child); -+ sd_ctl_dir[0].child = cpu_entries; -+ } -+ -+ if (!cpu_idx) { -+ struct ctl_table *e = cpu_entries; -+ -+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); -+ if (!cpu_idx) -+ return; -+ -+ /* deal with sparse possible map */ -+ for_each_possible_cpu(i) { -+ cpu_idx[i] = e; -+ e++; -+ } -+ } -+ -+ if (!cpumask_available(sd_sysctl_cpus)) { -+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) -+ return; -+ -+ /* init to possible to not have holes in @cpu_entries */ -+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); -+ } -+ -+ for_each_cpu(i, sd_sysctl_cpus) { -+ struct ctl_table *e = cpu_idx[i]; -+ -+ if (e->child) -+ sd_free_ctl_entry(&e->child); -+ -+ if (!e->procname) { -+ snprintf(buf, 32, "cpu%d", i); -+ e->procname = kstrdup(buf, GFP_KERNEL); -+ } -+ e->mode = 0555; -+ e->child = sd_alloc_ctl_cpu_table(i); -+ -+ __cpumask_clear_cpu(i, sd_sysctl_cpus); -+ } -+ -+ WARN_ON(sd_sysctl_header); -+ sd_sysctl_header = register_sysctl_table(sd_ctl_root); -+} -+ -+void dirty_sched_domain_sysctl(int cpu) -+{ -+ if (cpumask_available(sd_sysctl_cpus)) -+ __cpumask_set_cpu(cpu, sd_sysctl_cpus); -+} -+ -+/* may be called multiple times per register */ -+void unregister_sched_domain_sysctl(void) -+{ -+ unregister_sysctl_table(sd_sysctl_header); -+ sd_sysctl_header = NULL; -+} -+#endif /* CONFIG_SYSCTL */ -+ -+void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) { -+ cpumask_set_cpu(cpu_of(rq), rq->rd->online); -+ rq->online = true; -+ } -+} -+ -+void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) { -+ int cpu = cpu_of(rq); -+ -+ cpumask_clear_cpu(cpu, rq->rd->online); -+ rq->online = false; -+ clear_cpuidle_map(cpu); -+ } -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) { -+ sched_domains_numa_masks_set(cpu); -+ cpuset_cpu_active(); -+ } -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all CPUs have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ rq_lock_irqsave(rq, &rf); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_online(rq); -+ } -+ unbind_zero(cpu); -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ sched_domains_numa_masks_clear(cpu); -+ return 0; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_ttwu_pending(); -+ sched_tick_stop(cpu); -+ -+ local_irq_save(flags); -+ double_rq_lock(rq, cpu_rq(0)); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ bind_zero(cpu); -+ double_rq_unlock(rq, cpu_rq(0)); -+ sched_start_tick(rq, cpu); -+ hrexpiry_clear(rq); -+ local_irq_restore(flags); -+ -+ return 0; -+} -+#endif -+ -+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -+/* -+ * Cheaper version of the below functions in case support for SMT and MC is -+ * compiled in but CPUs have no siblings. -+ */ -+static bool sole_cpu_idle(struct rq *rq) -+{ -+ return rq_idle(rq); -+} -+#endif -+#ifdef CONFIG_SCHED_SMT -+static const cpumask_t *thread_cpumask(int cpu) -+{ -+ return topology_sibling_cpumask(cpu); -+} -+/* All this CPU's SMT siblings are idle */ -+static bool siblings_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); -+} -+#endif -+#ifdef CONFIG_SCHED_MC -+static const cpumask_t *core_cpumask(int cpu) -+{ -+ return topology_core_cpumask(cpu); -+} -+/* All this CPU's shared cache siblings are idle */ -+static bool cache_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->core_mask, &cpu_idle_map); -+} -+/* MC siblings CPU mask which share the same LLC */ -+static const cpumask_t *llc_core_cpumask(int cpu) -+{ -+ return per_cpu(cpu_llc_shared_map, cpu); -+} -+#endif -+ -+enum sched_domain_level { -+ SD_LV_NONE = 0, -+ SD_LV_SIBLING, -+ SD_LV_MC, -+ SD_LV_BOOK, -+ SD_LV_CPU, -+ SD_LV_NODE, -+ SD_LV_ALLNODES, -+ SD_LV_MAX -+}; -+ -+void __init sched_init_smp(void) -+{ -+ struct rq *rq, *other_rq, *leader = cpu_rq(0); -+ struct sched_domain *sd; -+ int cpu, other_cpu, i; -+#ifdef CONFIG_SCHED_SMT -+ bool smt_threads = false; -+#endif -+ sched_init_numa(); -+ -+ /* -+ * There's no userspace yet to cause hotplug operations; hence all the -+ * cpu masks are stable and all blatant races in the below code cannot -+ * happen. -+ */ -+ mutex_lock(&sched_domains_mutex); -+ sched_init_domains(cpu_active_mask); -+ mutex_unlock(&sched_domains_mutex); -+ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ local_irq_disable(); -+ mutex_lock(&sched_domains_mutex); -+ lock_all_rqs(); -+ -+ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", -+ num_possible_cpus(), num_present_cpus(), num_online_cpus()); -+ -+ /* -+ * Set up the relative cache distance of each online cpu from each -+ * other in a simple array for quick lookup. Locality is determined -+ * by the closest sched_domain that CPUs are separated by. CPUs with -+ * shared cache in SMT and MC are treated as local. Separate CPUs -+ * (within the same package or physically) within the same node are -+ * treated as not local. CPUs not even in the same domain (different -+ * nodes) are treated as very distant. -+ */ -+ for (cpu = num_online_cpus() - 1; cpu >= 0; cpu--) { -+ rq = cpu_rq(cpu); -+ leader = NULL; -+ /* First check if this cpu is in the same node */ -+ for_each_domain(cpu, sd) { -+ if (sd->level > SD_LV_MC) -+ continue; -+ if (rqshare != RQSHARE_ALL) -+ leader = NULL; -+ /* Set locality to local node if not already found lower */ -+ for_each_cpu(other_cpu, sched_domain_span(sd)) { -+ if (rqshare >= RQSHARE_SMP) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smp_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ other_rq->smp_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMP; -+ } -+ } -+ -+ /* -+ * Each runqueue has its own function in case it doesn't have -+ * siblings of its own allowing mixed topologies. -+ */ -+#ifdef CONFIG_SCHED_MC -+ leader = NULL; -+ if (cpumask_weight(core_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->core_mask); -+ for_each_cpu(other_cpu, core_cpumask(cpu)) { -+ if (rqshare == RQSHARE_MC || -+ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the mc_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ other_rq->mc_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { -+ /* this is to get LLC into play even in case LLC sharing is not used */ -+ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) -+ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; -+ else -+ rq->cpu_locality[other_cpu] = LOCALITY_MC; -+ } -+ } -+ rq->cache_idle = cache_cpu_idle; -+ } -+#endif -+#ifdef CONFIG_SCHED_SMT -+ leader = NULL; -+ if (cpumask_weight(thread_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->thread_mask); -+ for_each_cpu(other_cpu, thread_cpumask(cpu)) { -+ if (rqshare == RQSHARE_SMT) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smt_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ other_rq->smt_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMT; -+ } -+ rq->siblings_idle = siblings_cpu_idle; -+ smt_threads = true; -+ } -+#endif -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (smt_threads) { -+ check_siblings = &check_smt_siblings; -+ wake_siblings = &wake_smt_siblings; -+ smt_schedule = &smt_should_schedule; -+ } -+#endif -+ unlock_all_rqs(); -+ mutex_unlock(&sched_domains_mutex); -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for_each_online_cpu(other_cpu) { -+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smp_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+ } else -+ rq_unlock(rq); -+ } -+ -+#ifdef CONFIG_SCHED_MC -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->mc_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_MC */ -+ -+#ifdef CONFIG_SCHED_SMT -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ -+ leader = rq->smt_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_SMT */ -+ -+ local_irq_enable(); -+ -+ total_runqueues = 0; -+ for_each_online_cpu(cpu) { -+ int locality, total_rqs = 0, total_cpus = 0; -+ -+ rq = cpu_rq(cpu); -+ if ( -+#ifdef CONFIG_SCHED_MC -+ (rq->mc_leader == rq) && -+#endif -+#ifdef CONFIG_SCHED_SMT -+ (rq->smt_leader == rq) && -+#endif -+ (rq->smp_leader == rq)) { -+ total_runqueues++; -+ } -+ -+ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { -+ int selected_cpus[NR_CPUS], selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; -+ int ordered_cpus[NR_CPUS], ordered_cpus_idx; -+ -+ ordered_cpus_idx = -1; -+ selected_cpu_cnt = 0; -+ -+ for_each_online_cpu(test_cpu) { -+ if (cpu < num_online_cpus() / 2) -+ other_cpu = cpu + test_cpu; -+ else -+ other_cpu = cpu - test_cpu; -+ if (other_cpu < 0) -+ other_cpu += num_online_cpus(); -+ else -+ other_cpu %= num_online_cpus(); -+ /* gather CPUs of the same locality */ -+ if (rq->cpu_locality[other_cpu] == locality) { -+ selected_cpus[selected_cpu_cnt] = other_cpu; -+ selected_cpu_cnt++; -+ } -+ } -+ -+ /* reserve first CPU as starting point */ -+ if (selected_cpu_cnt > 0) { -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; -+ selected_cpus[ordered_cpus_idx] = -1; -+ } -+ -+ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ -+ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { -+ /* starting point with worst locality and current CPU */ -+ best_locality = LOCALITY_DISTANT; -+ selected_cpu_idx = test_cpu_idx; -+ -+ /* try to find the best locality within group */ -+ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { -+ /* if CPU has not been used and locality is better */ -+ if (selected_cpus[cpu_idx] > -1) { -+ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); -+ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { -+ /* assign best locality and best CPU idx in array */ -+ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; -+ selected_cpu_idx = cpu_idx; -+ } -+ } -+ } -+ -+ /* add our next best CPU to ordered list */ -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; -+ /* mark this CPU as used */ -+ selected_cpus[selected_cpu_idx] = -1; -+ } -+ -+ /* set up RQ and CPU orders */ -+ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { -+ other_rq = cpu_rq(ordered_cpus[test_cpu]); -+ /* set up cpu orders */ -+ rq->cpu_order[total_cpus++] = other_rq; -+ if ( -+#ifdef CONFIG_SCHED_MC -+ (other_rq->mc_leader == other_rq) && -+#endif -+#ifdef CONFIG_SCHED_SMT -+ (other_rq->smt_leader == other_rq) && -+#endif -+ (other_rq->smp_leader == other_rq)) { -+ /* set up RQ orders */ -+ rq->rq_order[total_rqs++] = other_rq; -+ } -+ } -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < total_runqueues; i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < num_online_cpus(); i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); -+ } -+ } -+ -+ switch (rqshare) { -+ case RQSHARE_ALL: -+ /* This should only ever read 1 */ -+ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMP: -+ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC: -+ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC_LLC: -+ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMT: -+ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_NONE: -+ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", -+ total_runqueues); -+ break; -+ } -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ sched_smp_initialized = true; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+#ifdef CONFIG_SMP -+ int cpu_ids; -+#endif -+ int i; -+ struct rq *rq; -+ -+ wait_bit_init(); -+ -+ prio_ratios[0] = 128; -+ for (i = 1 ; i < NICE_WIDTH ; i++) -+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; -+ -+ skiplist_node_init(&init_task.node); -+ -+#ifdef CONFIG_SMP -+ init_defrootdomain(); -+ cpumask_clear(&cpu_idle_map); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); -+ skiplist_init(rq->node); -+ rq->sl = new_skiplist(rq->node); -+ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); -+ raw_spin_lock_init(rq->lock); -+ rq->nr_running = 0; -+ rq->nr_uninterruptible = 0; -+ rq->nr_switches = 0; -+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; -+ rq->last_jiffy = jiffies; -+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = -+ rq->iowait_ns = rq->idle_ns = 0; -+ rq->dither = 0; -+ set_rq_task(rq, &init_task); -+ rq->iso_ticks = 0; -+ rq->iso_refractory = false; -+#ifdef CONFIG_SMP -+ rq->smp_leader = rq; -+#ifdef CONFIG_SCHED_MC -+ rq->mc_leader = rq; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ rq->smt_leader = rq; -+#endif -+ rq->sd = NULL; -+ rq->rd = NULL; -+ rq->online = false; -+ rq->cpu = i; -+ rq_attach_root(rq, &def_root_domain); -+#endif -+ init_rq_hrexpiry(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+ -+#ifdef CONFIG_SMP -+ cpu_ids = i; -+ /* -+ * Set the base locality for cpu cache distance calculation to -+ * "distant" (3). Make sure the distance from a CPU to itself is 0. -+ */ -+ for_each_possible_cpu(i) { -+ int j; -+ -+ rq = cpu_rq(i); -+#ifdef CONFIG_SCHED_SMT -+ rq->siblings_idle = sole_cpu_idle; -+#endif -+#ifdef CONFIG_SCHED_MC -+ rq->cache_idle = sole_cpu_idle; -+#endif -+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); -+ for_each_possible_cpu(j) { -+ if (i == j) -+ rq->cpu_locality[j] = LOCALITY_SAME; -+ else -+ rq->cpu_locality[j] = LOCALITY_DISTANT; -+ } -+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->rq_order[0] = rq->cpu_order[0] = rq; -+ for (j = 1; j < cpu_ids; j++) -+ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); -+ } -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && !preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+static inline void normalise_rt_tasks(void) -+{ -+ struct sched_attr attr = {}; -+ struct task_struct *g, *p; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p) && !iso_task(p)) -+ continue; -+ -+ rq = task_rq_lock(p, &rf); -+ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); -+ task_rq_unlock(rq, p, &rf); -+ } -+ read_unlock(&tasklist_lock); -+} -+ -+void normalize_rt_tasks(void) -+{ -+ normalise_rt_tasks(); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+void init_idle_bootup_task(struct task_struct *idle) -+{} -+ -+#ifdef CONFIG_SCHED_DEBUG -+__read_mostly bool sched_debug_enabled; -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h -new file mode 100644 -index 000000000000..5214b158d82f ---- /dev/null -+++ b/kernel/sched/MuQSS.h -@@ -0,0 +1,1010 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef MUQSS_SCHED_H -+#define MUQSS_SCHED_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+#else -+# define SCHED_WARN_ON(x) ((void)(x)) -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+struct rq; -+ -+#ifdef CONFIG_SMP -+ -+static inline bool sched_asym_prefer(int a, int b) -+{ -+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); -+} -+ -+struct perf_domain { -+ struct em_perf_domain *em_pd; -+ struct perf_domain *next; -+ struct rcu_head rcu; -+}; -+ -+/* Scheduling group status flags */ -+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ -+ -+/* -+ * We add the notion of a root-domain which will be used to define per-domain -+ * variables. Each exclusive cpuset essentially defines an island domain by -+ * fully partitioning the member cpus from any other cpuset. Whenever a new -+ * exclusive cpuset is created, we also create and attach a new root-domain -+ * object. -+ * -+ */ -+struct root_domain { -+ atomic_t refcount; -+ atomic_t rto_count; -+ struct rcu_head rcu; -+ cpumask_var_t span; -+ cpumask_var_t online; -+ -+ /* -+ * Indicate pullable load on at least one CPU, e.g: -+ * - More than one runnable task -+ * - Running task is misfit -+ */ -+ int overload; -+ -+ /* Indicate one or more cpus over-utilized (tipping point) */ -+ int overutilized; -+ -+ /* -+ * The bit corresponding to a CPU gets set here if such CPU has more -+ * than one runnable -deadline task (as it is below for RT tasks). -+ */ -+ cpumask_var_t dlo_mask; -+ atomic_t dlo_count; -+ /* Replace unused CFS structures with void */ -+ //struct dl_bw dl_bw; -+ //struct cpudl cpudl; -+ void *dl_bw; -+ void *cpudl; -+ -+ /* -+ * The "RT overload" flag: it gets set if a CPU has more than -+ * one runnable RT task. -+ */ -+ cpumask_var_t rto_mask; -+ //struct cpupri cpupri; -+ void *cpupri; -+ -+ unsigned long max_cpu_capacity; -+ -+ /* -+ * NULL-terminated list of performance domains intersecting with the -+ * CPUs of the rd. Protected by RCU. -+ */ -+ struct perf_domain *pd; -+}; -+ -+extern void init_defrootdomain(void); -+extern int sched_init_domains(const struct cpumask *cpu_map); -+extern void rq_attach_root(struct rq *rq, struct root_domain *rd); -+ -+static inline void cpupri_cleanup(void __maybe_unused *cpupri) -+{ -+} -+ -+static inline void cpudl_cleanup(void __maybe_unused *cpudl) -+{ -+} -+ -+static inline void init_dl_bw(void __maybe_unused *dl_bw) -+{ -+} -+ -+static inline int cpudl_init(void __maybe_unused *dl_bw) -+{ -+ return 0; -+} -+ -+static inline int cpupri_init(void __maybe_unused *cpupri) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ raw_spinlock_t *lock; -+ raw_spinlock_t *orig_lock; -+ -+ struct task_struct *curr, *idle, *stop; -+ struct mm_struct *prev_mm; -+ -+ unsigned int nr_running; -+ /* -+ * This is part of a global counter where only the total sum -+ * over all CPUs matters. A task can increase this counter on -+ * one CPU and if it got migrated afterwards it may decrease -+ * it on another CPU. Always updated under the runqueue lock: -+ */ -+ unsigned long nr_uninterruptible; -+ u64 nr_switches; -+ -+ /* Stored data about rq->curr to work outside rq lock */ -+ u64 rq_deadline; -+ int rq_prio; -+ -+ /* Best queued id for use outside lock */ -+ u64 best_key; -+ -+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ -+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ -+ u64 niffies; /* Last time this RQ updated rq clock */ -+ u64 last_niffy; /* Last niffies as updated by local clock */ -+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ -+ -+ u64 load_update; /* When we last updated load */ -+ unsigned long load_avg; /* Rolling load average */ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ u64 irq_load_update; /* When we last updated IRQ load */ -+ unsigned long irq_load_avg; /* Rolling IRQ load average */ -+#endif -+#ifdef CONFIG_SMT_NICE -+ struct mm_struct *rq_mm; -+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+ /* Accurate timekeeping data */ -+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, -+ iowait_ns, idle_ns; -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+ skiplist_node *node; -+ skiplist *sl; -+#ifdef CONFIG_SMP -+ struct task_struct *preempt; /* Preempt triggered on this task */ -+ struct task_struct *preempting; /* Hint only, what task is preempting */ -+ -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ struct root_domain *rd; -+ struct sched_domain *sd; -+ -+ unsigned long cpu_capacity_orig; -+ -+ int *cpu_locality; /* CPU relative cache distance */ -+ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ -+ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ -+ -+ struct rq *smp_leader; /* First physical CPU per node */ -+#ifdef CONFIG_SCHED_SMT -+ struct rq *smt_leader; /* First logical CPU in SMT siblings */ -+ cpumask_t thread_mask; -+ bool (*siblings_idle)(struct rq *rq); -+ /* See if all smt siblings are idle */ -+#endif /* CONFIG_SCHED_SMT */ -+#ifdef CONFIG_SCHED_MC -+ struct rq *mc_leader; /* First logical CPU in MC siblings */ -+ cpumask_t core_mask; -+ bool (*cache_idle)(struct rq *rq); -+ /* See if all cache siblings are idle */ -+#endif /* CONFIG_SCHED_MC */ -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ u64 clock, old_clock, last_tick; -+ /* Ensure that all clocks are in the same cache line */ -+ u64 clock_task ____cacheline_aligned; -+ int dither; -+ -+ int iso_ticks; -+ bool iso_refractory; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ struct hrtimer hrexpiry_timer; -+#endif -+ -+ int rt_nr_running; /* Number real time tasks running */ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_SMP -+ struct llist_head wake_list; -+#endif -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu); -+#endif -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#endif /* CONFIG_SMP */ -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline int task_running(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->on_cpu; -+#else -+ return task_current(rq, p); -+#endif -+} -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+static inline void rq_lock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(rq->lock); -+} -+ -+static inline void rq_unlock(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(rq->lock); -+} -+ -+static inline void rq_lock_irq(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(rq->lock); -+} -+ -+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(rq->lock); -+} -+ -+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(rq->lock, rf->flags); -+} -+ -+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(rq->lock, rf->flags); -+} -+ -+static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ while (42) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ } -+ return rq; -+} -+ -+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ rq_unlock(rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ while (42) { -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ } -+ return rq; -+} -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) -+{ -+ rq_unlock(rq); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ return rq; -+} -+ -+/* -+ * {de,en}queue flags: Most not used on MuQSS. -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks -+ * are in a known state which allows modification. Such pairs -+ * should preserve as much state as possible. -+ * -+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location -+ * in the runqueue. -+ * -+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) -+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) -+ * ENQUEUE_MIGRATED - the task was migrated during wakeup -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -+ -+#define ENQUEUE_WAKEUP 0x01 -+#define ENQUEUE_RESTORE 0x02 -+ -+#ifdef CONFIG_SMP -+#define ENQUEUE_MIGRATED 0x40 -+#else -+#define ENQUEUE_MIGRATED 0x00 -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock_task; -+} -+ -+#ifdef CONFIG_NUMA -+enum numa_topology_type { -+ NUMA_DIRECT, -+ NUMA_GLUELESS_MESH, -+ NUMA_BACKPLANE, -+}; -+extern enum numa_topology_type sched_numa_topology_type; -+extern int sched_max_numa_distance; -+extern bool find_numa_distance(int distance); -+extern void sched_init_numa(void); -+extern void sched_domains_numa_masks_set(unsigned int cpu); -+extern void sched_domains_numa_masks_clear(unsigned int cpu); -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline void sched_init_numa(void) { } -+static inline void sched_domains_numa_masks_set(unsigned int cpu) { } -+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern struct mutex sched_domains_mutex; -+extern struct static_key_false sched_schedstats; -+ -+#define rcu_dereference_check_sched_domain(p) \ -+ rcu_dereference_check((p), \ -+ lockdep_is_held(&sched_domains_mutex)) -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -+ * See destroy_sched_domains: call_rcu for details. -+ * -+ * The domain tree of any CPU may only be accessed from within -+ * preempt-disabled sections. -+ */ -+#define for_each_domain(cpu, __sd) \ -+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ -+ __sd; __sd = __sd->parent) -+ -+#define for_each_lower_domain(sd) for (; sd; sd = sd->child) -+ -+/** -+ * highest_flag_domain - Return highest sched_domain containing flag. -+ * @cpu: The cpu whose highest level of sched domain is to -+ * be returned. -+ * @flag: The flag to check for the highest sched_domain -+ * for the given cpu. -+ * -+ * Returns the highest sched_domain of a cpu which contains the given flag. -+ */ -+static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd, *hsd = NULL; -+ -+ for_each_domain(cpu, sd) { -+ if (!(sd->flags & flag)) -+ break; -+ hsd = sd; -+ } -+ -+ return hsd; -+} -+ -+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd; -+ -+ for_each_domain(cpu, sd) { -+ if (sd->flags & flag) -+ break; -+ } -+ -+ return sd; -+} -+ -+DECLARE_PER_CPU(struct sched_domain *, sd_llc); -+DECLARE_PER_CPU(int, sd_llc_size); -+DECLARE_PER_CPU(int, sd_llc_id); -+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -+DECLARE_PER_CPU(struct sched_domain *, sd_numa); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); -+ -+struct sched_group_capacity { -+ atomic_t ref; -+ /* -+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity -+ * for a single CPU. -+ */ -+ unsigned long capacity; -+ unsigned long min_capacity; /* Min per-CPU capacity in group */ -+ unsigned long max_capacity; /* Max per-CPU capacity in group */ -+ unsigned long next_update; -+ int imbalance; /* XXX unrelated to capacity but shared group state */ -+ -+#ifdef CONFIG_SCHED_DEBUG -+ int id; -+#endif -+ -+ unsigned long cpumask[0]; /* balance mask */ -+}; -+ -+struct sched_group { -+ struct sched_group *next; /* Must be a circular list */ -+ atomic_t ref; -+ -+ unsigned int group_weight; -+ struct sched_group_capacity *sgc; -+ int asym_prefer_cpu; /* cpu of highest priority in group */ -+ -+ /* -+ * The CPUs this group covers. -+ * -+ * NOTE: this field is variable length. (Allocated dynamically -+ * by attaching extra space to the end of the structure, -+ * depending on how many CPUs the kernel has booted up with) -+ */ -+ unsigned long cpumask[0]; -+}; -+ -+static inline struct cpumask *sched_group_span(struct sched_group *sg) -+{ -+ return to_cpumask(sg->cpumask); -+} -+ -+/* -+ * See build_balance_mask(). -+ */ -+static inline struct cpumask *group_balance_mask(struct sched_group *sg) -+{ -+ return to_cpumask(sg->sgc->cpumask); -+} -+ -+/** -+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. -+ * @group: The group whose first cpu is to be returned. -+ */ -+static inline unsigned int group_first_cpu(struct sched_group *group) -+{ -+ return cpumask_first(sched_group_span(group)); -+} -+ -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void dirty_sched_domain_sysctl(int cpu); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void dirty_sched_domain_sysctl(int cpu) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern void sched_ttwu_pending(void); -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_rq_online (struct rq *rq); -+extern void set_rq_offline(struct rq *rq); -+extern bool sched_smp_initialized; -+ -+static inline void update_group_capacity(struct sched_domain *sd, int cpu) -+{ -+} -+ -+static inline void trigger_load_balance(struct rq *rq) -+{ -+} -+ -+#define sched_feat(x) 0 -+ -+#else /* CONFIG_SMP */ -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ SCHED_WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+extern bool sched_debug_enabled; -+#endif -+ -+extern void schedule_idle(void); -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+static inline bool sched_stop_runnable(struct rq *rq) -+{ -+ return rq->stop && task_on_rq_queued(rq->stop); -+} -+ -+#ifdef CONFIG_SMP -+static inline int cpu_of(struct rq *rq) -+{ -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static inline int cpu_of(struct rq *rq) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); -+ -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ -+ if (data) -+ data->func(data, rq->niffies, flags); -+} -+#else -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) -+{ -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+static inline bool uclamp_is_used(void) -+{ -+ return false; -+} -+ -+static __always_inline -+unsigned int uclamp_util_with(struct rq __maybe_unused *rq, unsigned int util, -+ struct task_struct __maybe_unused *p) -+{ -+ return util; -+} -+ -+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) -+{ -+ return util; -+} -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+/* -+ * This should only be called when current == rq->idle. Dodgy workaround for -+ * when softirqs are pending and we are in the idle loop. Setting current to -+ * resched will kick us out of the idle loop and the softirqs will be serviced -+ * on our next pass through schedule(). -+ */ -+static inline bool softirq_pending(int cpu) -+{ -+ if (likely(!local_softirq_pending())) -+ return false; -+ set_tsk_need_resched(current); -+ return true; -+} -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return tsk_seruntime(t); -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ struct rq_flags rf; -+ u64 ns; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = tsk_seruntime(t); -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern bool sched_can_stop_tick(struct rq *rq); -+extern int __init sched_tick_offload_init(void); -+ -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out of -+ * nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (sched_can_stop_tick(rq)) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+static inline bool rt_rq_is_runnable(struct rq *rt_rq) -+{ -+ return rt_rq->rt_nr_running; -+} -+ -+/** -+ * enum schedutil_type - CPU utilization type -+ * @FREQUENCY_UTIL: Utilization used to select frequency -+ * @ENERGY_UTIL: Utilization used during energy calculation -+ * -+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time -+ * need to be aggregated differently depending on the usage made of them. This -+ * enum is used within schedutil_freq_util() to differentiate the types of -+ * utilization expected by the callers, and adjust the aggregation accordingly. -+ */ -+enum schedutil_type { -+ FREQUENCY_UTIL, -+ ENERGY_UTIL, -+}; -+ -+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -+ -+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, -+ unsigned long max, enum schedutil_type type, -+ struct task_struct *p); -+ -+static inline unsigned long cpu_bw_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_cfs(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline unsigned long cpu_util_rt(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->rt_nr_running); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->irq_load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ util *= (max - irq); -+ util /= max; -+ -+ return util; -+ -+} -+#else -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ return util; -+} -+#endif -+#endif -+ -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -+ -+DECLARE_STATIC_KEY_FALSE(sched_energy_present); -+ -+static inline bool sched_energy_enabled(void) -+{ -+ return static_branch_unlikely(&sched_energy_present); -+} -+ -+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ -+ -+#define perf_domain_span(pd) NULL -+static inline bool sched_energy_enabled(void) { return false; } -+ -+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#endif /* MUQSS_SCHED_H */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 86800b4d5453..f3d8dca0538a 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -185,6 +185,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifdef CONFIG_SCHED_MUQSS -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) -+#else -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) -+#endif -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -213,7 +219,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && -- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { -+ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { - return max; - } - -@@ -658,7 +664,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - struct task_struct *thread; - struct sched_attr attr = { - .size = sizeof(struct sched_attr), -+#ifdef CONFIG_SCHED_MUQSS -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, -+#endif - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, -diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h -index 7dc20a3232e7..e733a0a53b0a 100644 ---- a/kernel/sched/cpupri.h -+++ b/kernel/sched/cpupri.h -@@ -17,9 +17,11 @@ struct cpupri { - int *cpu_to_pri; - }; - -+#ifndef CONFIG_SCHED_MUQSS - #ifdef CONFIG_SMP - int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); - void cpupri_set(struct cpupri *cp, int cpu, int pri); - int cpupri_init(struct cpupri *cp); - void cpupri_cleanup(struct cpupri *cp); - #endif -+#endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 46ed4e1383e2..f077fcd22d2b 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) - return accounted; - } - --#ifdef CONFIG_64BIT --static inline u64 read_sum_exec_runtime(struct task_struct *t) --{ -- return t->se.sum_exec_runtime; --} --#else --static u64 read_sum_exec_runtime(struct task_struct *t) --{ -- u64 ns; -- struct rq_flags rf; -- struct rq *rq; -- -- rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -- task_rq_unlock(rq, t, &rf); -- -- return ns; --} --#endif -- - /* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. -@@ -663,7 +643,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f65ef1e2f204..e0aa6c73a5fa 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -225,6 +225,8 @@ static void cpuidle_idle_call(void) - static void do_idle(void) - { - int cpu = smp_processor_id(); -+ bool pending = false; -+ - /* - * If the arch has a polling bit, we maintain an invariant: - * -@@ -235,7 +237,10 @@ static void do_idle(void) - */ - - __current_set_polling(); -- tick_nohz_idle_enter(); -+ if (unlikely(softirq_pending(cpu))) -+ pending = true; -+ else -+ tick_nohz_idle_enter(); - - while (!need_resched()) { - rmb(); -@@ -273,7 +278,8 @@ static void do_idle(void) - * an IPI to fold the state for us. - */ - preempt_set_need_resched(); -- tick_nohz_idle_exit(); -+ if (!pending) -+ tick_nohz_idle_exit(); - __current_clr_polling(); - - /* -@@ -355,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_MUQSS - /* - * idle-task scheduling class. - */ -@@ -479,3 +486,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index c8870c5bd7df..add1d74c2e91 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,19 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_MUQSS -+#include "MuQSS.h" -+ -+/* Begin compatibility wrappers for MuQSS/CFS differences */ -+#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->nr_running) -+ -+#else /* CONFIG_SCHED_MUQSS */ -+ -+#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) -+ -+ - #include - - #include -@@ -2496,3 +2509,30 @@ static inline void membarrier_switch_mm(struct rq *rq, - { - } - #endif -+ -+/* MuQSS compatibility functions */ -+static inline bool softirq_pending(int cpu) -+{ -+ return false; -+} -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return t->se.sum_exec_runtime; -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ u64 ns; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = t->se.sum_exec_runtime; -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 49b835f1305f..0253ea846c0d 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -3,6 +3,7 @@ - * Scheduler topology setup/handling methods - */ - #include "sched.h" -+#include "linux/sched/deadline.h" - - DEFINE_MUTEX(sched_domains_mutex); - -@@ -442,7 +443,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - struct root_domain *old_rd = NULL; - unsigned long flags; - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_lock_irqsave(rq->lock, flags); -+#else - raw_spin_lock_irqsave(&rq->lock, flags); -+#endif - - if (rq->rd) { - old_rd = rq->rd; -@@ -468,7 +473,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_unlock_irqrestore(rq->lock, flags); -+#else - raw_spin_unlock_irqrestore(&rq->lock, flags); -+#endif - - if (old_rd) - call_rcu(&old_rd->rcu, free_rootdomain); -diff --git a/kernel/skip_list.c b/kernel/skip_list.c -new file mode 100644 -index 000000000000..bf5c6e97e139 ---- /dev/null -+++ b/kernel/skip_list.c -@@ -0,0 +1,148 @@ -+/* -+ Copyright (C) 2011,2016 Con Kolivas. -+ -+ Code based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+The routine randomLevel has been hard-coded to generate random -+levels using p=0.25. It can be easily changed. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+Levels start at zero and go up to MaxLevel (which is equal to -+MaxNumberOfLevels-1). -+ -+The routines defined in this file are: -+ -+init: defines slnode -+ -+new_skiplist: returns a new, empty list -+ -+randomLevel: Returns a random level based on a u64 random seed passed to it. -+In MuQSS, the "niffy" time is used for this purpose. -+ -+insert(l,key, value): inserts the binding (key, value) into l. This operation -+occurs in O(log n) time. -+ -+delnode(slnode, l, node): deletes any binding of key from the l based on the -+actual node value. This operation occurs in O(k) time where k is the -+number of levels of the node in question (max 8). The original delete -+function occurred in O(log n) time and involved a search. -+ -+MuQSS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+ -+*/ -+ -+#include -+#include -+ -+#define MaxNumberOfLevels 8 -+#define MaxLevel (MaxNumberOfLevels - 1) -+ -+void skiplist_init(skiplist_node *slnode) -+{ -+ int i; -+ -+ slnode->key = 0xFFFFFFFFFFFFFFFF; -+ slnode->level = 0; -+ slnode->value = NULL; -+ for (i = 0; i < MaxNumberOfLevels; i++) -+ slnode->next[i] = slnode->prev[i] = slnode; -+} -+ -+skiplist *new_skiplist(skiplist_node *slnode) -+{ -+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); -+ -+ BUG_ON(!l); -+ l->header = slnode; -+ return l; -+} -+ -+void free_skiplist(skiplist *l) -+{ -+ skiplist_node *p, *q; -+ -+ p = l->header; -+ do { -+ q = p->next[0]; -+ p->next[0]->prev[0] = q->prev[0]; -+ skiplist_node_init(p); -+ p = q; -+ } while (p != l->header); -+ kfree(l); -+} -+ -+void skiplist_node_init(skiplist_node *node) -+{ -+ memset(node, 0, sizeof(skiplist_node)); -+} -+ -+static inline unsigned int randomLevel(const long unsigned int randseed) -+{ -+ return find_first_bit(&randseed, MaxLevel) / 2; -+} -+ -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -+{ -+ skiplist_node *update[MaxNumberOfLevels]; -+ skiplist_node *p, *q; -+ int k = l->level; -+ -+ p = l->header; -+ do { -+ while (q = p->next[k], q->key <= key) -+ p = q; -+ update[k] = p; -+ } while (--k >= 0); -+ -+ ++l->entries; -+ k = randomLevel(randseed); -+ if (k > l->level) { -+ k = ++l->level; -+ update[k] = l->header; -+ } -+ -+ node->level = k; -+ node->key = key; -+ node->value = value; -+ do { -+ p = update[k]; -+ node->next[k] = p->next[k]; -+ p->next[k] = node; -+ node->prev[k] = p; -+ node->next[k]->prev[k] = node; -+ } while (--k >= 0); -+} -+ -+void skiplist_delete(skiplist *l, skiplist_node *node) -+{ -+ int k, m = node->level; -+ -+ for (k = 0; k <= m; k++) { -+ node->prev[k]->next[k] = node->next[k]; -+ node->next[k]->prev[k] = node->prev[k]; -+ } -+ skiplist_node_init(node); -+ if (m == l->level) { -+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) -+ m--; -+ l->level = m; -+ } -+ l->entries--; -+} -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b6f2f35d0bcf..349f5a249593 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; - static unsigned long zero_ul; - static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; --#ifdef CONFIG_PRINTK -+static int __read_mostly one_hundred = 100; -+static int __read_mostly one_thousand = 1000; -+#ifdef CONFIG_SCHED_MUQSS -+static int zero = 0; -+static int one = 1; -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_yield_type; -+#endif -+extern int hrtimer_granularity_us; -+extern int hrtimeout_min_us; -+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) - static int ten_thousand = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS -@@ -300,7 +310,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -317,6 +327,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_MUQSS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -498,6 +509,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_MUQSS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1070,6 +1082,62 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_MUQSS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif -+ { -+ .procname = "hrtimer_granularity_us", -+ .data = &hrtimer_granularity_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, -+ { -+ .procname = "hrtimeout_min_us", -+ .data = &hrtimeout_min_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig -index fcc42353f125..46bb16d3c159 100644 ---- a/kernel/time/Kconfig -+++ b/kernel/time/Kconfig -@@ -66,6 +66,9 @@ config NO_HZ_COMMON - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - -+config NO_HZ_FULL -+ bool -+ - choice - prompt "Timer tick handling" - default NO_HZ_IDLE if NO_HZ -@@ -87,8 +90,9 @@ config NO_HZ_IDLE - - Most of the time you want to say Y here. - --config NO_HZ_FULL -+config NO_HZ_FULL_NODEF - bool "Full dynticks system (tickless)" -+ select NO_HZ_FULL - # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - # We need at least one periodic CPU for timekeeping -@@ -114,6 +118,8 @@ config NO_HZ_FULL - transitions: syscalls, exceptions and interrupts. Even when it's - dynamically off. - -+ Not recommended for desktops,laptops, or mobile devices. -+ - Say N. - - endchoice -diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c -index f5490222e134..544c58c29267 100644 ---- a/kernel/time/clockevents.c -+++ b/kernel/time/clockevents.c -@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - --/* Limit min_delta to a jiffie */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) -+int __read_mostly hrtimer_granularity_us = 100; -+/* Limit min_delta to 100us */ -+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 65605530ee34..75e67a12a97b 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2206,3 +2206,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, - return schedule_hrtimeout_range(expires, 0, mode); - } - EXPORT_SYMBOL_GPL(schedule_hrtimeout); -+ -+/* -+ * As per schedule_hrtimeout but taskes a millisecond value and returns how -+ * many milliseconds are left. -+ */ -+long __sched schedule_msec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ int delta, jiffs; -+ ktime_t expires; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ jiffs = msecs_to_jiffies(timeout); -+ /* -+ * If regular timer resolution is adequate or hrtimer resolution is not -+ * (yet) better than Hz, as would occur during startup, use regular -+ * timers. -+ */ -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) -+ return schedule_timeout(jiffs); -+ -+ delta = (timeout % 1000) * NSEC_PER_MSEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_ms(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+EXPORT_SYMBOL(schedule_msec_hrtimeout); -+ -+#define USECS_PER_SEC 1000000 -+extern int hrtimer_granularity_us; -+ -+static inline long schedule_usec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ ktime_t expires; -+ int delta; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(usecs_to_jiffies(timeout)); -+ -+ if (timeout < hrtimer_granularity_us) -+ timeout = hrtimer_granularity_us; -+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_us(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+int __read_mostly hrtimeout_min_us = 500; -+ -+long __sched schedule_min_hrtimeout(void) -+{ -+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); -+} -+ -+EXPORT_SYMBOL(schedule_min_hrtimeout); -+ -+long __sched schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); -+ -+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 42d512fcfda2..0db83bdf7f39 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -845,7 +845,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index 4820823515e9..13034cc7c9a4 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -43,6 +43,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1567,7 +1568,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ --static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) -+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) - { - u64 nextevt = hrtimer_get_next_event(); - -@@ -1585,6 +1586,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) - if (nextevt <= basem) - return basem; - -+ if (nextevt < expires && nextevt - basem <= TICK_NSEC) -+ base->is_idle = false; -+ - /* - * Round up to the next jiffie. High resolution timers are - * off, so the hrtimers are expired in the tick and we need to -@@ -1654,7 +1658,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) - } - raw_spin_unlock(&base->lock); - -- return cmp_next_hrtimer_event(basem, expires); -+ return cmp_next_hrtimer_event(base, basem, expires); - } - - /** -@@ -1889,6 +1893,18 @@ signed long __sched schedule_timeout(signed long timeout) - - expire = timeout + jiffies; - -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ /* -+ * Special case 1 as being a request for the minimum timeout -+ * and use highres timers to timeout after 1ms to workaround -+ * the granularity of low Hz tick timers. -+ */ -+ if (!schedule_min_hrtimeout()) -+ return 0; -+ goto out_timeout; -+ } -+#endif - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, 0); -@@ -1897,10 +1913,10 @@ signed long __sched schedule_timeout(signed long timeout) - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); -- -+out_timeout: - timeout = expire - jiffies; - -- out: -+out: - return timeout < 0 ? 0 : timeout; - } - EXPORT_SYMBOL(schedule_timeout); -@@ -2042,7 +2058,19 @@ void __init init_timers(void) - */ - void msleep(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ /* -+ * Use high resolution timers where the resolution of tick based -+ * timers is inadequate. -+ */ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs) -+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); -+ return; -+ } -+ timeout = jiffs + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -2056,7 +2084,15 @@ EXPORT_SYMBOL(msleep); - */ - unsigned long msleep_interruptible(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs && !signal_pending(current)) -+ msecs = schedule_msec_hrtimeout_interruptible(msecs); -+ return msecs; -+ } -+ timeout = jiffs + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index 69ee8ef12cee..6edb01f2fd81 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_MUQSS -+ /* No deadline on MuQSS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index ee4eecc7e1c2..22c1b0469468 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -164,7 +164,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 33; - /* - * The total number of pages which are beyond the high watermark within all - * zones. -diff --git a/net/core/pktgen.c b/net/core/pktgen.c -index 48b1e429857c..908c866bc9fc 100644 ---- a/net/core/pktgen.c -+++ b/net/core/pktgen.c -@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) - mutex_unlock(&pktgen_thread_lock); - pr_debug("%s: waiting for %s to disappear....\n", - __func__, ifname); -- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); -+ schedule_msec_hrtimeout_interruptible((msec_per_try)); - mutex_lock(&pktgen_thread_lock); - - if (++i >= max_tries) { -diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c -index 19fa73df0846..46caed9b924d 100644 ---- a/sound/pci/maestro3.c -+++ b/sound/pci/maestro3.c -@@ -2001,7 +2001,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(0, io + GPIO_DATA); - outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); -+ schedule_msec_hrtimeout_uninterruptible((delay1)); - - outw(GPO_PRIMARY_AC97, io + GPIO_DATA); - udelay(5); -@@ -2009,7 +2009,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); - outw(~0, io + GPIO_MASK); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); -+ schedule_msec_hrtimeout_uninterruptible((delay2)); - - if (! snd_m3_try_read_vendor(chip)) - break; -diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c -index f70b9f7e68bb..77b65398ca07 100644 ---- a/sound/soc/codecs/rt5631.c -+++ b/sound/soc/codecs/rt5631.c -@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena - hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - /* config one-bit depop parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); - snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, -@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable - hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - /* config depop sequence parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); -diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c -index fe99584c917f..f1344d532a13 100644 ---- a/sound/soc/codecs/wm8350.c -+++ b/sound/soc/codecs/wm8350.c -@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) - out2->ramp == WM8350_RAMP_UP) { - /* delay is longer over 0dB as increases are larger */ - if (i >= WM8350_OUTn_0dB) -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (2)); - else -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (1)); - } else - udelay(50); /* doesn't matter if we delay longer */ -@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - (platform->dis_out4 << 6)); - - /* wait for discharge */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - cap_discharge_msecs)); - -@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - WM8350_VBUFEN); - - /* wait for vmid */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_charge_msecs)); - -@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_discharge_msecs)); - -@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - pm1 | WM8350_OUTPUT_DRAIN_EN); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform->drain_msecs)); - - pm1 &= ~WM8350_BIASEN; -diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c -index 271235a69c01..3ec90e1b1eb4 100644 ---- a/sound/soc/codecs/wm8900.c -+++ b/sound/soc/codecs/wm8900.c -@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, - /* Need to let things settle before stopping the clock - * to ensure that restart works, see "Stopping the - * master clock" in the datasheet. */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_write(component, WM8900_REG_POWER2, - WM8900_REG_POWER2_SYSCLK_ENA); - break; -diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c -index 6497c1ea6228..08fefeca9d82 100644 ---- a/sound/soc/codecs/wm9713.c -+++ b/sound/soc/codecs/wm9713.c -@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, - - /* Gracefully shut down the voice interface. */ - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); - snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); - -@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, - wm9713->pll_in = freq_in; - - /* wait 10ms AC97 link frames for the link to stabilise */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - return 0; - } - -diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c -index b6378f025836..5f5e58655d32 100644 ---- a/sound/soc/soc-dapm.c -+++ b/sound/soc/soc-dapm.c -@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) - static void pop_wait(u32 pop_time) - { - if (pop_time) -- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); -+ schedule_msec_hrtimeout_uninterruptible((pop_time)); - } - - __printf(3, 4) -diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c -index f70211e6b174..5ae4421225e6 100644 ---- a/sound/usb/line6/pcm.c -+++ b/sound/usb/line6/pcm.c -@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, - if (!alive) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } while (--timeout > 0); - if (alive) - dev_err(line6pcm->line6->ifcdev, diff --git a/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch b/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch deleted file mode 100644 index 2c4837e..0000000 --- a/linux54-tkg/linux54-tkg-patches/0004-glitched-muqss.patch +++ /dev/null @@ -1,78 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - MuQSS - -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index 84a1d08d68551..57c3036a68952 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -+#ifdef CONFIG_ZENIFY -+int sched_iso_cpu __read_mostly = 25; -+#else - int sched_iso_cpu __read_mostly = 70; -+#endif - - /* - * sched_yield_type - Choose what sort of yield sched_yield will perform. - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,7 @@ - choice - prompt "Timer frequency" - default HZ_100 if SCHED_MUQSS -- default HZ_250_NODEF if !SCHED_MUQSS -+ default HZ_500_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -50,6 +50,20 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500_NODEF -+ bool "500 HZ" -+ help -+ 500 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ -+ config HZ_750_NODEF -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000_NODEF - bool "1000 HZ" - help -@@ -63,6 +70,8 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250_NODEF - default 300 if HZ_300_NODEF -+ default 500 if HZ_500_NODEF -+ default 750 if HZ_750_NODEF - default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - -diff --git a/Makefile b/Makefile -index d4d36c61940b..4a9dfe471f1f 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus - - CKVERSION = -ck1 - CKNAME = MuQSS Powered --EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) - - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. diff --git a/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch deleted file mode 100644 index 02933e4..0000000 --- a/linux54-tkg/linux54-tkg-patches/0004-glitched-ondemand-muqss.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (45) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (45) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch b/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch deleted file mode 100644 index c1929e8..0000000 --- a/linux54-tkg/linux54-tkg-patches/0005-glitched-ondemand-pds.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (63) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch b/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch deleted file mode 100644 index 21f2d69..0000000 --- a/linux54-tkg/linux54-tkg-patches/0005-glitched-pds.patch +++ /dev/null @@ -1,213 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. - -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -index c2d831b242b6d18a47e0d87a9f5433a7748b52ff..5bc8d7a8f920c21feab69b2706a3328dc8d39f9a 100644 ---- a/kernel/sched/pds.c -+++ b/kernel/sched/pds.c -@@ -409,12 +409,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) - * [L] ->on_rq - * RELEASE (rq->lock) - * -- * If we observe the old CPU in task_rq_lock(), the acquire of -+ * If we observe the old CPU in task_rq_lock, the acquire of - * the old rq->lock will fully serialize against the stores. - * -- * If we observe the new CPU in task_rq_lock(), the address -- * dependency headed by '[L] rq = task_rq()' and the acquire -- * will pair with the WMB to ensure we then also see migrating. -+ * If we observe the new CPU in task_rq_lock, the acquire will -+ * pair with the WMB to ensure we must then also see migrating. - */ - if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - return rq; -@@ -952,9 +953,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) - smp_wmb(); - - #ifdef CONFIG_THREAD_INFO_IN_TASK -- WRITE_ONCE(p->cpu, cpu); -+ p->cpu = cpu; - #else -- WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+ task_thread_info(p)->cpu = cpu; - #endif - #endif - } -@@ -1035,7 +1036,7 @@ static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) - { - lockdep_assert_held(&rq->lock); - -- WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); -+ p->on_rq = TASK_ON_RQ_MIGRATING; - if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; - dequeue_task(p, rq, 0); -diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h -index 20dcf19ea057627d91be07b4ec20f0827c30084c..24fa90ca63d144cc4f45d82d88407ea70d2d2edf 100644 ---- a/kernel/sched/pds_sched.h -+++ b/kernel/sched/pds_sched.h -@@ -56,7 +56,7 @@ static inline int task_on_rq_queued(struct task_struct *p) - - static inline int task_on_rq_migrating(struct task_struct *p) - { -- return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+ return p->on_rq == TASK_ON_RQ_MIGRATING; - } - - enum { - -diff --git a/init/Kconfig b/init/Kconfig -index 11fd9b502d06..e9bc34d3019b 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -948,7 +948,6 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -- depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index b23231bae996..cab4e5c5b38e 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o - obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o --obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - endif - obj-y += loadavg.o clock.o cputime.o - obj-y += idle.o - obj-y += wait.o wait_bit.o swait.o completion.o - obj-$(CONFIG_SMP) += cpupri.o pelt.o - obj-$(CONFIG_SCHEDSTATS) += stats.o -+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o - -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -index 9281ad164..f09a609cf 100644 ---- a/kernel/sched/pds.c -+++ b/kernel/sched/pds.c -@@ -81,6 +81,18 @@ enum { - NR_CPU_AFFINITY_CHK_LEVEL - }; - -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ - static inline void print_scheduler_version(void) - { - printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); -@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) - #ifdef CONFIG_SCHED_DEBUG - void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - struct seq_file *m) --{} -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} - - void proc_sched_set_task(struct task_struct *p) - {} diff --git a/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch b/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch deleted file mode 100644 index e6db1ad..0000000 --- a/linux54-tkg/linux54-tkg-patches/0005-v5.4_undead-pds099o.patch +++ /dev/null @@ -1,8387 +0,0 @@ -From 89067d28ca90681fc6cf108de79b9aedb93dfa9d Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 25 Nov 2019 21:46:23 +0100 -Subject: PDS 099o, 5.4 rebase - - -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 032c7cd3cede..360a229b0abe 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -82,6 +82,7 @@ show up in /proc/sys/kernel: - - randomize_va_space - - real-root-dev ==> Documentation/admin-guide/initrd.rst - - reboot-cmd [ SPARC only ] -+- rr_interval - - rtsig-max - - rtsig-nr - - sched_energy_aware -@@ -105,6 +106,7 @@ show up in /proc/sys/kernel: - - unknown_nmi_panic - - watchdog - - watchdog_thresh -+- yield_type - - version - - -diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt -new file mode 100644 -index 000000000000..709e86f6487e ---- /dev/null -+++ b/Documentation/scheduler/sched-PDS-mq.txt -@@ -0,0 +1,56 @@ -+ Priority and Deadline based Skiplist multiple queue Scheduler -+ ------------------------------------------------------------- -+ -+CONTENT -+======== -+ -+ 0. Development -+ 1. Overview -+ 1.1 Design goal -+ 1.2 Design summary -+ 2. Design Detail -+ 2.1 Skip list implementation -+ 2.2 Task preempt -+ 2.3 Task policy, priority and deadline -+ 2.4 Task selection -+ 2.5 Run queue balance -+ 2.6 Task migration -+ -+ -+0. Development -+============== -+ -+Priority and Deadline based Skiplist multiple queue scheduler, referred to as -+PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run -+Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing -+design from VRQ and inspired by the introduction of skiplist data structure -+to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple -+Queue Skiplist Scheduler, the successor after BFS) in many ways. -+ -+1. Overview -+=========== -+ -+1.1 Design goal -+--------------- -+ -+PDS is designed to make the cpu process scheduler code to be simple, but while -+efficiency and scalable. Be Simple, the scheduler code will be easy to be read -+and the behavious of scheduler will be easy to predict. Be efficiency, the -+scheduler shall be well balance the thoughput performance and task interactivity -+at the same time for different properties the tasks behave. Be scalable, the -+performance of the scheduler should be in good shape with the glowing of -+workload or with the growing of the cpu numbers. -+ -+1.2 Design summary -+------------------ -+ -+PDS is described as a multiple run queues cpu scheduler. Each cpu has its own -+run queue. A heavry customized skiplist is used as the backend data structure -+of the cpu run queue. Tasks in run queue is sorted by priority then virtual -+deadline(simplfy to just deadline from here on). In PDS, balance action among -+run queues are kept as less as possible to reduce the migration cost. Cpumask -+data structure is widely used in cpu affinity checking and cpu preemption/ -+selection to make PDS scalable with increasing cpu number. -+ -+ -+To be continued... -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 8ef85139553f..9d44d8d78259 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1034,6 +1034,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_PDS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c -index b66e81c06a57..a294f8f5fd75 100644 ---- a/drivers/cpufreq/cpufreq_conservative.c -+++ b/drivers/cpufreq/cpufreq_conservative.c -@@ -28,8 +28,8 @@ struct cs_dbs_tuners { - }; - - /* Conservative governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_FREQUENCY_DOWN_THRESHOLD (20) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) -+#define DEF_FREQUENCY_DOWN_THRESHOLD (26) - #define DEF_FREQUENCY_STEP (5) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (10) -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index dced033875bf..d2cd03766b09 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -18,7 +18,7 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (100000) - #define MICRO_FREQUENCY_UP_THRESHOLD (95) -@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) - } - - /* -- * Every sampling_rate, we check, if current idle time is less than 20% -+ * Every sampling_rate, we check, if current idle time is less than 37% - * (default), then we try to increase frequency. Else, we adjust the frequency - * proportional to load. - */ -diff --git a/fs/proc/base.c b/fs/proc/base.c -index ebea9501afb8..51c9346a69fe 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..1a7987c40c80 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_PDS -+#define INIT_TASK_COMM "PDS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif /* !CONFIG_SCHED_PDS */ - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h -index 1b6d31da7cbc..dea181bdb1dd 100644 ---- a/include/linux/jiffies.h -+++ b/include/linux/jiffies.h -@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) - * Have the 32 bit jiffies value wrap 5 minutes after boot - * so jiffies wrap bugs show up earlier. - */ --#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) -+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) - - /* - * Change timeval to jiffies, trying to avoid the -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 67a1d86981a9..8268cad4b0a2 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -644,9 +645,13 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - struct llist_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -655,6 +660,7 @@ struct task_struct { - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; - -+#ifndef CONFIG_SCHED_PDS - /* - * recent_used_cpu is initially set as the last CPU used by a task - * that wakes affine another task. Waker/wakee relationships can -@@ -663,6 +669,7 @@ struct task_struct { - * used CPU that may be idle. - */ - int recent_used_cpu; -+#endif /* CONFIG_SCHED_PDS */ - int wake_cpu; - #endif - int on_rq; -@@ -672,13 +679,27 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_PDS -+ int time_slice; -+ u64 deadline; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+ /* 8bits prio and 56bits deadline for quick processing */ -+ u64 priodl; -+ u64 last_ran; -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* CONFIG_SCHED_PDS */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1283,6 +1304,29 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_PDS -+void cpu_scaling(int cpu); -+void cpu_nonscaling(int cpu); -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+ -+#define task_running_idle(p) ((p)->prio == IDLE_PRIO) -+#else /* CFS */ -+extern int runqueue_is_locked(int cpu); -+static inline void cpu_scaling(int cpu) -+{ -+} -+ -+static inline void cpu_nonscaling(int cpu) -+{ -+} -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+#define iso_task(p) (false) -+#endif /* CONFIG_SCHED_PDS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..a5e5fc2c9170 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,22 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_PDS -+ -+#define __tsk_deadline(p) ((p)->deadline) -+ -+static inline int dl_prio(int prio) -+{ -+ return 1; -+} -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 1; -+} -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_PDS */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..fba04bb91492 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,7 +20,18 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_PDS -+#define ISO_PRIO (MAX_USER_RT_PRIO) -+ -+#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) -+ -+#define NORMAL_PRIO (MAX_RT_PRIO) -+#define IDLE_PRIO ((MAX_RT_PRIO) + 1) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* !CONFIG_SCHED_PDS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO -+#endif /* CONFIG_SCHED_PDS */ - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..a96012e6f15e 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_PDS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 4b1c3b664f51..f186b8119ad6 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..713fedd8034f ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ Copyright (C) 2016 Alfred Chen. -+ -+ Code based on Con Kolivas's skip list implementation for BFS, and -+ which is based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+ -+This file only provides a infrastructure of skip list. -+ -+skiplist_node is embedded into container data structure, to get rid the -+dependency of kmalloc/kfree operation in scheduler code. -+ -+A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+macro and be used for skip list insert operation. -+ -+Random Level is also not defined in this file, instead, it should be customized -+implemented and set to node->level then pass to the customized skiplist_insert -+function. -+ -+Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ -+NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+considering that there will be 256 entries to enable the top level when using -+random level p=0.5, and that number is more than enough for a run queue usage -+in a scheduler usage. And it also help to reduce the memory usage of the -+embedded skip list node in task_struct to about 50%. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+BFS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+*/ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty()*/ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 25b4fa00bad1..fc0aabdce15f 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -84,7 +84,10 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ -+#ifdef CONFIG_SCHED_PDS -+#define SCHED_ISO 4 -+#endif - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 - -diff --git a/init/Kconfig b/init/Kconfig -index b4daad2bac23..ee3b9957cf3b 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -73,6 +73,21 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_PDS -+ bool "PDS-mq cpu scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler for excellent interactivity and responsiveness on the -+ desktop and solid scalability on normal hardware and commodity -+ servers. -+ -+ Currently incompatible with the Group CPU scheduler, and RCU TORTURE -+ TEST so these options are disabled. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -802,6 +817,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_PDS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -903,7 +919,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_PDS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1032,6 +1048,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1150,6 +1167,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_PDS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index 9e5cbe5eab7b..89787e2feb60 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -58,6 +58,126 @@ struct task_struct init_task - __init_task_data - #endif - = { -+#ifdef CONFIG_SCHED_PDS -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ .thread_info = INIT_THREAD_INFO(init_task), -+ .stack_refcount = ATOMIC_INIT(1), -+#endif -+ .state = 0, -+ .stack = init_stack, -+ .usage = ATOMIC_INIT(2), -+ .flags = PF_KTHREAD, -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, /* PDS only */ -+ .policy = SCHED_NORMAL, -+ .cpus_ptr = &init_task.cpus_mask, -+ .cpus_mask = CPU_MASK_ALL, -+ .nr_cpus_allowed= NR_CPUS, -+ .mm = NULL, -+ .active_mm = &init_mm, -+ .restart_block = { -+ .fn = do_no_restart_syscall, -+ }, -+ .sl_level = 0, /* PDS only */ -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ -+ .time_slice = HZ, /* PDS only */ -+ .tasks = LIST_HEAD_INIT(init_task.tasks), -+#ifdef CONFIG_SMP -+ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -+#endif -+#ifdef CONFIG_CGROUP_SCHED -+ .sched_task_group = &root_task_group, -+#endif -+ .ptraced = LIST_HEAD_INIT(init_task.ptraced), -+ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -+ .real_parent = &init_task, -+ .parent = &init_task, -+ .children = LIST_HEAD_INIT(init_task.children), -+ .sibling = LIST_HEAD_INIT(init_task.sibling), -+ .group_leader = &init_task, -+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), -+ RCU_POINTER_INITIALIZER(cred, &init_cred), -+ .comm = INIT_TASK_COMM, -+ .thread = INIT_THREAD, -+ .fs = &init_fs, -+ .files = &init_files, -+ .signal = &init_signals, -+ .sighand = &init_sighand, -+ .nsproxy = &init_nsproxy, -+ .pending = { -+ .list = LIST_HEAD_INIT(init_task.pending.list), -+ .signal = {{0}} -+ }, -+ .blocked = {{0}}, -+ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), -+ .journal_info = NULL, -+ INIT_CPU_TIMERS(init_task) -+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), -+ .timer_slack_ns = 50000, /* 50 usec default slack */ -+ .thread_pid = &init_struct_pid, -+ .thread_group = LIST_HEAD_INIT(init_task.thread_group), -+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), -+#ifdef CONFIG_AUDITSYSCALL -+ .loginuid = INVALID_UID, -+ .sessionid = AUDIT_SID_UNSET, -+#endif -+#ifdef CONFIG_PERF_EVENTS -+ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), -+ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), -+#endif -+#ifdef CONFIG_PREEMPT_RCU -+ .rcu_read_lock_nesting = 0, -+ .rcu_read_unlock_special.s = 0, -+ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), -+ .rcu_blocked_node = NULL, -+#endif -+#ifdef CONFIG_TASKS_RCU -+ .rcu_tasks_holdout = false, -+ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), -+ .rcu_tasks_idle_cpu = -1, -+#endif -+#ifdef CONFIG_CPUSETS -+ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), -+#endif -+#ifdef CONFIG_RT_MUTEXES -+ .pi_waiters = RB_ROOT_CACHED, -+ .pi_top_task = NULL, -+#endif -+ INIT_PREV_CPUTIME(init_task) -+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -+ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), -+ .vtime.starttime = 0, -+ .vtime.state = VTIME_SYS, -+#endif -+#ifdef CONFIG_NUMA_BALANCING -+ .numa_preferred_nid = -1, -+ .numa_group = NULL, -+ .numa_faults = NULL, -+#endif -+#ifdef CONFIG_KASAN -+ .kasan_depth = 1, -+#endif -+#ifdef CONFIG_TRACE_IRQFLAGS -+ .softirqs_enabled = 1, -+#endif -+#ifdef CONFIG_LOCKDEP -+ .lockdep_recursion = 0, -+#endif -+#ifdef CONFIG_FUNCTION_GRAPH_TRACER -+ .ret_stack = NULL, -+#endif -+#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) -+ .trace_recursion = 0, -+#endif -+#ifdef CONFIG_LIVEPATCH -+ .patch_state = KLP_UNDEFINED, -+#endif -+#ifdef CONFIG_SECURITY -+ .security = NULL, -+#endif -+#else /* CONFIG_SCHED_PDS */ - #ifdef CONFIG_THREAD_INFO_IN_TASK - .thread_info = INIT_THREAD_INFO(init_task), - .stack_refcount = REFCOUNT_INIT(1), -@@ -181,6 +301,7 @@ struct task_struct init_task - #ifdef CONFIG_SECURITY - .security = NULL, - #endif -+#endif /* CONFIG_SCHED_PDS */ - }; - EXPORT_SYMBOL(init_task); - -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index c87ee6412b36..4045c8532027 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index a46a50d67002..58043176b285 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index cdf318d86dd6..baa525865d5c 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_PDS -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 2874bf556162..fad8a279fdfa 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, -@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..8ebe4e33fb5f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_PDS -+obj-y += pds.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 86800b4d5453..07f278dc3137 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_PDS */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_PDS - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -918,6 +929,7 @@ static int __init sugov_register(void) - fs_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_PDS - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_PDS */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 46ed4e1383e2..0a9548ee995c 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -+#ifdef CONFIG_SCHED_PDS -+ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : -+ CPUTIME_USER; -+#else - index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+#endif - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -+#ifdef CONFIG_SCHED_PDS -+ if (task_nice(p) > 0 || task_running_idle(p)) { -+#else - if (task_nice(p) > 0) { -+#endif - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -663,7 +672,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f65ef1e2f204..454fa7e460e3 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * idle-task scheduling class. - */ -@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -new file mode 100644 -index 000000000000..aefbd9cebcfb ---- /dev/null -+++ b/kernel/sched/pds.c -@@ -0,0 +1,6566 @@ -+/* -+ * kernel/sched/pds.c, was kernel/sched.c -+ * -+ * PDS-mq Core kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ */ -+#include "pds_sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+ -+#define rt_prio(prio) ((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR || \ -+ (policy) == SCHED_ISO) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define idle_policy(policy) ((policy) == SCHED_IDLE) -+#define idleprio_task(p) unlikely(idle_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -+#define JIFFY_NS (1000000000 / HZ) -+#define HALF_JIFFY_NS (1000000000 / HZ / 2) -+#define HALF_JIFFY_US (1000000 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); -+} -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. -+ * Tunable via /proc interface. -+ */ -+#define SCHED_DEFAULT_RR (4) -+int rr_interval __read_mostly = SCHED_DEFAULT_RR; -+ -+static int __init rr_interval_set(char *str) -+{ -+ u32 rr; -+ -+ pr_info("rr_interval: "); -+ if (kstrtouint(str, 0, &rr)) { -+ pr_cont("using default of %u, unable to parse %s\n", -+ rr_interval, str); -+ return 1; -+ } -+ -+ rr_interval = rr; -+ pr_cont("%d\n", rr_interval); -+ -+ return 1; -+} -+__setup("rr_interval=", rr_interval_set); -+ -+ -+static const u64 sched_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, -+/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, -+/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, -+/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, -+/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, -+/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, -+/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, -+/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 -+}; -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+#ifdef CONFIG_SMP -+enum { -+SCHED_RQ_EMPTY = 0, -+SCHED_RQ_IDLE, -+SCHED_RQ_NORMAL_0, -+SCHED_RQ_NORMAL_1, -+SCHED_RQ_NORMAL_2, -+SCHED_RQ_NORMAL_3, -+SCHED_RQ_NORMAL_4, -+SCHED_RQ_NORMAL_5, -+SCHED_RQ_NORMAL_6, -+SCHED_RQ_NORMAL_7, -+SCHED_RQ_ISO, -+SCHED_RQ_RT, -+NR_SCHED_RQ_QUEUED_LEVEL -+}; -+ -+static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_PER_CPU(int, sched_sibling_cpu); -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+ -+static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SMT_NICE -+/* -+ * Preemptible sibling group mask -+ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO -+ */ -+static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; -+/* -+ * SMT supressed mask -+ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu -+ * will be supressed to run IDLE priority task. -+ */ -+static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; -+#endif /* CONFIG_SMT_NICE */ -+#endif -+ -+static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes CPU fairly amongst tasks of the -+ * same nice value, it proportions CPU according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 task_deadline_diff(const struct task_struct *p) -+{ -+ return sched_prio2deadline[TASK_USER_PRIO(p)]; -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return sched_prio2deadline[USER_PRIO(static_prio)]; -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline for non-rt tasks. -+ */ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ if (p->prio >= NORMAL_PRIO) -+ p->deadline = rq->clock + task_deadline_diff(p); -+ -+ update_task_priodl(p); -+} -+ -+static inline struct task_struct *rq_first_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct *rq_second_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) -+{ -+ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); -+} -+ -+static const int task_dl_hash_tbl[] = { -+/* 0 4 8 12 */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -+/* 16 20 24 28 */ -+ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 -+}; -+ -+static inline int -+task_deadline_level(const struct task_struct *p, const struct rq *rq) -+{ -+ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; -+ -+ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); -+ return task_dl_hash_tbl[delta]; -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_SMT_NICE -+static void resched_cpu_if_curr_is(int cpu, int priority) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rcu_read_lock(); -+ -+ if (rcu_dereference(rq->curr)->prio != priority) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ if (!do_raw_spin_trylock(&rq->lock)) -+ goto out; -+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if (priority == rq->curr->prio) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ -+ spin_release(&rq->lock.dep_map, 1, _RET_IP_); -+ do_raw_spin_unlock(&rq->lock); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline bool -+__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, -+ cpumask_t cpumasks[], unsigned long bitmap[]) -+{ -+ if (*plevel == level) -+ return false; -+ -+ cpumask_clear_cpu(cpu, cpumasks + *plevel); -+ if (cpumask_empty(cpumasks + *plevel)) -+ clear_bit(*plevel, bitmap); -+ cpumask_set_cpu(cpu, cpumasks + level); -+ set_bit(level, bitmap); -+ -+ *plevel = level; -+ -+ return true; -+} -+ -+static inline int -+task_running_policy_level(const struct task_struct *p, const struct rq *rq) -+{ -+ int prio = p->prio; -+ -+ if (NORMAL_PRIO == prio) -+ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); -+ -+ if (ISO_PRIO == prio) -+ return SCHED_RQ_ISO; -+ if (prio < MAX_RT_PRIO) -+ return SCHED_RQ_RT; -+ return PRIO_LIMIT - prio; -+} -+ -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) -+{ -+ struct task_struct *p = rq_first_queued_task(rq); -+ -+ if (p->prio != NORMAL_PRIO) -+ return; -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, -+ task_running_policy_level(p, rq), -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0]); -+} -+ -+#ifdef CONFIG_SMT_NICE -+static inline void update_sched_cpu_psg_mask(const int cpu) -+{ -+ cpumask_t tmp; -+ -+ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_IDLE]); -+ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+ else -+ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+} -+#endif -+ -+static inline void update_sched_rq_queued_masks(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ struct task_struct *p = rq_first_queued_task(rq); -+ unsigned long level; -+#ifdef CONFIG_SCHED_SMT -+ unsigned long last_level = rq->queued_level; -+#endif -+ -+ level = task_running_policy_level(p, rq); -+ sched_rq_prio[cpu] = p->prio; -+ -+ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0])) -+ return; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpu == per_cpu(sched_sibling_cpu, cpu)) -+ return; -+ -+ if (SCHED_RQ_EMPTY == last_level) { -+ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, -+ cpu_smt_mask(cpu)); -+ } else if (SCHED_RQ_EMPTY == level) { -+ cpumask_t tmp; -+ -+ cpumask_and(&tmp, cpu_smt_mask(cpu), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_cpu_sg_idle_mask); -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { -+ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); -+ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { -+ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); -+ } -+#endif /* CONFIG_SMT_NICE */ -+#endif -+} -+ -+static inline void update_sched_rq_pending_masks(struct rq *rq) -+{ -+ unsigned long level; -+ struct task_struct *p = rq_second_queued_task(rq); -+ -+ level = task_running_policy_level(p, rq); -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, -+ &sched_rq_pending_masks[0], -+ &sched_rq_pending_masks_bitmap[0]); -+} -+ -+#else /* CONFIG_SMP */ -+static inline void update_sched_rq_queued_masks(struct rq *rq) {} -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} -+static inline void update_sched_rq_pending_masks(struct rq *rq) {} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Removing from the runqueue. Deleting a task from the skip list is done -+ * via the stored node reference in the task struct and does not require a full -+ * look up. Thus it occurs in O(k) time where k is the "level" of the list the -+ * task was stored at - usually < 4, max 16. -+ * -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running--; -+ -+ sched_update_tick_dependency(rq); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ -+ sched_info_dequeued(rq, p); -+} -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLE to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static inline bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!freezing(p) && !signal_pending(p) && -+ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+/** -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Adding task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running++; -+ -+ sched_update_tick_dependency(rq); -+ -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ bool b_first, b_second; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); -+ b_second = is_second_in_rq(p, rq); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq) || b_second) -+ update_sched_rq_pending_masks(rq); -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) -+{ -+ struct task_struct *curr = rq->curr; -+ -+ if (curr->prio == PRIO_LIMIT) -+ resched_curr(rq); -+ -+ if (task_running_idle(p)) -+ return; -+ -+ if (p->priodl < curr->priodl) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * PDS doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ rq->hrtick_csd_pending = 0; -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) { -+ __hrtick_restart(rq); -+ } else if (!rq->hrtick_csd_pending) { -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+ rq->hrtick_csd_pending = 1; -+ } -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq->hrtick_csd_pending = 0; -+ -+ rq->hrtick_csd.flags = 0; -+ rq->hrtick_csd.func = __hrtick_start; -+ rq->hrtick_csd.info = rq; -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) -+ return 0; -+ -+ return HALF_JIFFY_NS; -+} -+ -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ static const int policy_to_prio[] = { -+ NORMAL_PRIO, /* SCHED_NORMAL */ -+ 0, /* SCHED_FIFO */ -+ 0, /* SCHED_RR */ -+ IDLE_PRIO, /* SCHED_BATCH */ -+ ISO_PRIO, /* SCHED_ISO */ -+ IDLE_PRIO /* SCHED_IDLE */ -+ }; -+ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ return policy_to_prio[p->policy]; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = 1; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * detach_task() -- detach the task for the migration specified in @target_cpu -+ */ -+static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, 0); -+ -+ set_task_cpu(p, target_cpu); -+} -+ -+/* -+ * attach_task() -- attach the task detached by detach_task() to its new rq. -+ */ -+static void attach_task(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ BUG_ON(task_rq(p) != rq); -+ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ detach_task(rq, p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ attach_task(rq, p); -+ -+ check_preempt_curr(rq, p); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq) -+ if (task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/* Enter with rq lock held. We know p is on the local CPU */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_mask is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, &p->cpus_mask) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ cpumask_t *mask; -+ -+ if (cpumask_test_cpu(cpu, cpumask)) -+ return cpu; -+ -+ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ -+ return cpu; -+} -+ -+/* -+ * task_preemptible_rq - return the rq which the given task can preempt on -+ * @p: task wants to preempt CPU -+ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p -+ */ -+static inline int -+task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) -+{ -+ cpumask_t tmp; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+ -+#ifdef CONFIG_SMT_NICE -+ /* Only ttwu on cpu which is not smt supressed */ -+ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { -+ cpumask_t t; -+ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &t); -+ return best_mask_cpu(task_cpu(p), &tmp); -+ } -+#endif -+ -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+static inline int -+task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, -+ int preempt_level) -+{ -+ cpumask_t tmp; -+ int level; -+ -+#ifdef CONFIG_SCHED_SMT -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#else -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+#endif -+ -+ level = find_first_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL); -+ -+ while (level < preempt_level) { -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ level = find_next_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL, -+ level + 1); -+ } -+ -+ if (unlikely(SCHED_RQ_RT == level && -+ level == preempt_level && -+ cpumask_and(&tmp, chk_mask, -+ &sched_rq_queued_masks[SCHED_RQ_RT]))) { -+ unsigned int cpu; -+ -+ for_each_cpu (cpu, &tmp) -+ if (p->prio < sched_rq_prio[cpu]) -+ return cpu; -+ } -+ -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask; -+ -+ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ /* Check IDLE tasks suitable to run normal priority */ -+ if (idleprio_task(p)) { -+ if (idleprio_suitable(p)) { -+ p->prio = p->normal_prio; -+ update_task_priodl(p); -+ return task_preemptible_rq_idle(p, &chk_mask); -+ } -+ p->prio = NORMAL_PRIO; -+ update_task_priodl(p); -+ } -+ -+ return task_preemptible_rq(p, &chk_mask, -+ task_running_policy_level(p, this_rq())); -+} -+#else /* CONFIG_SMP */ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** PDS ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ int cpu, success = 0; -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto out; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto stat; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { -+ p->prio = ISO_PRIO; -+ p->deadline = 0UL; -+ update_task_priodl(p); -+ } -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else /* CONFIG_SMP */ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+#endif -+ -+ rq = cpu_rq(cpu); -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ -+stat: -+ ttwu_stat(p, cpu, wake_flags); -+out: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ int cpu = get_cpu(); -+ struct rq *rq = this_rq(); -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ /* Should be reset in fork.c but done here for ease of PDS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = 0; -+ -+ p->sl_level = pds_skiplist_random_level(p); -+ INIT_SKIPLIST_NODE(&p->sl_node); -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); -+#endif -+ -+ if (p->time_slice < RESCHED_US) { -+ update_rq_clock(rq); -+ time_slice_expired(p, rq); -+ resched_curr(rq); -+ } else -+ update_task_priodl(p); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ put_cpu(); -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_mask can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void pds_update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ p->time_slice -= NS_TO_US(ns); -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ pds_update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void pds_scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+ -+ /** -+ * p->time_slice < RESCHED_US. We will modify task_struct under -+ * rq lock as p is rq->curr -+ */ -+ __set_tsk_resched(p); -+} -+ -+#ifdef CONFIG_SMP -+ -+#ifdef CONFIG_SCHED_SMT -+static int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ int cpu; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* -+ * _something_ may have changed the task, double check again -+ */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) -+ rq = __migrate_task(rq, p, cpu); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ -+static void pds_sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return; -+ curr = rq->curr; -+ if (!is_idle_task(curr) && -+ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { -+ int active_balance = 0; -+ -+ if (likely(!rq->active_balance)) { -+ rq->active_balance = 1; -+ active_balance = 1; -+ } -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (likely(active_balance)) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ } else -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+/* -+ * pds_sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void pds_sg_balance_check(const struct rq *rq) -+{ -+ cpumask_t chk; -+ int i; -+ -+ /* Only online cpu will do sg balance checking */ -+ if (unlikely(!rq->online)) -+ return; -+ -+ /* Only cpu in slibing idle group will do the checking */ -+ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) -+ return; -+ -+ /* Find potential cpus which can migrate the currently running task */ -+ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return; -+ -+ for_each_cpu(i, &chk) { -+ /* skip the cpu which has idle slibing cpu */ -+ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ continue; -+ pds_sg_balance_trigger(i); -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ -+ if (is_idle_task(curr) || cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ delta = rq_clock_task(rq) - curr->last_ran; -+ -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (rq->idle == p) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_US) { -+ time_slice_expired(p, rq); -+ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { -+ p->prio = NORMAL_PRIO; -+ p->deadline = rq->clock + task_deadline_diff(p); -+ update_task_priodl(p); -+ } -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) -+{ -+ struct task_struct *p; -+ int dest_cpu = cpu_of(dest_rq); -+ int nr_migrated = 0; -+ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ while (nr_tries && node != &rq->sl_header) { -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ node = node->next[0]; -+ -+ if (task_running(p)) -+ continue; -+ if (p->prio >= filter_prio) -+ break; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { -+ detach_task(rq, p, dest_cpu); -+ attach_task(dest_rq, p); -+ nr_migrated++; -+ } -+ nr_tries--; -+ /* make a jump */ -+ if (node == &rq->sl_header) -+ break; -+ node = node->next[0]; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int -+take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) -+{ -+ int src_cpu; -+ -+ for_each_cpu(src_cpu, chk_mask) { -+ int nr_migrated; -+ struct rq *src_rq = cpu_rq(src_cpu); -+ -+ if (!do_raw_spin_trylock(&src_rq->lock)) { -+ if (PRIO_LIMIT == filter_prio) -+ continue; -+ return 0; -+ } -+ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ update_rq_clock(src_rq); -+ nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio); -+ -+ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ if (nr_migrated || PRIO_LIMIT != filter_prio) -+ return nr_migrated; -+ } -+ return 0; -+} -+ -+static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) -+{ -+ struct cpumask *affinity_mask, *end; -+ struct cpumask chk; -+ -+ if (PRIO_LIMIT == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+#ifdef CONFIG_SMT_NICE -+ { -+ /* also try to take IDLE priority tasks from smt supressed cpu */ -+ struct cpumask t; -+ if (cpumask_and(&t, &sched_smt_supressed_mask, -+ &sched_rq_queued_masks[SCHED_RQ_IDLE])) -+ cpumask_or(&chk, &chk, &t); -+ } -+#endif -+ } else if (NORMAL_PRIO == filter_prio) { -+ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], -+ &sched_rq_pending_masks[SCHED_RQ_ISO]); -+ } else if (IDLE_PRIO == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); -+ } else -+ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); -+ -+ if (cpumask_empty(&chk)) -+ return 0; -+ -+ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); -+ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); -+ do { -+ struct cpumask tmp; -+ -+ if (cpumask_and(&tmp, &chk, affinity_mask) && -+ take_queued_task_cpumask(rq, &tmp, filter_prio)) -+ return 1; -+ } while (++affinity_mask < end); -+ -+ return 0; -+} -+#endif -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next = rq_first_queued_task(rq); -+ -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { -+ if (next->prio >= IDLE_PRIO) { -+ if (rq->online && -+ take_other_rq_task(rq, cpu, IDLE_PRIO)) -+ return rq_first_queued_task(rq); -+ return rq->idle; -+ } -+ } -+#endif -+ -+#ifdef CONFIG_SMP -+ if (likely(rq->online)) -+ if (take_other_rq_task(rq, cpu, next->prio)) { -+ resched_curr(rq); -+ return rq_first_queued_task(rq); -+ } -+#endif -+ return next; -+} -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ p->last_ran = rq->clock_task; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (p != rq->idle) -+ hrtick_start(rq, US_TO_NS(p->time_slice)); -+#endif -+ /* update rq->dither */ -+ rq->dither = rq_dither(rq); -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_deadline(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ set_rq_task(rq, next); -+ -+ if (prev != next) { -+ if (next->prio == PRIO_LIMIT) -+ schedstat_inc(rq->sched_goidle); -+ -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ rq->nr_switches++; -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+#ifdef CONFIG_SCHED_SMT -+ pds_sg_balance_check(rq); -+#endif -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state || tsk_is_pi_blocked(tsk) || -+ signal_pending_state(tsk->state, tsk)) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker. -+ */ -+ if (tsk->flags & PF_WQ_WORKER) { -+ preempt_disable(); -+ wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void -+check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* -+ * Trigger changes when task priority/deadline modified. -+ */ -+ if (task_on_rq_queued(p)) { -+ struct task_struct *first; -+ -+ requeue_task(p, rq); -+ -+ /* Resched if first queued task not running and not IDLE */ -+ if ((first = rq_first_queued_task(rq)) != rq->curr && -+ !task_running_idle(first)) -+ resched_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+ -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ /* rq lock may not held!! */ -+ update_rq_clock(rq); -+ -+ p->static_prio = new_static; -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->deadline -= task_deadline_diff(p); -+ p->deadline += static_deadline_diff(new_static); -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int level, prio = p->prio - MAX_RT_PRIO; -+ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; -+ -+ /* rt tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ preempt_disable(); -+ level = task_deadline_level(p, this_rq()); -+ preempt_enable(); -+ prio += level_to_nice_prio[level]; -+ if (idleprio_task(p)) -+ prio += NICE_WIDTH; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+#ifdef CONFIG_SMP -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif -+ -+static u64 task_init_deadline(const struct task_struct *p) -+{ -+ return task_rq(p)->clock + task_deadline_diff(p); -+} -+ -+u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { -+ task_init_deadline, /* SCHED_NORMAL */ -+ NULL, /* SCHED_FIFO */ -+ NULL, /* SCHED_RR */ -+ task_init_deadline, /* SCHED_BATCH */ -+ NULL, /* SCHED_ISO */ -+ task_init_deadline /* SCHED_IDLE */ -+}; -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int old_policy = p->policy; -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+ -+ if (old_policy != policy) -+ p->deadline = (task_init_deadline_func_tbl[p->policy])? -+ task_init_deadline_func_tbl[p->policy](p):0ULL; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int -+__sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_mask, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_mask); -+ cpumask_and(new_mask, in_mask, cpus_mask); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_mask); -+ if (!cpumask_subset(new_mask, cpus_mask)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_mask to the -+ * cpuset's cpus_mask -+ */ -+ cpumask_copy(new_mask, cpus_mask); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_mask); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ if (sched_yield_type > 1) { -+ time_slice_expired(current, rq); -+ requeue_task(current, rq); -+ } -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In PDS, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* PDS TODO: should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); -+ update_task_priodl(idle); -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+static bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(); -+ struct cpumask *mask; -+ -+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ return cpu; -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) -+ for_each_cpu(i, mask) -+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) -+ return i; -+ -+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ current->active_mm = &init_mm; -+ finish_arch_post_lock_switch(); -+ } -+ mmdrop(mm); -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ struct skiplist_node *node; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ node = &rq->sl_header; -+ while ((node = node->next[0]) != &rq->sl_header) { -+ int dest_cpu; -+ -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ -+ /* skip the running task */ -+ if (task_running(p)) -+ continue; -+ -+ /* -+ * Rules for changing task_struct::cpus_mask are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ node = &rq->sl_header; -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+ -+static __read_mostly int sched_debug_enabled; -+ -+static int __init sched_debug_setup(char *str) -+{ -+ sched_debug_enabled = 1; -+ -+ return 0; -+} -+early_param("sched_debug", sched_debug_setup); -+ -+static inline bool sched_debug(void) -+{ -+ return sched_debug_enabled; -+} -+#else /* !CONFIG_SCHED_DEBUG */ -+static inline bool sched_debug(void) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#ifdef CONFIG_SMP -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (!idle_cpu(smp_processor_id()) || need_resched()) -+ return; -+ -+ irq_enter(); -+ irq_exit(); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Topology list, bottom-up. -+ */ -+static struct sched_domain_topology_level default_topology[] = { -+#ifdef CONFIG_SCHED_SMT -+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -+#endif -+#ifdef CONFIG_SCHED_MC -+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -+#endif -+ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, -+ { NULL, }, -+}; -+ -+static struct sched_domain_topology_level *sched_domain_topology = -+ default_topology; -+ -+#define for_each_sd_topology(tl) \ -+ for (tl = sched_domain_topology; tl->mask; tl++) -+ -+void set_sched_topology(struct sched_domain_topology_level *tl) -+{ -+ if (WARN_ON_ONCE(sched_smp_initialized)) -+ return; -+ -+ sched_domain_topology = tl; -+} -+ -+/* -+ * Initializers for schedule domains -+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() -+ */ -+ -+int sched_domain_level_max; -+ -+/* -+ * Partition sched domains as specified by the 'ndoms_new' -+ * cpumasks in the array doms_new[] of cpumasks. This compares -+ * doms_new[] to the current sched domain partitioning, doms_cur[]. -+ * It destroys each deleted domain and builds each new domain. -+ * -+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. -+ * The masks don't intersect (don't overlap.) We should setup one -+ * sched domain for each mask. CPUs not in any of the cpumasks will -+ * not be load balanced. If the same cpumask appears both in the -+ * current 'doms_cur' domains and in the new 'doms_new', we can leave -+ * it as it is. -+ * -+ * The passed in 'doms_new' should be allocated using -+ * alloc_sched_domains. This routine takes ownership of it and will -+ * free_sched_domains it when done with it. If the caller failed the -+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, -+ * and partition_sched_domains() will fallback to the single partition -+ * 'fallback_doms', it also forces the domains to be rebuilt. -+ * -+ * If doms_new == NULL it will be replaced with cpu_online_mask. -+ * ndoms_new == 0 is a special case for destroying existing domains, -+ * and it will not create the default domain. -+ * -+ * Call with hotplug lock held -+ */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{ -+ /** -+ * PDS doesn't depend on sched domains, but just keep this api -+ */ -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+/* -+ * sched_numa_find_closest() - given the NUMA topology, find the cpu -+ * closest to @cpu from @cpumask. -+ * cpumask: cpumask to find a cpu from -+ * cpu: cpu to be close to -+ * -+ * returns: cpu, or nr_cpu_ids when nothing found. -+ */ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ sched_tick_stop(cpu); -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_start_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); -+ } -+} -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ -+#ifdef CONFIG_SCHED_SMT -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { -+ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+#endif -+#ifdef CONFIG_SCHED_MC -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+ cpumask_complement(chk, cpu_coregroup_mask(cpu)); -+ -+ /** -+ * Set up sd_llc_id per CPU -+ */ -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(cpu_coregroup_mask(cpu)); -+#else -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(topology_core_cpumask(cpu)); -+ -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+#endif /* NOT CONFIG_SCHED_MC */ -+ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ cpumask_complement(chk, topology_core_cpumask(cpu)); -+ -+ if (cpumask_and(chk, chk, cpu_online_mask)) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ print_scheduler_version(); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) -+ cpumask_clear(&sched_rq_queued_masks[i]); -+ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); -+ -+ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ raw_spin_lock_init(&rq->lock); -+ rq->dither = 0; -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+ rq->queued_level = SCHED_RQ_EMPTY; -+ rq->pending_level = SCHED_RQ_EMPTY; -+#ifdef CONFIG_SCHED_SMT -+ per_cpu(sched_sibling_cpu, i) = i; -+ rq->active_balance = 0; -+#endif -+#endif -+ rq->nr_switches = 0; -+ atomic_set(&rq->nr_iowait, 0); -+ hrtick_rq_init(rq); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h -new file mode 100644 -index 000000000000..b3926a8425b2 ---- /dev/null -+++ b/kernel/sched/pds_sched.h -@@ -0,0 +1,474 @@ -+#ifndef PDS_SCHED_H -+#define PDS_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct *curr, *idle, *stop; -+ struct mm_struct *prev_mm; -+ -+ struct skiplist_node sl_header; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+ unsigned long queued_level; -+ unsigned long pending_level; -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 clock_task; -+ int dither; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ int hrtick_csd_pending; -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+#endif /* CONFIG_SMP */ -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : PDS need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+ -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -+{ -+ if (cpu_of(rq) == smp_processor_id()) -+ cpufreq_update_util(rq, flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+#endif /* PDS_SCHED_H */ -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index a96db50d40e0..d3d12baa9036 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * sched_entity: - * -@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - - #ifdef CONFIG_HAVE_SCHED_AVG_IRQ - /* -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index afff644da065..26d6b47fc156 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,11 +1,13 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_PDS - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - - #ifdef CONFIG_HAVE_SCHED_AVG_IRQ - int update_irq_load_avg(struct rq *rq, u64 running); -@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_PDS - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_PDS */ - - #else - -+#ifndef CONFIG_SCHED_PDS - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_irq_load_avg(struct rq *rq, u64 running) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index c8870c5bd7df..4fc9f2ead4d2 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_PDS -+#include "pds_sched.h" -+#else -+ - #include - - #include -@@ -2496,3 +2500,4 @@ static inline void membarrier_switch_mm(struct rq *rq, - { - } - #endif -+#endif /* !CONFIG_SCHED_PDS */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..45bd43942575 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b6f2f35d0bcf..204933ebc95a 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; - static unsigned long zero_ul; - static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; -+static int __read_mostly one_hundred = 100; -+static int __read_mostly one_thousand = 1000; -+#ifdef CONFIG_SCHED_PDS -+extern int rr_interval; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_PDS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_PDS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1070,6 +1076,26 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_PDS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ONE, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 42d512fcfda2..71af3cd30ccc 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_PDS - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index 69ee8ef12cee..3eaa2a21caa4 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_PDS -+ /* No deadline on BFS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index d1303a5..0000000 --- a/linux54-tkg/linux54-tkg-patches/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch b/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch deleted file mode 100644 index 027116f..0000000 --- a/linux54-tkg/linux54-tkg-patches/0007-v5.4-fsync.patch +++ /dev/null @@ -1,419 +0,0 @@ -split the futex key setup from the queue locking and key reading. This -is useful to support the setup of multiple keys at the same time, like -what is done in futex_requeue() and what will be done for the -FUTEX_WAIT_MULTIPLE command. - -Signed-off-by: Gabriel Krisman Bertazi ---- - kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- - 1 file changed, 42 insertions(+), 29 deletions(-) - -diff --git a/kernel/futex.c b/kernel/futex.c -index 6d50728ef2e7..91f3db335c57 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -2631,6 +2631,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - __set_current_state(TASK_RUNNING); - } - -+static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, -+ struct futex_q *q, struct futex_hash_bucket **hb) -+{ -+ -+ u32 uval; -+ int ret; -+ -+retry_private: -+ *hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, uaddr); -+ -+ if (ret) { -+ queue_unlock(*hb); -+ -+ ret = get_user(uval, uaddr); -+ if (ret) -+ return ret; -+ -+ if (!(flags & FLAGS_SHARED)) -+ goto retry_private; -+ -+ return 1; -+ } -+ -+ if (uval != val) { -+ queue_unlock(*hb); -+ ret = -EWOULDBLOCK; -+ } -+ -+ return ret; -+} -+ - /** - * futex_wait_setup() - Prepare to wait on a futex - * @uaddr: the futex userspace address -@@ -2651,7 +2684,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - struct futex_q *q, struct futex_hash_bucket **hb) - { -- u32 uval; - int ret; - - /* -@@ -2672,38 +2704,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - * absorb a wakeup if *uaddr does not match the desired values - * while the syscall executes. - */ --retry: -- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); -- if (unlikely(ret != 0)) -- return ret; -- --retry_private: -- *hb = queue_lock(q); -+ do { -+ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, -+ &q->key, FUTEX_READ); -+ if (unlikely(ret != 0)) -+ return ret; - -- ret = get_futex_value_locked(&uval, uaddr); -+ ret = __futex_wait_setup(uaddr, val, flags, q, hb); - -- if (ret) { -- queue_unlock(*hb); -- -- ret = get_user(uval, uaddr); -+ /* Drop key reference if retry or error. */ - if (ret) -- goto out; -+ put_futex_key(&q->key); -+ } while (ret > 0); - -- if (!(flags & FLAGS_SHARED)) -- goto retry_private; -- -- put_futex_key(&q->key); -- goto retry; -- } -- -- if (uval != val) { -- queue_unlock(*hb); -- ret = -EWOULDBLOCK; -- } -- --out: -- if (ret) -- put_futex_key(&q->key); - return ret; - } - --- -2.20.1 - -This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows -a thread to wait on several futexes at the same time, and be awoken by -any of them. In a sense, it implements one of the features that was -supported by pooling on the old FUTEX_FD interface. - -My use case for this operation lies in Wine, where we want to implement -a similar interface available in Windows, used mainly for event -handling. The wine folks have an implementation that uses eventfd, but -it suffers from FD exhaustion (I was told they have application that go -to the order of multi-milion FDs), and higher CPU utilization. - -In time, we are also proposing modifications to glibc and libpthread to -make this feature available for Linux native multithreaded applications -using libpthread, which can benefit from the behavior of waiting on any -of a group of futexes. - -In particular, using futexes in our Wine use case reduced the CPU -utilization by 4% for the game Beat Saber and by 1.5% for the game -Shadow of Tomb Raider, both running over Proton (a wine based solution -for Windows emulation), when compared to the eventfd interface. This -implementation also doesn't rely of file descriptors, so it doesn't risk -overflowing the resource. - -Technically, the existing FUTEX_WAIT implementation can be easily -reworked by using do_futex_wait_multiple with a count of one, and I -have a patch showing how it works. I'm not proposing it, since -futex is such a tricky code, that I'd be more confortable to have -FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, -before considering modifying FUTEX_WAIT. - -From an implementation perspective, the futex list is passed as an array -of (pointer,value,bitset) to the kernel, which will enqueue all of them -and sleep if none was already triggered. It returns a hint of which -futex caused the wake up event to userspace, but the hint doesn't -guarantee that is the only futex triggered. Before calling the syscall -again, userspace should traverse the list, trying to re-acquire any of -the other futexes, to prevent an immediate -EWOULDBLOCK return code from -the kernel. - -This was tested using three mechanisms: - -1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and -running the unmodified tools/testing/selftests/futex and a full linux -distro on top of this kernel. - -2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a -multi-threaded, event-handling setup. - -3) By running the Wine fsync implementation and executing multi-threaded -applications, in particular the modern games mentioned above, on top of -this implementation. - -Signed-off-by: Zebediah Figura -Signed-off-by: Steven Noonan -Signed-off-by: Pierre-Loup A. Griffais -Signed-off-by: Gabriel Krisman Bertazi ---- - include/uapi/linux/futex.h | 7 ++ - kernel/futex.c | 161 ++++++++++++++++++++++++++++++++++++- - 2 files changed, 164 insertions(+), 4 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e..2401c4cf5095 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -150,4 +151,10 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index 91f3db335c57..2623e8f152cd 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -183,6 +183,7 @@ static int __read_mostly futex_cmpxchg_enabled; - #endif - #define FLAGS_CLOCKRT 0x02 - #define FLAGS_HAS_TIMEOUT 0x04 -+#define FLAGS_WAKE_MULTIPLE 0x08 - - /* - * Priority Inheritance state: -@@ -2720,6 +2721,150 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+static int do_futex_wait_multiple(struct futex_wait_block *wb, -+ u32 count, unsigned int flags, -+ ktime_t *abs_time) -+{ -+ -+ struct hrtimer_sleeper timeout, *to; -+ struct futex_hash_bucket *hb; -+ struct futex_q *qs = NULL; -+ int ret; -+ int i; -+ -+ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); -+ if (!qs) -+ return -ENOMEM; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, -+ current->timer_slack_ns); -+ retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ qs[i].bitset = wb[i].bitset; -+ -+ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret != 0)) { -+ for (--i; i >= 0; i--) -+ put_futex_key(&qs[i].key); -+ goto out; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, -+ flags, &qs[i], &hb); -+ if (ret) { -+ /* Drop the failed key directly. keys 0..(i-1) -+ * will be put by unqueue_me. -+ */ -+ put_futex_key(&qs[i].key); -+ -+ /* Undo the partial work we did. */ -+ for (--i; i >= 0; i--) -+ unqueue_me(&qs[i]); -+ -+ __set_current_state(TASK_RUNNING); -+ if (ret > 0) -+ goto retry; -+ goto out; -+ } -+ -+ /* We can't hold to the bucket lock when dealing with -+ * the next futex. Queue ourselves now so we can unlock -+ * it before moving on. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* There is no easy to way to check if we are wake already on -+ * multiple futexes without waking through each one of them. So -+ * just sleep and let the scheduler handle it. -+ */ -+ if (!to || to->task) -+ freezable_schedule(); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ ret = -ETIMEDOUT; -+ /* If we were woken (and unqueued), we succeeded. */ -+ for (i = 0; i < count; i++) -+ if (!unqueue_me(&qs[i])) -+ ret = i; -+ -+ /* Succeed wakeup */ -+ if (ret >= 0) -+ goto out; -+ -+ /* Woken by triggered timeout */ -+ if (to && !to->task) -+ goto out; -+ -+ /* -+ * We expect signal_pending(current), but we might be the -+ * victim of a spurious wakeup as well. -+ */ -+ if (!signal_pending(current)) -+ goto retry; -+ -+ ret = -ERESTARTSYS; -+ if (!abs_time) -+ goto out; -+ -+ ret = -ERESTART_RESTARTBLOCK; -+ out: -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ kfree(qs); -+ return ret; -+} -+ -+static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, -+ u32 count, ktime_t *abs_time) -+{ -+ struct futex_wait_block *wb; -+ struct restart_block *restart; -+ int ret; -+ -+ if (!count) -+ return -EINVAL; -+ -+ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); -+ if (!wb) -+ return -ENOMEM; -+ -+ if (copy_from_user(wb, uaddr, -+ count * sizeof(struct futex_wait_block))) { -+ ret = -EFAULT; -+ goto out; -+ } -+ -+ ret = do_futex_wait_multiple(wb, count, flags, abs_time); -+ -+ if (ret == -ERESTART_RESTARTBLOCK) { -+ restart = ¤t->restart_block; -+ restart->fn = futex_wait_restart; -+ restart->futex.uaddr = uaddr; -+ restart->futex.val = count; -+ restart->futex.time = *abs_time; -+ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | -+ FLAGS_WAKE_MULTIPLE); -+ } -+ -+out: -+ kfree(wb); -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -2797,6 +2942,10 @@ static long futex_wait_restart(struct restart_block *restart) - } - restart->fn = do_no_restart_syscall; - -+ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) -+ return (long)futex_wait_multiple(uaddr, restart->futex.flags, -+ restart->futex.val, tp); -+ - return (long)futex_wait(uaddr, restart->futex.flags, - restart->futex.val, tp, restart->futex.bitset); - } -@@ -3680,6 +3829,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - uaddr2); - case FUTEX_CMP_REQUEUE_PI: - return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); -+ case FUTEX_WAIT_MULTIPLE: -+ return futex_wait_multiple(uaddr, flags, val, timeout); - } - return -ENOSYS; - } -@@ -3696,7 +3847,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3705,7 +3857,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } -@@ -3889,14 +4041,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } --- -2.20.1 diff --git a/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch b/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch deleted file mode 100644 index 4d86ca6..0000000 --- a/linux54-tkg/linux54-tkg-patches/0009-bmq_v5.4-r2.patch +++ /dev/null @@ -1,7601 +0,0 @@ -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 032c7cd3cede..97ea247cc43a 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -105,6 +105,7 @@ show up in /proc/sys/kernel: - - unknown_nmi_panic - - watchdog - - watchdog_thresh -+- yield_type - - version - - -@@ -1175,3 +1176,13 @@ is 10 seconds. - - The softlockup threshold is (2 * watchdog_thresh). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ CPU scheduler only. This determines what type of yield calls to -+sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c -index b66e81c06a57..a294f8f5fd75 100644 ---- a/drivers/cpufreq/cpufreq_conservative.c -+++ b/drivers/cpufreq/cpufreq_conservative.c -@@ -28,8 +28,8 @@ struct cs_dbs_tuners { - }; - - /* Conservative governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_FREQUENCY_DOWN_THRESHOLD (20) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) -+#define DEF_FREQUENCY_DOWN_THRESHOLD (26) - #define DEF_FREQUENCY_STEP (5) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (10) -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index dced033875bf..d2cd03766b09 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -18,7 +18,7 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (100000) - #define MICRO_FREQUENCY_UP_THRESHOLD (95) -@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) - } - - /* -- * Every sampling_rate, we check, if current idle time is less than 20% -+ * Every sampling_rate, we check, if current idle time is less than 37% - * (default), then we try to increase frequency. Else, we adjust the frequency - * proportional to load. - */ -diff --git a/fs/proc/base.c b/fs/proc/base.c -index ebea9501afb8..51c9346a69fe 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h -index 1b6d31da7cbc..dea181bdb1dd 100644 ---- a/include/linux/jiffies.h -+++ b/include/linux/jiffies.h -@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) - * Have the 32 bit jiffies value wrap 5 minutes after boot - * so jiffies wrap bugs show up earlier. - */ --#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) -+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) - - /* - * Change timeval to jiffies, trying to avoid the -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 67a1d86981a9..a38ec88efbad 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -644,13 +644,18 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) - struct llist_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BMQ) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_BMQ - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -664,6 +669,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_BMQ */ - #endif - int on_rq; - -@@ -672,13 +678,23 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_BMQ -+ u64 last_ran; -+ s64 time_slice; -+ int boost_prio; -+ int bmq_idx; -+ struct list_head bmq_node; -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_BMQ */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1283,6 +1299,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_BMQ -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_BMQ */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..02a3c5d34ee4 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,22 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_BMQ -+ -+#define __tsk_deadline(p) (0UL) -+ -+static inline int dl_prio(int prio) -+{ -+ return 0; -+} -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return (SCHED_NORMAL == p->policy); -+} -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_BMQ */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..d9dc5d3ccd2e 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,11 +20,17 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+#ifdef CONFIG_SCHED_BMQ -+/* +/- priority levels from the base priority */ -+#define MAX_PRIORITY_ADJ 4 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..6387c8ea9832 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_BMQ - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 4b1c3b664f51..f0f966219695 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/init/Kconfig b/init/Kconfig -index b4daad2bac23..f9faeb82f677 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -717,9 +717,28 @@ config GENERIC_SCHED_CLOCK - - menu "Scheduler features" - -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+ Say Y here. -+ default y -+ -+config SCHED_TIMESLICE -+ int "Scheduler Task time slice" -+ depends on SCHED_BMQ -+ help -+ Time slice in ms for BMQ CPU scheduler, default 4 ms. -+ default 2 if PREEMPT -+ default 4 if !PREEMPT -+ - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_BMQ - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -802,6 +821,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_BMQ - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -903,7 +923,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_BMQ - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1150,6 +1170,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_BMQ - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index 9e5cbe5eab7b..c293de91d90f 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -66,9 +66,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_BMQ -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -78,6 +84,12 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -85,6 +97,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index c87ee6412b36..45fac7b9c940 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_BMQ */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index a46a50d67002..58043176b285 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index cdf318d86dd6..b3bd1e65c002 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_BMQ -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index 2874bf556162..fad8a279fdfa 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, -@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..cab4e5c5b38e 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_BMQ -+obj-y += bmq.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/bmq.c b/kernel/sched/bmq.c -new file mode 100644 -index 000000000000..42a2a5b3d172 ---- /dev/null -+++ b/kernel/sched/bmq.c -@@ -0,0 +1,6102 @@ -+/* -+ * kernel/sched/bmq.c -+ * -+ * BMQ Core kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include "bmq_sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+#define SCHED_TIMESLICE_NS (CONFIG_SCHED_TIMESLICE * 1000 * 1000) -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "bmq: BMQ CPU Scheduler 5.4-r2 by Alfred Chen.\n"); -+} -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (SCHED_TIMESLICE_NS >>\ -+ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+static cpumask_t sched_rq_watermark[bmq_BITS] ____cacheline_aligned_in_smp; -+ -+#if (bmq_BITS <= BITS_PER_LONG) -+#define bmq_find_first_bit(bm) __ffs((bm[0])) -+#define bmq_find_next_bit(bm, start) __ffs(BITMAP_FIRST_WORD_MASK(start) & bm[0]) -+#else -+#define bmq_find_first_bit(bm) find_first_bit((bm), bmq_BITS) -+#define bmq_find_next_bit(bm, start) find_next_bit(bm, bmq_BITS, start) -+#endif -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = bmq_find_first_bit(rq->queue.bitmap); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline int task_sched_prio(struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO)? 0:p->prio - MAX_RT_PRIO + p->boost_prio + 1; -+} -+ -+static inline void bmq_init(struct bmq *q) -+{ -+ int i; -+ -+ bitmap_zero(q->bitmap, bmq_BITS); -+ for(i = 0; i < bmq_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void bmq_init_idle(struct bmq *q, struct task_struct *idle) -+{ -+ INIT_LIST_HEAD(&q->heads[IDLE_TASK_SCHED_PRIO]); -+ list_add(&idle->bmq_node, &q->heads[IDLE_TASK_SCHED_PRIO]); -+ set_bit(IDLE_TASK_SCHED_PRIO, q->bitmap); -+} -+ -+static inline void bmq_add_task(struct task_struct *p, struct bmq *q, int idx) -+{ -+ struct list_head *n; -+ -+ if (likely(idx)) { -+ list_add_tail(&p->bmq_node, &q->heads[idx]); -+ return; -+ } -+ -+ list_for_each(n, &q->heads[idx]) -+ if (list_entry(n, struct task_struct, bmq_node)->prio > p->prio) -+ break; -+ __list_add(&p->bmq_node, n->prev, n); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *rq_first_bmq_task(struct rq *rq) -+{ -+ unsigned long idx = bmq_find_first_bit(rq->queue.bitmap); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+rq_next_bmq_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = bmq_find_next_bit(rq->queue.bitmap, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = rq_first_bmq_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = rq_next_bmq_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "bmq: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ list_del(&p->bmq_node); -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { -+ clear_bit(p->bmq_idx, rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ } -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ -+ sched_info_dequeued(rq, p); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "bmq: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ p->bmq_idx = task_sched_prio(p); -+ bmq_add_task(p, &rq->queue, p->bmq_idx); -+ set_bit(p->bmq_idx, rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ int idx = task_sched_prio(p); -+ -+ lockdep_assert_held(&rq->lock); -+ WARN_ONCE(task_rq(p) != rq, "bmq: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ list_del(&p->bmq_node); -+ bmq_add_task(p, &rq->queue, idx); -+ if (idx != p->bmq_idx) { -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) -+ clear_bit(p->bmq_idx, rq->queue.bitmap); -+ p->bmq_idx = idx; -+ set_bit(p->bmq_idx, rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (rq_first_bmq_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * BMQ doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ rq->hrtick_csd_pending = 0; -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) { -+ __hrtick_restart(rq); -+ } else if (!rq->hrtick_csd_pending) { -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+ rq->hrtick_csd_pending = 1; -+ } -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq->hrtick_csd_pending = 0; -+ -+ rq->hrtick_csd.flags = 0; -+ rq->hrtick_csd.func = __hrtick_start; -+ rq->hrtick_csd.info = rq; -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/* Enter with rq lock held. We know p is on the local CPU */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ cpumask_t *mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu:__best_mask_cpu(cpu, cpumask); -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+#else /* CONFIG_SMP */ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** BMQ ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ check_preempt_curr(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_remote()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ cpu = task_cpu(p); -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ if(cpu_rq(smp_processor_id())->clock - p->last_ran > SCHED_TIMESLICE_NS) -+ boost_task(p); -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else /* CONFIG_SMP */ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, cpu, wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ int cpu = get_cpu(); -+ struct rq *rq = this_rq(); -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ /* Should be reset in fork.c but done here for ease of BMQ patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = 0; -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = SCHED_TIMESLICE_NS; -+ resched_curr(rq); -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ put_cpu(); -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ __set_tsk_resched(p); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) -+ rq = move_queued_task(rq, p, __best_mask_cpu(cpu_of(rq), &tmp)); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu, struct rq *rq) -+{ -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance_check(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu; -+ -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) -+ return; -+ -+ cpu = cpu_of(rq); -+ /* Only cpu in slibing idle group will do the checking */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask)) { -+ /* Find potential cpus which can migrate the currently running task */ -+ if (cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ /* skip the cpu which has idle slibing cpu */ -+ if (cpumask_intersects(cpu_smt_mask(i), -+ &sched_rq_watermark[IDLE_WM])) -+ continue; -+ if (cpumask_intersects(cpu_smt_mask(i), -+ &sched_rq_pending_mask)) -+ continue; -+ if (sg_balance_trigger(i, cpu_rq(i))) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } -+ return; -+ } -+ -+ if (1 != rq->nr_running) -+ return; -+ -+ if (cpumask_andnot(&chk, cpu_smt_mask(cpu), &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM]) && -+ cpumask_equal(&chk, cpu_smt_mask(cpu))) -+ sg_balance_trigger(cpu, rq); -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ -+ if (is_idle_task(curr) || cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ delta = rq_clock_task(rq) - curr->last_ran; -+ -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ scheduler_task_tick(rq); -+ -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (rq->idle == p) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = SCHED_TIMESLICE_NS; -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = rq_next_bmq_task(skip, rq)) != rq->idle) { -+ skip = rq_next_bmq_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, dest_cpu); -+ enqueue_task(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ nr_migrated = migrate_pending_tasks(src_rq, rq, cpu); -+ -+ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ if (nr_migrated) { -+ cpufreq_update_util(rq, 0); -+ return 1; -+ } -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+#ifdef CONFIG_SMP -+ if (likely(rq->online)) -+ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) -+ next = rq_runnable_task(rq); -+#endif -+ rq->skip = NULL; -+ return next; -+ } -+ -+ next = rq_first_bmq_task(rq); -+#ifdef CONFIG_SMP -+ if (likely(rq->online)) -+ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) -+ return rq_first_bmq_task(rq); -+#endif -+ return next; -+} -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ p->last_ran = rq->clock_task; -+ -+ if (unlikely(SCHED_TIMESLICE_NS == p->time_slice)) -+ rq->last_ts_switch = rq->clock; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (p != rq->idle) -+ hrtick_start(rq, p->time_slice); -+#endif -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which BMQ doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ if (rq_switch_time(rq) < boost_threshold(prev)) -+ boost_task(prev); -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ set_rq_task(rq, next); -+ -+ if (prev != next) { -+ if (MAX_PRIO == next->prio) -+ schedstat_inc(rq->sched_goidle); -+ -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ rq->nr_switches++; -+ rq->last_ts_switch = rq->clock; -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); -+#endif -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker. -+ */ -+ if (tsk->flags & PF_WQ_WORKER) { -+ preempt_disable(); -+ wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && task_sched_prio(p) != p->bmq_idx) { -+ requeue_task(p, rq); -+ check_preempt_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+#ifdef CONFIG_SMP -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(p->cpus_ptr, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * BMQ supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) { -+ current->boost_prio = MAX_PRIORITY_ADJ; -+ requeue_task(current, rq); -+ } -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In BMQ, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(SCHED_TIMESLICE_NS); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: BMQ should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = MAX_PRIO; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ bmq_init_idle(&rq->queue, idle); -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+static bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(); -+ struct cpumask *mask; -+ -+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ return cpu; -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu(i, mask) -+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) -+ return i; -+ -+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) -+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ current->active_mm = &init_mm; -+ finish_arch_post_lock_switch(); -+ } -+ mmdrop(mm); -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ p = rq_first_bmq_task(rq); -+ while (p != rq->idle) { -+ int dest_cpu; -+ -+ /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = rq_next_bmq_task(p, rq); -+ continue; -+ } -+ -+ /* -+ * Rules for changing task_struct::cpus_allowed are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ p = rq_next_bmq_task(p, rq); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ p = rq_first_bmq_task(rq); -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+ -+static __read_mostly int sched_debug_enabled; -+ -+static int __init sched_debug_setup(char *str) -+{ -+ sched_debug_enabled = 1; -+ -+ return 0; -+} -+early_param("sched_debug", sched_debug_setup); -+ -+static inline bool sched_debug(void) -+{ -+ return sched_debug_enabled; -+} -+#else /* !CONFIG_SCHED_DEBUG */ -+static inline bool sched_debug(void) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#ifdef CONFIG_SMP -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (!idle_cpu(smp_processor_id()) || need_resched()) -+ return; -+ -+ irq_enter(); -+ irq_exit(); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Topology list, bottom-up. -+ */ -+static struct sched_domain_topology_level default_topology[] = { -+#ifdef CONFIG_SCHED_SMT -+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -+#endif -+#ifdef CONFIG_SCHED_MC -+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -+#endif -+ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, -+ { NULL, }, -+}; -+ -+static struct sched_domain_topology_level *sched_domain_topology = -+ default_topology; -+ -+#define for_each_sd_topology(tl) \ -+ for (tl = sched_domain_topology; tl->mask; tl++) -+ -+void set_sched_topology(struct sched_domain_topology_level *tl) -+{ -+ if (WARN_ON_ONCE(sched_smp_initialized)) -+ return; -+ -+ sched_domain_topology = tl; -+} -+ -+/* -+ * Initializers for schedule domains -+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() -+ */ -+ -+int sched_domain_level_max; -+ -+/* -+ * Partition sched domains as specified by the 'ndoms_new' -+ * cpumasks in the array doms_new[] of cpumasks. This compares -+ * doms_new[] to the current sched domain partitioning, doms_cur[]. -+ * It destroys each deleted domain and builds each new domain. -+ * -+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. -+ * The masks don't intersect (don't overlap.) We should setup one -+ * sched domain for each mask. CPUs not in any of the cpumasks will -+ * not be load balanced. If the same cpumask appears both in the -+ * current 'doms_cur' domains and in the new 'doms_new', we can leave -+ * it as it is. -+ * -+ * The passed in 'doms_new' should be allocated using -+ * alloc_sched_domains. This routine takes ownership of it and will -+ * free_sched_domains it when done with it. If the caller failed the -+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, -+ * and partition_sched_domains() will fallback to the single partition -+ * 'fallback_doms', it also forces the domains to be rebuilt. -+ * -+ * If doms_new == NULL it will be replaced with cpu_online_mask. -+ * ndoms_new == 0 is a special case for destroying existing domains, -+ * and it will not create the default domain. -+ * -+ * Call with hotplug lock held -+ */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{ -+ /** -+ * BMQ doesn't depend on sched domains, but just keep this api -+ */ -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+/* -+ * sched_numa_find_closest() - given the NUMA topology, find the cpu -+ * closest to @cpu from @cpumask. -+ * cpumask: cpumask to find a cpu from -+ * cpu: cpu to be close to -+ * -+ * returns: cpu, or nr_cpu_ids when nothing found. -+ */ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ sched_tick_stop(cpu); -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ } -+} -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ -+#ifdef CONFIG_SCHED_SMT -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { -+ printk(KERN_INFO "bmq: cpu #%d affinity check mask - smt 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+ cpumask_complement(chk, topology_sibling_cpumask(cpu)); -+#else -+ cpumask_clear_cpu(cpu, chk); -+#endif -+#ifdef CONFIG_SCHED_MC -+ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) -+ printk(KERN_INFO "bmq: cpu #%d affinity check mask - coregroup 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ cpumask_complement(chk, cpu_coregroup_mask(cpu)); -+ -+ /** -+ * Set up sd_llc_id per CPU -+ */ -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(cpu_coregroup_mask(cpu)); -+#else -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(topology_core_cpumask(cpu)); -+ -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+#endif /* NOT CONFIG_SCHED_MC */ -+ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) -+ printk(KERN_INFO "bmq: cpu #%d affinity check mask - core 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ cpumask_complement(chk, topology_core_cpumask(cpu)); -+ -+ if (cpumask_and(chk, chk, cpu_online_mask)) -+ printk(KERN_INFO "bmq: cpu #%d affinity check mask - others 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ print_scheduler_version(); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < bmq_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ bmq_init(&rq->queue); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+#endif -+ rq->nr_switches = 0; -+ atomic_set(&rq->nr_iowait, 0); -+ hrtick_rq_init(rq); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/bmq_sched.h b/kernel/sched/bmq_sched.h -new file mode 100644 -index 000000000000..ed08dd0b8227 ---- /dev/null -+++ b/kernel/sched/bmq_sched.h -@@ -0,0 +1,472 @@ -+#ifndef BMQ_SCHED_H -+#define BMQ_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* bits: -+ * RT, Low prio adj range, nice width, high prio adj range, cpu idle task */ -+#define bmq_BITS (NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 2) -+#define IDLE_TASK_SCHED_PRIO (bmq_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, bmq_BITS); -+ struct list_head heads[bmq_BITS]; -+}; -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct *curr, *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+ struct bmq queue; -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ int hrtick_csd_pending; -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+#endif /* CONFIG_SMP */ -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+#endif /* BMQ_SCHED_H */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 86800b4d5453..a816aafa6ba3 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_BMQ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_BMQ */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_BMQ - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -918,6 +929,7 @@ static int __init sugov_register(void) - fs_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_BMQ - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_BMQ */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 46ed4e1383e2..51460a446da0 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -663,7 +663,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f65ef1e2f204..77bf219444fa 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_BMQ - /* - * idle-task scheduling class. - */ -@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index a96db50d40e0..22c20e28b613 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_BMQ - /* - * sched_entity: - * -@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - - #ifdef CONFIG_HAVE_SCHED_AVG_IRQ - /* -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index afff644da065..4da52afaeff8 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,11 +1,13 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_BMQ - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - - #ifdef CONFIG_HAVE_SCHED_AVG_IRQ - int update_irq_load_avg(struct rq *rq, u64 running); -@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_BMQ - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_BMQ */ - - #else - -+#ifndef CONFIG_SCHED_BMQ - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_irq_load_avg(struct rq *rq, u64 running) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index c8870c5bd7df..4bca9838b6f0 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq_sched.h" -+#else -+ - #include - - #include -@@ -2496,3 +2500,9 @@ static inline void membarrier_switch_mm(struct rq *rq, - { - } - #endif -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_BMQ */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..0cc040a28d3f 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_BMQ - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_BMQ - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b6f2f35d0bcf..435440943455 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_BMQ -+static int __maybe_unused zero = 0; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BMQ) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_BMQ - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_BMQ */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1070,6 +1076,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_BMQ -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 42d512fcfda2..70b97fe0ff44 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_BMQ - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_BMQ - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_BMQ - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index 69ee8ef12cee..208788fcbb0e 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_BMQ -+ /* No deadline on BMQ, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch b/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch deleted file mode 100644 index 5e78811..0000000 --- a/linux54-tkg/linux54-tkg-patches/0009-glitched-bmq.patch +++ /dev/null @@ -1,108 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - BMQ - -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (63) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. diff --git a/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch b/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch deleted file mode 100644 index af71d04..0000000 --- a/linux54-tkg/linux54-tkg-patches/0011-ZFS-fix.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= -Date: Thu, 2 May 2019 05:28:08 +0100 -Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with - EXPORT_SYMBOL_GPL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We need these symbols in zfs as the fpu implementation breaks userspace: - -https://github.com/zfsonlinux/zfs/issues/9346 -Signed-off-by: Jörg Thalheim ---- - arch/x86/kernel/fpu/core.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index 12c70840980e..352538b3bb5d 100644 ---- a/arch/x86/kernel/fpu/core.c -+++ b/arch/x86/kernel/fpu/core.c -@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) - } - __cpu_invalidate_fpregs_state(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_begin); -+EXPORT_SYMBOL(kernel_fpu_begin); - - void kernel_fpu_end(void) - { -@@ -111,7 +111,7 @@ void kernel_fpu_end(void) - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_end); -+EXPORT_SYMBOL(kernel_fpu_end); - - /* - * Save the FPU state (mark it for reload if necessary): --- -2.23.0 - - diff --git a/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch b/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch deleted file mode 100644 index b50ec74..0000000 --- a/linux54-tkg/linux54-tkg-patches/0012-linux-hardened.patch +++ /dev/null @@ -1,2806 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 5594c8bf1dcd..ac80978f4629 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -505,16 +505,6 @@ - nosocket -- Disable socket memory accounting. - nokmem -- Disable kernel memory accounting. - -- checkreqprot [SELINUX] Set initial checkreqprot flag value. -- Format: { "0" | "1" } -- See security/selinux/Kconfig help text. -- 0 -- check protection applied by kernel (includes -- any implied execute protection). -- 1 -- check protection requested by application. -- Default value is set via a kernel config option. -- Value can be changed at runtime via -- /selinux/checkreqprot. -- - cio_ignore= [S390] - See Documentation/s390/common_io.rst for details. - clk_ignore_unused -@@ -3345,6 +3335,11 @@ - the specified number of seconds. This is to be used if - your oopses keep scrolling off the screen. - -+ extra_latent_entropy -+ Enable a very simple form of latent entropy extraction -+ from the first 4GB of memory as the bootmem allocator -+ passes the memory pages to the buddy allocator. -+ - pcbit= [HW,ISDN] - - pcd. [PARIDE] -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 032c7cd3cede..cc3491b05976 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -102,6 +102,7 @@ show up in /proc/sys/kernel: - - sysctl_writes_strict - - tainted ==> Documentation/admin-guide/tainted-kernels.rst - - threads-max -+- tiocsti_restrict - - unknown_nmi_panic - - watchdog - - watchdog_thresh -@@ -1114,6 +1115,25 @@ thread structures would occupy too much (more than 1/8th) of the - available RAM pages threads-max is reduced accordingly. - - -+tiocsti_restrict: -+================= -+ -+This toggle indicates whether unprivileged users are prevented from using the -+TIOCSTI ioctl to inject commands into other processes which share a tty -+session. -+ -+When tiocsti_restrict is set to (0) there are no restrictions(accept the -+default restriction of only being able to injection commands into one's own -+tty). When tiocsti_restrict is set to (1), users must have CAP_SYS_ADMIN to -+use the TIOCSTI ioctl. -+ -+When user namespaces are in use, the check for the capability CAP_SYS_ADMIN is -+done against the user namespace that originally opened the tty. -+ -+The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the default -+value of tiocsti_restrict. -+ -+ - unknown_nmi_panic: - ================== - -diff --git a/arch/Kconfig b/arch/Kconfig -index 5f8a5d84dbbe..60103a76d33e 100644 ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -653,7 +653,7 @@ config ARCH_MMAP_RND_BITS - int "Number of bits to use for ASLR of mmap base address" if EXPERT - range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX - default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT -- default ARCH_MMAP_RND_BITS_MIN -+ default ARCH_MMAP_RND_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_BITS - help - This value can be used to select the number of bits to use to -@@ -687,7 +687,7 @@ config ARCH_MMAP_RND_COMPAT_BITS - int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT - range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX - default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT -- default ARCH_MMAP_RND_COMPAT_BITS_MIN -+ default ARCH_MMAP_RND_COMPAT_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS - help - This value can be used to select the number of bits to use to -@@ -906,6 +906,7 @@ config ARCH_HAS_REFCOUNT - - config REFCOUNT_FULL - bool "Perform full reference count validation at the expense of speed" -+ default y - help - Enabling this switches the refcounting infrastructure from a fast - unchecked atomic_t implementation to a fully state checked -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 6ccd2ed30963..56d39ec3c2c3 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -1139,6 +1139,7 @@ config RODATA_FULL_DEFAULT_ENABLED - - config ARM64_SW_TTBR0_PAN - bool "Emulate Privileged Access Never using TTBR0_EL1 switching" -+ default y - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved -@@ -1538,6 +1539,7 @@ config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" - select ARM64_MODULE_PLTS if MODULES - select RELOCATABLE -+ default y - help - Randomizes the virtual address at which the kernel image is - loaded, as a security feature that deters exploit attempts -diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug -index cf09010d825f..dc4083ceff57 100644 ---- a/arch/arm64/Kconfig.debug -+++ b/arch/arm64/Kconfig.debug -@@ -43,6 +43,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select ARM64_PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig -index c9a867ac32d4..5c4d264f6a6e 100644 ---- a/arch/arm64/configs/defconfig -+++ b/arch/arm64/configs/defconfig -@@ -1,4 +1,3 @@ --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ_IDLE=y -diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h -index b618017205a3..0a228dbcad65 100644 ---- a/arch/arm64/include/asm/elf.h -+++ b/arch/arm64/include/asm/elf.h -@@ -103,14 +103,10 @@ - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ --#ifdef CONFIG_ARM64_FORCE_52BIT --#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) --#else --#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) --#endif /* CONFIG_ARM64_FORCE_52BIT */ -+#define ELF_ET_DYN_BASE 0x100000000UL - - #ifndef __ASSEMBLY__ - -@@ -164,10 +160,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, - /* 1GB of VA */ - #ifdef CONFIG_COMPAT - #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ -- 0x7ff >> (PAGE_SHIFT - 12) : \ -- 0x3ffff >> (PAGE_SHIFT - 12)) -+ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ -+ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #else --#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) -+#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #endif - - #ifdef __AARCH64EB__ -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 8ef85139553f..e16076b30625 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1219,8 +1219,7 @@ config VM86 - default X86_LEGACY_VM86 - - config X86_16BIT -- bool "Enable support for 16-bit segments" if EXPERT -- default y -+ bool "Enable support for 16-bit segments" - depends on MODIFY_LDT_SYSCALL - ---help--- - This option is required by programs like Wine to run 16-bit -@@ -2365,7 +2364,7 @@ config COMPAT_VDSO - choice - prompt "vsyscall table for legacy applications" - depends on X86_64 -- default LEGACY_VSYSCALL_XONLY -+ default LEGACY_VSYSCALL_NONE - help - Legacy user code that does not know how to find the vDSO expects - to be able to issue three syscalls by calling fixed addresses in -@@ -2461,8 +2460,7 @@ config CMDLINE_OVERRIDE - be set to 'N' under normal conditions. - - config MODIFY_LDT_SYSCALL -- bool "Enable the LDT (local descriptor table)" if EXPERT -- default y -+ bool "Enable the LDT (local descriptor table)" - ---help--- - Linux can allow user programs to install a per-process x86 - Local Descriptor Table (LDT) using the modify_ldt(2) system -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index bf9cd83de777..13ef90f3de52 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -91,6 +91,7 @@ config EFI_PGT_DUMP - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select X86_PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index d0a5ffeae8df..2a91d4a9f640 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -1,5 +1,4 @@ - # CONFIG_LOCALVERSION_AUTO is not set --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_TASKSTATS=y -diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c -index f5937742b290..6655ce228e25 100644 ---- a/arch/x86/entry/vdso/vma.c -+++ b/arch/x86/entry/vdso/vma.c -@@ -198,55 +198,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) - } - - #ifdef CONFIG_X86_64 --/* -- * Put the vdso above the (randomized) stack with another randomized -- * offset. This way there is no hole in the middle of address space. -- * To save memory make sure it is still in the same PTE as the stack -- * top. This doesn't give that many random bits. -- * -- * Note that this algorithm is imperfect: the distribution of the vdso -- * start address within a PMD is biased toward the end. -- * -- * Only used for the 64-bit and x32 vdsos. -- */ --static unsigned long vdso_addr(unsigned long start, unsigned len) --{ -- unsigned long addr, end; -- unsigned offset; -- -- /* -- * Round up the start address. It can start out unaligned as a result -- * of stack start randomization. -- */ -- start = PAGE_ALIGN(start); -- -- /* Round the lowest possible end address up to a PMD boundary. */ -- end = (start + len + PMD_SIZE - 1) & PMD_MASK; -- if (end >= TASK_SIZE_MAX) -- end = TASK_SIZE_MAX; -- end -= len; -- -- if (end > start) { -- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); -- addr = start + (offset << PAGE_SHIFT); -- } else { -- addr = start; -- } -- -- /* -- * Forcibly align the final address in case we have a hardware -- * issue that requires alignment for performance reasons. -- */ -- addr = align_vdso_addr(addr); -- -- return addr; --} -- - static int map_vdso_randomized(const struct vdso_image *image) - { -- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); -- -- return map_vdso(image, addr); -+ return map_vdso(image, 0); - } - #endif - -diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h -index 69c0f892e310..f9f7a85bb71e 100644 ---- a/arch/x86/include/asm/elf.h -+++ b/arch/x86/include/asm/elf.h -@@ -248,11 +248,11 @@ extern int force_personality32; - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ - #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ -- (DEFAULT_MAP_WINDOW / 3 * 2)) -+ 0x100000000UL) - - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, -@@ -312,8 +312,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); - - #ifdef CONFIG_X86_32 - --#define __STACK_RND_MASK(is32bit) (0x7ff) --#define STACK_RND_MASK (0x7ff) -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) - - #define ARCH_DLINFO ARCH_DLINFO_IA32 - -@@ -322,7 +322,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); - #else /* CONFIG_X86_32 */ - - /* 1GB for 64bit, 8MB for 32bit */ --#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) -+#ifdef CONFIG_COMPAT -+#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) -+#else -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#endif - #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) - - #define ARCH_DLINFO \ -@@ -380,5 +384,4 @@ struct va_alignment { - } ____cacheline_aligned; - - extern struct va_alignment va_align; --extern unsigned long align_vdso_addr(unsigned long); - #endif /* _ASM_X86_ELF_H */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 6f66d841262d..b786e7cb395d 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -295,6 +295,7 @@ static inline void cr4_set_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 | mask) != cr4) - __cr4_set(cr4 | mask); - } -@@ -305,6 +306,7 @@ static inline void cr4_clear_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 & ~mask) != cr4) - __cr4_set(cr4 & ~mask); - } -@@ -334,6 +336,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - __cr4_set(cr4 ^ mask); - } - -@@ -440,6 +443,7 @@ static inline void __native_flush_tlb_global(void) - raw_local_irq_save(flags); - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ -diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c -index fffe21945374..e9e124eb6ccb 100644 ---- a/arch/x86/kernel/cpu/common.c -+++ b/arch/x86/kernel/cpu/common.c -@@ -1854,7 +1854,6 @@ void cpu_init(void) - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - -- x86_configure_nx(); - x2apic_setup(); - - /* -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 5e94c4354d4e..093bd8ad1130 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -42,6 +42,8 @@ - #include - #include - #include -+#include -+#include - - #include "process.h" - -@@ -798,7 +800,10 @@ unsigned long arch_align_stack(unsigned long sp) - - unsigned long arch_randomize_brk(struct mm_struct *mm) - { -- return randomize_page(mm->brk, 0x02000000); -+ if (mmap_is_ia32()) -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; -+ else -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - /* -diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c -index f7476ce23b6e..652169a2b23a 100644 ---- a/arch/x86/kernel/sys_x86_64.c -+++ b/arch/x86/kernel/sys_x86_64.c -@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) - return va_align.bits & get_align_mask(); - } - --unsigned long align_vdso_addr(unsigned long addr) --{ -- unsigned long align_mask = get_align_mask(); -- addr = (addr + align_mask) & ~align_mask; -- return addr | get_align_bits(); --} -- - static int __init control_va_addr_alignment(char *str) - { - /* guard against enabling this on other CPU families */ -@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, - } - - *begin = get_mmap_base(1); -- if (in_32bit_syscall()) -- *end = task_size_32bit(); -- else -- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); -+ *end = get_mmap_base(0); - } - - unsigned long -@@ -210,7 +200,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; -- info.low_limit = PAGE_SIZE; -+ info.low_limit = get_mmap_base(1); - info.high_limit = get_mmap_base(0); - - /* -diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c -index 0a74407ef92e..5ceff405c81c 100644 ---- a/arch/x86/mm/init_32.c -+++ b/arch/x86/mm/init_32.c -@@ -560,9 +560,9 @@ static void __init pagetable_init(void) - - #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) - /* Bits supported by the hardware: */ --pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; -+pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; - /* Bits allowed in normal kernel mappings: */ --pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; -+pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; - EXPORT_SYMBOL_GPL(__supported_pte_mask); - /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ - EXPORT_SYMBOL(__default_kernel_pte_mask); -diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c -index b8541d77452c..a231504e0348 100644 ---- a/arch/x86/mm/init_64.c -+++ b/arch/x86/mm/init_64.c -@@ -97,9 +97,9 @@ DEFINE_ENTRY(pte, pte, init) - */ - - /* Bits supported by the hardware: */ --pteval_t __supported_pte_mask __read_mostly = ~0; -+pteval_t __supported_pte_mask __ro_after_init = ~0; - /* Bits allowed in normal kernel mappings: */ --pteval_t __default_kernel_pte_mask __read_mostly = ~0; -+pteval_t __default_kernel_pte_mask __ro_after_init = ~0; - EXPORT_SYMBOL_GPL(__supported_pte_mask); - /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ - EXPORT_SYMBOL(__default_kernel_pte_mask); -diff --git a/block/blk-softirq.c b/block/blk-softirq.c -index 457d9ba3eb20..5f987fc1c0a0 100644 ---- a/block/blk-softirq.c -+++ b/block/blk-softirq.c -@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ --static __latent_entropy void blk_done_softirq(struct softirq_action *h) -+static __latent_entropy void blk_done_softirq(void) - { - struct list_head *cpu_list, local_list; - -diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c -index 84b183a6424e..b83bff5e9ab5 100644 ---- a/drivers/ata/libata-core.c -+++ b/drivers/ata/libata-core.c -@@ -5143,7 +5143,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) - struct ata_port *ap; - unsigned int tag; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - ap = qc->ap; - - qc->flags = 0; -@@ -5160,7 +5160,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) - struct ata_port *ap; - struct ata_link *link; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); - ap = qc->ap; - link = qc->dev->link; -diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig -index df0fc997dc3e..bd8eed8de6c1 100644 ---- a/drivers/char/Kconfig -+++ b/drivers/char/Kconfig -@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" - - config DEVMEM - bool "/dev/mem virtual device support" -- default y - help - Say Y here if you want to support the /dev/mem device. - The /dev/mem device is used to access areas of physical -@@ -514,7 +513,6 @@ config TELCLOCK - config DEVPORT - bool "/dev/port character device" - depends on ISA || PCI -- default y - help - Say Y here if you want to support the /dev/port device. The /dev/port - device is similar to /dev/mem, but for I/O ports. -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index c7623f99ac0f..859c2782c8e2 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -122,7 +122,6 @@ config UNIX98_PTYS - - config LEGACY_PTYS - bool "Legacy (BSD) PTY support" -- default y - ---help--- - A pseudo terminal (PTY) is a software device consisting of two - halves: a master and a slave. The slave device behaves identical to -diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c -index 802c1210558f..0cc320f33cdc 100644 ---- a/drivers/tty/tty_io.c -+++ b/drivers/tty/tty_io.c -@@ -173,6 +173,7 @@ static void free_tty_struct(struct tty_struct *tty) - put_device(tty->dev); - kfree(tty->write_buf); - tty->magic = 0xDEADDEAD; -+ put_user_ns(tty->owner_user_ns); - kfree(tty); - } - -@@ -2180,11 +2181,19 @@ static int tty_fasync(int fd, struct file *filp, int on) - * FIXME: may race normal receive processing - */ - -+int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); -+ - static int tiocsti(struct tty_struct *tty, char __user *p) - { - char ch, mbz = 0; - struct tty_ldisc *ld; - -+ if (tiocsti_restrict && -+ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { -+ dev_warn_ratelimited(tty->dev, -+ "Denied TIOCSTI ioctl for non-privileged process\n"); -+ return -EPERM; -+ } - if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (get_user(ch, p)) -@@ -3004,6 +3013,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) - tty->index = idx; - tty_line_name(driver, idx, tty->name); - tty->dev = tty_get_device(tty); -+ tty->owner_user_ns = get_user_ns(current_user_ns()); - - return tty; - } -diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c -index 4ac74b354801..7c2cb5b3a449 100644 ---- a/drivers/usb/core/hub.c -+++ b/drivers/usb/core/hub.c -@@ -42,6 +42,8 @@ - #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ - #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ - -+extern int deny_new_usb; -+ - /* Protect struct usb_device->state and ->children members - * Note: Both are also protected by ->dev.sem, except that ->state can - * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ -@@ -4991,6 +4993,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, - goto done; - return; - } -+ -+ if (deny_new_usb) { -+ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); -+ goto done; -+ } -+ - if (hub_is_superspeed(hub->hdev)) - unit_load = 150; - else -diff --git a/fs/exec.c b/fs/exec.c -index c27231234764..4038334db213 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -63,6 +63,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -276,6 +277,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) - arch_bprm_mm_init(mm, vma); - up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); -+ if (randomize_va_space) -+ bprm->p ^= get_random_int() & ~PAGE_MASK; - return 0; - err: - up_write(&mm->mmap_sem); -diff --git a/fs/namei.c b/fs/namei.c -index 671c3c1a3425..618ef0b5d000 100644 ---- a/fs/namei.c -+++ b/fs/namei.c -@@ -877,10 +877,10 @@ static inline void put_link(struct nameidata *nd) - path_put(&last->link); - } - --int sysctl_protected_symlinks __read_mostly = 0; --int sysctl_protected_hardlinks __read_mostly = 0; --int sysctl_protected_fifos __read_mostly; --int sysctl_protected_regular __read_mostly; -+int sysctl_protected_symlinks __read_mostly = 1; -+int sysctl_protected_hardlinks __read_mostly = 1; -+int sysctl_protected_fifos __read_mostly = 2; -+int sysctl_protected_regular __read_mostly = 2; - - /** - * may_follow_link - Check symlink following for unsafe situations -diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig -index 295a7a21b774..3aed361bc0f9 100644 ---- a/fs/nfs/Kconfig -+++ b/fs/nfs/Kconfig -@@ -195,4 +195,3 @@ config NFS_DEBUG - bool - depends on NFS_FS && SUNRPC_DEBUG - select CRC32 -- default y -diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig -index cb5629bd5fff..bc44606fcc48 100644 ---- a/fs/proc/Kconfig -+++ b/fs/proc/Kconfig -@@ -41,7 +41,6 @@ config PROC_KCORE - config PROC_VMCORE - bool "/proc/vmcore support" - depends on PROC_FS && CRASH_DUMP -- default y - help - Exports the dump image of crashed kernel in ELF format. - -diff --git a/fs/stat.c b/fs/stat.c -index c38e4c2e1221..6135fbaf7298 100644 ---- a/fs/stat.c -+++ b/fs/stat.c -@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->size = i_size_read(inode); -- stat->atime = inode->i_atime; -- stat->mtime = inode->i_mtime; -+ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = inode->i_ctime; -+ stat->mtime = inode->i_ctime; -+ } else { -+ stat->atime = inode->i_atime; -+ stat->mtime = inode->i_mtime; -+ } - stat->ctime = inode->i_ctime; - stat->blksize = i_blocksize(inode); - stat->blocks = inode->i_blocks; -@@ -77,9 +82,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, - if (IS_AUTOMOUNT(inode)) - stat->attributes |= STATX_ATTR_AUTOMOUNT; - -- if (inode->i_op->getattr) -- return inode->i_op->getattr(path, stat, request_mask, -- query_flags); -+ if (inode->i_op->getattr) { -+ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); -+ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = stat->ctime; -+ stat->mtime = stat->ctime; -+ } -+ return retval; -+ } - - generic_fillattr(inode, stat); - return 0; -diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c -index d99d166fd892..7a4f2854feb8 100644 ---- a/fs/userfaultfd.c -+++ b/fs/userfaultfd.c -@@ -28,7 +28,11 @@ - #include - #include - -+#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED - int sysctl_unprivileged_userfaultfd __read_mostly = 1; -+#else -+int sysctl_unprivileged_userfaultfd __read_mostly; -+#endif - - static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; - -diff --git a/include/linux/cache.h b/include/linux/cache.h -index 750621e41d1c..e7157c18c62c 100644 ---- a/include/linux/cache.h -+++ b/include/linux/cache.h -@@ -31,6 +31,8 @@ - #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) - #endif - -+#define __read_only __ro_after_init -+ - #ifndef ____cacheline_aligned - #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) - #endif -diff --git a/include/linux/capability.h b/include/linux/capability.h -index ecce0f43c73a..e46306dd4401 100644 ---- a/include/linux/capability.h -+++ b/include/linux/capability.h -@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); - extern bool has_ns_capability_noaudit(struct task_struct *t, - struct user_namespace *ns, int cap); - extern bool capable(int cap); -+extern bool capable_noaudit(int cap); - extern bool ns_capable(struct user_namespace *ns, int cap); - extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); - extern bool ns_capable_setid(struct user_namespace *ns, int cap); -@@ -234,6 +235,10 @@ static inline bool capable(int cap) - { - return true; - } -+static inline bool capable_noaudit(int cap) -+{ -+ return true; -+} - static inline bool ns_capable(struct user_namespace *ns, int cap) - { - return true; -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 0b4d8fc79e0f..6f318e089249 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3627,4 +3627,15 @@ static inline int inode_drain_writes(struct inode *inode) - return filemap_write_and_wait(inode->i_mapping); - } - -+extern int device_sidechannel_restrict; -+ -+static inline bool is_sidechannel_device(const struct inode *inode) -+{ -+ umode_t mode; -+ if (!device_sidechannel_restrict) -+ return false; -+ mode = inode->i_mode; -+ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); -+} -+ - #endif /* _LINUX_FS_H */ -diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h -index a2d5d175d3c1..e91ab06119b0 100644 ---- a/include/linux/fsnotify.h -+++ b/include/linux/fsnotify.h -@@ -233,6 +233,9 @@ static inline void fsnotify_access(struct file *file) - struct inode *inode = file_inode(file); - __u32 mask = FS_ACCESS; - -+ if (is_sidechannel_device(inode)) -+ return; -+ - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - -@@ -249,6 +252,9 @@ static inline void fsnotify_modify(struct file *file) - struct inode *inode = file_inode(file); - __u32 mask = FS_MODIFY; - -+ if (is_sidechannel_device(inode)) -+ return; -+ - if (S_ISDIR(inode->i_mode)) - mask |= FS_ISDIR; - -diff --git a/include/linux/gfp.h b/include/linux/gfp.h -index 61f2f6ff9467..f9b3e3d675ae 100644 ---- a/include/linux/gfp.h -+++ b/include/linux/gfp.h -@@ -553,9 +553,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); - extern unsigned long get_zeroed_page(gfp_t gfp_mask); - --void *alloc_pages_exact(size_t size, gfp_t gfp_mask); -+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); - void free_pages_exact(void *virt, size_t size); --void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); -+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); - - #define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask), 0) -diff --git a/include/linux/highmem.h b/include/linux/highmem.h -index ea5cdbd8c2c3..805b84d6bbca 100644 ---- a/include/linux/highmem.h -+++ b/include/linux/highmem.h -@@ -215,6 +215,13 @@ static inline void clear_highpage(struct page *page) - kunmap_atomic(kaddr); - } - -+static inline void verify_zero_highpage(struct page *page) -+{ -+ void *kaddr = kmap_atomic(page); -+ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); -+ kunmap_atomic(kaddr); -+} -+ - static inline void zero_user_segments(struct page *page, - unsigned start1, unsigned end1, - unsigned start2, unsigned end2) -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index 89fc59dab57d..5f98e14e9470 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -540,7 +540,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; - - struct softirq_action - { -- void (*action)(struct softirq_action *); -+ void (*action)(void); - }; - - asmlinkage void do_softirq(void); -@@ -555,7 +555,7 @@ static inline void do_softirq_own_stack(void) - } - #endif - --extern void open_softirq(int nr, void (*action)(struct softirq_action *)); -+extern void __init open_softirq(int nr, void (*action)(void)); - extern void softirq_init(void); - extern void __raise_softirq_irqoff(unsigned int nr); - -diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h -index 069aa2ebef90..cb9e3637a620 100644 ---- a/include/linux/kobject_ns.h -+++ b/include/linux/kobject_ns.h -@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { - void (*drop_ns)(void *); - }; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); - int kobj_ns_type_registered(enum kobj_ns_type type); - const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); - const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); -diff --git a/include/linux/mm.h b/include/linux/mm.h -index b249d2e033aa..a4855777d1fa 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -664,7 +664,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) - } - #endif - --extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); - static inline void *kvmalloc(size_t size, gfp_t flags) - { - return kvmalloc_node(size, flags, NUMA_NO_NODE); -diff --git a/include/linux/percpu.h b/include/linux/percpu.h -index 5e76af742c80..9a6c682ec127 100644 ---- a/include/linux/percpu.h -+++ b/include/linux/percpu.h -@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_populate_pte_fn_t populate_pte_fn); - #endif - --extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); - extern bool is_kernel_percpu_address(unsigned long addr); - -@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); - extern void __init setup_per_cpu_areas(void); - #endif - --extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); --extern void __percpu *__alloc_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); -+extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern void free_percpu(void __percpu *__pdata); - extern phys_addr_t per_cpu_ptr_to_phys(void *addr); - -diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h -index 68ccc5b1913b..a7565ea44938 100644 ---- a/include/linux/perf_event.h -+++ b/include/linux/perf_event.h -@@ -1241,6 +1241,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -+static inline bool perf_paranoid_any(void) -+{ -+ return sysctl_perf_event_paranoid > 2; -+} -+ - static inline bool perf_paranoid_tracepoint_raw(void) - { - return sysctl_perf_event_paranoid > -1; -diff --git a/include/linux/slab.h b/include/linux/slab.h -index 4d2a2fa55ed5..be3a8234edde 100644 ---- a/include/linux/slab.h -+++ b/include/linux/slab.h -@@ -184,8 +184,8 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); - /* - * Common kmalloc functions provided by all allocators - */ --void * __must_check __krealloc(const void *, size_t, gfp_t); --void * __must_check krealloc(const void *, size_t, gfp_t); -+void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); -+void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); - void kfree(const void *); - void kzfree(const void *); - size_t __ksize(const void *); -@@ -390,7 +390,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) - } - #endif /* !CONFIG_SLOB */ - --void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; - void kmem_cache_free(struct kmem_cache *, void *); - -@@ -414,7 +414,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) - } - - #ifdef CONFIG_NUMA --void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; - #else - static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) -@@ -539,7 +539,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) - * Try really hard to succeed the allocation but fail - * eventually. - */ --static __always_inline void *kmalloc(size_t size, gfp_t flags) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) - { - if (__builtin_constant_p(size)) { - #ifndef CONFIG_SLOB -@@ -581,7 +581,7 @@ static __always_inline unsigned int kmalloc_size(unsigned int n) - return 0; - } - --static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) - { - #ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && -diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h -index d2153789bd9f..97da977d6060 100644 ---- a/include/linux/slub_def.h -+++ b/include/linux/slub_def.h -@@ -121,6 +121,11 @@ struct kmem_cache { - unsigned long random; - #endif - -+#ifdef CONFIG_SLAB_CANARY -+ unsigned long random_active; -+ unsigned long random_inactive; -+#endif -+ - #ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. -diff --git a/include/linux/string.h b/include/linux/string.h -index b6ccdc2c7f02..6d66b8740f90 100644 ---- a/include/linux/string.h -+++ b/include/linux/string.h -@@ -268,10 +268,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob - void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); - void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); - -+#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING -+#define __string_size(p) __builtin_object_size(p, 1) -+#else -+#define __string_size(p) __builtin_object_size(p, 0) -+#endif -+ - #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) -@@ -281,7 +287,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - - __FORTIFY_INLINE char *strcat(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (p_size == (size_t)-1) - return __builtin_strcat(p, q); - if (strlcat(p, q, p_size) >= p_size) -@@ -292,7 +298,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) - __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - { - __kernel_size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || -@@ -307,7 +313,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); - __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); -@@ -319,8 +325,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); - __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - { - size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __real_strlcpy(p, q, size); - ret = strlen(q); -@@ -340,8 +346,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) - { - size_t p_len, copy_len; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strncat(p, q, count); - p_len = strlen(p); -@@ -454,8 +460,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) - /* defined after fortified strlen and memcpy to reuse them */ - __FORTIFY_INLINE char *strcpy(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __builtin_strcpy(p, q); - memcpy(p, q, strlen(q) + 1); -diff --git a/include/linux/tty.h b/include/linux/tty.h -index bfa4e2ee94a9..3e18d583fc8d 100644 ---- a/include/linux/tty.h -+++ b/include/linux/tty.h -@@ -14,6 +14,7 @@ - #include - #include - #include -+#include - - - /* -@@ -336,6 +337,7 @@ struct tty_struct { - /* If the tty has a pending do_SAK, queue it here - akpm */ - struct work_struct SAK_work; - struct tty_port *port; -+ struct user_namespace *owner_user_ns; - } __randomize_layout; - - /* Each of a tty's open files has private_data pointing to tty_file_private */ -@@ -345,6 +347,8 @@ struct tty_file_private { - struct list_head list; - }; - -+extern int tiocsti_restrict; -+ - /* tty magic number */ - #define TTY_MAGIC 0x5401 - -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index 4e7809408073..0b58a5176a25 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -88,19 +88,19 @@ static inline void vmalloc_init(void) - static inline unsigned long vmalloc_nr_pages(void) { return 0; } - #endif - --extern void *vmalloc(unsigned long size); --extern void *vzalloc(unsigned long size); --extern void *vmalloc_user(unsigned long size); --extern void *vmalloc_node(unsigned long size, int node); --extern void *vzalloc_node(unsigned long size, int node); --extern void *vmalloc_exec(unsigned long size); --extern void *vmalloc_32(unsigned long size); --extern void *vmalloc_32_user(unsigned long size); --extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -+extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); - extern void *__vmalloc_node_range(unsigned long size, unsigned long align, - unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, -- const void *caller); -+ const void *caller) __attribute__((alloc_size(1))); - #ifndef CONFIG_MMU - extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); - static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, -diff --git a/init/Kconfig b/init/Kconfig -index b4daad2bac23..c1016fd960f0 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -381,6 +381,7 @@ config USELIB - config AUDIT - bool "Auditing support" - depends on NET -+ default y - help - Enable auditing infrastructure that can be used with another - kernel subsystem, such as SELinux (which requires this for -@@ -1118,6 +1119,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ depends on USER_NS -+ default n -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say N. -+ - config PID_NS - bool "PID Namespaces" - default y -@@ -1538,8 +1555,7 @@ config SHMEM - which may be appropriate on small systems without swap. - - config AIO -- bool "Enable AIO support" if EXPERT -- default y -+ bool "Enable AIO support" - help - This option enables POSIX asynchronous I/O which may by used - by some high performance threaded applications. Disabling -@@ -1650,6 +1666,23 @@ config USERFAULTFD - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. - -+config USERFAULTFD_UNPRIVILEGED -+ bool "Allow unprivileged users to use the userfaultfd syscall" -+ depends on USERFAULTFD -+ default n -+ help -+ When disabled, unprivileged users will not be able to use the userfaultfd -+ syscall. Userfaultfd provide attackers with a way to stall a kernel -+ thread in the middle of memory accesses from userspace by initiating an -+ access on an unmapped page. To avoid various heap grooming and heap -+ spraying techniques for exploiting use-after-free flaws this should be -+ disabled by default. -+ -+ This setting can be overridden at runtime via the -+ vm.unprivileged_userfaultfd sysctl. -+ -+ If unsure, say N. -+ - config ARCH_HAS_MEMBARRIER_CALLBACKS - bool - -@@ -1762,7 +1795,7 @@ config VM_EVENT_COUNTERS - - config SLUB_DEBUG - default y -- bool "Enable SLUB debugging support" if EXPERT -+ bool "Enable SLUB debugging support" - depends on SLUB && SYSFS - help - SLUB has extensive debug support features. Disabling these can -@@ -1786,7 +1819,6 @@ config SLUB_MEMCG_SYSFS_ON - - config COMPAT_BRK - bool "Disable heap randomization" -- default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). -@@ -1833,7 +1865,6 @@ endchoice - - config SLAB_MERGE_DEFAULT - bool "Allow slab caches to be merged" -- default y - help - For reduced kernel memory fragmentation, slab caches can be - merged when they share the same size and other characteristics. -@@ -1846,9 +1877,9 @@ config SLAB_MERGE_DEFAULT - command line. - - config SLAB_FREELIST_RANDOM -- default n - depends on SLAB || SLUB - bool "SLAB freelist randomization" -+ default y - help - Randomizes the freelist order used on creating new pages. This - security feature reduces the predictability of the kernel slab -@@ -1857,12 +1888,30 @@ config SLAB_FREELIST_RANDOM - config SLAB_FREELIST_HARDENED - bool "Harden slab freelist metadata" - depends on SLUB -+ default y - help - Many kernel heap attacks try to target slab cache metadata and - other infrastructure. This options makes minor performance - sacrifices to harden the kernel slab allocator against common - freelist exploit methods. - -+config SLAB_CANARY -+ depends on SLUB -+ depends on !SLAB_MERGE_DEFAULT -+ bool "SLAB canaries" -+ default y -+ help -+ Place canaries at the end of kernel slab allocations, sacrificing -+ some performance and memory usage for security. -+ -+ Canaries can detect some forms of heap corruption when allocations -+ are freed and as part of the HARDENED_USERCOPY feature. It provides -+ basic use-after-free detection for HARDENED_USERCOPY. -+ -+ Canaries absorb small overflows (rendering them harmless), mitigate -+ non-NUL terminated C string overflows on 64-bit via a guaranteed zero -+ byte and provide basic double-free detection. -+ - config SHUFFLE_PAGE_ALLOCATOR - bool "Page allocator randomization" - default SLAB_FREELIST_RANDOM && ACPI_NUMA -diff --git a/kernel/audit.c b/kernel/audit.c -index da8dc0db5bd3..62dda6867dd9 100644 ---- a/kernel/audit.c -+++ b/kernel/audit.c -@@ -1628,6 +1628,9 @@ static int __init audit_enable(char *str) - - if (audit_default == AUDIT_OFF) - audit_initialized = AUDIT_DISABLED; -+ else if (!audit_ever_enabled) -+ audit_initialized = AUDIT_UNINITIALIZED; -+ - if (audit_set_enabled(audit_default)) - pr_err("audit: error setting audit state (%d)\n", - audit_default); -diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c -index ef0e1e3e66f4..d1ddc8695ab8 100644 ---- a/kernel/bpf/core.c -+++ b/kernel/bpf/core.c -@@ -519,7 +519,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) - #ifdef CONFIG_BPF_JIT - /* All BPF JIT sysctl knobs here. */ - int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); --int bpf_jit_harden __read_mostly; -+int bpf_jit_harden __read_mostly = 2; - int bpf_jit_kallsyms __read_mostly; - long bpf_jit_limit __read_mostly; - -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index ace1cfaa24b6..37e08fc44a6b 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -39,7 +39,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); - static DEFINE_IDR(map_idr); - static DEFINE_SPINLOCK(map_idr_lock); - --int sysctl_unprivileged_bpf_disabled __read_mostly; -+int sysctl_unprivileged_bpf_disabled __read_mostly = 1; - - static const struct bpf_map_ops * const bpf_map_types[] = { - #define BPF_PROG_TYPE(_id, _ops) -diff --git a/kernel/capability.c b/kernel/capability.c -index 1444f3954d75..8cc9dd7992f2 100644 ---- a/kernel/capability.c -+++ b/kernel/capability.c -@@ -449,6 +449,12 @@ bool capable(int cap) - return ns_capable(&init_user_ns, cap); - } - EXPORT_SYMBOL(capable); -+ -+bool capable_noaudit(int cap) -+{ -+ return ns_capable_noaudit(&init_user_ns, cap); -+} -+EXPORT_SYMBOL(capable_noaudit); - #endif /* CONFIG_MULTIUSER */ - - /** -diff --git a/kernel/events/core.c b/kernel/events/core.c -index 6c829e22bad3..3063a7239a94 100644 ---- a/kernel/events/core.c -+++ b/kernel/events/core.c -@@ -398,8 +398,13 @@ static cpumask_var_t perf_online_mask; - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv -+ * 3 - disallow all unpriv perf event use - */ -+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT -+int sysctl_perf_event_paranoid __read_mostly = 3; -+#else - int sysctl_perf_event_paranoid __read_mostly = 2; -+#endif - - /* Minimum for 512 kiB + 1 user control page */ - int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ -@@ -10895,6 +10900,9 @@ SYSCALL_DEFINE5(perf_event_open, - if (flags & ~PERF_FLAG_ALL) - return -EINVAL; - -+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) -+ return -EACCES; -+ - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; -diff --git a/kernel/fork.c b/kernel/fork.c -index 755d8160e001..ed909f8050b2 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -106,6 +106,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1779,6 +1784,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2836,6 +2845,12 @@ int ksys_unshare(unsigned long unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c -index 477b4eb44af5..db28cc3fd301 100644 ---- a/kernel/rcu/tiny.c -+++ b/kernel/rcu/tiny.c -@@ -74,7 +74,7 @@ void rcu_sched_clock_irq(int user) - } - - /* Invoke the RCU callbacks whose grace period has elapsed. */ --static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -+static __latent_entropy void rcu_process_callbacks(void) - { - struct rcu_head *next, *list; - unsigned long flags; -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index 81105141b6a8..38f04f653d29 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -2381,7 +2381,7 @@ static __latent_entropy void rcu_core(void) - trace_rcu_utilization(TPS("End RCU core")); - } - --static void rcu_core_si(struct softirq_action *h) -+static void rcu_core_si(void) - { - rcu_core(); - } -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index c87a798d1456..341c384cc597 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -9889,7 +9889,7 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). - */ --static __latent_entropy void run_rebalance_domains(struct softirq_action *h) -+static __latent_entropy void run_rebalance_domains(void) - { - struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? -diff --git a/kernel/softirq.c b/kernel/softirq.c -index 0427a86743a4..5e6a9b4ccb41 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c -@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); - EXPORT_PER_CPU_SYMBOL(irq_stat); - #endif - --static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -+static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); - - DEFINE_PER_CPU(struct task_struct *, ksoftirqd); - -@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); -- h->action(h); -+ h->action(); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", -@@ -452,7 +452,7 @@ void __raise_softirq_irqoff(unsigned int nr) - or_softirq_pending(1UL << nr); - } - --void open_softirq(int nr, void (*action)(struct softirq_action *)) -+void __init open_softirq(int nr, void (*action)(void)) - { - softirq_vec[nr].action = action; - } -@@ -498,8 +498,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) - } - EXPORT_SYMBOL(__tasklet_hi_schedule); - --static void tasklet_action_common(struct softirq_action *a, -- struct tasklet_head *tl_head, -+static void tasklet_action_common(struct tasklet_head *tl_head, - unsigned int softirq_nr) - { - struct tasklet_struct *list; -@@ -536,14 +535,14 @@ static void tasklet_action_common(struct softirq_action *a, - } - } - --static __latent_entropy void tasklet_action(struct softirq_action *a) -+static __latent_entropy void tasklet_action(void) - { -- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); -+ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); - } - --static __latent_entropy void tasklet_hi_action(struct softirq_action *a) -+static __latent_entropy void tasklet_hi_action(void) - { -- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); -+ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); - } - - void tasklet_init(struct tasklet_struct *t, -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 70665934d53e..8ea67d08b926 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -68,6 +68,7 @@ - #include - #include - #include -+#include - - #include "../lib/kstrtox.h" - -@@ -104,12 +105,19 @@ - #if defined(CONFIG_SYSCTL) - - /* External variables not in a header file. */ -+#if IS_ENABLED(CONFIG_USB) -+int deny_new_usb __read_mostly = 0; -+EXPORT_SYMBOL(deny_new_usb); -+#endif - extern int suid_dumpable; - #ifdef CONFIG_COREDUMP - extern int core_uses_pid; - extern char core_pattern[]; - extern unsigned int core_pipe_limit; - #endif -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - extern int pid_max; - extern int pid_max_min, pid_max_max; - extern int percpu_pagelist_fraction; -@@ -121,32 +129,32 @@ extern int sysctl_nr_trim_pages; - - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR --static int sixty = 60; -+static int sixty __read_only = 60; - #endif - --static int __maybe_unused neg_one = -1; --static int __maybe_unused two = 2; --static int __maybe_unused four = 4; --static unsigned long zero_ul; --static unsigned long one_ul = 1; --static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; -+static int __maybe_unused neg_one __read_only = -1; -+static int __maybe_unused two __read_only = 2; -+static int __maybe_unused four __read_only = 4; -+static unsigned long zero_ul __read_only; -+static unsigned long one_ul __read_only = 1; -+static unsigned long long_max __read_only = LONG_MAX; -+static int one_hundred __read_only = 100; -+static int one_thousand __read_only = 1000; - #ifdef CONFIG_PRINTK --static int ten_thousand = 10000; -+static int ten_thousand __read_only = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS --static int six_hundred_forty_kb = 640 * 1024; -+static int six_hundred_forty_kb __read_only = 640 * 1024; - #endif - - /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ --static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -+static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; - - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ --static int maxolduid = 65535; --static int minolduid; -+static int maxolduid __read_only = 65535; -+static int minolduid __read_only; - --static int ngroups_max = NGROUPS_MAX; -+static int ngroups_max __read_only = NGROUPS_MAX; - static const int cap_last_cap = CAP_LAST_CAP; - - /* -@@ -154,9 +162,12 @@ static const int cap_last_cap = CAP_LAST_CAP; - * and hung_task_check_interval_secs - */ - #ifdef CONFIG_DETECT_HUNG_TASK --static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -+static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); - #endif - -+int device_sidechannel_restrict __read_mostly = 1; -+EXPORT_SYMBOL(device_sidechannel_restrict); -+ - #ifdef CONFIG_INOTIFY_USER - #include - #endif -@@ -301,19 +312,19 @@ static struct ctl_table sysctl_base_table[] = { - }; - - #ifdef CONFIG_SCHED_DEBUG --static int min_sched_granularity_ns = 100000; /* 100 usecs */ --static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ --static int min_wakeup_granularity_ns; /* 0 usecs */ --static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -+static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ -+static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ -+static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ -+static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ - #ifdef CONFIG_SMP --static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; --static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -+static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; -+static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; - #endif /* CONFIG_SMP */ - #endif /* CONFIG_SCHED_DEBUG */ - - #ifdef CONFIG_COMPACTION --static int min_extfrag_threshold; --static int max_extfrag_threshold = 1000; -+static int min_extfrag_threshold __read_only; -+static int max_extfrag_threshold __read_only = 1000; - #endif - - static struct ctl_table kern_table[] = { -@@ -546,6 +557,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -@@ -901,6 +921,37 @@ static struct ctl_table kern_table[] = { - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -+#endif -+#if defined CONFIG_TTY -+ { -+ .procname = "tiocsti_restrict", -+ .data = &tiocsti_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#endif -+ { -+ .procname = "device_sidechannel_restrict", -+ .data = &device_sidechannel_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#if IS_ENABLED(CONFIG_USB) -+ { -+ .procname = "deny_new_usb", -+ .data = &deny_new_usb, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, - #endif - { - .procname = "ngroups_max", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 7f31932216a1..9ede224fc81f 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1583,7 +1583,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, - } - } - --static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) -+static __latent_entropy void hrtimer_run_softirq(void) - { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - unsigned long flags; -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index 4820823515e9..1a61e5aa87ae 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1779,7 +1779,7 @@ static inline void __run_timers(struct timer_base *base) - /* - * This function runs timers and the timer-tq in bottom half context. - */ --static __latent_entropy void run_timer_softirq(struct softirq_action *h) -+static __latent_entropy void run_timer_softirq(void) - { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 8eadadc478f9..c36ecd19562c 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -21,6 +21,13 @@ - #include - #include - -+/* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else -+int unprivileged_userns_clone; -+#endif -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 93d97f9b0157..fb923cae2120 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -352,6 +352,9 @@ config SECTION_MISMATCH_WARN_ONLY - - If unsure, say Y. - -+config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE -+ bool "Enable verbose reporting of writable function pointers" -+ - # - # Select this config option from the architecture Kconfig, if it - # is preferred to always offer frame pointers as a config -@@ -974,6 +977,7 @@ endmenu # "Debug lockups and hangs" - - config PANIC_ON_OOPS - bool "Panic on Oops" -+ default y - help - Say Y here to enable the kernel to panic when it oopses. This - has the same effect as setting oops=panic on the kernel command -@@ -983,7 +987,7 @@ config PANIC_ON_OOPS - anything erroneous after an oops which could result in data - corruption or other issues. - -- Say N if unsure. -+ Say Y if unsure. - - config PANIC_ON_OOPS_VALUE - int -@@ -1352,6 +1356,7 @@ config DEBUG_BUGVERBOSE - config DEBUG_LIST - bool "Debug linked list manipulation" - depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION -+ default y - help - Enable this to turn on extended checks in the linked-list - walking routines. -@@ -2073,6 +2078,7 @@ config MEMTEST - config BUG_ON_DATA_CORRUPTION - bool "Trigger a BUG when data corruption is detected" - select DEBUG_LIST -+ default y - help - Select this option if the kernel should BUG when it encounters - data corruption in kernel memory structures when they get checked -@@ -2112,6 +2118,7 @@ config STRICT_DEVMEM - config IO_STRICT_DEVMEM - bool "Filter I/O access to /dev/mem" - depends on STRICT_DEVMEM -+ default y - ---help--- - If this option is disabled, you allow userspace (root) access to all - io-memory regardless of whether a driver is actively using that -diff --git a/lib/irq_poll.c b/lib/irq_poll.c -index 2f17b488d58e..b6e7996a0058 100644 ---- a/lib/irq_poll.c -+++ b/lib/irq_poll.c -@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) - } - EXPORT_SYMBOL(irq_poll_complete); - --static void __latent_entropy irq_poll_softirq(struct softirq_action *h) -+static void __latent_entropy irq_poll_softirq(void) - { - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = irq_poll_budget; -diff --git a/lib/kobject.c b/lib/kobject.c -index 83198cb37d8d..4a053b7aef42 100644 ---- a/lib/kobject.c -+++ b/lib/kobject.c -@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); - - - static DEFINE_SPINLOCK(kobj_ns_type_lock); --static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; -+static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) - { - enum kobj_ns_type type = ops->type; - int error; -diff --git a/lib/nlattr.c b/lib/nlattr.c -index cace9b307781..39ba1387045d 100644 ---- a/lib/nlattr.c -+++ b/lib/nlattr.c -@@ -571,6 +571,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) - { - int minlen = min_t(int, count, nla_len(src)); - -+ BUG_ON(minlen < 0); -+ - memcpy(dest, nla_data(src), minlen); - if (count > minlen) - memset(dest + minlen, 0, count - minlen); -diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index e78017a3e1bd..ac5a5b5a439b 100644 ---- a/lib/vsprintf.c -+++ b/lib/vsprintf.c -@@ -771,7 +771,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, - return pointer_string(buf, end, (const void *)hashval, spec); - } - --int kptr_restrict __read_mostly; -+int kptr_restrict __read_mostly = 2; - - static noinline_for_stack - char *restricted_pointer(char *buf, char *end, const void *ptr, -diff --git a/mm/Kconfig b/mm/Kconfig -index a5dae9a7eb51..0a3070c5a125 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -303,7 +303,8 @@ config KSM - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" - depends on MMU -- default 4096 -+ default 32768 if ARM || (ARM64 && COMPAT) -+ default 65536 - help - This is the portion of low virtual memory which should be protected - from userspace allocation. Keeping a user from writing to low pages -diff --git a/mm/mmap.c b/mm/mmap.c -index 4390dbea4aa5..076fd46af68c 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -230,6 +230,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) - - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); -+ /* properly handle unaligned min_brk as an empty heap */ -+ if (min_brk & ~PAGE_MASK) { -+ if (brk == min_brk) -+ newbrk -= PAGE_SIZE; -+ if (mm->brk == min_brk) -+ oldbrk -= PAGE_SIZE; -+ } - if (oldbrk == newbrk) { - mm->brk = brk; - goto success; -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 45e39131a716..78b4865f8a1c 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -68,6 +68,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -106,6 +107,15 @@ struct pcpu_drain { - DEFINE_MUTEX(pcpu_drain_mutex); - DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); - -+bool __meminitdata extra_latent_entropy; -+ -+static int __init setup_extra_latent_entropy(char *str) -+{ -+ extra_latent_entropy = true; -+ return 0; -+} -+early_param("extra_latent_entropy", setup_extra_latent_entropy); -+ - #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY - volatile unsigned long latent_entropy __latent_entropy; - EXPORT_SYMBOL(latent_entropy); -@@ -1427,6 +1437,25 @@ static void __free_pages_ok(struct page *page, unsigned int order) - local_irq_restore(flags); - } - -+static void __init __gather_extra_latent_entropy(struct page *page, -+ unsigned int nr_pages) -+{ -+ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { -+ unsigned long hash = 0; -+ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; -+ const unsigned long *data = lowmem_page_address(page); -+ -+ for (index = 0; index < end; index++) -+ hash ^= hash + data[index]; -+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -+ latent_entropy ^= hash; -+ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); -+#else -+ add_device_randomness((const void *)&hash, sizeof(hash)); -+#endif -+ } -+} -+ - void __free_pages_core(struct page *page, unsigned int order) - { - unsigned int nr_pages = 1 << order; -@@ -1441,7 +1470,6 @@ void __free_pages_core(struct page *page, unsigned int order) - } - __ClearPageReserved(p); - set_page_count(p, 0); -- - atomic_long_add(nr_pages, &page_zone(page)->managed_pages); - set_page_refcounted(page); - __free_pages(page, order); -@@ -1492,6 +1520,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, - { - if (early_page_uninitialised(pfn)) - return; -+ __gather_extra_latent_entropy(page, 1 << order); - __free_pages_core(page, order); - } - -@@ -1582,6 +1611,7 @@ static void __init deferred_free_range(unsigned long pfn, - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); -+ __gather_extra_latent_entropy(page, 1 << pageblock_order); - __free_pages_core(page, pageblock_order); - return; - } -@@ -1589,6 +1619,7 @@ static void __init deferred_free_range(unsigned long pfn, - for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); -+ __gather_extra_latent_entropy(page, 1); - __free_pages_core(page, 0); - } - } -@@ -2156,6 +2187,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags - { - post_alloc_hook(page, order, gfp_flags); - -+ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { -+ int i; -+ for (i = 0; i < (1 << order); i++) -+ verify_zero_highpage(page + i); -+ } -+ - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); - -diff --git a/mm/slab.h b/mm/slab.h -index b2b01694dc43..b531661095a2 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -470,9 +470,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) - struct page *page; - - page = virt_to_head_page(obj); -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(!PageSlab(page)); -+#else - if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", - __func__)) - return NULL; -+#endif - return page->slab_cache; - } - -@@ -518,9 +522,14 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) - return s; - - cachep = virt_to_cache(x); -- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), -- "%s: Wrong slab cache. %s but object is from %s\n", -- __func__, s->name, cachep->name); -+ if (cachep && !slab_equal_or_root(cachep, s)) { -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG(); -+#else -+ WARN_ONCE(1, "%s: Wrong slab cache. %s but object is from %s\n", -+ __func__, s->name, cachep->name); -+#endif -+ } - return cachep; - } - -@@ -545,7 +554,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) - * back there or track user information then we can - * only use the space before that information. - */ -- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) -+ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation -@@ -674,8 +683,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } - static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) - { - if (static_branch_unlikely(&init_on_alloc)) { -+#ifndef CONFIG_SLUB - if (c->ctor) - return false; -+#endif - if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) - return flags & __GFP_ZERO; - return true; -@@ -685,9 +696,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) - - static inline bool slab_want_init_on_free(struct kmem_cache *c) - { -- if (static_branch_unlikely(&init_on_free)) -- return !(c->ctor || -- (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); -+ if (static_branch_unlikely(&init_on_free)) { -+#ifndef CONFIG_SLUB -+ if (c->ctor) -+ return false; -+#endif -+ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) -+ return false; -+ return true; -+ } - return false; - } - -diff --git a/mm/slab_common.c b/mm/slab_common.c -index ade6c257d4b4..f8f9ebd51296 100644 ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -28,10 +28,10 @@ - - #include "slab.h" - --enum slab_state slab_state; -+enum slab_state slab_state __ro_after_init; - LIST_HEAD(slab_caches); - DEFINE_MUTEX(slab_mutex); --struct kmem_cache *kmem_cache; -+struct kmem_cache *kmem_cache __ro_after_init; - - #ifdef CONFIG_HARDENED_USERCOPY - bool usercopy_fallback __ro_after_init = -@@ -59,7 +59,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - /* - * Merge control. If this is set then no merging of slab caches will occur. - */ --static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); -+static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); - - static int __init setup_slab_nomerge(char *str) - { -diff --git a/mm/slub.c b/mm/slub.c -index 20d72cb20515..6690bce322a4 100644 ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -125,6 +125,12 @@ static inline int kmem_cache_debug(struct kmem_cache *s) - #endif - } - -+static inline bool has_sanitize_verify(struct kmem_cache *s) -+{ -+ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && -+ slab_want_init_on_free(s); -+} -+ - void *fixup_red_left(struct kmem_cache *s, void *p) - { - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) -@@ -309,6 +315,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) - *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); - } - -+#ifdef CONFIG_SLAB_CANARY -+static inline unsigned long *get_canary(struct kmem_cache *s, void *object) -+{ -+ if (s->offset) -+ return object + s->offset + sizeof(void *); -+ return object + s->inuse; -+} -+ -+static inline unsigned long get_canary_value(const void *canary, unsigned long value) -+{ -+ return (value ^ (unsigned long)canary) & CANARY_MASK; -+} -+ -+static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ *canary = get_canary_value(canary, value); -+} -+ -+static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ BUG_ON(*canary != get_canary_value(canary, value)); -+} -+#else -+#define set_canary(s, object, value) -+#define check_canary(s, object, value) -+#endif -+ - /* Loop over all objects in a slab */ - #define for_each_object(__p, __s, __addr, __objects) \ - for (__p = fixup_red_left(__s, __addr); \ -@@ -476,13 +511,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) - * Debug settings: - */ - #if defined(CONFIG_SLUB_DEBUG_ON) --static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; -+static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; - #else --static slab_flags_t slub_debug; -+static slab_flags_t slub_debug __ro_after_init; - #endif - --static char *slub_debug_slabs; --static int disable_higher_order_debug; -+static char *slub_debug_slabs __ro_after_init; -+static int disable_higher_order_debug __ro_after_init; - - /* - * slub is about to manipulate internal object metadata. This memory lies -@@ -543,6 +578,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, - else - p = object + s->inuse; - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ p = (void *)p + sizeof(void *); -+ - return p + alloc; - } - -@@ -673,6 +711,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) - else - off = s->inuse; - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - off += 2 * sizeof(struct track); - -@@ -802,6 +843,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) - /* Freepointer is placed after the object. */ - off += sizeof(void *); - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - /* We also have user information there */ - off += 2 * sizeof(struct track); -@@ -1441,6 +1485,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - object = next; - next = get_freepointer(s, object); - -+ check_canary(s, object, s->random_active); -+ - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch -@@ -1451,8 +1497,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - : 0; - memset((char *)object + s->inuse, 0, - s->size - s->inuse - rsize); -- -+ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) -+ s->ctor(object); - } -+ -+ set_canary(s, object, s->random_inactive); -+ - /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { - /* Move object to the new freelist */ -@@ -1460,6 +1510,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - *head = object; - if (!*tail) - *tail = object; -+ } else if (slab_want_init_on_free(s) && s->ctor) { -+ /* Objects that are put into quarantine by KASAN will -+ * still undergo free_consistency_checks() and thus -+ * need to show a valid freepointer to check_object(). -+ * -+ * Note that doing this for all caches (not just ctor -+ * ones, which have s->offset != NULL)) causes a GPF, -+ * due to KASAN poisoning and the way set_freepointer() -+ * eventually dereferences the freepointer. -+ */ -+ set_freepointer(s, object, NULL); - } - } while (object != old_tail); - -@@ -1473,8 +1534,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page, - void *object) - { - setup_object_debug(s, page, object); -+ set_canary(s, object, s->random_inactive); - object = kasan_init_slab_obj(s, object); -- if (unlikely(s->ctor)) { -+ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); -@@ -2752,8 +2814,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - - maybe_wipe_obj_freeptr(s, object); - -- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) -+ if (has_sanitize_verify(s) && object) { -+ /* KASAN hasn't unpoisoned the object yet (this is done in the -+ * post-alloc hook), so let's do it temporarily. -+ */ -+ kasan_unpoison_object_data(s, object); -+ BUG_ON(memchr_inv(object, 0, s->object_size)); -+ if (s->ctor) -+ s->ctor(object); -+ kasan_poison_object_data(s, object); -+ } else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) { - memset(object, 0, s->object_size); -+ if (s->ctor) { -+ kasan_unpoison_object_data(s, object); -+ s->ctor(object); -+ kasan_poison_object_data(s, object); -+ } -+ } -+ -+ if (object) { -+ check_canary(s, object, s->random_inactive); -+ set_canary(s, object, s->random_active); -+ } - - slab_post_alloc_hook(s, gfpflags, 1, &object); - -@@ -3136,7 +3218,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) - { - struct kmem_cache_cpu *c; -- int i; -+ int i, k; - - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); -@@ -3176,11 +3258,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - local_irq_enable(); - - /* Clear memory outside IRQ disabled fastpath loop */ -- if (unlikely(slab_want_init_on_alloc(flags, s))) { -+ if (has_sanitize_verify(s)) { - int j; - -- for (j = 0; j < i; j++) -+ for (j = 0; j < i; j++) { -+ /* KASAN hasn't unpoisoned the object yet (this is done -+ * in the post-alloc hook), so let's do it temporarily. -+ */ -+ kasan_unpoison_object_data(s, p[j]); -+ BUG_ON(memchr_inv(p[j], 0, s->object_size)); -+ if (s->ctor) -+ s->ctor(p[j]); -+ kasan_poison_object_data(s, p[j]); -+ } -+ } else if (unlikely(slab_want_init_on_alloc(flags, s))) { -+ int j; -+ -+ for (j = 0; j < i; j++) { - memset(p[j], 0, s->object_size); -+ if (s->ctor) { -+ kasan_unpoison_object_data(s, p[j]); -+ s->ctor(p[j]); -+ kasan_poison_object_data(s, p[j]); -+ } -+ } -+ } -+ -+ for (k = 0; k < i; k++) { -+ check_canary(s, p[k], s->random_inactive); -+ set_canary(s, p[k], s->random_active); - } - - /* memcg and kmem_cache debug support */ -@@ -3214,9 +3320,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); - * and increases the number of allocations possible without having to - * take the list_lock. - */ --static unsigned int slub_min_order; --static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; --static unsigned int slub_min_objects; -+static unsigned int slub_min_order __ro_after_init; -+static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; -+static unsigned int slub_min_objects __ro_after_init; - - /* - * Calculate the order of allocation given an slab object size. -@@ -3384,6 +3490,7 @@ static void early_kmem_cache_node_alloc(int node) - init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); - #endif -+ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); - page->freelist = get_freepointer(kmem_cache_node, n); -@@ -3544,6 +3651,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) - size += sizeof(void *); - } - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ size += sizeof(void *); -+ - #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) - /* -@@ -3616,6 +3726,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) - #ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); - #endif -+#ifdef CONFIG_SLAB_CANARY -+ s->random_active = get_random_long(); -+ s->random_inactive = get_random_long(); -+#endif - - if (!calculate_sizes(s, -1)) - goto error; -@@ -3891,6 +4005,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, - offset -= s->red_left_pad; - } - -+ check_canary(s, (void *)ptr - offset, s->random_active); -+ - /* Allow address range falling entirely within usercopy region. */ - if (offset >= s->useroffset && - offset - s->useroffset <= s->usersize && -@@ -3924,7 +4040,11 @@ size_t __ksize(const void *object) - page = virt_to_head_page(object); - - if (unlikely(!PageSlab(page))) { -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(!PageCompound(page)); -+#else - WARN_ON(!PageCompound(page)); -+#endif - return page_size(page); - } - -@@ -4769,7 +4889,7 @@ enum slab_stat_type { - #define SO_TOTAL (1 << SL_TOTAL) - - #ifdef CONFIG_MEMCG --static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); -+static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); - - static int __init setup_slub_memcg_sysfs(char *str) - { -diff --git a/mm/swap.c b/mm/swap.c -index 38c3fa4308e2..0534c2e348c2 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -94,6 +94,13 @@ static void __put_compound_page(struct page *page) - if (!PageHuge(page)) - __page_cache_release(page); - dtor = get_compound_page_dtor(page); -+ if (!PageHuge(page)) -+ BUG_ON(dtor != free_compound_page -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ && dtor != free_transhuge_page -+#endif -+ ); -+ - (*dtor)(page); - } - -diff --git a/mm/util.c b/mm/util.c -index 3ad6db9a722e..80209685f67c 100644 ---- a/mm/util.c -+++ b/mm/util.c -@@ -325,9 +325,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) - { - /* Is the current task 32bit ? */ - if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) -- return randomize_page(mm->brk, SZ_32M); -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - -- return randomize_page(mm->brk, SZ_1G); -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - unsigned long arch_mmap_rnd(void) -diff --git a/net/core/dev.c b/net/core/dev.c -index 3098c90d60e2..08de516adfd5 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -4492,7 +4492,7 @@ int netif_rx_ni(struct sk_buff *skb) - } - EXPORT_SYMBOL(netif_rx_ni); - --static __latent_entropy void net_tx_action(struct softirq_action *h) -+static __latent_entropy void net_tx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - -@@ -6353,7 +6353,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) - return work; - } - --static __latent_entropy void net_rx_action(struct softirq_action *h) -+static __latent_entropy void net_rx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 03381f3e12ba..8ea409f37436 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -267,6 +267,7 @@ config IP_PIMSM_V2 - - config SYN_COOKIES - bool "IP: TCP syncookie support" -+ default y - ---help--- - Normal TCP/IP networking is open to an attack known as "SYN - flooding". This denial-of-service attack prevents legitimate remote -diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost -index 952fff485546..59ffccdb1be4 100644 ---- a/scripts/Makefile.modpost -+++ b/scripts/Makefile.modpost -@@ -54,6 +54,7 @@ MODPOST = scripts/mod/modpost \ - $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ - $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ - $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ -+ $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ - $(if $(KBUILD_MODPOST_WARN),-w) \ - $(if $(filter nsdeps,$(MAKECMDGOALS)),-d) - -diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig -index e3569543bdac..55cc439b3bc6 100644 ---- a/scripts/gcc-plugins/Kconfig -+++ b/scripts/gcc-plugins/Kconfig -@@ -61,6 +61,11 @@ config GCC_PLUGIN_LATENT_ENTROPY - is some slowdown of the boot process (about 0.5%) and fork and - irq processing. - -+ When extra_latent_entropy is passed on the kernel command line, -+ entropy will be extracted from up to the first 4GB of RAM while the -+ runtime memory allocator is being initialized. This costs even more -+ slowdown of the boot process. -+ - Note that entropy extracted this way is not cryptographically - secure! - -diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c -index d2a30a7b3f07..ff57a5fe8029 100644 ---- a/scripts/mod/modpost.c -+++ b/scripts/mod/modpost.c -@@ -36,6 +36,8 @@ static int warn_unresolved = 0; - /* How a symbol is exported */ - static int sec_mismatch_count = 0; - static int sec_mismatch_fatal = 0; -+static int writable_fptr_count = 0; -+static int writable_fptr_verbose = 0; - /* ignore missing files */ - static int ignore_missing_files; - /* write namespace dependencies */ -@@ -1019,6 +1021,7 @@ enum mismatch { - ANY_EXIT_TO_ANY_INIT, - EXPORT_TO_INIT_EXIT, - EXTABLE_TO_NON_TEXT, -+ DATA_TO_TEXT - }; - - /** -@@ -1145,6 +1148,12 @@ static const struct sectioncheck sectioncheck[] = { - .good_tosec = {ALL_TEXT_SECTIONS , NULL}, - .mismatch = EXTABLE_TO_NON_TEXT, - .handler = extable_mismatch_handler, -+}, -+/* Do not reference code from writable data */ -+{ -+ .fromsec = { DATA_SECTIONS, NULL }, -+ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, -+ .mismatch = DATA_TO_TEXT - } - }; - -@@ -1332,10 +1341,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, - continue; - if (!is_valid_name(elf, sym)) - continue; -- if (sym->st_value == addr) -- return sym; - /* Find a symbol nearby - addr are maybe negative */ - d = sym->st_value - addr; -+ if (d == 0) -+ return sym; - if (d < 0) - d = addr - sym->st_value; - if (d < distance) { -@@ -1470,7 +1479,13 @@ static void report_sec_mismatch(const char *modname, - char *prl_from; - char *prl_to; - -- sec_mismatch_count++; -+ if (mismatch->mismatch == DATA_TO_TEXT) { -+ writable_fptr_count++; -+ if (!writable_fptr_verbose) -+ return; -+ } else { -+ sec_mismatch_count++; -+ } - - get_pretty_name(from_is_func, &from, &from_p); - get_pretty_name(to_is_func, &to, &to_p); -@@ -1592,6 +1607,12 @@ static void report_sec_mismatch(const char *modname, - fatal("There's a special handler for this mismatch type, " - "we should never get here."); - break; -+ case DATA_TO_TEXT: -+ fprintf(stderr, -+ "The %s %s:%s references\n" -+ "the %s %s:%s%s\n", -+ from, fromsec, fromsym, to, tosec, tosym, to_p); -+ break; - } - fprintf(stderr, "\n"); - } -@@ -2569,7 +2590,7 @@ int main(int argc, char **argv) - struct ext_sym_list *extsym_iter; - struct ext_sym_list *extsym_start = NULL; - -- while ((opt = getopt(argc, argv, "i:I:e:mnsT:o:awEd")) != -1) { -+ while ((opt = getopt(argc, argv, "i:I:e:fmnsT:o:awEd")) != -1) { - switch (opt) { - case 'i': - kernel_read = optarg; -@@ -2586,6 +2607,9 @@ int main(int argc, char **argv) - extsym_iter->file = optarg; - extsym_start = extsym_iter; - break; -+ case 'f': -+ writable_fptr_verbose = 1; -+ break; - case 'm': - modversions = 1; - break; -@@ -2692,6 +2716,11 @@ int main(int argc, char **argv) - } - - free(buf.p); -+ if (writable_fptr_count && !writable_fptr_verbose) -+ warn("modpost: Found %d writable function pointer%s.\n" -+ "To see full details build your kernel with:\n" -+ "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", -+ writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); - - return err; - } -diff --git a/security/Kconfig b/security/Kconfig -index 2a1a2d396228..3b7a71410f88 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -9,7 +9,7 @@ source "security/keys/Kconfig" - - config SECURITY_DMESG_RESTRICT - bool "Restrict unprivileged access to the kernel syslog" -- default n -+ default y - help - This enforces restrictions on unprivileged users reading the kernel - syslog via dmesg(8). -@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT - - If you are unsure how to answer this question, answer N. - -+config SECURITY_PERF_EVENTS_RESTRICT -+ bool "Restrict unprivileged use of performance events" -+ depends on PERF_EVENTS -+ default y -+ help -+ If you say Y here, the kernel.perf_event_paranoid sysctl -+ will be set to 3 by default, and no unprivileged use of the -+ perf_event_open syscall will be permitted unless it is -+ changed. -+ -+config SECURITY_TIOCSTI_RESTRICT -+ bool "Restrict unprivileged use of tiocsti command injection" -+ default y -+ help -+ This enforces restrictions on unprivileged users injecting commands -+ into other processes which share a tty session using the TIOCSTI -+ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. -+ -+ If this option is not selected, no restrictions will be enforced -+ unless the tiocsti_restrict sysctl is explicitly set to (1). -+ -+ If you are unsure how to answer this question, answer N. -+ - config SECURITY - bool "Enable different security models" - depends on SYSFS - depends on MULTIUSER -+ default y - help - This allows you to choose different security modules to be - configured into your kernel. -@@ -48,6 +72,7 @@ config SECURITYFS - config SECURITY_NETWORK - bool "Socket and Networking Security Hooks" - depends on SECURITY -+ default y - help - This enables the socket and networking security hooks. - If enabled, a security module can use these hooks to -@@ -154,6 +179,7 @@ config HARDENED_USERCOPY - bool "Harden memory copies between kernel and userspace" - depends on HAVE_HARDENED_USERCOPY_ALLOCATOR - imply STRICT_DEVMEM -+ default y - help - This option checks for obviously wrong memory regions when - copying memory to/from the kernel (via copy_to_user() and -@@ -166,7 +192,6 @@ config HARDENED_USERCOPY - config HARDENED_USERCOPY_FALLBACK - bool "Allow usercopy whitelist violations to fallback to object size" - depends on HARDENED_USERCOPY -- default y - help - This is a temporary option that allows missing usercopy whitelists - to be discovered via a WARN() to the kernel log, instead of -@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN - config FORTIFY_SOURCE - bool "Harden common str/mem functions against buffer overflows" - depends on ARCH_HAS_FORTIFY_SOURCE -+ default y - help - Detect overflows of buffers in common string and memory functions - where the compiler can determine and validate the buffer sizes. - -+config FORTIFY_SOURCE_STRICT_STRING -+ bool "Harden common functions against buffer overflows" -+ depends on FORTIFY_SOURCE -+ depends on EXPERT -+ help -+ Perform stricter overflow checks catching overflows within objects -+ for common C string functions rather than only between objects. -+ -+ This is not yet intended for production use, only bug finding. -+ - config STATIC_USERMODEHELPER - bool "Force all usermode helper calls through a single binary" - help -diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening -index af4c979b38ee..473e40bb8537 100644 ---- a/security/Kconfig.hardening -+++ b/security/Kconfig.hardening -@@ -169,6 +169,7 @@ config STACKLEAK_RUNTIME_DISABLE - - config INIT_ON_ALLOC_DEFAULT_ON - bool "Enable heap memory zeroing on allocation by default" -+ default yes - help - This has the effect of setting "init_on_alloc=1" on the kernel - command line. This can be disabled with "init_on_alloc=0". -@@ -181,6 +182,7 @@ config INIT_ON_ALLOC_DEFAULT_ON - - config INIT_ON_FREE_DEFAULT_ON - bool "Enable heap memory zeroing on free by default" -+ default yes - help - This has the effect of setting "init_on_free=1" on the kernel - command line. This can be disabled with "init_on_free=0". -@@ -196,6 +198,20 @@ config INIT_ON_FREE_DEFAULT_ON - touching "cold" memory areas. Most cases see 3-5% impact. Some - synthetic workloads have measured as high as 8%. - -+config PAGE_SANITIZE_VERIFY -+ bool "Verify sanitized pages" -+ default y -+ help -+ When init_on_free is enabled, verify that newly allocated pages -+ are zeroed to detect write-after-free bugs. -+ -+config SLAB_SANITIZE_VERIFY -+ default y -+ bool "Verify sanitized SLAB allocations" -+ help -+ When init_on_free is enabled, verify that newly allocated slab -+ objects are zeroed to detect write-after-free bugs. -+ - endmenu - - endmenu -diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig -index 5711689deb6a..fab0cb896907 100644 ---- a/security/selinux/Kconfig -+++ b/security/selinux/Kconfig -@@ -3,7 +3,7 @@ config SECURITY_SELINUX - bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET - select NETWORK_SECMARK -- default n -+ default y - help - This selects NSA Security-Enhanced Linux (SELinux). - You will also need a policy configuration and a labeled filesystem. -@@ -65,23 +65,3 @@ config SECURITY_SELINUX_AVC_STATS - This option collects access vector cache statistics to - /selinux/avc/cache_stats, which may be monitored via - tools such as avcstat. -- --config SECURITY_SELINUX_CHECKREQPROT_VALUE -- int "NSA SELinux checkreqprot default value" -- depends on SECURITY_SELINUX -- range 0 1 -- default 0 -- help -- This option sets the default value for the 'checkreqprot' flag -- that determines whether SELinux checks the protection requested -- by the application or the protection that will be applied by the -- kernel (including any implied execute for read-implies-exec) for -- mmap and mprotect calls. If this option is set to 0 (zero), -- SELinux will default to checking the protection that will be applied -- by the kernel. If this option is set to 1 (one), SELinux will -- default to checking the protection requested by the application. -- The checkreqprot flag may be changed from the default via the -- 'checkreqprot=' boot parameter. It may also be changed at runtime -- via /selinux/checkreqprot if authorized by policy. -- -- If you are unsure how to answer this question, answer 0. -diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c -index 9625b99e677f..daa40da7a8f9 100644 ---- a/security/selinux/hooks.c -+++ b/security/selinux/hooks.c -@@ -135,18 +135,7 @@ static int __init selinux_enabled_setup(char *str) - __setup("selinux=", selinux_enabled_setup); - #endif - --static unsigned int selinux_checkreqprot_boot = -- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; -- --static int __init checkreqprot_setup(char *str) --{ -- unsigned long checkreqprot; -- -- if (!kstrtoul(str, 0, &checkreqprot)) -- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; -- return 1; --} --__setup("checkreqprot=", checkreqprot_setup); -+static const unsigned int selinux_checkreqprot_boot; - - /** - * selinux_secmark_enabled - Check to see if SECMARK is currently enabled -diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c -index e6c7643c3fc0..0e8217f72c5a 100644 ---- a/security/selinux/selinuxfs.c -+++ b/security/selinux/selinuxfs.c -@@ -639,7 +639,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, - static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) - { -- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - char *page; - ssize_t length; - unsigned int new_value; -@@ -663,10 +662,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, - return PTR_ERR(page); - - length = -EINVAL; -- if (sscanf(page, "%u", &new_value) != 1) -+ if (sscanf(page, "%u", &new_value) != 1 || new_value) - goto out; - -- fsi->state->checkreqprot = new_value ? 1 : 0; - length = count; - out: - kfree(page); -diff --git a/security/yama/Kconfig b/security/yama/Kconfig -index a810304123ca..b809050b25d2 100644 ---- a/security/yama/Kconfig -+++ b/security/yama/Kconfig -@@ -2,7 +2,7 @@ - config SECURITY_YAMA - bool "Yama support" - depends on SECURITY -- default n -+ default y - help - This selects Yama, which extends DAC support with additional - system-wide security settings beyond regular Linux discretionary diff --git a/linux57-tkg/PKGBUILD b/linux57-tkg/PKGBUILD deleted file mode 100644 index a4b9696..0000000 --- a/linux57-tkg/PKGBUILD +++ /dev/null @@ -1,282 +0,0 @@ -# Based on the file created for Arch Linux by: -# Tobias Powalowski -# Thomas Baechler - -# Contributor: Tk-Glitch - -plain ' .---.` `.---.' -plain ' `/syhhhyso- -osyhhhys/`' -plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' -plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' -plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' -plain ' /ssyhhhysssssssssssssssssyhhhyss/' -plain ' .ossssssssssssssssssssssssssssso.' -plain ' :sssssssssssssssssssssssssssssssss:' -plain ' /sssssssssssssssssssssssssssssssssss/' -plain ' :sssssssssssssoosssssssoosssssssssssss:' -plain ' osssssssssssssoosssssssoossssssssssssso' -plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' -plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' -plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' -plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' -plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' -plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' -plain ' `.-:///////:-.`' - -_where="$PWD" # track basedir as different Arch based distros are moving srcdir around - -source "$_where"/customization.cfg # load default configuration from file -source "$_where"/linux*-tkg-config/prepare - -_tkg_initscript - -_distro="Arch" - -if [ -n "$_custom_pkgbase" ]; then - pkgbase="${_custom_pkgbase}" -else - pkgbase=linux"${_basever}"-tkg-"${_cpusched}" -fi -pkgname=("${pkgbase}" "${pkgbase}-headers") -pkgver="${_basekernel}"."${_sub}" -pkgrel=33 -pkgdesc='Linux-tkg' -arch=('x86_64') # no i686 in here -url="http://www.kernel.org/" -license=('GPL2') -makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'pahole' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') -optdepends=('schedtool') -options=('!strip' 'docs') -source=("https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" - "https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" - "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.7%2B.patch" - 'config.x86_64' # stock Arch config - 'config_hardened.x86_64' # hardened Arch config - 90-cleanup.hook - cleanup - # ARCH Patches - 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - # TkG - 0002-clear-patches.patch - 0003-glitched-base.patch - 0003-glitched-cfs.patch - 0004-glitched-ondemand-muqss.patch - 0004-glitched-muqss.patch - 0004-5.7-ck1.patch - 0005-glitched-ondemand-pds.patch - 0005-glitched-pds.patch - 0005-v5.7_undead-pds099o.patch - 0006-add-acs-overrides_iommu.patch - 0007-v5.7-fsync.patch - 0008-5.7-bcachefs.patch - 0009-glitched-ondemand-bmq.patch - 0009-glitched-bmq.patch - 0009-prjc_v5.7-r3.patch - 0011-ZFS-fix.patch - 0012-linux-hardened.patch - 0012-misc-additions.patch -) -sha256sums=('de8163bb62f822d84f7a3983574ec460060bf013a78ff79cd7c979ff1ec1d7e0' - '66a0173a13cd58015f5bf1b14f67bfa15dc1db5d8e7225fcd95ac2e9a5341653' - '1f56a2466bd9b4477925682d8f944fabb38727140e246733214fe50aa326fc47' - '6313ccad7f8e4d8ce09dd5bdb51b8dfa124d0034d7097ba47008380a14a84f09' - '15ce09447b7e9b28425c1df5961c955378f2829e4115037337eef347b1db3d9d' - '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' - '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' - '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' - 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' - 'bbf332201423888257c9687bee06916a5dbbac2194f9df5b4126100c40e48d16' - '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' - 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' - 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' - '8d8aec86e34dbec6cc3a47f2cd55dc9212e95d36b6cd34d6e637c66731e7d838' - '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' - '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' - '55be5e4c6254da0a9d34bbfac807a70d8b58b3f7b2ec852026195c4db5e263e2' - '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - 'cd225e86d72eaf6c31ef3d7b20df397f4cc44ddd04389850691292cdf292b204' - 'd2214504c43f9d297a8ef68dffc198143bfebf85614b71637a71978d7a86bd78' - '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' - '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' - 'b2a2ae866fc3f1093f67e69ba59738827e336b8f800fb0487599127f7f3ef881' - '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' - '6821f92bd2bde3a3938d17b070d70f18a2f33cae81647567b5a4d94c9cd75f3d' - 'bdc60c83cd5fbf9912f9201d6e4fe3c84fe5f634e6823bd8e78264ad606b3a9e') - -export KBUILD_BUILD_HOST=archlinux -export KBUILD_BUILD_USER=$pkgbase -export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" - -prepare() { - rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build - - ln -s "${_where}/customization.cfg" "${srcdir}" # workaround - - cd "${srcdir}/linux-${_basekernel}" - - _tkg_srcprep -} - -build() { - cd "${srcdir}/linux-${_basekernel}" - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _force_all_threads="-j$((`nproc`*2))" - else - _force_all_threads="${MAKEFLAGS}" - fi - - # ccache - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - export PATH="/usr/lib/ccache/bin/:$PATH" - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - fi - - # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". - declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" - - # build! - _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) -} - -hackbase() { - pkgdesc="The $pkgdesc kernel and modules" - depends=('coreutils' 'kmod' 'initramfs') - optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' - 'crda: to set the correct wireless channels of your country.' - 'linux-firmware: Firmware files for Linux' - 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' - 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' - 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' - 'update-grub: Simple wrapper around grub-mkconfig.') - provides=("linux=${pkgver}" "${pkgbase}" VIRTUALBOX-GUEST-MODULES WIREGUARD-MODULE) - replaces=(virtualbox-guest-modules-arch wireguard-arch) - - cd "${srcdir}/linux-${_basekernel}" - - # get kernel version - local _kernver="$(\033[1;0m \033[1;1m$1\033[1;0m" >&2 -} - -error() { - echo -e " \033[1;31m==> ERROR: $1\033[1;0m" >&2 -} - -warning() { - echo -e " \033[1;33m==> WARNING: $1\033[1;0m" >&2 -} - -plain() { - echo "$1" >&2 -} - -# Stop the script at any ecountered error -set -e - -_where=`pwd` -srcdir="$_where" - -source linux*-tkg-config/prepare - -_cpu_opt_patch_link="https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v${_basekernel}%2B.patch" - -source customization.cfg - -if [ "$1" != "install" ] && [ "$1" != "config" ] && [ "$1" != "uninstall-help" ]; then - msg2 "Argument not recognised, options are: - - config : shallow clones the linux ${_basekernel}.x git tree into the folder linux-${_basekernel}, then applies on it the extra patches and prepares the .config file - by copying the one from the current linux system in /boot/config-`uname -r` and updates it. - - install : [RPM and DEB based distros only], does the config step, proceeds to compile, then prompts to install - - uninstall-help : [RPM and DEB based distros only], lists the installed kernels in this system, then gives a hint on how to uninstall them manually." - exit 0 -fi - -# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. -if [ -e "$_EXT_CONFIG_PATH" ]; then - msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values." - source "$_EXT_CONFIG_PATH" -fi - -_misc_adds="false" # We currently don't want this enabled on non-Arch - -if [ "$1" = "install" ] || [ "$1" = "config" ]; then - - if [ -z $_distro ] && [ "$1" = "install" ]; then - while true; do - echo "Which linux distribution are you running ?" - echo "if it's not on the list, chose the closest one to it: Fedora/Suse for RPM, Ubuntu/Debian for DEB" - echo " 1) Debian" - echo " 2) Fedora" - echo " 3) Suse" - echo " 4) Ubuntu" - read -p "[1-4]: " _distro_index - - if [ "$_distro_index" = "1" ]; then - _distro="Debian" - break - elif [ "$_distro_index" = "2" ]; then - _distro="Fedora" - break - elif [ "$_distro_index" = "3" ]; then - _distro="Suse" - break - elif [ "$_distro_index" = "4" ]; then - _distro="Ubuntu" - break - else - echo "Wrong index." - fi - done - fi - - if [[ $1 = "install" && "$_distro" != "Ubuntu" && "$_distro" != "Debian" && "$_distro" != "Fedora" && "$_distro" != "Suse" ]]; then - msg2 "Variable \"_distro\" in \"customization.cfg\" hasn't been set to \"Ubuntu\", \"Debian\", \"Fedora\" or \"Suse\"" - msg2 "This script can only install custom kernels for RPM and DEB based distros, though only those keywords are permitted. Exiting..." - exit 0 - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - msg2 "Installing dependencies" - sudo apt install git build-essential kernel-package fakeroot libncurses5-dev libssl-dev ccache bison flex qtbase5-dev -y - elif [ "$_distro" = "Fedora" ]; then - msg2 "Installing dependencies" - sudo dnf install fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby qt5-devel libXi-devel gcc-c++ git ccache flex bison elfutils-libelf-devel openssl-devel dwarves rpm-build -y - elif [ "$_distro" = "Suse" ]; then - msg2 "Installing dependencies" - sudo zypper install -y rpmdevtools ncurses-devel pesign libXi-devel gcc-c++ git ccache flex bison elfutils libelf-devel openssl-devel dwarves make patch bc rpm-build libqt5-qtbase-common-devel libqt5-qtbase-devel lz4 - fi - - # Force prepare script to avoid Arch specific commands if the user is using `config` - if [ "$1" = "config" ]; then - _distro="" - fi - - if [ -d linux-${_basekernel}.orig ]; then - rm -rf linux-${_basekernel}.orig - fi - - if [ -d linux-${_basekernel} ]; then - msg2 "Reseting files in linux-$_basekernel to their original state and getting latest updates" - cd "$_where"/linux-${_basekernel} - git checkout --force linux-$_basekernel.y - git clean -f -d -x - git pull - msg2 "Done" - cd "$_where" - else - msg2 "Shallow git cloning linux $_basekernel" - git clone --branch linux-$_basekernel.y --single-branch --depth=1 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git linux-${_basekernel} - msg2 "Done" - fi - - # Define current kernel subversion - if [ -z $_kernel_subver ]; then - cd "$_where"/linux-${_basekernel} - _kernelverstr=`git describe` - _kernel_subver=${_kernelverstr:5} - cd "$_where" - fi - - - # Run init script that is also run in PKGBUILD, it will define some env vars that we will use - _tkg_initscript - - cd "$_where" - msg2 "Downloading Graysky2's CPU optimisations patch" - wget "$_cpu_opt_patch_link" - - # Follow Ubuntu install isntructions in https://wiki.ubuntu.com/KernelTeam/GitKernelBuild - - # cd in linux folder, copy Ubuntu's current config file, update with new params - cd "$_where"/linux-${_basekernel} - - msg2 "Copying current kernel's config and running make oldconfig..." - cp /boot/config-`uname -r` .config - if [ "$_distro" = "Debian" ]; then #Help Debian cert problem. - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/test-signing-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/debian-uefi-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - fi - yes '' | make oldconfig - msg2 "Done" - - # apply linux-tkg patching script - _tkg_srcprep - - msg2 "Configuration done." -fi - -if [ "$1" = "install" ]; then - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _thread_num=`nproc` - else - _thread_num=`expr \`nproc\` / 4` - if [ "$_thread_num" = "0" ]; then - _thread_num=1 - fi - fi - - # ccache - if [ "$_noccache" != "true" ]; then - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - export PATH="/usr/lib/ccache/bin/:$PATH" - elif [ "$_distro" = "Fedora" ] || [ "$_distro" = "Suse" ]; then - export PATH="/usr/lib64/ccache/:$PATH" - fi - - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - - fi - - if [ -z $_kernel_localversion ]; then - _kernel_flavor="tkg-${_cpusched}" - else - _kernel_flavor="tkg-${_kernel_localversion}" - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - - if make -j ${_thread_num} deb-pkg LOCALVERSION=-${_kernel_flavor}; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create DEBS folder if it doesn't exist - mkdir -p DEBS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv "$_where"/*.deb "$_where"/DEBS/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [[ $_install =~ [yY] ]] || [ $_install = "yes" ] || [ $_install = "Yes" ]; then - cd "$_where" - _kernelname=$_basekernel.$_kernel_subver-$_kernel_flavor - _headers_deb="linux-headers-${_kernelname}*.deb" - _image_deb="linux-image-${_kernelname}_*.deb" - _kernel_devel_deb="linux-libc-dev_${_kernelname}*.deb" - - cd DEBS - sudo dpkg -i $_headers_deb $_image_deb $_kernel_devel_deb - fi - fi - - elif [[ "$_distro" = "Fedora" || "$_distro" = "Suse" ]]; then - - # Replace dashes with underscores, it seems that it's being done by binrpm-pkg - # Se we can actually refer properly to the rpm files. - _kernel_flavor=${_kernel_flavor//-/_} - - if make -j ${_thread_num} rpm-pkg EXTRAVERSION="_${_kernel_flavor}"; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create RPMS folder if it doesn't exist - mkdir -p RPMS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv ~/rpmbuild/RPMS/x86_64/* "$_where"/RPMS/ - - #Clean up the original folder, unneeded and takes a lot of space - rm -rf ~/rpmbuild/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [ "$_install" = "y" ] || [ "$_install" = "Y" ] || [ "$_install" = "yes" ] || [ "$_install" = "Yes" ]; then - - _kernelname=$_basekernel.${_kernel_subver}_$_kernel_flavor - _headers_rpm="kernel-headers-${_kernelname}*.rpm" - _kernel_rpm="kernel-${_kernelname}*.rpm" - _kernel_devel_rpm="kernel-devel-${_kernelname}*.rpm" - - cd RPMS - if [ "$_distro" = "Fedora" ]; then - sudo dnf install $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - elif [ "$_distro" = "Suse" ]; then - msg2 "Some files from 'linux-glibc-devel' will be replaced by files from the custom kernel-hearders package" - msg2 "To revert back to the original kernel headers do 'sudo zypper install -f linux-glibc-devel'" - sudo zypper install --replacefiles --allow-unsigned-rpm $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - fi - - msg2 "Install successful" - fi - fi - fi -fi - -if [ "$1" = "uninstall-help" ]; then - - cd "$_where" - msg2 "List of installed custom tkg kernels: " - - if [ "$_distro" = "Ubuntu" ]; then - dpkg -l "*tkg*" | grep "linux.*tkg" - dpkg -l "*linux-libc-dev*" | grep "linux.*tkg" - msg2 "To uninstall a version, you should remove the linux-image, linux-headers and linux-libc-dev associated to it (if installed), with: " - msg2 " sudo apt remove linux-image-VERSION linux-headers-VERSION linux-libc-dev-VERSION" - msg2 " where VERSION is displayed in the lists above, uninstall only versions that have \"tkg\" in its name" - elif [ "$_distro" = "Fedora" ]; then - dnf list --installed kernel* - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo dnf remove --noautoremove kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second column" - elif [ "$_distro" = "Suse" ]; then - zypper packages --installed-only | grep "kernel.*tkg" - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo zypper remove --no-clean-deps kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second to last column" - fi - -fi diff --git a/linux57-tkg/linux57-tkg-config/90-cleanup.hook b/linux57-tkg/linux57-tkg-config/90-cleanup.hook deleted file mode 100644 index 99f5221..0000000 --- a/linux57-tkg/linux57-tkg-config/90-cleanup.hook +++ /dev/null @@ -1,14 +0,0 @@ -[Trigger] -Type = File -Operation = Install -Operation = Upgrade -Operation = Remove -Target = usr/lib/modules/*/ -Target = !usr/lib/modules/*/?* - -[Action] -Description = Cleaning up... -When = PostTransaction -Exec = /usr/share/libalpm/scripts/cleanup -NeedsTargets - diff --git a/linux57-tkg/linux57-tkg-config/cleanup b/linux57-tkg/linux57-tkg-config/cleanup deleted file mode 100755 index c00c08d..0000000 --- a/linux57-tkg/linux57-tkg-config/cleanup +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -for _f in /usr/lib/modules/*tkg*; do - if [[ ! -e ${_f}/vmlinuz ]]; then - rm -rf "$_f" - fi -done - -# vim:set ft=sh sw=2 et: - diff --git a/linux57-tkg/linux57-tkg-config/config.x86_64 b/linux57-tkg/linux57-tkg-config/config.x86_64 deleted file mode 100644 index 1014972..0000000 --- a/linux57-tkg/linux57-tkg-config/config.x86_64 +++ /dev/null @@ -1,10864 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 5.7.11-arch1 Kernel Configuration -# - -# -# Compiler: gcc (GCC) 10.1.0 -# -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=100100 -CONFIG_LD_VERSION=234000000 -CONFIG_CLANG_VERSION=0 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_HAS_ASM_GOTO=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -CONFIG_KERNEL_XZ=y -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -CONFIG_DEFAULT_HOSTNAME="archlinux" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -# end of Timers subsystem - -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y - -# -# CPU/Task time and stats accounting -# -CONFIG_TICK_CPU_ACCOUNTING=y -# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -# CONFIG_SCHED_THERMAL_PRESSURE is not set -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -CONFIG_RCU_EXPERT=y -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_RCU_FANOUT=64 -CONFIG_RCU_FANOUT_LEAF=16 -CONFIG_RCU_FAST_NO_HZ=y -CONFIG_RCU_BOOST=y -CONFIG_RCU_BOOST_DELAY=500 -# CONFIG_RCU_NOCB_CPU is not set -# end of RCU Subsystem - -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_IKHEADERS is not set -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -CONFIG_UCLAMP_TASK=y -CONFIG_UCLAMP_BUCKETS_COUNT=5 -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_SWAP_ENABLED=y -CONFIG_MEMCG_KMEM=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_USER_NS_UNPRIVILEGED=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_BOOT_CONFIG=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -CONFIG_EXPERT=y -# CONFIG_UID16 is not set -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_MEMBARRIER=y -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_BPF_LSM=y -CONFIG_BPF_SYSCALL=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -# CONFIG_USERFAULTFD is not set -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_RSEQ=y -# CONFIG_DEBUG_RSEQ is not set -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -# CONFIG_SLOB is not set -CONFIG_SLAB_MERGE_DEFAULT=y -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_FILTER_PGPROT=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DYNAMIC_PHYSICAL_MASK=y -CONFIG_PGTABLE_LEVELS=5 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_X86_CPU_RESCTRL=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_PARAVIRT_XXL=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -CONFIG_XEN=y -CONFIG_XEN_PV=y -CONFIG_XEN_PV_SMP=y -CONFIG_XEN_DOM0=y -CONFIG_XEN_PVHVM=y -CONFIG_XEN_PVHVM_SMP=y -CONFIG_XEN_512GB=y -CONFIG_XEN_SAVE_RESTORE=y -# CONFIG_XEN_DEBUG_FS is not set -CONFIG_XEN_PVH=y -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -# CONFIG_KVM_DEBUG_FS is not set -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -CONFIG_JAILHOUSE_GUEST=y -CONFIG_ACRN_GUEST=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_PROCESSOR_SELECT=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=320 -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -CONFIG_X86_MCE_INJECT=m -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=m -CONFIG_PERF_EVENTS_INTEL_RAPL=m -CONFIG_PERF_EVENTS_INTEL_CSTATE=m -CONFIG_PERF_EVENTS_AMD_POWER=m -# end of Performance monitoring - -CONFIG_X86_16BIT=y -CONFIG_X86_ESPFIX64=y -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_X86_IOPL_IOPERM=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -CONFIG_X86_5LEVEL=y -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -CONFIG_NODES_SPAN_OTHER_NODES=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=5 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ARCH_PROC_KCORE_TEXT=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_UMIP=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_SECCOMP=y -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_ARCH_HAS_KEXEC_PURGATORY=y -# CONFIG_KEXEC_SIG is not set -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x200000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -CONFIG_LEGACY_VSYSCALL_XONLY=y -# CONFIG_LEGACY_VSYSCALL_NONE is not set -# CONFIG_CMDLINE_BOOL is not set -CONFIG_MODIFY_LDT_SYSCALL=y -CONFIG_HAVE_LIVEPATCH=y -# CONFIG_LIVEPATCH is not set -# end of Processor type and features - -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_SUSPEND_SKIP_SYNC is not set -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -# CONFIG_PM_TEST_SUSPEND is not set -CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_DPM_WATCHDOG is not set -CONFIG_PM_TRACE=y -CONFIG_PM_TRACE_RTC=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_PM_GENERIC_DOMAINS_OF=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -# CONFIG_ACPI_PROCFS_POWER is not set -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -CONFIG_ACPI_EC_DEBUGFS=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=y -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_DEBUG=y -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -CONFIG_ACPI_CUSTOM_METHOD=m -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -# CONFIG_NFIT_SECURITY_DEBUG is not set -CONFIG_ACPI_NUMA=y -CONFIG_ACPI_HMAT=y -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -CONFIG_ACPI_APEI_EINJ=m -CONFIG_ACPI_APEI_ERST_DEBUG=m -CONFIG_DPTF_POWER=m -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_ADXL=y -CONFIG_PMIC_OPREGION=y -CONFIG_BYTCRC_PMIC_OPREGION=y -CONFIG_CHTCRC_PMIC_OPREGION=y -CONFIG_XPOWER_PMIC_OPREGION=y -CONFIG_BXT_WC_PMIC_OPREGION=y -CONFIG_CHT_WC_PMIC_OPREGION=y -CONFIG_CHT_DC_TI_PMIC_OPREGION=y -CONFIG_ACPI_CONFIGFS=m -CONFIG_TPS68470_PMIC_OPREGION=y -CONFIG_X86_PM_TIMER=y -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_CPUFREQ_DT=m -CONFIG_CPUFREQ_DT_PLATDEV=y -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=m - -# -# shared options -# -CONFIG_X86_SPEEDSTEP_LIB=m -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=m -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_XEN=y -CONFIG_MMCONF_FAM10H=y -# CONFIG_PCI_CNB20LE_QUIRK is not set -# CONFIG_ISA_BUS is not set -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# CONFIG_X86_SYSFB is not set -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_X86_X32 is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -# end of Binary Emulations - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -# CONFIG_GOOGLE_SMI is not set -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_MEMCONSOLE=m -# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set -CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -# CONFIG_EFI_VARS is not set -CONFIG_EFI_ESRT=y -CONFIG_EFI_RUNTIME_MAP=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_SOFT_RESERVE=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_APPLE_PROPERTIES=y -# CONFIG_RESET_ATTACK_MITIGATION is not set -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_EFI_EMBEDDED_FIRMWARE=y -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_EFI_EARLYCON=y - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_MMU_AUDIT=y -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y - -# -# General architecture-dependent options -# -CONFIG_CRASH_CORE=y -CONFIG_KEXEC_CORE=y -CONFIG_HOTPLUG_SMT=y -CONFIG_OPROFILE=m -# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_CLK=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_CC_HAS_STACKPROTECTOR_NONE=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=28 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_COPY_THREAD_TLS=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_ISA_BUS_API=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set -# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set -# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_MODVERSIONS is not set -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -CONFIG_MODULE_COMPRESS_XZ=y -CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_INTEGRITY_T10=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y -# CONFIG_BLK_CMDLINE_PARSER is not set -CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y -CONFIG_BLK_SED_OPAL=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_AIX_PARTITION=y -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y -CONFIG_BLK_PM=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_HAVE_MEMORY_PRESENT=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_MEMBLOCK_NODE_MAP=y -CONFIG_HAVE_FAST_GUP=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -CONFIG_HWPOISON_INJECT=m -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -# CONFIG_CMA is not set -# CONFIG_MEM_SOFT_DIRTY is not set -CONFIG_ZSWAP=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZPOOL=y -CONFIG_ZBUD=y -CONFIG_Z3FOLD=y -CONFIG_ZSMALLOC=y -# CONFIG_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DEVICE=y -CONFIG_DEV_PAGEMAP_OPS=y -CONFIG_HMM_MIRROR=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_BENCHMARK is not set -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MAPPING_DIRTY_HELPERS=y -# end of Memory Management options - -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_REDIRECT=y -CONFIG_SKB_EXTENSIONS=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_UNIX_SCM=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -# CONFIG_NET_IPGRE_BROADCAST is not set -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_ESPINTCP=y -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_INET_RAW_DIAG=m -CONFIG_INET_DIAG_DESTROY=y -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_CUBIC=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="cubic" -CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_RPL_LWTUNNEL=y -CONFIG_NETLABEL=y -CONFIG_MPTCP=y -CONFIG_MPTCP_IPV6=y -# CONFIG_MPTCP_HMAC_TEST is not set -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=15 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_MH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS MH scheduler -# -CONFIG_IP_VS_MH_TAB_INDEX=12 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_FLOW_TABLE_IPV4=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_FLOW_TABLE_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_BPFILTER is not set -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y -# end of DCCP CCIDs Configuration - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# end of DCCP Kernel Hacking - -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_TIPC_CRYPTO=y -CONFIG_TIPC_DIAG=m -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_8021Q=m -CONFIG_NET_DSA_TAG_AR9331=m -CONFIG_NET_DSA_TAG_BRCM_COMMON=m -CONFIG_NET_DSA_TAG_BRCM=m -CONFIG_NET_DSA_TAG_BRCM_PREPEND=m -CONFIG_NET_DSA_TAG_GSWIP=m -CONFIG_NET_DSA_TAG_DSA=m -CONFIG_NET_DSA_TAG_EDSA=m -CONFIG_NET_DSA_TAG_MTK=m -CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_OCELOT=m -CONFIG_NET_DSA_TAG_QCA=m -CONFIG_NET_DSA_TAG_LAN9303=m -CONFIG_NET_DSA_TAG_SJA1105=m -CONFIG_NET_DSA_TAG_TRAILER=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -# CONFIG_DECNET is not set -CONFIG_LLC=m -CONFIG_LLC2=m -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_CBS=m -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_SKBPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_FQ_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_SCH_ETS=m -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_CODEL is not set -CONFIG_DEFAULT_FQ_CODEL=y -# CONFIG_DEFAULT_SFQ is not set -# CONFIG_DEFAULT_PFIFO_FAST is not set -CONFIG_DEFAULT_NET_SCH="fq_codel" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_MPLS=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_CTINFO=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_ACT_CT=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_TC_SKB_EXT=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -CONFIG_BATMAN_ADV_BATMAN_V=y -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -CONFIG_BATMAN_ADV_DEBUGFS=y -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_BATMAN_ADV_SYSFS=y -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VSOCKETS_DIAG=m -CONFIG_VSOCKETS_LOOPBACK=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -CONFIG_NET_NCSI=y -CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -CONFIG_NET_DROP_MONITOR=y -# end of Network testing -# end of Networking options - -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -# end of AX.25 network device drivers - -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m -CONFIG_CAN_J1939=m - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_FLEXCAN=m -CONFIG_CAN_GRCAN=m -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_KVASER_PCIEFD=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_M_CAN_PLATFORM=m -CONFIG_CAN_M_CAN_TCAN4X5X=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_EMS_PCI=m -# CONFIG_CAN_EMS_PCMCIA is not set -CONFIG_CAN_F81601=m -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m -# end of CAN SPI interfaces - -# -# CAN USB interfaces -# -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_MCBA_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_UCAN=m -# end of CAN USB interfaces - -# CONFIG_CAN_DEBUG_DEVICES is not set -# end of CAN Device Drivers - -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -# CONFIG_BT_SELFTEST is not set -CONFIG_BT_DEBUGFS=y - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_RTL=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_MTKSDIO=m -CONFIG_BT_MTKUART=m -CONFIG_BT_HCIRSI=m -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -CONFIG_AF_RXRPC_DEBUG=y -CONFIG_RXKAD=y -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -# CONFIG_CFG80211_CERTIFICATION_ONUS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_XEN=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -CONFIG_CEPH_LIB_PRETTYDEBUG=y -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_PN532_UART=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -# end of Near Field Communication (NFC) devices - -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SOCK_MSG=y -CONFIG_NET_DEVLINK=y -CONFIG_PAGE_POOL=y -CONFIG_FAILOVER=m -CONFIG_ETHTOOL_NETLINK=y -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_BW is not set -CONFIG_PCIE_EDR=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=y -CONFIG_PCI_PF_STUB=m -CONFIG_XEN_PCIDEV_FRONTEND=m -CONFIG_PCI_ATS=y -CONFIG_PCI_ECAM=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_P2PDMA=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=y - -# -# PCI controller drivers -# -CONFIG_PCI_FTPCI100=y -CONFIG_PCI_HOST_COMMON=y -CONFIG_PCI_HOST_GENERIC=y -CONFIG_PCIE_XILINX=y -CONFIG_VMD=m -CONFIG_PCI_HYPERV_INTERFACE=m - -# -# DesignWare PCI Core Support -# -CONFIG_PCIE_DW=y -CONFIG_PCIE_DW_HOST=y -CONFIG_PCIE_DW_EP=y -CONFIG_PCIE_DW_PLAT=y -CONFIG_PCIE_DW_PLAT_HOST=y -CONFIG_PCIE_DW_PLAT_EP=y -CONFIG_PCIE_INTEL_GW=y -CONFIG_PCI_MESON=y -# end of DesignWare PCI Core Support - -# -# Mobiveil PCIe Core Support -# -# end of Mobiveil PCIe Core Support - -# -# Cadence PCIe controllers support -# -CONFIG_PCIE_CADENCE=y -CONFIG_PCIE_CADENCE_HOST=y -CONFIG_PCIE_CADENCE_EP=y -CONFIG_PCIE_CADENCE_PLAT=y -CONFIG_PCIE_CADENCE_PLAT_HOST=y -CONFIG_PCIE_CADENCE_PLAT_EP=y -# end of Cadence PCIe controllers support -# end of PCI controller drivers - -# -# PCI Endpoint -# -CONFIG_PCI_ENDPOINT=y -CONFIG_PCI_ENDPOINT_CONFIGFS=y -# CONFIG_PCI_EPF_TEST is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -# end of PCI switch controller drivers - -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=m -CONFIG_RAPIDIO_TSI721=m -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=m -CONFIG_RAPIDIO_CPS_XX=m -CONFIG_RAPIDIO_TSI568=m -CONFIG_RAPIDIO_CPS_GEN2=m -CONFIG_RAPIDIO_RXS_GEN3=m -# end of RapidIO Switch drivers - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_CACHE=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_HMEM_REPORTING=y -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_SYS_HYPERVISOR=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SLIMBUS=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_REGMAP_SOUNDWIRE=m -CONFIG_REGMAP_SCCB=m -CONFIG_REGMAP_I3C=m -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# end of Generic Driver Options - -# -# Bus devices -# -CONFIG_MOXTET=m -CONFIG_SIMPLE_PM_BUS=y -CONFIG_MHI_BUS=m -# end of Bus devices - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y -CONFIG_GNSS=m -CONFIG_GNSS_SERIAL=m -CONFIG_GNSS_MTK_SERIAL=m -CONFIG_GNSS_SIRF_SERIAL=m -CONFIG_GNSS_UBX_SERIAL=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m - -# -# Partition parsers -# -CONFIG_MTD_AR7_PARTS=m -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_OF_PARTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# end of Partition parsers - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_SWAP=m -CONFIG_MTD_PARTITIONED_MASTER=y - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m -# end of RAM/ROM/Flash chip drivers - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_MTD_PHYSMAP_VERSATILE=y -CONFIG_MTD_PHYSMAP_GEMINI=y -CONFIG_MTD_PHYSMAP_GPIO_ADDR=y -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -# end of Mapping drivers for chip access - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -# end of Self-contained MTD device drivers - -CONFIG_MTD_NAND_CORE=m -CONFIG_MTD_ONENAND=m -# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y -CONFIG_MTD_NAND_ECC_SW_HAMMING=m -CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y -CONFIG_MTD_RAW_NAND=m -CONFIG_MTD_NAND_ECC_SW_BCH=y - -# -# Raw/parallel NAND flash controllers -# -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_DT=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_MXIC=m -CONFIG_MTD_NAND_GPIO=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_NAND_CADENCE=m - -# -# Misc -# -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_SPI_NAND=m - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -# end of LPDDR & LPDDR2 PCM memory drivers - -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_SPI_INTEL_SPI=m -CONFIG_SPI_INTEL_SPI_PCI=m -CONFIG_SPI_INTEL_SPI_PLATFORM=m -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -CONFIG_MTD_UBI_GLUEBI=m -CONFIG_MTD_UBI_BLOCK=y -CONFIG_MTD_HYPERBUS=m -CONFIG_DTC=y -CONFIG_OF=y -# CONFIG_OF_UNITTEST is not set -CONFIG_OF_FLATTREE=y -CONFIG_OF_KOBJ=y -CONFIG_OF_DYNAMIC=y -CONFIG_OF_ADDRESS=y -CONFIG_OF_IRQ=y -CONFIG_OF_NET=y -CONFIG_OF_MDIO=m -CONFIG_OF_RESOLVE=y -CONFIG_OF_OVERLAY=y -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_CDROM=m -# CONFIG_PARIDE is not set -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -# CONFIG_CDROM_PKTCDVD_WCACHE is not set -CONFIG_ATA_OVER_ETH=m -CONFIG_XEN_BLKDEV_FRONTEND=m -CONFIG_XEN_BLKDEV_BACKEND=m -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -CONFIG_NVME_MULTIPATH=y -CONFIG_NVME_HWMON=y -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -CONFIG_NVME_TARGET=m -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m -CONFIG_NVME_TARGET_TCP=m -# end of NVME Support - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_VMWARE_BALLOON=m -CONFIG_LATTICE_ECP3_CONFIG=m -# CONFIG_SRAM is not set -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_XILINX_SDFEC=m -CONFIG_MISC_RTSX=m -CONFIG_PVPANIC=m -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -# end of Texas Instruments shared transport line discipline - -CONFIG_SENSORS_LIS3_I2C=m -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=m -CONFIG_INTEL_MEI_ME=m -CONFIG_INTEL_MEI_TXE=m -CONFIG_INTEL_MEI_HDCP=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC & related support -# -CONFIG_INTEL_MIC_BUS=m -CONFIG_SCIF_BUS=m -CONFIG_VOP_BUS=m -CONFIG_INTEL_MIC_HOST=m -CONFIG_INTEL_MIC_CARD=m -CONFIG_SCIF=m -CONFIG_MIC_COSM=m -CONFIG_VOP=m -# end of Intel MIC & related support - -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -CONFIG_MISC_ALCOR_PCI=m -CONFIG_MISC_RTSX_PCI=m -CONFIG_MISC_RTSX_USB=m -CONFIG_HABANA_AI=m -CONFIG_UACCE=m -# end of Misc devices - -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC94XX=m -CONFIG_AIC94XX_DEBUG=y -CONFIG_SCSI_MVSAS=m -CONFIG_SCSI_MVSAS_DEBUG=y -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -# CONFIG_SCSI_UFS_DWC_TC_PCI is not set -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_CDNS_PLATFORM=m -# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set -CONFIG_SCSI_UFS_BSG=y -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_SCSI_MYRB=m -CONFIG_SCSI_MYRS=m -CONFIG_VMWARE_PVSCSI=m -CONFIG_XEN_SCSI_FRONTEND=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_FDOMAIN=m -CONFIG_SCSI_FDOMAIN_PCI=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -CONFIG_SCSI_IPR_TRACE=y -CONFIG_SCSI_IPR_DUMP=y -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -CONFIG_SCSI_DEBUG=m -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=3 -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_AHCI_CEVA=m -CONFIG_AHCI_QORIQ=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -# CONFIG_PATA_PLATFORM is not set -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -CONFIG_DM_DEBUG=y -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -CONFIG_DM_ERA=m -CONFIG_DM_CLONE=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_DELAY=m -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -# CONFIG_FUSION_LOGGING is not set - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -# end of IEEE 1394 (FireWire) support - -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN_L3S=y -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_BAREUDP=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_NTB_NETDEV=m -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -# CONFIG_ATM_NICSTAR_USE_SUNI is not set -# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m -CONFIG_CAIF_DRIVERS=y -CONFIG_CAIF_TTY=m -CONFIG_CAIF_SPI_SLAVE=m -CONFIG_CAIF_SPI_SYNC=y -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -# CONFIG_B53_SPI_DRIVER is not set -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_B53_SERDES=m -CONFIG_NET_DSA_BCM_SF2=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_LANTIQ_GSWIP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_MV88E6XXX_PTP=y -CONFIG_NET_DSA_AR9331=m -CONFIG_NET_DSA_SJA1105=m -CONFIG_NET_DSA_SJA1105_PTP=y -CONFIG_NET_DSA_SJA1105_TAS=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_REALTEK_SMI=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_NET_DSA_VITESSE_VSC73XX=m -CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m -CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m -# end of Distributed Switch Architecture drivers - -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BCMGENET=m -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_SYSTEMPORT=m -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_BNXT_HWMON=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_CAVIUM_PTP=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -CONFIG_CHELSIO_T4_FCOE=y -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_NET_VENDOR_CORTINA=y -CONFIG_GEMINI_ETHERNET=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_BE2=y -CONFIG_BE2NET_BE3=y -CONFIG_BE2NET_LANCER=y -CONFIG_BE2NET_SKYHAWK=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_GOOGLE=y -CONFIG_GVE=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -# CONFIG_IXGBE_IPSEC is not set -CONFIG_IXGBEVF=m -CONFIG_IXGBEVF_IPSEC=y -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_IAVF=m -CONFIG_I40EVF=m -CONFIG_ICE=m -CONFIG_FM10K=m -CONFIG_IGC=m -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX4_CORE_GEN2=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_EN_ARFS=y -CONFIG_MLX5_EN_RXNFC=y -CONFIG_MLX5_MPFS=y -CONFIG_MLX5_ESWITCH=y -CONFIG_MLX5_TC_CT=y -CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_IPOIB=y -CONFIG_MLX5_FPGA_IPSEC=y -CONFIG_MLX5_EN_IPSEC=y -CONFIG_MLX5_FPGA_TLS=y -CONFIG_MLX5_TLS=y -CONFIG_MLX5_EN_TLS=y -CONFIG_MLX5_SW_STEERING=y -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_LAN743X=m -CONFIG_NET_VENDOR_MICROSEMI=y -CONFIG_MSCC_OCELOT_SWITCH=m -CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETERION=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -CONFIG_NFP_APP_FLOWER=y -CONFIG_NFP_APP_ABM_NIC=y -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_NI=y -CONFIG_NI_XGE_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_VENDOR_PACKET_ENGINES=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_PENSANDO=y -CONFIG_IONIC=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_QED_OOO=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCA7000=m -CONFIG_QCA7000_SPI=m -CONFIG_QCA7000_UART=m -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_SOCIONEXT=y -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -# CONFIG_STMMAC_SELFTESTS is not set -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_DWC_QOS_ETH=m -CONFIG_DWMAC_GENERIC=m -CONFIG_DWMAC_INTEL=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -# CONFIG_TI_CPSW_PHY_SEL is not set -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XILINX=y -CONFIG_XILINX_AXI_EMAC=m -CONFIG_XILINX_LL_TEMAC=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -CONFIG_DEFXX_MMIO=y -CONFIG_SKFP=m -# CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_BUS_MUX=m -CONFIG_MDIO_BUS_MUX_GPIO=m -CONFIG_MDIO_BUS_MUX_MMIOREG=m -CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_HISI_FEMAC=m -CONFIG_MDIO_I2C=m -CONFIG_MDIO_IPQ8064=m -CONFIG_MDIO_MSCC_MIIM=m -CONFIG_MDIO_MVUSB=m -CONFIG_MDIO_OCTEON=m -CONFIG_MDIO_THUNDER=m -CONFIG_MDIO_XPCS=m -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y - -# -# MII PHY device drivers -# -CONFIG_SFP=m -CONFIG_ADIN_PHY=m -CONFIG_AMD_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AX88796B_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_BROADCOM_PHY=m -CONFIG_BCM84881_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_DP83822_PHY=m -CONFIG_DP83TC811_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_DP83869_PHY=m -CONFIG_FIXED_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_LXT_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROCHIP_T1_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_NXP_TJA11XX_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_RENESAS_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_USB_NET_AQC111=m -CONFIG_WLAN=y -# CONFIG_WIRELESS_WDS is not set -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -CONFIG_ATH5K_DEBUG=y -CONFIG_ATH5K_TRACER=y -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_COMMON_DEBUG=y -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -CONFIG_ATH9K_DEBUGFS=y -CONFIG_ATH9K_STATION_STATISTICS=y -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_PCI_NO_EEPROM=m -CONFIG_ATH9K_HTC=m -CONFIG_ATH9K_HTC_DEBUGFS=y -CONFIG_ATH9K_HWRNG=y -CONFIG_ATH9K_COMMON_SPECTRAL=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_DEBUGFS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -CONFIG_ATH6KL_DEBUG=y -CONFIG_ATH6KL_TRACING=y -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -CONFIG_WIL6210_DEBUGFS=y -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_AHB=y -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -CONFIG_ATH10K_DEBUG=y -CONFIG_ATH10K_DEBUGFS=y -CONFIG_ATH10K_SPECTRAL=y -CONFIG_ATH10K_TRACING=y -CONFIG_WCN36XX=m -CONFIG_WCN36XX_DEBUGFS=y -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -CONFIG_B43LEGACY_DEBUG=y -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -CONFIG_BRCM_TRACING=y -CONFIG_BRCMDBG=y -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -CONFIG_IWLEGACY_DEBUG=y -CONFIG_IWLEGACY_DEBUGFS=y -# end of iwl3945 / iwl4965 Debugging Options - -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -# CONFIG_IWLWIFI_BCAST_FILTERING is not set - -# -# Debugging Options -# -CONFIG_IWLWIFI_DEBUG=y -CONFIG_IWLWIFI_DEBUGFS=y -CONFIG_IWLWIFI_DEVICE_TRACING=y -# end of Debugging Options - -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -# CONFIG_P54_SPI_DEFAULT_EEPROM is not set -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76x0_COMMON=m -CONFIG_MT76x0U=m -CONFIG_MT76x0E=m -CONFIG_MT76x2_COMMON=m -CONFIG_MT76x2E=m -CONFIG_MT76x2U=m -CONFIG_MT7603E=m -CONFIG_MT7615E=m -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -CONFIG_RT2X00_LIB_DEBUGFS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -CONFIG_RTLWIFI_DEBUG=y -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_RTW88=m -CONFIG_RTW88_CORE=m -CONFIG_RTW88_PCI=m -CONFIG_RTW88_8822BE=y -CONFIG_RTW88_8822CE=y -CONFIG_RTW88_DEBUG=y -CONFIG_RTW88_DEBUGFS=y -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -CONFIG_RSI_DEBUGFS=y -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_RSI_COEX=y -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SPI=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_VIRT_WIFI=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -# end of WiMAX Wireless Broadband devices - -# CONFIG_WAN is not set -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_IEEE802154_MCR20A=m -CONFIG_IEEE802154_HWSIM=m -CONFIG_XEN_NETDEV_FRONTEND=m -CONFIG_XEN_NETDEV_BACKEND=m -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_USB4_NET=m -CONFIG_HYPERV_NET=m -CONFIG_NETDEVSIM=m -CONFIG_NET_FAILOVER=m -CONFIG_ISDN=y -CONFIG_ISDN_CAPI=y -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_HDLC=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_NVM=y -CONFIG_NVM_PBLK=m -# CONFIG_NVM_PBLK_DEBUG is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=m -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5520=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_APPLESPI=m -CONFIG_KEYBOARD_ATKBD=m -CONFIG_KEYBOARD_QT1050=m -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_STMPE=m -CONFIG_KEYBOARD_IQS62X=m -CONFIG_KEYBOARD_OMAP4=m -CONFIG_KEYBOARD_TC3589X=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_TWL4030=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_KEYBOARD_CAP11XX=m -CONFIG_KEYBOARD_BCM=m -CONFIG_KEYBOARD_MTK_PMIC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -CONFIG_MOUSE_PS2_VMMOUSE=y -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=m -CONFIG_JOYSTICK_IFORCE_232=m -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -CONFIG_JOYSTICK_JOYDUMP=m -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_JOYSTICK_PXRC=m -CONFIG_JOYSTICK_FSIA6B=m -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_88PM860X=m -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ADC=m -CONFIG_TOUCHSCREEN_AR1021_I2C=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_BU21029=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9034=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_EXC3000=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_HIDEEP=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_S6SY761=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_IMX6UL_TSC=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_STMPE=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_COLIBRI_VF50=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_TOUCHSCREEN_IQS5XX=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM860X_ONKEY=m -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_ATMEL_CAPTOUCH=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_MSM_VIBRATOR=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77650_ONKEY=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MAX8925_ONKEY=m -CONFIG_INPUT_MAX8997_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GP2A=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_GPIO_VIBRA=m -CONFIG_INPUT_CPCAP_PWRBUTTON=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_TWL4030_PWRBUTTON=m -CONFIG_INPUT_TWL4030_VIBRA=m -CONFIG_INPUT_TWL6040_VIBRA=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PALMAS_PWRBUTTON=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_RK805_PWRKEY=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9055_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_INPUT_RAVE_SP_PWRBUTTON=m -CONFIG_INPUT_STPMIC1_ONKEY=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -# CONFIG_RMI4_F54 is not set -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=m -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=m -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=m -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -# CONFIG_SERIO_APBPS2 is not set -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_LDISC_AUTOLOAD=y - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=m -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=32 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_ASPEED_VUART=m -CONFIG_SERIAL_8250_SHARE_IRQ=y -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -CONFIG_SERIAL_8250_RSA=y -CONFIG_SERIAL_8250_DWLIB=y -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_OF_PLATFORM=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=m -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SIFIVE=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_XILINX_PS_UART=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_FSL_LINFLEXUART=m -CONFIG_SERIAL_CONEXANT_DIGICOLOR=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_SPRD=m -# end of Serial drivers - -CONFIG_SERIAL_MCTRL_GPIO=y -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_NOZOMI=m -CONFIG_NULL_TTY=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_HVC_DRIVER=y -CONFIG_HVC_IRQ=y -CONFIG_HVC_XEN=y -CONFIG_HVC_XEN_FRONTEND=y -CONFIG_SERIAL_DEV_BUS=y -CONFIG_SERIAL_DEV_CTRL_TTYPORT=y -# CONFIG_TTY_PRINTK is not set -CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set -CONFIG_PPDEV=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PLAT_DATA=y -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_IPMB_DEVICE_INTERFACE=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -# end of PCMCIA character devices - -CONFIG_MWAVE=m -CONFIG_DEVMEM=y -# CONFIG_DEVKMEM is not set -CONFIG_NVRAM=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -CONFIG_DEVPORT=y -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_XEN=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m -CONFIG_XILLYBUS_OF=m -# end of Character devices - -# CONFIG_RANDOM_TRUST_CPU is not set -# CONFIG_RANDOM_TRUST_BOOTLOADER is not set - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_ARB_GPIO_CHALLENGE=m -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_GPMUX=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_PINCTRL=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_DEMUX_PINCTRL=m -CONFIG_I2C_MUX_MLXCPLD=m -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_AMD_MP2=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_CHT_WC=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_NVIDIA_GPU=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=y -CONFIG_I2C_DESIGNWARE_PLATFORM=y -CONFIG_I2C_DESIGNWARE_SLAVE=y -CONFIG_I2C_DESIGNWARE_PCI=m -CONFIG_I2C_DESIGNWARE_BAYTRAIL=y -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -CONFIG_I2C_RK3X=m -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -CONFIG_I2C_FSI=m -# end of I2C Hardware Bus support - -CONFIG_I2C_STUB=m -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -CONFIG_SPI_MEM=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MID_DMA=y -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_FSI=m -CONFIG_SPI_NXP_FLEXSPI=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_FSL_LIB=m -CONFIG_SPI_FSL_SPI=m -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_SIFIVE=m -CONFIG_SPI_MXIC=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m - -# -# SPI Multiplexer support -# -CONFIG_SPI_MUX=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=y -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -CONFIG_PPS_CLIENT_KTIMER=m -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=y -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_INES=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PTP_1588_CLOCK_IDT82P33=m -CONFIG_PTP_1588_CLOCK_IDTCM=m -CONFIG_PTP_1588_CLOCK_VMW=m -# end of PTP clock support - -CONFIG_PINCTRL=y -CONFIG_GENERIC_PINCTRL_GROUPS=y -CONFIG_PINMUX=y -CONFIG_GENERIC_PINMUX_FUNCTIONS=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AS3722=m -CONFIG_PINCTRL_AXP209=m -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_DA9062=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_SINGLE=m -CONFIG_PINCTRL_SX150X=y -CONFIG_PINCTRL_STMFX=m -CONFIG_PINCTRL_MAX77620=m -CONFIG_PINCTRL_PALMAS=m -CONFIG_PINCTRL_RK805=m -CONFIG_PINCTRL_OCELOT=y -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y -CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y -CONFIG_PINCTRL_LOCHNAGAR=m -CONFIG_PINCTRL_MADERA=m -CONFIG_PINCTRL_CS47L15=y -CONFIG_PINCTRL_CS47L35=y -CONFIG_PINCTRL_CS47L85=y -CONFIG_PINCTRL_CS47L90=y -CONFIG_PINCTRL_CS47L92=y -CONFIG_PINCTRL_EQUILIBRIUM=m -CONFIG_GPIOLIB=y -CONFIG_GPIOLIB_FASTPATH_LIMIT=512 -CONFIG_OF_GPIO=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_GENERIC=y -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_74XX_MMIO=m -CONFIG_GPIO_ALTERA=m -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_CADENCE=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_FTGPIO010=y -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_GRGPIO=m -CONFIG_GPIO_HLWD=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LOGICVC=m -CONFIG_GPIO_MB86S7X=m -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_SAMA5D2_PIOBU=m -CONFIG_GPIO_SIFIVE=y -CONFIG_GPIO_SIOX=m -CONFIG_GPIO_SYSCON=m -CONFIG_GPIO_VX855=m -CONFIG_GPIO_WCD934X=m -CONFIG_GPIO_XILINX=m -CONFIG_GPIO_AMD_FCH=m -# end of Memory mapped GPIO drivers - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m -CONFIG_GPIO_WINBOND=m -CONFIG_GPIO_WS16C48=m -# end of Port-mapped I/O GPIO drivers - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_ADNP=m -CONFIG_GPIO_GW_PLD=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m -# end of I2C GPIO expanders - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ADP5520=m -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD70528=m -CONFIG_GPIO_BD71828=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_CRYSTAL_COVE=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DA9055=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_LP87565=m -CONFIG_GPIO_MADERA=m -CONFIG_GPIO_MAX77620=m -CONFIG_GPIO_MAX77650=m -CONFIG_GPIO_PALMAS=y -CONFIG_GPIO_RC5T583=y -CONFIG_GPIO_STMPE=y -CONFIG_GPIO_TC3589X=y -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS6586X=y -CONFIG_GPIO_TPS65910=y -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_TPS68470=y -CONFIG_GPIO_TQMX86=m -CONFIG_GPIO_TWL4030=m -CONFIG_GPIO_TWL6040=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8350=m -CONFIG_GPIO_WM8994=m -# end of MFD GPIO expanders - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_PCIE_IDIO_24=m -CONFIG_GPIO_RDC321X=m -CONFIG_GPIO_SODAVILLE=y -# end of PCI GPIO expanders - -# -# SPI GPIO expanders -# -CONFIG_GPIO_74X164=m -CONFIG_GPIO_MAX3191X=m -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m -CONFIG_GPIO_MOXTET=m -# end of SPI GPIO expanders - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -# end of USB GPIO expanders - -CONFIG_GPIO_MOCKUP=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m -CONFIG_W1_MASTER_SGI=m -# end of 1-wire Bus Masters - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -CONFIG_W1_SLAVE_DS2405=m -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2430=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -# CONFIG_W1_SLAVE_DS2433_CRC is not set -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS250X=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_W1_SLAVE_DS28E17=m -# end of 1-wire Slaves - -CONFIG_POWER_AVS=y -CONFIG_QCOM_CPR=m -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_AS3722=y -CONFIG_POWER_RESET_GPIO=y -CONFIG_POWER_RESET_GPIO_RESTART=y -CONFIG_POWER_RESET_LTC2952=y -CONFIG_POWER_RESET_MT6323=y -CONFIG_POWER_RESET_RESTART=y -CONFIG_POWER_RESET_SYSCON=y -CONFIG_POWER_RESET_SYSCON_POWEROFF=y -CONFIG_REBOOT_MODE=m -CONFIG_SYSCON_REBOOT_MODE=m -CONFIG_NVMEM_REBOOT_MODE=m -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_MAX8925_POWER=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -CONFIG_WM8350_POWER=m -CONFIG_TEST_POWER=m -CONFIG_BATTERY_88PM860X=m -CONFIG_CHARGER_ADP5061=m -CONFIG_BATTERY_ACT8945A=m -CONFIG_BATTERY_CPCAP=m -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_LEGO_EV3=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_MANAGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9030=m -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_BATTERY_TWL4030_MADC=m -CONFIG_CHARGER_88PM860X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_BATTERY_RX51=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_TWL4030=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_LP8788=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LT3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_DETECTOR_MAX14656=m -CONFIG_CHARGER_MAX77650=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_MAX8997=m -CONFIG_CHARGER_MAX8998=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -CONFIG_CHARGER_BQ25890=m -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65090=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_CHARGER_CROS_USBPD=m -CONFIG_CHARGER_UCS1002=m -CONFIG_CHARGER_BD70528=m -CONFIG_CHARGER_WILCO=m -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM1177=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_AS370=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_AXI_FAN_CONTROL=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_DRIVETEMP=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_DA9055=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LOCHNAGAR=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2947=m -CONFIG_SENSORS_LTC2947_I2C=m -CONFIG_SENSORS_LTC2947_SPI=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX31730=m -CONFIG_SENSORS_MAX6621=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_MLXREG_FAN=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_NPCM7XX=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_BEL_PFE=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_INSPUR_IPSPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_IR38064=m -CONFIG_SENSORS_IRPS5401=m -CONFIG_SENSORS_ISL68137=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -# CONFIG_SENSORS_LTC2978_REGULATOR is not set -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX20730=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX31785=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -CONFIG_SENSORS_PXE1610=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_XDPE122=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_PWM_FAN=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TMP513=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83773G=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_WM8350=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -# CONFIG_THERMAL_STATISTICS is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 -CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_OF=y -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -CONFIG_CPU_THERMAL=y -CONFIG_CPU_FREQ_THERMAL=y -CONFIG_CPU_IDLE_THERMAL=y -CONFIG_CLOCK_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_THERMAL_MMIO=m -CONFIG_MAX77620_THERMAL=m -CONFIG_QORIQ_THERMAL=m -CONFIG_DA9062_THERMAL=m - -# -# Intel thermal drivers -# -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=y -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -# end of Intel thermal drivers - -# CONFIG_TI_SOC_THERMAL is not set -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_BD70528_WATCHDOG=m -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9055_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_GPIO_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_MENZ069_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_WM8350_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_RAVE_SP_WATCHDOG=m -CONFIG_MLX_WDT=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_RN5T618_WATCHDOG=m -CONFIG_TWL4030_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_MAX77620_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_STPMIC1_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m -CONFIG_F71808E_WDT=m -CONFIG_SP5100_TCO=m -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_KEMPLD_WDT=m -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_TQMX86_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m -CONFIG_XEN_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -# CONFIG_BCMA_HOST_SOC is not set -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_ACT8945A=m -CONFIG_MFD_AS3711=y -CONFIG_MFD_AS3722=m -CONFIG_PMIC_ADP5520=y -CONFIG_MFD_AAT2870_CORE=y -CONFIG_MFD_ATMEL_FLEXCOM=m -CONFIG_MFD_ATMEL_HLCDC=m -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC_DEV=m -CONFIG_MFD_MADERA=m -CONFIG_MFD_MADERA_I2C=m -CONFIG_MFD_MADERA_SPI=m -CONFIG_MFD_CS47L15=y -CONFIG_MFD_CS47L35=y -CONFIG_MFD_CS47L85=y -CONFIG_MFD_CS47L90=y -CONFIG_MFD_CS47L92=y -CONFIG_PMIC_DA903X=y -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9052_I2C=y -CONFIG_MFD_DA9055=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_MFD_HI6421_PMIC=m -CONFIG_HTC_PASIC3=m -CONFIG_HTC_I2CPLD=y -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC=y -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_INTEL_SOC_PMIC_CHTWC=y -CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_IQS62X=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_88PM860X=y -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77620=y -CONFIG_MFD_MAX77650=m -CONFIG_MFD_MAX77686=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX77843=y -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MAX8925=y -CONFIG_MFD_MAX8997=y -CONFIG_MFD_MAX8998=y -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_CPCAP=m -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RC5T583=y -CONFIG_MFD_RK808=m -CONFIG_MFD_RN5T618=m -CONFIG_MFD_SEC_CORE=y -CONFIG_MFD_SI476X_CORE=m -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_MFD_SMSC=y -CONFIG_ABX500_CORE=y -CONFIG_AB3100_CORE=y -CONFIG_AB3100_OTP=y -CONFIG_MFD_STMPE=y - -# -# STMicroelectronics STMPE Interface Drivers -# -CONFIG_STMPE_I2C=y -CONFIG_STMPE_SPI=y -# end of STMicroelectronics STMPE Interface Drivers - -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_LP8788=y -CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65090=y -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TPS68470=y -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TI_LP87565=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS6586X=y -CONFIG_MFD_TPS65910=y -CONFIG_MFD_TPS65912=m -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y -CONFIG_TWL4030_CORE=y -CONFIG_MFD_TWL4030_AUDIO=y -CONFIG_TWL6040_CORE=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -CONFIG_MFD_TC3589X=y -CONFIG_MFD_TQMX86=m -CONFIG_MFD_VX855=m -CONFIG_MFD_LOCHNAGAR=y -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM8400=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_I2C=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8350=y -CONFIG_MFD_WM8350_I2C=y -CONFIG_MFD_WM8994=m -CONFIG_MFD_ROHM_BD718XX=m -CONFIG_MFD_ROHM_BD70528=m -CONFIG_MFD_ROHM_BD71828=m -CONFIG_MFD_STPMIC1=m -CONFIG_MFD_STMFX=m -CONFIG_MFD_WCD934X=m -CONFIG_RAVE_SP_CORE=m -# end of Multifunction device drivers - -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PG86X=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_88PM8607=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_ACT8945A=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_AAT2870=m -CONFIG_REGULATOR_AB3100=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AS3711=m -CONFIG_REGULATOR_AS3722=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD70528=m -CONFIG_REGULATOR_BD71828=m -CONFIG_REGULATOR_BD718XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_CPCAP=m -CONFIG_REGULATOR_DA903X=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9055=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_HI6421=m -CONFIG_REGULATOR_HI6421V530=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LOCHNAGAR=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP873X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LP87565=m -CONFIG_REGULATOR_LP8788=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX77620=m -CONFIG_REGULATOR_MAX77650=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8925=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m -CONFIG_REGULATOR_MAX8997=m -CONFIG_REGULATOR_MAX8998=m -CONFIG_REGULATOR_MAX77686=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MAX77802=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MCP16502=m -CONFIG_REGULATOR_MP5416=m -CONFIG_REGULATOR_MP8859=m -CONFIG_REGULATOR_MP886X=m -CONFIG_REGULATOR_MPQ7920=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PALMAS=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_RC5T583=m -CONFIG_REGULATOR_RK808=m -CONFIG_REGULATOR_RN5T618=m -CONFIG_REGULATOR_ROHM=m -CONFIG_REGULATOR_RT5033=m -CONFIG_REGULATOR_S2MPA01=m -CONFIG_REGULATOR_S2MPS11=m -CONFIG_REGULATOR_S5M8767=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_SLG51000=m -CONFIG_REGULATOR_STPMIC1=m -CONFIG_REGULATOR_SY8106A=m -CONFIG_REGULATOR_SY8824X=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65090=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS65218=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS6586X=m -CONFIG_REGULATOR_TPS65910=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m -CONFIG_REGULATOR_TWL4030=m -CONFIG_REGULATOR_VCTRL=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8350=m -CONFIG_REGULATOR_WM8400=m -CONFIG_REGULATOR_WM8994=m -CONFIG_CEC_CORE=m -CONFIG_CEC_NOTIFIER=y -CONFIG_CEC_PIN=y -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_LIRC=y -CONFIG_RC_DECODERS=y -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_IR_IMON_DECODER=m -CONFIG_IR_RCMM_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_IMON_RAW=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_RC_XBOX_DVD=m -CONFIG_MEDIA_SUPPORT=m - -# -# Multimedia core support -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_MEDIA_CEC_RC=y -# CONFIG_CEC_PIN_ERROR_INJ is not set -CONFIG_MEDIA_CONTROLLER=y -CONFIG_MEDIA_CONTROLLER_DVB=y -# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set -CONFIG_VIDEO_DEV=m -CONFIG_VIDEO_V4L2_SUBDEV_API=y -CONFIG_VIDEO_V4L2=m -CONFIG_VIDEO_V4L2_I2C=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -CONFIG_DVB_CORE=m -CONFIG_DVB_MMAP=y -CONFIG_DVB_NET=y -CONFIG_TTPCI_EEPROM=m -CONFIG_DVB_MAX_ADAPTERS=16 -# CONFIG_DVB_DYNAMIC_MINORS is not set -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set -# CONFIG_DVB_ULE_DEBUG is not set - -# -# Media drivers -# -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_CXUSB_ANALOG=y -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m - -# -# USB HDMI CEC adapters -# -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_VIDEO_IPU3_CIO2=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_VIDEO_CADENCE=y -CONFIG_VIDEO_CADENCE_CSI2RX=m -CONFIG_VIDEO_CADENCE_CSI2TX=m -CONFIG_VIDEO_ASPEED=m -CONFIG_VIDEO_MUX=m -CONFIG_VIDEO_XILINX=m -CONFIG_VIDEO_XILINX_TPG=m -CONFIG_VIDEO_XILINX_VTC=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_VIDEO_SH_VEU=m -CONFIG_V4L_TEST_DRIVERS=y -CONFIG_VIDEO_VIMC=m -CONFIG_VIDEO_VIVID=m -CONFIG_VIDEO_VIVID_CEC=y -CONFIG_VIDEO_VIVID_MAX_DEVS=64 -CONFIG_VIDEO_VIM2M=m -CONFIG_VIDEO_VICODEC=m -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_CEC_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CROS_EC_CEC=m -CONFIG_CEC_GPIO=m -CONFIG_VIDEO_SECO_CEC=m -CONFIG_VIDEO_SECO_RC=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# Supported MMC/SDIO adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=m -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m - -# -# Texas Instruments WL128x FM driver (ST based) -# -CONFIG_RADIO_WL128X=m -# end of Texas Instruments WL128x FM driver (ST based) - -# -# Supported FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set -CONFIG_VIDEO_V4L2_TPG=m - -# -# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) -# -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y -CONFIG_MEDIA_ATTACH=y -CONFIG_VIDEO_IR_I2C=m - -# -# I2C Encoders, decoders, sensors and other helper chips -# - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TDA1997X=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_TLV320AIC23B=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m - -# -# Video decoders -# -CONFIG_VIDEO_ADV7180=m -CONFIG_VIDEO_ADV7183=m -CONFIG_VIDEO_ADV748X=m -CONFIG_VIDEO_ADV7604=m -CONFIG_VIDEO_ADV7604_CEC=y -CONFIG_VIDEO_ADV7842=m -CONFIG_VIDEO_ADV7842_CEC=y -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_ML86V7667=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TC358743=m -CONFIG_VIDEO_TC358743_CEC=y -CONFIG_VIDEO_TVP514X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TVP7002=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_TW9910=m -CONFIG_VIDEO_VPX3220=m - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m -CONFIG_VIDEO_ADV7343=m -CONFIG_VIDEO_ADV7393=m -CONFIG_VIDEO_AD9389B=m -CONFIG_VIDEO_AK881X=m -CONFIG_VIDEO_THS8200=m - -# -# Camera sensor devices -# -CONFIG_VIDEO_APTINA_PLL=m -CONFIG_VIDEO_SMIAPP_PLL=m -CONFIG_VIDEO_HI556=m -CONFIG_VIDEO_IMX214=m -CONFIG_VIDEO_IMX219=m -CONFIG_VIDEO_IMX258=m -CONFIG_VIDEO_IMX274=m -CONFIG_VIDEO_IMX290=m -CONFIG_VIDEO_IMX319=m -CONFIG_VIDEO_IMX355=m -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV2659=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_OV2685=m -CONFIG_VIDEO_OV5640=m -CONFIG_VIDEO_OV5645=m -CONFIG_VIDEO_OV5647=m -CONFIG_VIDEO_OV6650=m -CONFIG_VIDEO_OV5670=m -CONFIG_VIDEO_OV5675=m -CONFIG_VIDEO_OV5695=m -CONFIG_VIDEO_OV7251=m -CONFIG_VIDEO_OV772X=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_OV7740=m -CONFIG_VIDEO_OV8856=m -CONFIG_VIDEO_OV9640=m -CONFIG_VIDEO_OV9650=m -CONFIG_VIDEO_OV13858=m -CONFIG_VIDEO_VS6624=m -CONFIG_VIDEO_MT9M001=m -CONFIG_VIDEO_MT9M032=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9P031=m -CONFIG_VIDEO_MT9T001=m -CONFIG_VIDEO_MT9T112=m -CONFIG_VIDEO_MT9V011=m -CONFIG_VIDEO_MT9V032=m -CONFIG_VIDEO_MT9V111=m -CONFIG_VIDEO_SR030PC30=m -CONFIG_VIDEO_NOON010PC30=m -CONFIG_VIDEO_M5MOLS=m -CONFIG_VIDEO_RJ54N1=m -CONFIG_VIDEO_S5K6AA=m -CONFIG_VIDEO_S5K6A3=m -CONFIG_VIDEO_S5K4ECGX=m -CONFIG_VIDEO_S5K5BAF=m -CONFIG_VIDEO_SMIAPP=m -CONFIG_VIDEO_ET8EK8=m -CONFIG_VIDEO_S5C73M3=m - -# -# Lens drivers -# -CONFIG_VIDEO_AD5820=m -CONFIG_VIDEO_AK7375=m -CONFIG_VIDEO_DW9714=m -CONFIG_VIDEO_DW9807_VCM=m - -# -# Flash devices -# -CONFIG_VIDEO_ADP1653=m -CONFIG_VIDEO_LM3560=m -CONFIG_VIDEO_LM3646=m - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m - -# -# SDR tuner chips -# -CONFIG_SDR_MAX2175=m - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_THS7303=m -CONFIG_VIDEO_M52790=m -CONFIG_VIDEO_I2C=m -CONFIG_VIDEO_ST_MIPID02=m -# end of I2C Encoders, decoders, sensors and other helper chips - -# -# SPI helper chips -# -CONFIG_VIDEO_GS1662=m -# end of SPI helper chips - -# -# Media SPI Adapters -# -CONFIG_CXD2880_SPI_DRV=m -# end of Media SPI Adapters - -CONFIG_MEDIA_TUNER=m - -# -# Customize TV tuners -# -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA18250=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m -CONFIG_MEDIA_TUNER_QM1D1B0004=m -# end of Customize TV tuners - -# -# Customise DVB Frontends -# - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_S5H1432=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_DIB9000=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m -CONFIG_DVB_CXD2880=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m -CONFIG_DVB_MN88443X=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBH29=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_LGS8GL5=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Common Interface (EN50221) controller drivers -# -CONFIG_DVB_CXD2099=m -CONFIG_DVB_SP2=m - -# -# Tools to develop new frontends -# -CONFIG_DVB_DUMMY_FE=m -# end of Customise DVB Frontends - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=10 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DBI=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set -CONFIG_DRM_LOAD_EDID_FIRMWARE=y -CONFIG_DRM_DP_CEC=y -CONFIG_DRM_TTM=m -CONFIG_DRM_TTM_DMA_PAGE_POOL=y -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_I2C_NXP_TDA9950=m -# end of I2C encoder or helper chips - -# -# ARM devices -# -CONFIG_DRM_KOMEDA=m -# end of ARM devices - -CONFIG_DRM_RADEON=m -CONFIG_DRM_RADEON_USERPTR=y -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_DCN=y -CONFIG_DRM_AMD_DC_HDCP=y -# CONFIG_DEBUG_KERNEL_DC is not set -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_DRM_NOUVEAU=m -# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_NOUVEAU_SVM=y -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="*" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m - -# -# drm/i915 Debugging -# -# CONFIG_DRM_I915_WERROR is not set -# CONFIG_DRM_I915_DEBUG is not set -# CONFIG_DRM_I915_DEBUG_MMIO is not set -# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set -# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set -# CONFIG_DRM_I915_DEBUG_GUC is not set -# CONFIG_DRM_I915_SELFTEST is not set -# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set -# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set -# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set -# end of drm/i915 Debugging - -# -# drm/i915 Profile Guided Optimisation -# -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -# end of drm/i915 Profile Guided Optimisation - -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_RCAR_DW_HDMI=m -CONFIG_DRM_RCAR_LVDS=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_PANEL_ARM_VERSATILE=m -CONFIG_DRM_PANEL_BOE_HIMAX8279D=m -CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m -CONFIG_DRM_PANEL_LVDS=m -CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_PANEL_ELIDA_KD35T133=m -CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m -CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m -CONFIG_DRM_PANEL_ILITEK_IL9322=m -CONFIG_DRM_PANEL_ILITEK_ILI9881C=m -CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m -CONFIG_DRM_PANEL_JDI_LT070ME05000=m -CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m -CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m -CONFIG_DRM_PANEL_SAMSUNG_LD9040=m -CONFIG_DRM_PANEL_LG_LB035Q02=m -CONFIG_DRM_PANEL_LG_LG4573=m -CONFIG_DRM_PANEL_NEC_NL8048HL11=m -CONFIG_DRM_PANEL_NOVATEK_NT35510=m -CONFIG_DRM_PANEL_NOVATEK_NT39016=m -CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m -CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m -CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m -CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m -CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m -CONFIG_DRM_PANEL_RAYDIUM_RM67191=m -CONFIG_DRM_PANEL_RAYDIUM_RM68200=m -CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m -CONFIG_DRM_PANEL_RONBO_RB070D30=m -CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m -CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m -CONFIG_DRM_PANEL_SEIKO_43WVF1G=m -CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m -CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m -CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m -CONFIG_DRM_PANEL_SITRONIX_ST7701=m -CONFIG_DRM_PANEL_SITRONIX_ST7789V=m -CONFIG_DRM_PANEL_SONY_ACX424AKP=m -CONFIG_DRM_PANEL_SONY_ACX565AKM=m -CONFIG_DRM_PANEL_TPO_TD028TTEC1=m -CONFIG_DRM_PANEL_TPO_TD043MTEA1=m -CONFIG_DRM_PANEL_TPO_TPG110=m -CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m -CONFIG_DRM_PANEL_XINPENG_XPP055C272=m -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_CDNS_DSI=m -CONFIG_DRM_DISPLAY_CONNECTOR=m -CONFIG_DRM_LVDS_CODEC=m -CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m -CONFIG_DRM_NXP_PTN3460=m -CONFIG_DRM_PARADE_PS8622=m -CONFIG_DRM_PARADE_PS8640=m -CONFIG_DRM_SIL_SII8620=m -CONFIG_DRM_SII902X=m -CONFIG_DRM_SII9234=m -CONFIG_DRM_SIMPLE_BRIDGE=m -CONFIG_DRM_THINE_THC63LVD1024=m -CONFIG_DRM_TOSHIBA_TC358764=m -CONFIG_DRM_TOSHIBA_TC358767=m -CONFIG_DRM_TOSHIBA_TC358768=m -CONFIG_DRM_TI_TFP410=m -CONFIG_DRM_TI_SN65DSI86=m -CONFIG_DRM_TI_TPD12S015=m -CONFIG_DRM_ANALOGIX_ANX6345=m -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_DRM_ANALOGIX_DP=m -CONFIG_DRM_I2C_ADV7511=m -CONFIG_DRM_I2C_ADV7511_AUDIO=y -CONFIG_DRM_I2C_ADV7511_CEC=y -CONFIG_DRM_DW_HDMI=m -CONFIG_DRM_DW_HDMI_AHB_AUDIO=m -CONFIG_DRM_DW_HDMI_I2S_AUDIO=m -CONFIG_DRM_DW_HDMI_CEC=m -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_ARCPGU=m -CONFIG_DRM_MXS=y -CONFIG_DRM_MXSFB=m -CONFIG_DRM_GM12U320=m -CONFIG_TINYDRM_HX8357D=m -CONFIG_TINYDRM_ILI9225=m -CONFIG_TINYDRM_ILI9341=m -CONFIG_TINYDRM_ILI9486=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -CONFIG_TINYDRM_ST7735R=m -CONFIG_DRM_XEN=y -CONFIG_DRM_XEN_FRONTEND=m -CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_BACKLIGHT=m -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -CONFIG_XEN_FBDEV_FRONTEND=m -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -CONFIG_FB_HYPERV=m -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SSD1307 is not set -# CONFIG_FB_SM712 is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_LCD_OTM3225A=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_GENERIC=m -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA903X=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_MAX8925=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP5520=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_88PM860X=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_AAT2870=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_LP8788=m -CONFIG_BACKLIGHT_PANDORA=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_AS3711=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -CONFIG_BACKLIGHT_RAVE_SP=m -CONFIG_BACKLIGHT_LED=m -# end of Backlight & LCD device support - -CONFIG_VIDEOMODE_HELPERS=y -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -# CONFIG_SND_DEBUG_VERBOSE is not set -# CONFIG_SND_PCM_XRUN_DEBUG is not set -# CONFIG_SND_CTL_VALIDATION is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -# CONFIG_SND_BT87X_OVERCLOCK is not set -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_AC97_BUS=y -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m -CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m -CONFIG_SND_SOC_AMD_ACP3x=m -CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_SOC_MIKROE_PROTO=m -CONFIG_SND_BCM63XX_I2S_WHISTLER=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_PCI=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m -CONFIG_SND_SOC_INTEL_HASWELL=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL=m -CONFIG_SND_SOC_INTEL_APL=m -CONFIG_SND_SOC_INTEL_KBL=m -CONFIG_SND_SOC_INTEL_GLK=m -CONFIG_SND_SOC_INTEL_CNL=m -CONFIG_SND_SOC_INTEL_CFL=m -CONFIG_SND_SOC_INTEL_CML_H=m -CONFIG_SND_SOC_INTEL_CML_LP=m -CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m -CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m -# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set -CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m -CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m -CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m -CONFIG_SND_SOC_MTK_BTCVSD=m -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_OF=m -# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set -# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_ACPI=m -CONFIG_SND_SOC_SOF_INTEL_PCI=m -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE_LP=m -CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y -CONFIG_SND_SOC_SOF_COMETLAKE_H=m -CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -CONFIG_SND_SOC_XILINX_I2S=m -CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m -CONFIG_SND_SOC_XILINX_SPDIF=m -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -CONFIG_SND_SOC_AC97_CODEC=m -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_ADAU7118=m -CONFIG_SND_SOC_ADAU7118_HW=m -CONFIG_SND_SOC_ADAU7118_I2C=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4118=m -CONFIG_SND_SOC_AK4458=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_AK5558=m -CONFIG_SND_SOC_ALC5623=m -CONFIG_SND_SOC_BD28623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CPCAP=m -CONFIG_SND_SOC_CROS_EC_CODEC=m -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS35L36=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4341=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_CX2072X=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES7241=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_HDAC_HDA=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_LOCHNAGAR_SC=m -CONFIG_SND_SOC_MAX98088=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX9867=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX98373=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM1789=m -CONFIG_SND_SOC_PCM1789_I2C=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM186X=m -CONFIG_SND_SOC_PCM186X_I2C=m -CONFIG_SND_SOC_PCM186X_SPI=m -CONFIG_SND_SOC_PCM3060=m -CONFIG_SND_SOC_PCM3060_I2C=m -CONFIG_SND_SOC_PCM3060_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RK3328=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT1011=m -CONFIG_SND_SOC_RT1015=m -CONFIG_SND_SOC_RT1308_SDW=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5660=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_RT5682=m -CONFIG_SND_SOC_RT5682_SDW=m -CONFIG_SND_SOC_RT700=m -CONFIG_SND_SOC_RT700_SDW=m -CONFIG_SND_SOC_RT711=m -CONFIG_SND_SOC_RT711_SDW=m -CONFIG_SND_SOC_RT715=m -CONFIG_SND_SOC_RT715_SDW=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2305=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS2562=m -CONFIG_SND_SOC_TAS2770=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TAS6424=m -CONFIG_SND_SOC_TDA7419=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC32X4=m -CONFIG_SND_SOC_TLV320AIC32X4_I2C=m -CONFIG_SND_SOC_TLV320AIC32X4_SPI=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TLV320ADCX140=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_TSCS42XX=m -CONFIG_SND_SOC_TSCS454=m -CONFIG_SND_SOC_UDA1334=m -CONFIG_SND_SOC_WCD9335=m -CONFIG_SND_SOC_WCD934X=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8782=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8904=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_WSA881X=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_MAX9759=m -CONFIG_SND_SOC_MT6351=m -CONFIG_SND_SOC_MT6358=m -CONFIG_SND_SOC_MT6660=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8822=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -# end of CODEC drivers - -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_AUDIO_GRAPH_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_SND_XEN_FRONTEND=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_BIGBEN_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_COUGAR=m -CONFIG_HID_MACALLY=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CREATIVE_SB0540=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELAN=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_GLORIOUS=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GOOGLE_HAMMER=m -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_VIEWSONIC=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_JABRA=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MALTRON=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_REDRAGON=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEAM=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_U2FZERO=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set -CONFIG_HID_ALPS=m -CONFIG_HID_MCP2221=m -# end of Special HID drivers - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# end of USB HID Boot Protocol drivers -# end of USB HID support - -# -# I2C HID support -# -CONFIG_I2C_HID=m -# end of I2C HID support - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support -# end of HID support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_USB_CONN_GPIO=m -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=y -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -CONFIG_USB_DYNAMIC_MINORS=y -# CONFIG_USB_OTG is not set -# CONFIG_USB_OTG_WHITELIST is not set -# CONFIG_USB_OTG_BLACKLIST_HUB is not set -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_MON=m - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_CDNS3=m -CONFIG_USB_CDNS3_GADGET=y -CONFIG_USB_CDNS3_HOST=y -CONFIG_USB_CDNS3_PCI_WRAP=m -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -# CONFIG_MUSB_PIO_ONLY is not set -CONFIG_USB_DWC3=m -CONFIG_USB_DWC3_ULPI=y -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC3_HAPS=m -CONFIG_USB_DWC3_OF_SIMPLE=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_OF=m -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_CONSOLE=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -CONFIG_USB_SERIAL_UPD78F0730=m -CONFIG_USB_SERIAL_DEBUG=m - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_APPLE_MFI_FASTCHARGE=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set -CONFIG_USB_ISP1301=m -# end of USB Physical Layer drivers - -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_SNP_UDC_PLAT=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -CONFIG_USB_GADGET_XILINX=m -CONFIG_USB_MAX3420_UDC=m -CONFIG_USB_DUMMY_HCD=m -# end of USB Peripheral Controller - -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC1_LEGACY=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y - -# -# USB Gadget precomposed configurations -# -CONFIG_USB_ZERO=m -CONFIG_USB_AUDIO=m -# CONFIG_GADGET_UAC1 is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -CONFIG_USB_G_DBGP=m -# CONFIG_USB_G_DBGP_PRINTK is not set -CONFIG_USB_G_DBGP_SERIAL=y -CONFIG_USB_G_WEBCAM=m -CONFIG_USB_RAW_GADGET=m -# end of USB Gadget precomposed configurations - -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -CONFIG_TYPEC_FUSB302=m -CONFIG_TYPEC_WCOVE=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -CONFIG_TYPEC_HD3SS3220=m -CONFIG_TYPEC_TPS6598X=m - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -CONFIG_TYPEC_MUX_PI3USB30532=m -CONFIG_TYPEC_MUX_INTEL_PMC=m -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -CONFIG_TYPEC_NVIDIA_ALTMODE=m -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -CONFIG_USB_ROLES_INTEL_XHCI=m -CONFIG_MMC=m -CONFIG_PWRSEQ_EMMC=m -CONFIG_PWRSEQ_SD8787=m -CONFIG_PWRSEQ_SIMPLE=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -CONFIG_MMC_TEST=m - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_SDHCI_OF_ARASAN=m -CONFIG_MMC_SDHCI_OF_ASPEED=m -CONFIG_MMC_SDHCI_OF_AT91=m -CONFIG_MMC_SDHCI_OF_DWCMSHC=m -CONFIG_MMC_SDHCI_CADENCE=m -CONFIG_MMC_SDHCI_F_SDH30=m -CONFIG_MMC_SDHCI_MILBEAUT=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_ALCOR=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MMC_SDHCI_OMAP=m -CONFIG_MMC_SDHCI_AM654=m -CONFIG_MMC_SDHCI_EXTERNAL_DMA=y -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y - -# -# LED drivers -# -CONFIG_LEDS_88PM860X=m -CONFIG_LEDS_AAT1290=m -CONFIG_LEDS_AN30259A=m -CONFIG_LEDS_APU=m -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_BCM6328=m -CONFIG_LEDS_BCM6358=m -CONFIG_LEDS_CPCAP=m -CONFIG_LEDS_CR0014114=m -CONFIG_LEDS_EL15203000=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3532=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_LM3692X=m -CONFIG_LEDS_LM3601X=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -# CONFIG_LEDS_LP5521 is not set -# CONFIG_LEDS_LP5523 is not set -# CONFIG_LEDS_LP5562 is not set -# CONFIG_LEDS_LP8501 is not set -CONFIG_LEDS_LP8788=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -CONFIG_LEDS_PCA955X_GPIO=y -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_WM8350=m -CONFIG_LEDS_DA903X=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_MAX77650=m -CONFIG_LEDS_MAX77693=m -CONFIG_LEDS_MAX8997=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m -CONFIG_LEDS_KTD2692=m -CONFIG_LEDS_IS31FL319X=m -CONFIG_LEDS_IS31FL32XX=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_SYSCON=y -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_MLXREG=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m -CONFIG_LEDS_SPI_BYTE=m -CONFIG_LEDS_TI_LMU_COMMON=m -CONFIG_LEDS_LM3697=m -CONFIG_LEDS_LM36274=m -CONFIG_LEDS_TPS6105X=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_MTD=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_ACTIVITY=m -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_LEDS_TRIGGER_NETDEV=m -CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_EFA=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_RDMA_SIW=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_I10NM=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM860X=m -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABEOZ9=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_AS3722=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_CENTURY=y -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_HYM8563=m -CONFIG_RTC_DRV_LP8788=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_MAX8925=m -CONFIG_RTC_DRV_MAX8998=m -CONFIG_RTC_DRV_MAX8997=m -CONFIG_RTC_DRV_MAX77686=m -CONFIG_RTC_DRV_RK808=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_ISL12026=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF85363=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BD70528=m -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_TWL4030=m -CONFIG_RTC_DRV_PALMAS=m -CONFIG_RTC_DRV_TPS6586X=m -CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m -CONFIG_RTC_DRV_RC5T583=m -CONFIG_RTC_DRV_RC5T619=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3028=m -CONFIG_RTC_DRV_RV8803=m -CONFIG_RTC_DRV_S5M=m -CONFIG_RTC_DRV_SD3078=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9055=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_WM8350=m -CONFIG_RTC_DRV_PCF50633=m -CONFIG_RTC_DRV_AB3100=m -CONFIG_RTC_DRV_ZYNQMP=m -CONFIG_RTC_DRV_CROS_EC=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_CADENCE=m -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m -CONFIG_RTC_DRV_R7301=m -CONFIG_RTC_DRV_CPCAP=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_RTC_DRV_WILCO_EC=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_DMA_OF=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_DW_AXI_DMAC=m -CONFIG_FSL_EDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IDXD=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_PLX_DMA=m -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=y -CONFIG_DW_DMAC_PCI=y -CONFIG_DW_EDMA=m -CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y -CONFIG_SF_PDMA=m - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -# CONFIG_DMABUF_MOVE_NOTIFY is not set -# CONFIG_DMABUF_SELFTESTS is not set -CONFIG_DMABUF_HEAPS=y -CONFIG_DMABUF_HEAPS_SYSTEM=y -# end of DMABUF options - -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_HT16K33=m -CONFIG_PARPORT_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -# CONFIG_CHARLCD_BL_OFF is not set -# CONFIG_CHARLCD_BL_ON is not set -CONFIG_CHARLCD_BL_FLASH=y -CONFIG_PANEL=m -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VBOXGUEST=m -CONFIG_VIRTIO=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y -CONFIG_VDPA=m -CONFIG_VDPA_SIM=m -CONFIG_IFCVF=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_RING=m -CONFIG_VHOST_DPN=y -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TIMER=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -# end of Microsoft Hyper-V guest support - -# -# Xen driver support -# -CONFIG_XEN_BALLOON=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 -CONFIG_XEN_SCRUB_PAGES_DEFAULT=y -CONFIG_XEN_DEV_EVTCHN=m -CONFIG_XEN_BACKEND=y -CONFIG_XENFS=m -CONFIG_XEN_COMPAT_XENFS=y -CONFIG_XEN_SYS_HYPERVISOR=y -CONFIG_XEN_XENBUS_FRONTEND=y -CONFIG_XEN_GNTDEV=m -CONFIG_XEN_GNTDEV_DMABUF=y -CONFIG_XEN_GRANT_DEV_ALLOC=m -CONFIG_XEN_GRANT_DMA_ALLOC=y -CONFIG_SWIOTLB_XEN=y -CONFIG_XEN_PCIDEV_BACKEND=m -CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y -CONFIG_XEN_SCSI_BACKEND=m -CONFIG_XEN_PRIVCMD=m -CONFIG_XEN_ACPI_PROCESSOR=m -CONFIG_XEN_MCE_LOG=y -CONFIG_XEN_HAVE_PVMMU=y -CONFIG_XEN_EFI=y -CONFIG_XEN_AUTO_XLATE=y -CONFIG_XEN_ACPI=y -CONFIG_XEN_SYMS=y -CONFIG_XEN_HAVE_VPMU=y -CONFIG_XEN_FRONT_PGDIR_SHBUF=m -# end of Xen driver support - -# CONFIG_GREYBUS is not set -CONFIG_STAGING=y -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -# CONFIG_COMEDI_ISA_DRIVERS is not set -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_COMEDI_NI_ROUTING=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16203=m -CONFIG_ADIS16240=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD7816=m -CONFIG_AD7280=m -# end of Analog to digital converters - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m -# end of Analog digital bi-direction converters - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7746=m -# end of Capacitance to digital converters - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m -# end of Direct Digital Synthesis - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m -# end of Network Analyzer, Impedance Converters - -# -# Active energy metering IC -# -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m -# end of Active energy metering IC - -# -# Resolver to digital converters -# -CONFIG_AD2S1210=m -# end of Resolver to digital converters -# end of IIO staging drivers - -# CONFIG_FB_SM750 is not set - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -CONFIG_SPEAKUP_SYNTH_DUMMY=m -# end of Speakup console speech - -CONFIG_STAGING_MEDIA=y -CONFIG_VIDEO_IPU3_IMGU=m - -# -# soc_camera sensor drivers -# -CONFIG_VIDEO_USBVISION=m - -# -# Android -# -# end of Android - -CONFIG_STAGING_BOARD=y -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_GS_FPGABOOT=m -CONFIG_UNISYSSPAR=y -CONFIG_UNISYS_VISORNIC=m -CONFIG_UNISYS_VISORINPUT=m -CONFIG_UNISYS_VISORHBA=m -CONFIG_COMMON_CLK_XLNX_CLKWZRD=m -# CONFIG_FB_TFT is not set -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_MOST_COMPONENTS=m -CONFIG_MOST_CDEV=m -CONFIG_MOST_NET=m -CONFIG_MOST_SOUND=m -CONFIG_MOST_VIDEO=m -CONFIG_MOST_DIM2=m -CONFIG_MOST_I2C=m -CONFIG_MOST_USB=m -CONFIG_KS7010=m -CONFIG_PI433=m - -# -# Gasket devices -# -CONFIG_STAGING_GASKET_FRAMEWORK=m -CONFIG_STAGING_APEX_DRIVER=m -# end of Gasket devices - -CONFIG_XIL_AXIS_FIFO=m -CONFIG_FIELDBUS_DEV=m -CONFIG_HMS_ANYBUSS_BUS=m -CONFIG_ARCX_ANYBUS_CONTROLLER=m -CONFIG_HMS_PROFINET=m -CONFIG_KPC2000=y -CONFIG_KPC2000_CORE=m -CONFIG_KPC2000_SPI=m -CONFIG_KPC2000_I2C=m -CONFIG_KPC2000_DMA=m -CONFIG_QLGE=m -CONFIG_WFX=m -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_HUAWEI_WMI=m -CONFIG_INTEL_WMI_THUNDERBOLT=m -CONFIG_MXM_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_XIAOMI_WMI=m -CONFIG_ACERHDF=m -CONFIG_ACER_WIRELESS=m -CONFIG_ACER_WMI=m -CONFIG_APPLE_GMUX=m -CONFIG_ASUS_LAPTOP=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_EEEPC_WMI=m -CONFIG_DCDBAS=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_SMBIOS_WMI=y -CONFIG_DELL_SMBIOS_SMM=y -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_RBTN=m -# CONFIG_DELL_RBU is not set -CONFIG_DELL_SMO8800=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_DESCRIPTOR=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_AMILO_RFKILL=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_GPD_POCKET_FAN=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_IBM_RTL=m -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_INTEL_ATOMISP2_PM=m -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_MENLOW=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_VBTN=m -CONFIG_SURFACE3_WMI=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_SURFACE_3_POWER_OPREGION=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_MSI_LAPTOP=m -CONFIG_MSI_WMI=m -CONFIG_PCENGINES_APU2=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_SAMSUNG_Q10=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_LG_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_SYSTEM76_ACPI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_I2C_MULTI_INSTANTIATE=m -CONFIG_MLX_PLATFORM=m -CONFIG_TOUCHSCREEN_DMI=y -CONFIG_INTEL_IPS=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_INTEL_CHTDC_TI_PWRBTN=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_INTEL_PMC_IPC=m -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_PMC_ATOM=y -CONFIG_MFD_CROS_EC=m -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CHROMEOS_TBMC=m -CONFIG_CROS_EC=m -CONFIG_CROS_EC_I2C=m -CONFIG_CROS_EC_RPMSG=m -CONFIG_CROS_EC_ISHTP=m -CONFIG_CROS_EC_SPI=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LIGHTBAR=m -CONFIG_CROS_EC_VBC=m -CONFIG_CROS_EC_DEBUGFS=m -CONFIG_CROS_EC_SENSORHUB=m -CONFIG_CROS_EC_SYSFS=m -CONFIG_CROS_EC_TYPEC=m -CONFIG_CROS_USBPD_LOGGER=m -CONFIG_CROS_USBPD_NOTIFY=m -CONFIG_WILCO_EC=m -CONFIG_WILCO_EC_DEBUGFS=m -CONFIG_WILCO_EC_EVENTS=m -CONFIG_WILCO_EC_TELEMETRY=m -CONFIG_MELLANOX_PLATFORM=y -CONFIG_MLXREG_HOTPLUG=m -CONFIG_MLXREG_IO=m -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y - -# -# Common Clock Framework -# -CONFIG_COMMON_CLK_WM831X=m -CONFIG_CLK_HSDK=y -CONFIG_COMMON_CLK_MAX77686=m -CONFIG_COMMON_CLK_MAX9485=m -CONFIG_COMMON_CLK_RK808=m -CONFIG_COMMON_CLK_SI5341=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_SI514=m -CONFIG_COMMON_CLK_SI544=m -CONFIG_COMMON_CLK_SI570=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CDCE925=m -CONFIG_COMMON_CLK_CS2000_CP=m -CONFIG_COMMON_CLK_S2MPS11=m -CONFIG_CLK_TWL6040=m -CONFIG_COMMON_CLK_LOCHNAGAR=m -CONFIG_COMMON_CLK_PALMAS=m -CONFIG_COMMON_CLK_PWM=m -CONFIG_COMMON_CLK_VC5=m -CONFIG_COMMON_CLK_BD718XX=m -CONFIG_COMMON_CLK_FIXED_MMIO=y -# end of Common Clock Framework - -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_TIMER_OF=y -CONFIG_TIMER_PROBE=y -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -CONFIG_CLKSRC_MMIO=y -CONFIG_MICROCHIP_PIT64B=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PLATFORM_MHU=m -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_MAILBOX_TEST=m -CONFIG_IOMMU_IOVA=y -CONFIG_IOASID=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_OF_IOMMU=y -CONFIG_IOMMU_DMA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set -CONFIG_IRQ_REMAP=y -CONFIG_HYPERV_IOMMU=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=y -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK_NATIVE=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m -CONFIG_RPMSG_VIRTIO=m -# end of Rpmsg drivers - -CONFIG_SOUNDWIRE=m - -# -# SoundWire Devices -# -CONFIG_SOUNDWIRE_CADENCE=m -CONFIG_SOUNDWIRE_INTEL=m -CONFIG_SOUNDWIRE_QCOM=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Aspeed SoC drivers -# -# end of Aspeed SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Qualcomm SoC drivers -# -# end of Qualcomm SoC drivers - -CONFIG_SOC_TI=y - -# -# Xilinx SoC drivers -# -CONFIG_XILINX_VCU=m -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_FSA9480=m -CONFIG_EXTCON_GPIO=m -CONFIG_EXTCON_INTEL_INT3496=m -CONFIG_EXTCON_INTEL_CHT_WC=m -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_MAX77843=m -CONFIG_EXTCON_MAX8997=m -CONFIG_EXTCON_PALMAS=m -CONFIG_EXTCON_PTN5150=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -CONFIG_EXTCON_USB_GPIO=m -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_BUFFER_HW_CONSUMER=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16209=m -CONFIG_ADXL372=m -CONFIG_ADXL372_SPI=m -CONFIG_ADXL372_I2C=m -CONFIG_BMA180=m -CONFIG_BMA220=m -CONFIG_BMA400=m -CONFIG_BMA400_I2C=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD06=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7091R5=m -CONFIG_AD7124=m -CONFIG_AD7192=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7292=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7766=m -CONFIG_AD7768_1=m -CONFIG_AD7780=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD7949=m -CONFIG_AD799X=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_CPCAP_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_ENVELOPE_DETECTOR=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_LP8788_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2496=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MCP3911=m -CONFIG_MEN_Z188_ADC=m -CONFIG_NAU7802=m -CONFIG_PALMAS_GPADC=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_QCOM_SPMI_ADC5=m -CONFIG_RN5T618_ADC=m -CONFIG_SD_ADC_MODULATOR=m -CONFIG_STMPE_ADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_ADS8344=m -CONFIG_TI_ADS8688=m -CONFIG_TI_ADS124S08=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_TWL4030_MADC=m -CONFIG_TWL6030_GPADC=m -CONFIG_VF610_ADC=m -CONFIG_VIPERBOARD_ADC=m -CONFIG_XILINX_XADC=m -# end of Analog to digital converters - -# -# Analog Front Ends -# -CONFIG_IIO_RESCALE=m -# end of Analog Front Ends - -# -# Amplifiers -# -CONFIG_AD8366=m -CONFIG_HMC425=m -# end of Amplifiers - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_BME680=m -CONFIG_BME680_I2C=m -CONFIG_BME680_SPI=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_PMS7003=m -CONFIG_SENSIRION_SGP30=m -CONFIG_SPS30=m -CONFIG_VZ89X=m -# end of Chemical Sensors - -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m -CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -# end of Hid Sensor IIO Common - -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -# end of SSP Sensor Common - -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_AD5686=m -CONFIG_AD5686_SPI=m -CONFIG_AD5696_I2C=m -CONFIG_AD5755=m -CONFIG_AD5758=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5770R=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_DPOT_DAC=m -CONFIG_DS4424=m -CONFIG_LTC1660=m -CONFIG_LTC2632=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MAX5821=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m -CONFIG_TI_DAC082S085=m -CONFIG_TI_DAC5571=m -CONFIG_TI_DAC7311=m -CONFIG_TI_DAC7612=m -CONFIG_VF610_DAC=m -# end of Digital to analog converters - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set -# end of IIO dummy driver - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m -# end of Clock Generator/Distribution - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m -CONFIG_ADF4371=m -# end of Phase-Locked Loop (PLL) frequency synthesizers -# end of Frequency Synthesizers DDS/PLL - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_FXAS21002C=m -CONFIG_FXAS21002C_I2C=m -CONFIG_FXAS21002C_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m -# end of Digital gyroscope sensors - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m -# end of Heart Rate Monitors -# end of Health Sensors - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m -# end of Humidity sensors - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16460=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_FXOS8700=m -CONFIG_FXOS8700_I2C=m -CONFIG_FXOS8700_SPI=m -CONFIG_KMX61=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ST_LSM6DSX_I3C=m -# end of Inertial measurement units - -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -CONFIG_ACPI_ALS=m -CONFIG_ADJD_S311=m -CONFIG_ADUX1020=m -CONFIG_AL3010=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM3605=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP002=m -CONFIG_GP2AP020A00F=m -CONFIG_IQS621_ALS=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_LV0104CS=m -CONFIG_MAX44000=m -CONFIG_MAX44009=m -CONFIG_NOA1305=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1133=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_ST_UVIS25=m -CONFIG_ST_UVIS25_I2C=m -CONFIG_ST_UVIS25_SPI=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL2772=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VCNL4035=m -CONFIG_VEML6030=m -CONFIG_VEML6070=m -CONFIG_VL6180=m -CONFIG_ZOPT2201=m -# end of Light sensors - -# -# Magnetometer sensors -# -CONFIG_AK8974=m -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m -CONFIG_SENSORS_RM3100=m -CONFIG_SENSORS_RM3100_I2C=m -CONFIG_SENSORS_RM3100_SPI=m -# end of Magnetometer sensors - -# -# Multiplexers -# -CONFIG_IIO_MUX=m -# end of Multiplexers - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m -# end of Inclinometer sensors - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m -# end of Triggers - standalone - -# -# Linear and angular position sensors -# -CONFIG_IQS624_POS=m -# end of Linear and angular position sensors - -# -# Digital potentiometers -# -CONFIG_AD5272=m -CONFIG_DS1803=m -CONFIG_MAX5432=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4018=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_MCP41010=m -CONFIG_TPL0102=m -# end of Digital potentiometers - -# -# Digital potentiostats -# -CONFIG_LMP91000=m -# end of Digital potentiostats - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_DLHL60D=m -CONFIG_DPS310=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_ICP10100=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m -# end of Pressure sensors - -# -# Lightning sensors -# -CONFIG_AS3935=m -# end of Lightning sensors - -# -# Proximity and distance sensors -# -CONFIG_ISL29501=m -CONFIG_LIDAR_LITE_V2=m -CONFIG_MB1232=m -CONFIG_PING=m -CONFIG_RFD77402=m -CONFIG_SRF04=m -CONFIG_SX9500=m -CONFIG_SRF08=m -CONFIG_VL53L0X_I2C=m -# end of Proximity and distance sensors - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -# end of Resolver to digital converters - -# -# Temperature sensors -# -CONFIG_IQS620AT_TEMP=m -CONFIG_LTC2983=m -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_MLX90632=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_MAX31856=m -# end of Temperature sensors - -CONFIG_NTB=m -CONFIG_NTB_MSI=y -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_SWITCHTEC=m -# CONFIG_NTB_PINGPONG is not set -# CONFIG_NTB_TOOL is not set -# CONFIG_NTB_PERF is not set -# CONFIG_NTB_MSI_TEST is not set -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -# CONFIG_VME_FAKE is not set - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_ATMEL_HLCDC_PWM=m -CONFIG_PWM_CRC=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_FSL_FTM=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_PWM_STMPE=y -CONFIG_PWM_TWL=m -CONFIG_PWM_TWL_LED=m - -# -# IRQ chip support -# -CONFIG_IRQCHIP=y -CONFIG_AL_FIC=y -CONFIG_MADERA_IRQ=m -# end of IRQ chip support - -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -CONFIG_RESET_BRCMSTB_RESCAL=y -CONFIG_RESET_INTEL_GW=y -CONFIG_RESET_TI_SYSCON=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_GENERIC_PHY_MIPI_DPHY=y -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_CADENCE_TORRENT=m -CONFIG_PHY_CADENCE_DPHY=m -CONFIG_PHY_CADENCE_SIERRA=m -CONFIG_PHY_FSL_IMX8MQ_USB=m -CONFIG_PHY_MIXEL_MIPI_DPHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_MAPPHONE_MDM6600=m -CONFIG_PHY_OCELOT_SERDES=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -CONFIG_PHY_TUSB1210=m -CONFIG_PHY_INTEL_EMMC=m -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_IDLE_INJECT=y -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -# end of Performance monitor support - -CONFIG_RAS=y -CONFIG_RAS_CEC=y -# CONFIG_RAS_CEC_DEBUG is not set -CONFIG_USB4=m - -# -# Android -# -# CONFIG_ANDROID is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_ND_PFN=m -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_OF_PMEM=m -CONFIG_DAX_DRIVER=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_HMEM=m -CONFIG_DEV_DAX_KMEM=m -CONFIG_DEV_DAX_PMEM_COMPAT=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -CONFIG_NVMEM_SPMI_SDAM=m -CONFIG_RAVE_SP_EEPROM=m - -# -# HW tracing support -# -CONFIG_STM=m -CONFIG_STM_PROTO_BASIC=m -CONFIG_STM_PROTO_SYS_T=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_STM_SOURCE_FTRACE=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_ACPI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -# end of HW tracing support - -CONFIG_FPGA=m -CONFIG_ALTERA_PR_IP_CORE=m -CONFIG_ALTERA_PR_IP_CORE_PLAT=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_FPGA_MGR_ICE40_SPI=m -CONFIG_FPGA_MGR_MACHXO2_SPI=m -CONFIG_FPGA_BRIDGE=m -CONFIG_ALTERA_FREEZE_BRIDGE=m -CONFIG_XILINX_PR_DECOUPLER=m -CONFIG_FPGA_REGION=m -CONFIG_OF_FPGA_REGION=m -CONFIG_FPGA_DFL=m -CONFIG_FPGA_DFL_FME=m -CONFIG_FPGA_DFL_FME_MGR=m -CONFIG_FPGA_DFL_FME_BRIDGE=m -CONFIG_FPGA_DFL_FME_REGION=m -CONFIG_FPGA_DFL_AFU=m -CONFIG_FPGA_DFL_PCI=m -CONFIG_FSI=m -CONFIG_FSI_NEW_DEV_NODE=y -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_MASTER_ASPEED=m -CONFIG_FSI_SCOM=m -CONFIG_FSI_SBEFIFO=m -CONFIG_FSI_OCC=m -CONFIG_TEE=m - -# -# TEE drivers -# -CONFIG_AMDTEE=m -# end of TEE drivers - -CONFIG_MULTIPLEXER=m - -# -# Multiplexer drivers -# -CONFIG_MUX_ADG792A=m -CONFIG_MUX_ADGS1408=m -CONFIG_MUX_GPIO=m -CONFIG_MUX_MMIO=m -# end of Multiplexer drivers - -CONFIG_PM_OPP=y -CONFIG_UNISYS_VISORBUS=m -CONFIG_SIOX=m -CONFIG_SIOX_BUS_GPIO=m -CONFIG_SLIMBUS=m -CONFIG_SLIM_QCOM_CTRL=m -CONFIG_INTERCONNECT=m -CONFIG_COUNTER=m -CONFIG_FTM_QUADDEC=m -CONFIG_MOST=m -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=m -CONFIG_EXT4_USE_FOR_EXT2=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -# CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=m -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_XFS_ONLINE_SCRUB=y -CONFIG_XFS_ONLINE_REPAIR=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_OCFS2_FS_O2CB=m -CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m -CONFIG_OCFS2_FS_STATS=y -CONFIG_OCFS2_DEBUG_MASKLOG=y -# CONFIG_OCFS2_DEBUG_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_IO_TRACE is not set -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -# CONFIG_MANDATORY_FILE_LOCKING is not set -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=m -CONFIG_FS_VERITY=y -# CONFIG_FS_VERITY_DEBUG is not set -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_QUOTACTL_COMPAT=y -CONFIG_AUTOFS4_FS=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -CONFIG_FSCACHE_HISTOGRAM=y -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_VMCORE=y -CONFIG_PROC_VMCORE_DEVICE_DUMP=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_MEMFD_CREATE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=y -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -# CONFIG_JFFS2_FS_WBUF_VERIFY is not set -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_RTIME=y -CONFIG_UBIFS_FS=m -# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_FS_ZSTD=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_XATTR=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_UBIFS_FS_AUTHENTICATION=y -CONFIG_CRAMFS=m -CONFIG_CRAMFS_BLOCKDEV=y -CONFIG_CRAMFS_MTD=y -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -CONFIG_SQUASHFS_DECOMP_MULTI=y -# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set -# CONFIG_SQUASHFS_EMBEDDED is not set -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -CONFIG_ROMFS_FS=m -CONFIG_ROMFS_BACKED_BY_BLOCK=y -# CONFIG_ROMFS_BACKED_BY_MTD is not set -# CONFIG_ROMFS_BACKED_BY_BOTH is not set -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFLATE_COMPRESS=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -# CONFIG_PSTORE_842_COMPRESS is not set -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y -CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=y -# CONFIG_SYSV_FS is not set -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EROFS_FS=m -# CONFIG_EROFS_FS_DEBUG is not set -CONFIG_EROFS_FS_XATTR=y -CONFIG_EROFS_FS_POSIX_ACL=y -CONFIG_EROFS_FS_SECURITY=y -CONFIG_EROFS_FS_ZIP=y -CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 -CONFIG_VBOXSF_FS=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFS_DEBUG=y -# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -# CONFIG_NFSD_FLEXFILELAYOUT is not set -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y -CONFIG_SUNRPC_DEBUG=y -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CEPH_FS_SECURITY_LABEL=y -CONFIG_CIFS=m -# CONFIG_CIFS_STATS2 is not set -CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y -# CONFIG_CIFS_WEAK_PW_HASH is not set -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -CONFIG_CIFS_DEBUG=y -# CONFIG_CIFS_DEBUG2 is not set -# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set -CONFIG_CIFS_DFS_UPCALL=y -# CONFIG_CIFS_SMB_DIRECT is not set -CONFIG_CIFS_FSCACHE=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -# CONFIG_AFS_DEBUG_CURSOR is not set -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -CONFIG_KEY_DH_OPERATIONS=y -# CONFIG_SECURITY_DMESG_RESTRICT is not set -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_SECURITY_INFINIBAND=y -CONFIG_SECURITY_NETWORK_XFRM=y -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HARDENED_USERCOPY_FALLBACK=y -# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set -CONFIG_FORTIFY_SOURCE=y -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DISABLE is not set -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -CONFIG_SECURITY_SMACK=y -CONFIG_SECURITY_SMACK_BRINGUP=y -CONFIG_SECURITY_SMACK_NETFILTER=y -CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y -CONFIG_SECURITY_TOMOYO=y -CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 -CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 -# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set -CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" -CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" -# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -# CONFIG_INTEGRITY is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_SMACK is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama" - -# -# Kernel hardening options -# -CONFIG_GCC_PLUGIN_STRUCTLEAK=y - -# -# Memory initialization -# -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set -CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y -# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set -# CONFIG_GCC_PLUGIN_STACKLEAK is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set -# end of Memory initialization -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=y -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=y -CONFIG_CRYPTO_ECC=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_CURVE25519_X86=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_NHPOLY1305=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_ESSIV=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_BLAKE2B=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_BLAKE2S_X86=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=y -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=y -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -# CONFIG_CRYPTO_STATS is not set -CONFIG_CRYPTO_HASH_INFO=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA256=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -CONFIG_CRYPTO_DEV_ATMEL_I2C=m -CONFIG_CRYPTO_DEV_ATMEL_ECC=m -CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CHELSIO_IPSEC_INLINE=y -CONFIG_CHELSIO_TLS_DEVICE=y -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_CRYPTO_DEV_SAFEXCEL=m -CONFIG_CRYPTO_DEV_CCREE=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_TPM_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -CONFIG_SIGNED_PE_FILE_VERIFICATION=y - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_CORDIC=m -CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_CRC_CCITT=y -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=y -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y -CONFIG_DMA_VIRT_OPS=y -CONFIG_SWIOTLB=y -# CONFIG_DMA_API_DEBUG is not set -CONFIG_SGL_ALLOC=y -CONFIG_IOMMU_HELPER=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_DIMLIB=y -CONFIG_LIBFDT=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -CONFIG_FONT_TER16x32=y -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_UACCESS_MCSAFE=y -CONFIG_ARCH_STACKWALK=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -CONFIG_OBJAGG=m -# CONFIG_STRING_SELFTEST is not set -# end of Library routines - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 -CONFIG_CONSOLE_LOGLEVEL_QUIET=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -CONFIG_DYNAMIC_DEBUG=y -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -# -# Compile-time checks and compiler options -# -CONFIG_DEBUG_INFO=y -# CONFIG_DEBUG_INFO_REDUCED is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y -# CONFIG_GDB_SCRIPTS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_UBSAN is not set -# end of Generic Kernel Debugging Instruments - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_OWNER is not set -CONFIG_PAGE_POISONING=y -CONFIG_PAGE_POISONING_NO_SANITY=y -CONFIG_PAGE_POISONING_ZERO=y -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -# CONFIG_DEBUG_VM is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -# CONFIG_KASAN is not set -CONFIG_KASAN_STACK=1 -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -# CONFIG_DEBUG_LIST is not set -# CONFIG_DEBUG_PLIST is not set -# CONFIG_DEBUG_SG is not set -# CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BUG_ON_DATA_CORRUPTION is not set -# end of Debug kernel data structures - -# CONFIG_DEBUG_CREDENTIALS is not set - -# -# RCU Debugging -# -# CONFIG_RCU_PERF_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_RING_BUFFER_ALLOW_SWAP=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_STACK_TRACER=y -# CONFIG_PREEMPTIRQ_EVENTS is not set -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -CONFIG_SCHED_TRACER=y -CONFIG_HWLAT_TRACER=y -CONFIG_MMIOTRACE=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT=y -# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -CONFIG_BPF_KPROBE_OVERRIDE=y -CONFIG_FTRACE_MCOUNT_RECORD=y -CONFIG_TRACING_MAP=y -CONFIG_HIST_TRIGGERS=y -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_MMIOTRACE_TEST is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_SYNTH_EVENT_GEN_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -CONFIG_STRICT_DEVMEM=y -CONFIG_IO_STRICT_DEVMEM=y - -# -# x86 Debugging -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_X86_VERBOSE_BOOTUP is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -CONFIG_DEBUG_WX=y -CONFIG_DOUBLEFAULT=y -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEBUG_BOOT_PARAMS=y -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# CONFIG_UNWINDER_GUESS is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_STRSCPY is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_BITFIELD is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_OVERFLOW is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_HASH is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_PARMAN is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_VMALLOC is not set -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_OBJAGG is not set -# CONFIG_TEST_STACKINIT is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_MEMTEST is not set -# CONFIG_HYPERV_TESTING is not set -# end of Kernel Testing and Coverage -# end of Kernel hacking diff --git a/linux57-tkg/linux57-tkg-config/config_hardened.x86_64 b/linux57-tkg/linux57-tkg-config/config_hardened.x86_64 deleted file mode 100644 index 105f167..0000000 --- a/linux57-tkg/linux57-tkg-config/config_hardened.x86_64 +++ /dev/null @@ -1,10839 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 5.7.8 Kernel Configuration -# - -# -# Compiler: gcc (GCC) 10.1.0 -# -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=100100 -CONFIG_LD_VERSION=234000000 -CONFIG_CLANG_VERSION=0 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_HAS_ASM_GOTO=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -CONFIG_KERNEL_XZ=y -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -CONFIG_DEFAULT_HOSTNAME="archlinux" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -# end of Timers subsystem - -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y - -# -# CPU/Task time and stats accounting -# -CONFIG_TICK_CPU_ACCOUNTING=y -# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -# CONFIG_SCHED_THERMAL_PRESSURE is not set -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -CONFIG_RCU_EXPERT=y -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_RCU_FANOUT=64 -CONFIG_RCU_FANOUT_LEAF=16 -CONFIG_RCU_FAST_NO_HZ=y -CONFIG_RCU_BOOST=y -CONFIG_RCU_BOOST_DELAY=500 -# CONFIG_RCU_NOCB_CPU is not set -# end of RCU Subsystem - -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_IKHEADERS is not set -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -CONFIG_UCLAMP_TASK=y -CONFIG_UCLAMP_BUCKETS_COUNT=5 -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_SWAP_ENABLED=y -CONFIG_MEMCG_KMEM=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -# CONFIG_USER_NS_UNPRIVILEGED is not set -CONFIG_PID_NS=y -CONFIG_NET_NS=y -# CONFIG_CHECKPOINT_RESTORE is not set -CONFIG_SCHED_AUTOGROUP=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_BOOT_CONFIG=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -CONFIG_EXPERT=y -# CONFIG_UID16 is not set -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_MEMBARRIER=y -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_BPF_LSM=y -CONFIG_BPF_SYSCALL=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -# CONFIG_USERFAULTFD is not set -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_RSEQ=y -# CONFIG_DEBUG_RSEQ is not set -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -# CONFIG_SLOB is not set -# CONFIG_SLAB_MERGE_DEFAULT is not set -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SLAB_CANARY=y -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_FILTER_PGPROT=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DYNAMIC_PHYSICAL_MASK=y -CONFIG_PGTABLE_LEVELS=5 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_X86_CPU_RESCTRL=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_PARAVIRT_XXL=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -CONFIG_XEN=y -CONFIG_XEN_PV=y -CONFIG_XEN_PV_SMP=y -CONFIG_XEN_DOM0=y -CONFIG_XEN_PVHVM=y -CONFIG_XEN_PVHVM_SMP=y -CONFIG_XEN_512GB=y -CONFIG_XEN_SAVE_RESTORE=y -# CONFIG_XEN_DEBUG_FS is not set -CONFIG_XEN_PVH=y -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -# CONFIG_KVM_DEBUG_FS is not set -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -CONFIG_JAILHOUSE_GUEST=y -CONFIG_ACRN_GUEST=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_PROCESSOR_SELECT=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=320 -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -CONFIG_X86_MCE_INJECT=m -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=m -CONFIG_PERF_EVENTS_INTEL_RAPL=m -CONFIG_PERF_EVENTS_INTEL_CSTATE=m -CONFIG_PERF_EVENTS_AMD_POWER=m -# end of Performance monitoring - -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_X86_IOPL_IOPERM=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -CONFIG_X86_5LEVEL=y -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -CONFIG_AMD_MEM_ENCRYPT=y -CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -CONFIG_NODES_SPAN_OTHER_NODES=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=5 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_UMIP=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_SECCOMP=y -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -# CONFIG_KEXEC is not set -# CONFIG_KEXEC_FILE is not set -CONFIG_CRASH_DUMP=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x1000000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -# CONFIG_LEGACY_VSYSCALL_XONLY is not set -CONFIG_LEGACY_VSYSCALL_NONE=y -CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="pti=on page_alloc.shuffle=1" -# CONFIG_CMDLINE_OVERRIDE is not set -# CONFIG_MODIFY_LDT_SYSCALL is not set -CONFIG_HAVE_LIVEPATCH=y -# CONFIG_LIVEPATCH is not set -# end of Processor type and features - -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y - -# -# Power management and ACPI options -# -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_SUSPEND_SKIP_SYNC is not set -CONFIG_HIBERNATE_CALLBACKS=y -# CONFIG_HIBERNATION is not set -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -# CONFIG_PM_TEST_SUSPEND is not set -CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_DPM_WATCHDOG is not set -CONFIG_PM_TRACE=y -CONFIG_PM_TRACE_RTC=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_PM_GENERIC_DOMAINS_OF=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -# CONFIG_ACPI_PROCFS_POWER is not set -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -# CONFIG_ACPI_EC_DEBUGFS is not set -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=y -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -# CONFIG_ACPI_DEBUG is not set -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -# CONFIG_ACPI_CUSTOM_METHOD is not set -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -# CONFIG_NFIT_SECURITY_DEBUG is not set -CONFIG_ACPI_NUMA=y -CONFIG_ACPI_HMAT=y -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -CONFIG_ACPI_APEI_EINJ=m -CONFIG_ACPI_APEI_ERST_DEBUG=m -CONFIG_DPTF_POWER=m -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_ADXL=y -CONFIG_PMIC_OPREGION=y -CONFIG_BYTCRC_PMIC_OPREGION=y -CONFIG_CHTCRC_PMIC_OPREGION=y -CONFIG_XPOWER_PMIC_OPREGION=y -CONFIG_BXT_WC_PMIC_OPREGION=y -CONFIG_CHT_WC_PMIC_OPREGION=y -CONFIG_CHT_DC_TI_PMIC_OPREGION=y -CONFIG_ACPI_CONFIGFS=m -CONFIG_TPS68470_PMIC_OPREGION=y -CONFIG_X86_PM_TIMER=y -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_CPUFREQ_DT=m -CONFIG_CPUFREQ_DT_PLATDEV=y -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=m - -# -# shared options -# -CONFIG_X86_SPEEDSTEP_LIB=m -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=m -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_XEN=y -CONFIG_MMCONF_FAM10H=y -# CONFIG_PCI_CNB20LE_QUIRK is not set -# CONFIG_ISA_BUS is not set -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# CONFIG_X86_SYSFB is not set -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_X86_X32 is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -# end of Binary Emulations - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -# CONFIG_GOOGLE_SMI is not set -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_MEMCONSOLE=m -# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set -CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -# CONFIG_EFI_VARS is not set -CONFIG_EFI_ESRT=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_SOFT_RESERVE=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_APPLE_PROPERTIES=y -CONFIG_RESET_ATTACK_MITIGATION=y -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_EFI_EMBEDDED_FIRMWARE=y -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_EFI_EARLYCON=y - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_MMU_AUDIT=y -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y - -# -# General architecture-dependent options -# -CONFIG_HOTPLUG_SMT=y -CONFIG_OPROFILE=m -# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_CLK=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_CC_HAS_STACKPROTECTOR_NONE=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=32 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_COPY_THREAD_TLS=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_ISA_BUS_API=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set -CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y -# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_MODVERSIONS is not set -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -CONFIG_MODULE_COMPRESS_XZ=y -# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_INTEGRITY_T10=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y -# CONFIG_BLK_CMDLINE_PARSER is not set -CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y -CONFIG_BLK_SED_OPAL=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_AIX_PARTITION=y -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y -CONFIG_BLK_PM=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_HAVE_MEMORY_PRESENT=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_MEMBLOCK_NODE_MAP=y -CONFIG_HAVE_FAST_GUP=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -CONFIG_HWPOISON_INJECT=m -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -# CONFIG_CMA is not set -CONFIG_ZSWAP=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZPOOL=y -CONFIG_ZBUD=y -CONFIG_Z3FOLD=y -CONFIG_ZSMALLOC=y -# CONFIG_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DEVICE=y -CONFIG_DEV_PAGEMAP_OPS=y -CONFIG_HMM_MIRROR=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_BENCHMARK is not set -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MAPPING_DIRTY_HELPERS=y -# end of Memory Management options - -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_REDIRECT=y -CONFIG_SKB_EXTENSIONS=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_UNIX_SCM=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -# CONFIG_NET_IPGRE_BROADCAST is not set -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_ESPINTCP=y -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_INET_RAW_DIAG=m -CONFIG_INET_DIAG_DESTROY=y -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_CUBIC=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="cubic" -CONFIG_TCP_MD5SIG=y -# CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON is not set -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_RPL_LWTUNNEL=y -CONFIG_NETLABEL=y -CONFIG_MPTCP=y -CONFIG_MPTCP_IPV6=y -# CONFIG_MPTCP_HMAC_TEST is not set -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=15 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_MH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS MH scheduler -# -CONFIG_IP_VS_MH_TAB_INDEX=12 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_FLOW_TABLE_IPV4=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_FLOW_TABLE_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_BPFILTER is not set -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y -# end of DCCP CCIDs Configuration - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# end of DCCP Kernel Hacking - -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_TIPC_CRYPTO=y -CONFIG_TIPC_DIAG=m -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_8021Q=m -CONFIG_NET_DSA_TAG_AR9331=m -CONFIG_NET_DSA_TAG_BRCM_COMMON=m -CONFIG_NET_DSA_TAG_BRCM=m -CONFIG_NET_DSA_TAG_BRCM_PREPEND=m -CONFIG_NET_DSA_TAG_GSWIP=m -CONFIG_NET_DSA_TAG_DSA=m -CONFIG_NET_DSA_TAG_EDSA=m -CONFIG_NET_DSA_TAG_MTK=m -CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_OCELOT=m -CONFIG_NET_DSA_TAG_QCA=m -CONFIG_NET_DSA_TAG_LAN9303=m -CONFIG_NET_DSA_TAG_SJA1105=m -CONFIG_NET_DSA_TAG_TRAILER=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -# CONFIG_DECNET is not set -CONFIG_LLC=m -CONFIG_LLC2=m -# CONFIG_ATALK is not set -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_CBS=m -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_SKBPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_FQ_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_SCH_ETS=m -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_CODEL is not set -CONFIG_DEFAULT_FQ_CODEL=y -# CONFIG_DEFAULT_SFQ is not set -# CONFIG_DEFAULT_PFIFO_FAST is not set -CONFIG_DEFAULT_NET_SCH="fq_codel" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_MPLS=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_CTINFO=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_ACT_CT=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_TC_SKB_EXT=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -CONFIG_BATMAN_ADV_BATMAN_V=y -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -# CONFIG_BATMAN_ADV_DEBUGFS is not set -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_BATMAN_ADV_SYSFS=y -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VSOCKETS_DIAG=m -CONFIG_VSOCKETS_LOOPBACK=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -CONFIG_NET_NCSI=y -CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -CONFIG_NET_DROP_MONITOR=y -# end of Network testing -# end of Networking options - -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -# end of AX.25 network device drivers - -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m -CONFIG_CAN_J1939=m - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_FLEXCAN=m -CONFIG_CAN_GRCAN=m -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_KVASER_PCIEFD=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_M_CAN_PLATFORM=m -CONFIG_CAN_M_CAN_TCAN4X5X=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_EMS_PCI=m -# CONFIG_CAN_EMS_PCMCIA is not set -CONFIG_CAN_F81601=m -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m -# end of CAN SPI interfaces - -# -# CAN USB interfaces -# -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_MCBA_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_UCAN=m -# end of CAN USB interfaces - -# CONFIG_CAN_DEBUG_DEVICES is not set -# end of CAN Device Drivers - -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -# CONFIG_BT_SELFTEST is not set -# CONFIG_BT_DEBUGFS is not set - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_RTL=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_MTKSDIO=m -CONFIG_BT_MTKUART=m -CONFIG_BT_HCIRSI=m -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -# CONFIG_AF_RXRPC_DEBUG is not set -CONFIG_RXKAD=y -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -# CONFIG_CFG80211_CERTIFICATION_ONUS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -# CONFIG_CFG80211_DEBUGFS is not set -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -# CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_XEN=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -CONFIG_CEPH_LIB_PRETTYDEBUG=y -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_PN532_UART=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -# end of Near Field Communication (NFC) devices - -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SOCK_MSG=y -CONFIG_NET_DEVLINK=y -CONFIG_PAGE_POOL=y -CONFIG_FAILOVER=m -CONFIG_ETHTOOL_NETLINK=y -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_BW is not set -CONFIG_PCIE_EDR=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=y -CONFIG_PCI_PF_STUB=m -CONFIG_XEN_PCIDEV_FRONTEND=m -CONFIG_PCI_ATS=y -CONFIG_PCI_ECAM=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_P2PDMA=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=y - -# -# PCI controller drivers -# -CONFIG_PCI_FTPCI100=y -CONFIG_PCI_HOST_COMMON=y -CONFIG_PCI_HOST_GENERIC=y -CONFIG_PCIE_XILINX=y -CONFIG_VMD=m -CONFIG_PCI_HYPERV_INTERFACE=m - -# -# DesignWare PCI Core Support -# -CONFIG_PCIE_DW=y -CONFIG_PCIE_DW_HOST=y -CONFIG_PCIE_DW_EP=y -CONFIG_PCIE_DW_PLAT=y -CONFIG_PCIE_DW_PLAT_HOST=y -CONFIG_PCIE_DW_PLAT_EP=y -CONFIG_PCIE_INTEL_GW=y -CONFIG_PCI_MESON=y -# end of DesignWare PCI Core Support - -# -# Mobiveil PCIe Core Support -# -# end of Mobiveil PCIe Core Support - -# -# Cadence PCIe controllers support -# -CONFIG_PCIE_CADENCE=y -CONFIG_PCIE_CADENCE_HOST=y -CONFIG_PCIE_CADENCE_EP=y -CONFIG_PCIE_CADENCE_PLAT=y -CONFIG_PCIE_CADENCE_PLAT_HOST=y -CONFIG_PCIE_CADENCE_PLAT_EP=y -# end of Cadence PCIe controllers support -# end of PCI controller drivers - -# -# PCI Endpoint -# -CONFIG_PCI_ENDPOINT=y -CONFIG_PCI_ENDPOINT_CONFIGFS=y -# CONFIG_PCI_EPF_TEST is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -# end of PCI switch controller drivers - -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=m -CONFIG_RAPIDIO_TSI721=m -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=m -CONFIG_RAPIDIO_CPS_XX=m -CONFIG_RAPIDIO_TSI568=m -CONFIG_RAPIDIO_CPS_GEN2=m -CONFIG_RAPIDIO_RXS_GEN3=m -# end of RapidIO Switch drivers - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_CACHE=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_HMEM_REPORTING=y -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_SYS_HYPERVISOR=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SLIMBUS=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_REGMAP_SOUNDWIRE=m -CONFIG_REGMAP_SCCB=m -CONFIG_REGMAP_I3C=m -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# end of Generic Driver Options - -# -# Bus devices -# -CONFIG_MOXTET=m -CONFIG_SIMPLE_PM_BUS=y -CONFIG_MHI_BUS=m -# end of Bus devices - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y -CONFIG_GNSS=m -CONFIG_GNSS_SERIAL=m -CONFIG_GNSS_MTK_SERIAL=m -CONFIG_GNSS_SIRF_SERIAL=m -CONFIG_GNSS_UBX_SERIAL=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m - -# -# Partition parsers -# -CONFIG_MTD_AR7_PARTS=m -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_OF_PARTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# end of Partition parsers - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_SWAP=m -CONFIG_MTD_PARTITIONED_MASTER=y - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m -# end of RAM/ROM/Flash chip drivers - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_MTD_PHYSMAP_VERSATILE=y -CONFIG_MTD_PHYSMAP_GEMINI=y -CONFIG_MTD_PHYSMAP_GPIO_ADDR=y -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -# end of Mapping drivers for chip access - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -# end of Self-contained MTD device drivers - -CONFIG_MTD_NAND_CORE=m -CONFIG_MTD_ONENAND=m -# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y -CONFIG_MTD_NAND_ECC_SW_HAMMING=m -CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y -CONFIG_MTD_RAW_NAND=m -CONFIG_MTD_NAND_ECC_SW_BCH=y - -# -# Raw/parallel NAND flash controllers -# -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_DT=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_MXIC=m -CONFIG_MTD_NAND_GPIO=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_NAND_CADENCE=m - -# -# Misc -# -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_SPI_NAND=m - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -# end of LPDDR & LPDDR2 PCM memory drivers - -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_SPI_INTEL_SPI=m -CONFIG_SPI_INTEL_SPI_PCI=m -CONFIG_SPI_INTEL_SPI_PLATFORM=m -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -CONFIG_MTD_UBI_GLUEBI=m -CONFIG_MTD_UBI_BLOCK=y -CONFIG_MTD_HYPERBUS=m -CONFIG_DTC=y -CONFIG_OF=y -# CONFIG_OF_UNITTEST is not set -CONFIG_OF_FLATTREE=y -CONFIG_OF_KOBJ=y -CONFIG_OF_DYNAMIC=y -CONFIG_OF_ADDRESS=y -CONFIG_OF_IRQ=y -CONFIG_OF_NET=y -CONFIG_OF_MDIO=m -CONFIG_OF_RESOLVE=y -CONFIG_OF_OVERLAY=y -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_CDROM=m -# CONFIG_PARIDE is not set -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -# CONFIG_CDROM_PKTCDVD_WCACHE is not set -CONFIG_ATA_OVER_ETH=m -CONFIG_XEN_BLKDEV_FRONTEND=m -CONFIG_XEN_BLKDEV_BACKEND=m -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -CONFIG_NVME_MULTIPATH=y -CONFIG_NVME_HWMON=y -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -CONFIG_NVME_TARGET=m -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m -CONFIG_NVME_TARGET_TCP=m -# end of NVME Support - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_VMWARE_BALLOON=m -CONFIG_LATTICE_ECP3_CONFIG=m -# CONFIG_SRAM is not set -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_XILINX_SDFEC=m -CONFIG_MISC_RTSX=m -CONFIG_PVPANIC=m -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -# end of Texas Instruments shared transport line discipline - -CONFIG_SENSORS_LIS3_I2C=m -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=m -CONFIG_INTEL_MEI_ME=m -CONFIG_INTEL_MEI_TXE=m -CONFIG_INTEL_MEI_HDCP=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC & related support -# -CONFIG_INTEL_MIC_BUS=m -CONFIG_SCIF_BUS=m -CONFIG_VOP_BUS=m -CONFIG_INTEL_MIC_HOST=m -CONFIG_INTEL_MIC_CARD=m -CONFIG_SCIF=m -CONFIG_MIC_COSM=m -CONFIG_VOP=m -# end of Intel MIC & related support - -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -CONFIG_MISC_ALCOR_PCI=m -CONFIG_MISC_RTSX_PCI=m -CONFIG_MISC_RTSX_USB=m -CONFIG_HABANA_AI=m -CONFIG_UACCE=m -# end of Misc devices - -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC94XX=m -CONFIG_AIC94XX_DEBUG=y -CONFIG_SCSI_MVSAS=m -CONFIG_SCSI_MVSAS_DEBUG=y -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -# CONFIG_SCSI_UFS_DWC_TC_PCI is not set -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_CDNS_PLATFORM=m -# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set -CONFIG_SCSI_UFS_BSG=y -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_SCSI_MYRB=m -CONFIG_SCSI_MYRS=m -CONFIG_VMWARE_PVSCSI=m -CONFIG_XEN_SCSI_FRONTEND=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_FDOMAIN=m -CONFIG_SCSI_FDOMAIN_PCI=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -CONFIG_SCSI_IPR_TRACE=y -CONFIG_SCSI_IPR_DUMP=y -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -CONFIG_SCSI_DEBUG=m -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=3 -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_AHCI_CEVA=m -CONFIG_AHCI_QORIQ=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -# CONFIG_PATA_PLATFORM is not set -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -# CONFIG_DM_DEBUG is not set -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -CONFIG_DM_ERA=m -CONFIG_DM_CLONE=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_DELAY=m -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -# CONFIG_FUSION_LOGGING is not set - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -# end of IEEE 1394 (FireWire) support - -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN_L3S=y -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_BAREUDP=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_NTB_NETDEV=m -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -# CONFIG_ATM_NICSTAR_USE_SUNI is not set -# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m -CONFIG_CAIF_DRIVERS=y -CONFIG_CAIF_TTY=m -CONFIG_CAIF_SPI_SLAVE=m -CONFIG_CAIF_SPI_SYNC=y -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -# CONFIG_B53_SPI_DRIVER is not set -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_B53_SERDES=m -CONFIG_NET_DSA_BCM_SF2=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_LANTIQ_GSWIP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_MV88E6XXX_PTP=y -CONFIG_NET_DSA_AR9331=m -CONFIG_NET_DSA_SJA1105=m -CONFIG_NET_DSA_SJA1105_PTP=y -CONFIG_NET_DSA_SJA1105_TAS=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_REALTEK_SMI=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_NET_DSA_VITESSE_VSC73XX=m -CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m -CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m -# end of Distributed Switch Architecture drivers - -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BCMGENET=m -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_SYSTEMPORT=m -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_BNXT_HWMON=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_CAVIUM_PTP=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -CONFIG_CHELSIO_T4_FCOE=y -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_NET_VENDOR_CORTINA=y -CONFIG_GEMINI_ETHERNET=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_BE2=y -CONFIG_BE2NET_BE3=y -CONFIG_BE2NET_LANCER=y -CONFIG_BE2NET_SKYHAWK=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_GOOGLE=y -CONFIG_GVE=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -# CONFIG_IXGBE_IPSEC is not set -CONFIG_IXGBEVF=m -CONFIG_IXGBEVF_IPSEC=y -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_IAVF=m -CONFIG_I40EVF=m -CONFIG_ICE=m -CONFIG_FM10K=m -CONFIG_IGC=m -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX4_CORE_GEN2=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_EN_ARFS=y -CONFIG_MLX5_EN_RXNFC=y -CONFIG_MLX5_MPFS=y -CONFIG_MLX5_ESWITCH=y -CONFIG_MLX5_TC_CT=y -CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_IPOIB=y -CONFIG_MLX5_FPGA_IPSEC=y -CONFIG_MLX5_EN_IPSEC=y -CONFIG_MLX5_FPGA_TLS=y -CONFIG_MLX5_TLS=y -CONFIG_MLX5_EN_TLS=y -CONFIG_MLX5_SW_STEERING=y -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_LAN743X=m -CONFIG_NET_VENDOR_MICROSEMI=y -CONFIG_MSCC_OCELOT_SWITCH=m -CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETERION=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -CONFIG_NFP_APP_FLOWER=y -CONFIG_NFP_APP_ABM_NIC=y -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_NI=y -CONFIG_NI_XGE_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_VENDOR_PACKET_ENGINES=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_PENSANDO=y -CONFIG_IONIC=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_QED_OOO=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCA7000=m -CONFIG_QCA7000_SPI=m -CONFIG_QCA7000_UART=m -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_SOCIONEXT=y -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -# CONFIG_STMMAC_SELFTESTS is not set -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_DWC_QOS_ETH=m -CONFIG_DWMAC_GENERIC=m -CONFIG_DWMAC_INTEL=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -# CONFIG_TI_CPSW_PHY_SEL is not set -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XILINX=y -CONFIG_XILINX_AXI_EMAC=m -CONFIG_XILINX_LL_TEMAC=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -CONFIG_DEFXX_MMIO=y -CONFIG_SKFP=m -# CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_BUS_MUX=m -CONFIG_MDIO_BUS_MUX_GPIO=m -CONFIG_MDIO_BUS_MUX_MMIOREG=m -CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_HISI_FEMAC=m -CONFIG_MDIO_I2C=m -CONFIG_MDIO_IPQ8064=m -CONFIG_MDIO_MSCC_MIIM=m -CONFIG_MDIO_MVUSB=m -CONFIG_MDIO_OCTEON=m -CONFIG_MDIO_THUNDER=m -CONFIG_MDIO_XPCS=m -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y - -# -# MII PHY device drivers -# -CONFIG_SFP=m -CONFIG_ADIN_PHY=m -CONFIG_AMD_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AX88796B_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_BROADCOM_PHY=m -CONFIG_BCM84881_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_DP83822_PHY=m -CONFIG_DP83TC811_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_DP83869_PHY=m -CONFIG_FIXED_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_LXT_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROCHIP_T1_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_NXP_TJA11XX_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_RENESAS_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_USB_NET_AQC111=m -CONFIG_WLAN=y -# CONFIG_WIRELESS_WDS is not set -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -# CONFIG_ATH5K_DEBUG is not set -# CONFIG_ATH5K_TRACER is not set -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -# CONFIG_ATH9K_DEBUGFS is not set -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_PCI_NO_EEPROM=m -CONFIG_ATH9K_HTC=m -# CONFIG_ATH9K_HTC_DEBUGFS is not set -CONFIG_ATH9K_HWRNG=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -# CONFIG_ATH6KL_DEBUG is not set -# CONFIG_ATH6KL_TRACING is not set -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -# CONFIG_WIL6210_DEBUGFS is not set -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_AHB=y -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -# CONFIG_ATH10K_DEBUG is not set -# CONFIG_ATH10K_DEBUGFS is not set -# CONFIG_ATH10K_TRACING is not set -CONFIG_WCN36XX=m -# CONFIG_WCN36XX_DEBUGFS is not set -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -CONFIG_B43LEGACY_DEBUG=y -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -# CONFIG_BRCM_TRACING is not set -CONFIG_BRCMDBG=y -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -# CONFIG_IWLEGACY_DEBUG is not set -# end of iwl3945 / iwl4965 Debugging Options - -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -# CONFIG_IWLWIFI_BCAST_FILTERING is not set - -# -# Debugging Options -# -# CONFIG_IWLWIFI_DEBUG is not set -# CONFIG_IWLWIFI_DEVICE_TRACING is not set -# end of Debugging Options - -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -# CONFIG_P54_SPI_DEFAULT_EEPROM is not set -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76x0_COMMON=m -CONFIG_MT76x0U=m -CONFIG_MT76x0E=m -CONFIG_MT76x2_COMMON=m -CONFIG_MT76x2E=m -CONFIG_MT76x2U=m -CONFIG_MT7603E=m -CONFIG_MT7615E=m -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -CONFIG_RTLWIFI_DEBUG=y -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_RTW88=m -CONFIG_RTW88_CORE=m -CONFIG_RTW88_PCI=m -CONFIG_RTW88_8822BE=y -CONFIG_RTW88_8822CE=y -# CONFIG_RTW88_DEBUG is not set -# CONFIG_RTW88_DEBUGFS is not set -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -# CONFIG_RSI_DEBUGFS is not set -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_RSI_COEX=y -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SPI=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_VIRT_WIFI=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -# end of WiMAX Wireless Broadband devices - -# CONFIG_WAN is not set -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_IEEE802154_MCR20A=m -CONFIG_IEEE802154_HWSIM=m -CONFIG_XEN_NETDEV_FRONTEND=m -CONFIG_XEN_NETDEV_BACKEND=m -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_USB4_NET=m -CONFIG_HYPERV_NET=m -CONFIG_NETDEVSIM=m -CONFIG_NET_FAILOVER=m -CONFIG_ISDN=y -CONFIG_ISDN_CAPI=y -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_HDLC=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_NVM=y -CONFIG_NVM_PBLK=m -# CONFIG_NVM_PBLK_DEBUG is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=m -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5520=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_APPLESPI=m -CONFIG_KEYBOARD_ATKBD=m -CONFIG_KEYBOARD_QT1050=m -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_STMPE=m -CONFIG_KEYBOARD_IQS62X=m -CONFIG_KEYBOARD_OMAP4=m -CONFIG_KEYBOARD_TC3589X=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_TWL4030=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_KEYBOARD_CAP11XX=m -CONFIG_KEYBOARD_BCM=m -CONFIG_KEYBOARD_MTK_PMIC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -CONFIG_MOUSE_PS2_VMMOUSE=y -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=m -CONFIG_JOYSTICK_IFORCE_232=m -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -CONFIG_JOYSTICK_JOYDUMP=m -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_JOYSTICK_PXRC=m -CONFIG_JOYSTICK_FSIA6B=m -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_88PM860X=m -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ADC=m -CONFIG_TOUCHSCREEN_AR1021_I2C=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_BU21029=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9034=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_EXC3000=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_HIDEEP=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_S6SY761=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_IMX6UL_TSC=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_STMPE=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_COLIBRI_VF50=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_TOUCHSCREEN_IQS5XX=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM860X_ONKEY=m -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_ATMEL_CAPTOUCH=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_MSM_VIBRATOR=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77650_ONKEY=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MAX8925_ONKEY=m -CONFIG_INPUT_MAX8997_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GP2A=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_GPIO_VIBRA=m -CONFIG_INPUT_CPCAP_PWRBUTTON=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_TWL4030_PWRBUTTON=m -CONFIG_INPUT_TWL4030_VIBRA=m -CONFIG_INPUT_TWL6040_VIBRA=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PALMAS_PWRBUTTON=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_RK805_PWRKEY=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9055_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_INPUT_RAVE_SP_PWRBUTTON=m -CONFIG_INPUT_STPMIC1_ONKEY=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -# CONFIG_RMI4_F54 is not set -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=m -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=m -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=m -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -# CONFIG_SERIO_APBPS2 is not set -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_LDISC_AUTOLOAD=y - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=m -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=32 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_ASPEED_VUART=m -CONFIG_SERIAL_8250_SHARE_IRQ=y -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -CONFIG_SERIAL_8250_RSA=y -CONFIG_SERIAL_8250_DWLIB=y -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_OF_PLATFORM=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=m -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SIFIVE=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_XILINX_PS_UART=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_FSL_LINFLEXUART=m -CONFIG_SERIAL_CONEXANT_DIGICOLOR=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_SPRD=m -# end of Serial drivers - -CONFIG_SERIAL_MCTRL_GPIO=y -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_NOZOMI=m -CONFIG_NULL_TTY=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_HVC_DRIVER=y -CONFIG_HVC_IRQ=y -CONFIG_HVC_XEN=y -CONFIG_HVC_XEN_FRONTEND=y -CONFIG_SERIAL_DEV_BUS=y -CONFIG_SERIAL_DEV_CTRL_TTYPORT=y -# CONFIG_TTY_PRINTK is not set -CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set -CONFIG_PPDEV=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PLAT_DATA=y -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_IPMB_DEVICE_INTERFACE=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -# end of PCMCIA character devices - -CONFIG_MWAVE=m -# CONFIG_DEVMEM is not set -# CONFIG_DEVKMEM is not set -CONFIG_NVRAM=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -# CONFIG_DEVPORT is not set -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_XEN=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m -CONFIG_XILLYBUS_OF=m -# end of Character devices - -# CONFIG_RANDOM_TRUST_CPU is not set -# CONFIG_RANDOM_TRUST_BOOTLOADER is not set - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_ARB_GPIO_CHALLENGE=m -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_GPMUX=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_PINCTRL=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_DEMUX_PINCTRL=m -CONFIG_I2C_MUX_MLXCPLD=m -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_AMD_MP2=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_CHT_WC=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_NVIDIA_GPU=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=y -CONFIG_I2C_DESIGNWARE_PLATFORM=y -CONFIG_I2C_DESIGNWARE_SLAVE=y -CONFIG_I2C_DESIGNWARE_PCI=m -CONFIG_I2C_DESIGNWARE_BAYTRAIL=y -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -CONFIG_I2C_RK3X=m -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -CONFIG_I2C_FSI=m -# end of I2C Hardware Bus support - -CONFIG_I2C_STUB=m -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -CONFIG_SPI_MEM=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MID_DMA=y -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_FSI=m -CONFIG_SPI_NXP_FLEXSPI=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_FSL_LIB=m -CONFIG_SPI_FSL_SPI=m -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_SIFIVE=m -CONFIG_SPI_MXIC=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m - -# -# SPI Multiplexer support -# -CONFIG_SPI_MUX=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=y -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -CONFIG_PPS_CLIENT_KTIMER=m -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=y -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_INES=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PTP_1588_CLOCK_IDT82P33=m -CONFIG_PTP_1588_CLOCK_IDTCM=m -CONFIG_PTP_1588_CLOCK_VMW=m -# end of PTP clock support - -CONFIG_PINCTRL=y -CONFIG_GENERIC_PINCTRL_GROUPS=y -CONFIG_PINMUX=y -CONFIG_GENERIC_PINMUX_FUNCTIONS=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AS3722=m -CONFIG_PINCTRL_AXP209=m -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_DA9062=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_SINGLE=m -CONFIG_PINCTRL_SX150X=y -CONFIG_PINCTRL_STMFX=m -CONFIG_PINCTRL_MAX77620=m -CONFIG_PINCTRL_PALMAS=m -CONFIG_PINCTRL_RK805=m -CONFIG_PINCTRL_OCELOT=y -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y -CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y -CONFIG_PINCTRL_LOCHNAGAR=m -CONFIG_PINCTRL_MADERA=m -CONFIG_PINCTRL_CS47L15=y -CONFIG_PINCTRL_CS47L35=y -CONFIG_PINCTRL_CS47L85=y -CONFIG_PINCTRL_CS47L90=y -CONFIG_PINCTRL_CS47L92=y -CONFIG_PINCTRL_EQUILIBRIUM=m -CONFIG_GPIOLIB=y -CONFIG_GPIOLIB_FASTPATH_LIMIT=512 -CONFIG_OF_GPIO=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_GENERIC=y -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_74XX_MMIO=m -CONFIG_GPIO_ALTERA=m -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_CADENCE=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_FTGPIO010=y -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_GRGPIO=m -CONFIG_GPIO_HLWD=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LOGICVC=m -CONFIG_GPIO_MB86S7X=m -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_SAMA5D2_PIOBU=m -CONFIG_GPIO_SIFIVE=y -CONFIG_GPIO_SIOX=m -CONFIG_GPIO_SYSCON=m -CONFIG_GPIO_VX855=m -CONFIG_GPIO_WCD934X=m -CONFIG_GPIO_XILINX=m -CONFIG_GPIO_AMD_FCH=m -# end of Memory mapped GPIO drivers - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m -CONFIG_GPIO_WINBOND=m -CONFIG_GPIO_WS16C48=m -# end of Port-mapped I/O GPIO drivers - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_ADNP=m -CONFIG_GPIO_GW_PLD=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m -# end of I2C GPIO expanders - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ADP5520=m -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD70528=m -CONFIG_GPIO_BD71828=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_CRYSTAL_COVE=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DA9055=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_LP87565=m -CONFIG_GPIO_MADERA=m -CONFIG_GPIO_MAX77620=m -CONFIG_GPIO_MAX77650=m -CONFIG_GPIO_PALMAS=y -CONFIG_GPIO_RC5T583=y -CONFIG_GPIO_STMPE=y -CONFIG_GPIO_TC3589X=y -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS6586X=y -CONFIG_GPIO_TPS65910=y -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_TPS68470=y -CONFIG_GPIO_TQMX86=m -CONFIG_GPIO_TWL4030=m -CONFIG_GPIO_TWL6040=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8350=m -CONFIG_GPIO_WM8994=m -# end of MFD GPIO expanders - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_PCIE_IDIO_24=m -CONFIG_GPIO_RDC321X=m -CONFIG_GPIO_SODAVILLE=y -# end of PCI GPIO expanders - -# -# SPI GPIO expanders -# -CONFIG_GPIO_74X164=m -CONFIG_GPIO_MAX3191X=m -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m -CONFIG_GPIO_MOXTET=m -# end of SPI GPIO expanders - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -# end of USB GPIO expanders - -CONFIG_GPIO_MOCKUP=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m -CONFIG_W1_MASTER_SGI=m -# end of 1-wire Bus Masters - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -CONFIG_W1_SLAVE_DS2405=m -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2430=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -# CONFIG_W1_SLAVE_DS2433_CRC is not set -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS250X=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_W1_SLAVE_DS28E17=m -# end of 1-wire Slaves - -CONFIG_POWER_AVS=y -CONFIG_QCOM_CPR=m -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_AS3722=y -CONFIG_POWER_RESET_GPIO=y -CONFIG_POWER_RESET_GPIO_RESTART=y -CONFIG_POWER_RESET_LTC2952=y -CONFIG_POWER_RESET_MT6323=y -CONFIG_POWER_RESET_RESTART=y -CONFIG_POWER_RESET_SYSCON=y -CONFIG_POWER_RESET_SYSCON_POWEROFF=y -CONFIG_REBOOT_MODE=m -CONFIG_SYSCON_REBOOT_MODE=m -CONFIG_NVMEM_REBOOT_MODE=m -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_MAX8925_POWER=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -CONFIG_WM8350_POWER=m -CONFIG_TEST_POWER=m -CONFIG_BATTERY_88PM860X=m -CONFIG_CHARGER_ADP5061=m -CONFIG_BATTERY_ACT8945A=m -CONFIG_BATTERY_CPCAP=m -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_LEGO_EV3=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_MANAGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9030=m -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_BATTERY_TWL4030_MADC=m -CONFIG_CHARGER_88PM860X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_BATTERY_RX51=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_TWL4030=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_LP8788=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LT3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_DETECTOR_MAX14656=m -CONFIG_CHARGER_MAX77650=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_MAX8997=m -CONFIG_CHARGER_MAX8998=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -CONFIG_CHARGER_BQ25890=m -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65090=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_CHARGER_CROS_USBPD=m -CONFIG_CHARGER_UCS1002=m -CONFIG_CHARGER_BD70528=m -CONFIG_CHARGER_WILCO=m -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM1177=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_AS370=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_AXI_FAN_CONTROL=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_DRIVETEMP=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_DA9055=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LOCHNAGAR=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2947=m -CONFIG_SENSORS_LTC2947_I2C=m -CONFIG_SENSORS_LTC2947_SPI=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX31730=m -CONFIG_SENSORS_MAX6621=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_MLXREG_FAN=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_NPCM7XX=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_BEL_PFE=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_INSPUR_IPSPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_IR38064=m -CONFIG_SENSORS_IRPS5401=m -CONFIG_SENSORS_ISL68137=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -# CONFIG_SENSORS_LTC2978_REGULATOR is not set -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX20730=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX31785=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -CONFIG_SENSORS_PXE1610=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_XDPE122=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_PWM_FAN=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TMP513=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83773G=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_WM8350=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -# CONFIG_THERMAL_STATISTICS is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 -CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_OF=y -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -CONFIG_CPU_THERMAL=y -CONFIG_CPU_FREQ_THERMAL=y -CONFIG_CPU_IDLE_THERMAL=y -CONFIG_CLOCK_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_THERMAL_MMIO=m -CONFIG_MAX77620_THERMAL=m -CONFIG_QORIQ_THERMAL=m -CONFIG_DA9062_THERMAL=m - -# -# Intel thermal drivers -# -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=y -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -# end of Intel thermal drivers - -# CONFIG_TI_SOC_THERMAL is not set -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_BD70528_WATCHDOG=m -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9055_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_GPIO_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_MENZ069_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_WM8350_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_RAVE_SP_WATCHDOG=m -CONFIG_MLX_WDT=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_RN5T618_WATCHDOG=m -CONFIG_TWL4030_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_MAX77620_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_STPMIC1_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m -CONFIG_F71808E_WDT=m -CONFIG_SP5100_TCO=m -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_KEMPLD_WDT=m -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_TQMX86_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m -CONFIG_XEN_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -# CONFIG_BCMA_HOST_SOC is not set -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_ACT8945A=m -CONFIG_MFD_AS3711=y -CONFIG_MFD_AS3722=m -CONFIG_PMIC_ADP5520=y -CONFIG_MFD_AAT2870_CORE=y -CONFIG_MFD_ATMEL_FLEXCOM=m -CONFIG_MFD_ATMEL_HLCDC=m -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC_DEV=m -CONFIG_MFD_MADERA=m -CONFIG_MFD_MADERA_I2C=m -CONFIG_MFD_MADERA_SPI=m -CONFIG_MFD_CS47L15=y -CONFIG_MFD_CS47L35=y -CONFIG_MFD_CS47L85=y -CONFIG_MFD_CS47L90=y -CONFIG_MFD_CS47L92=y -CONFIG_PMIC_DA903X=y -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9052_I2C=y -CONFIG_MFD_DA9055=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_MFD_HI6421_PMIC=m -CONFIG_HTC_PASIC3=m -CONFIG_HTC_I2CPLD=y -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC=y -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_INTEL_SOC_PMIC_CHTWC=y -CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_IQS62X=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_88PM860X=y -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77620=y -CONFIG_MFD_MAX77650=m -CONFIG_MFD_MAX77686=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX77843=y -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MAX8925=y -CONFIG_MFD_MAX8997=y -CONFIG_MFD_MAX8998=y -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_CPCAP=m -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RC5T583=y -CONFIG_MFD_RK808=m -CONFIG_MFD_RN5T618=m -CONFIG_MFD_SEC_CORE=y -CONFIG_MFD_SI476X_CORE=m -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_MFD_SMSC=y -CONFIG_ABX500_CORE=y -CONFIG_AB3100_CORE=y -CONFIG_AB3100_OTP=y -CONFIG_MFD_STMPE=y - -# -# STMicroelectronics STMPE Interface Drivers -# -CONFIG_STMPE_I2C=y -CONFIG_STMPE_SPI=y -# end of STMicroelectronics STMPE Interface Drivers - -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_LP8788=y -CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65090=y -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TPS68470=y -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TI_LP87565=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS6586X=y -CONFIG_MFD_TPS65910=y -CONFIG_MFD_TPS65912=m -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y -CONFIG_TWL4030_CORE=y -CONFIG_MFD_TWL4030_AUDIO=y -CONFIG_TWL6040_CORE=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -CONFIG_MFD_TC3589X=y -CONFIG_MFD_TQMX86=m -CONFIG_MFD_VX855=m -CONFIG_MFD_LOCHNAGAR=y -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM8400=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_I2C=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8350=y -CONFIG_MFD_WM8350_I2C=y -CONFIG_MFD_WM8994=m -CONFIG_MFD_ROHM_BD718XX=m -CONFIG_MFD_ROHM_BD70528=m -CONFIG_MFD_ROHM_BD71828=m -CONFIG_MFD_STPMIC1=m -CONFIG_MFD_STMFX=m -CONFIG_MFD_WCD934X=m -CONFIG_RAVE_SP_CORE=m -# end of Multifunction device drivers - -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PG86X=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_88PM8607=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_ACT8945A=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_AAT2870=m -CONFIG_REGULATOR_AB3100=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AS3711=m -CONFIG_REGULATOR_AS3722=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD70528=m -CONFIG_REGULATOR_BD71828=m -CONFIG_REGULATOR_BD718XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_CPCAP=m -CONFIG_REGULATOR_DA903X=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9055=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_HI6421=m -CONFIG_REGULATOR_HI6421V530=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LOCHNAGAR=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP873X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LP87565=m -CONFIG_REGULATOR_LP8788=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX77620=m -CONFIG_REGULATOR_MAX77650=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8925=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m -CONFIG_REGULATOR_MAX8997=m -CONFIG_REGULATOR_MAX8998=m -CONFIG_REGULATOR_MAX77686=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MAX77802=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MCP16502=m -CONFIG_REGULATOR_MP5416=m -CONFIG_REGULATOR_MP8859=m -CONFIG_REGULATOR_MP886X=m -CONFIG_REGULATOR_MPQ7920=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PALMAS=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_RC5T583=m -CONFIG_REGULATOR_RK808=m -CONFIG_REGULATOR_RN5T618=m -CONFIG_REGULATOR_ROHM=m -CONFIG_REGULATOR_RT5033=m -CONFIG_REGULATOR_S2MPA01=m -CONFIG_REGULATOR_S2MPS11=m -CONFIG_REGULATOR_S5M8767=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_SLG51000=m -CONFIG_REGULATOR_STPMIC1=m -CONFIG_REGULATOR_SY8106A=m -CONFIG_REGULATOR_SY8824X=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65090=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS65218=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS6586X=m -CONFIG_REGULATOR_TPS65910=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m -CONFIG_REGULATOR_TWL4030=m -CONFIG_REGULATOR_VCTRL=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8350=m -CONFIG_REGULATOR_WM8400=m -CONFIG_REGULATOR_WM8994=m -CONFIG_CEC_CORE=m -CONFIG_CEC_NOTIFIER=y -CONFIG_CEC_PIN=y -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_LIRC=y -CONFIG_RC_DECODERS=y -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_IR_IMON_DECODER=m -CONFIG_IR_RCMM_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_IMON_RAW=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_RC_XBOX_DVD=m -CONFIG_MEDIA_SUPPORT=m - -# -# Multimedia core support -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_MEDIA_CEC_RC=y -# CONFIG_CEC_PIN_ERROR_INJ is not set -CONFIG_MEDIA_CONTROLLER=y -CONFIG_MEDIA_CONTROLLER_DVB=y -# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set -CONFIG_VIDEO_DEV=m -CONFIG_VIDEO_V4L2_SUBDEV_API=y -CONFIG_VIDEO_V4L2=m -CONFIG_VIDEO_V4L2_I2C=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -CONFIG_DVB_CORE=m -CONFIG_DVB_MMAP=y -CONFIG_DVB_NET=y -CONFIG_TTPCI_EEPROM=m -CONFIG_DVB_MAX_ADAPTERS=16 -# CONFIG_DVB_DYNAMIC_MINORS is not set -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set -# CONFIG_DVB_ULE_DEBUG is not set - -# -# Media drivers -# -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_CXUSB_ANALOG=y -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m - -# -# USB HDMI CEC adapters -# -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_VIDEO_IPU3_CIO2=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_VIDEO_CADENCE=y -CONFIG_VIDEO_CADENCE_CSI2RX=m -CONFIG_VIDEO_CADENCE_CSI2TX=m -CONFIG_VIDEO_ASPEED=m -CONFIG_VIDEO_MUX=m -CONFIG_VIDEO_XILINX=m -CONFIG_VIDEO_XILINX_TPG=m -CONFIG_VIDEO_XILINX_VTC=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_VIDEO_SH_VEU=m -CONFIG_V4L_TEST_DRIVERS=y -CONFIG_VIDEO_VIMC=m -CONFIG_VIDEO_VIVID=m -CONFIG_VIDEO_VIVID_CEC=y -CONFIG_VIDEO_VIVID_MAX_DEVS=64 -CONFIG_VIDEO_VIM2M=m -CONFIG_VIDEO_VICODEC=m -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_CEC_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CROS_EC_CEC=m -CONFIG_CEC_GPIO=m -CONFIG_VIDEO_SECO_CEC=m -CONFIG_VIDEO_SECO_RC=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# Supported MMC/SDIO adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=m -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m - -# -# Texas Instruments WL128x FM driver (ST based) -# -CONFIG_RADIO_WL128X=m -# end of Texas Instruments WL128x FM driver (ST based) - -# -# Supported FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set -CONFIG_VIDEO_V4L2_TPG=m - -# -# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) -# -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y -CONFIG_MEDIA_ATTACH=y -CONFIG_VIDEO_IR_I2C=m - -# -# I2C Encoders, decoders, sensors and other helper chips -# - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TDA1997X=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_TLV320AIC23B=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m - -# -# Video decoders -# -CONFIG_VIDEO_ADV7180=m -CONFIG_VIDEO_ADV7183=m -CONFIG_VIDEO_ADV748X=m -CONFIG_VIDEO_ADV7604=m -CONFIG_VIDEO_ADV7604_CEC=y -CONFIG_VIDEO_ADV7842=m -CONFIG_VIDEO_ADV7842_CEC=y -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_ML86V7667=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TC358743=m -CONFIG_VIDEO_TC358743_CEC=y -CONFIG_VIDEO_TVP514X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TVP7002=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_TW9910=m -CONFIG_VIDEO_VPX3220=m - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m -CONFIG_VIDEO_ADV7343=m -CONFIG_VIDEO_ADV7393=m -CONFIG_VIDEO_AD9389B=m -CONFIG_VIDEO_AK881X=m -CONFIG_VIDEO_THS8200=m - -# -# Camera sensor devices -# -CONFIG_VIDEO_APTINA_PLL=m -CONFIG_VIDEO_SMIAPP_PLL=m -CONFIG_VIDEO_HI556=m -CONFIG_VIDEO_IMX214=m -CONFIG_VIDEO_IMX219=m -CONFIG_VIDEO_IMX258=m -CONFIG_VIDEO_IMX274=m -CONFIG_VIDEO_IMX290=m -CONFIG_VIDEO_IMX319=m -CONFIG_VIDEO_IMX355=m -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV2659=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_OV2685=m -CONFIG_VIDEO_OV5640=m -CONFIG_VIDEO_OV5645=m -CONFIG_VIDEO_OV5647=m -CONFIG_VIDEO_OV6650=m -CONFIG_VIDEO_OV5670=m -CONFIG_VIDEO_OV5675=m -CONFIG_VIDEO_OV5695=m -CONFIG_VIDEO_OV7251=m -CONFIG_VIDEO_OV772X=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_OV7740=m -CONFIG_VIDEO_OV8856=m -CONFIG_VIDEO_OV9640=m -CONFIG_VIDEO_OV9650=m -CONFIG_VIDEO_OV13858=m -CONFIG_VIDEO_VS6624=m -CONFIG_VIDEO_MT9M001=m -CONFIG_VIDEO_MT9M032=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9P031=m -CONFIG_VIDEO_MT9T001=m -CONFIG_VIDEO_MT9T112=m -CONFIG_VIDEO_MT9V011=m -CONFIG_VIDEO_MT9V032=m -CONFIG_VIDEO_MT9V111=m -CONFIG_VIDEO_SR030PC30=m -CONFIG_VIDEO_NOON010PC30=m -CONFIG_VIDEO_M5MOLS=m -CONFIG_VIDEO_RJ54N1=m -CONFIG_VIDEO_S5K6AA=m -CONFIG_VIDEO_S5K6A3=m -CONFIG_VIDEO_S5K4ECGX=m -CONFIG_VIDEO_S5K5BAF=m -CONFIG_VIDEO_SMIAPP=m -CONFIG_VIDEO_ET8EK8=m -CONFIG_VIDEO_S5C73M3=m - -# -# Lens drivers -# -CONFIG_VIDEO_AD5820=m -CONFIG_VIDEO_AK7375=m -CONFIG_VIDEO_DW9714=m -CONFIG_VIDEO_DW9807_VCM=m - -# -# Flash devices -# -CONFIG_VIDEO_ADP1653=m -CONFIG_VIDEO_LM3560=m -CONFIG_VIDEO_LM3646=m - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m - -# -# SDR tuner chips -# -CONFIG_SDR_MAX2175=m - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_THS7303=m -CONFIG_VIDEO_M52790=m -CONFIG_VIDEO_I2C=m -CONFIG_VIDEO_ST_MIPID02=m -# end of I2C Encoders, decoders, sensors and other helper chips - -# -# SPI helper chips -# -CONFIG_VIDEO_GS1662=m -# end of SPI helper chips - -# -# Media SPI Adapters -# -CONFIG_CXD2880_SPI_DRV=m -# end of Media SPI Adapters - -CONFIG_MEDIA_TUNER=m - -# -# Customize TV tuners -# -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA18250=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m -CONFIG_MEDIA_TUNER_QM1D1B0004=m -# end of Customize TV tuners - -# -# Customise DVB Frontends -# - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_S5H1432=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_DIB9000=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m -CONFIG_DVB_CXD2880=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m -CONFIG_DVB_MN88443X=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBH29=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_LGS8GL5=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Common Interface (EN50221) controller drivers -# -CONFIG_DVB_CXD2099=m -CONFIG_DVB_SP2=m - -# -# Tools to develop new frontends -# -CONFIG_DVB_DUMMY_FE=m -# end of Customise DVB Frontends - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=10 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DBI=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set -CONFIG_DRM_LOAD_EDID_FIRMWARE=y -CONFIG_DRM_DP_CEC=y -CONFIG_DRM_TTM=m -CONFIG_DRM_TTM_DMA_PAGE_POOL=y -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_I2C_NXP_TDA9950=m -# end of I2C encoder or helper chips - -# -# ARM devices -# -CONFIG_DRM_KOMEDA=m -# end of ARM devices - -CONFIG_DRM_RADEON=m -CONFIG_DRM_RADEON_USERPTR=y -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_DCN=y -CONFIG_DRM_AMD_DC_HDCP=y -# CONFIG_DEBUG_KERNEL_DC is not set -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_DRM_NOUVEAU=m -# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_NOUVEAU_SVM=y -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="*" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m - -# -# drm/i915 Debugging -# -# CONFIG_DRM_I915_WERROR is not set -# CONFIG_DRM_I915_DEBUG is not set -# CONFIG_DRM_I915_DEBUG_MMIO is not set -# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set -# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set -# CONFIG_DRM_I915_DEBUG_GUC is not set -# CONFIG_DRM_I915_SELFTEST is not set -# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set -# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set -# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set -# end of drm/i915 Debugging - -# -# drm/i915 Profile Guided Optimisation -# -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -# end of drm/i915 Profile Guided Optimisation - -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_RCAR_DW_HDMI=m -CONFIG_DRM_RCAR_LVDS=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_PANEL_ARM_VERSATILE=m -CONFIG_DRM_PANEL_BOE_HIMAX8279D=m -CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m -CONFIG_DRM_PANEL_LVDS=m -CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_PANEL_ELIDA_KD35T133=m -CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m -CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m -CONFIG_DRM_PANEL_ILITEK_IL9322=m -CONFIG_DRM_PANEL_ILITEK_ILI9881C=m -CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m -CONFIG_DRM_PANEL_JDI_LT070ME05000=m -CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m -CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m -CONFIG_DRM_PANEL_SAMSUNG_LD9040=m -CONFIG_DRM_PANEL_LG_LB035Q02=m -CONFIG_DRM_PANEL_LG_LG4573=m -CONFIG_DRM_PANEL_NEC_NL8048HL11=m -CONFIG_DRM_PANEL_NOVATEK_NT35510=m -CONFIG_DRM_PANEL_NOVATEK_NT39016=m -CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m -CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m -CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m -CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m -CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m -CONFIG_DRM_PANEL_RAYDIUM_RM67191=m -CONFIG_DRM_PANEL_RAYDIUM_RM68200=m -CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m -CONFIG_DRM_PANEL_RONBO_RB070D30=m -CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m -CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m -CONFIG_DRM_PANEL_SEIKO_43WVF1G=m -CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m -CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m -CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m -CONFIG_DRM_PANEL_SITRONIX_ST7701=m -CONFIG_DRM_PANEL_SITRONIX_ST7789V=m -CONFIG_DRM_PANEL_SONY_ACX424AKP=m -CONFIG_DRM_PANEL_SONY_ACX565AKM=m -CONFIG_DRM_PANEL_TPO_TD028TTEC1=m -CONFIG_DRM_PANEL_TPO_TD043MTEA1=m -CONFIG_DRM_PANEL_TPO_TPG110=m -CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m -CONFIG_DRM_PANEL_XINPENG_XPP055C272=m -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_CDNS_DSI=m -CONFIG_DRM_DISPLAY_CONNECTOR=m -CONFIG_DRM_LVDS_CODEC=m -CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m -CONFIG_DRM_NXP_PTN3460=m -CONFIG_DRM_PARADE_PS8622=m -CONFIG_DRM_PARADE_PS8640=m -CONFIG_DRM_SIL_SII8620=m -CONFIG_DRM_SII902X=m -CONFIG_DRM_SII9234=m -CONFIG_DRM_SIMPLE_BRIDGE=m -CONFIG_DRM_THINE_THC63LVD1024=m -CONFIG_DRM_TOSHIBA_TC358764=m -CONFIG_DRM_TOSHIBA_TC358767=m -CONFIG_DRM_TOSHIBA_TC358768=m -CONFIG_DRM_TI_TFP410=m -CONFIG_DRM_TI_SN65DSI86=m -CONFIG_DRM_TI_TPD12S015=m -CONFIG_DRM_ANALOGIX_ANX6345=m -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_DRM_ANALOGIX_DP=m -CONFIG_DRM_I2C_ADV7511=m -CONFIG_DRM_I2C_ADV7511_AUDIO=y -CONFIG_DRM_I2C_ADV7511_CEC=y -CONFIG_DRM_DW_HDMI=m -CONFIG_DRM_DW_HDMI_AHB_AUDIO=m -CONFIG_DRM_DW_HDMI_I2S_AUDIO=m -CONFIG_DRM_DW_HDMI_CEC=m -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_ARCPGU=m -CONFIG_DRM_MXS=y -CONFIG_DRM_MXSFB=m -CONFIG_DRM_GM12U320=m -CONFIG_TINYDRM_HX8357D=m -CONFIG_TINYDRM_ILI9225=m -CONFIG_TINYDRM_ILI9341=m -CONFIG_TINYDRM_ILI9486=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -CONFIG_TINYDRM_ST7735R=m -CONFIG_DRM_XEN=y -CONFIG_DRM_XEN_FRONTEND=m -CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_BACKLIGHT=m -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -CONFIG_XEN_FBDEV_FRONTEND=m -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -CONFIG_FB_HYPERV=m -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SSD1307 is not set -# CONFIG_FB_SM712 is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_LCD_OTM3225A=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_GENERIC=m -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA903X=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_MAX8925=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP5520=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_88PM860X=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_AAT2870=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_LP8788=m -CONFIG_BACKLIGHT_PANDORA=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_AS3711=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -CONFIG_BACKLIGHT_RAVE_SP=m -CONFIG_BACKLIGHT_LED=m -# end of Backlight & LCD device support - -CONFIG_VIDEOMODE_HELPERS=y -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -# CONFIG_SND_DEBUG_VERBOSE is not set -# CONFIG_SND_PCM_XRUN_DEBUG is not set -# CONFIG_SND_CTL_VALIDATION is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -# CONFIG_SND_BT87X_OVERCLOCK is not set -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_AC97_BUS=y -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m -CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m -CONFIG_SND_SOC_AMD_ACP3x=m -CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_SOC_MIKROE_PROTO=m -CONFIG_SND_BCM63XX_I2S_WHISTLER=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_PCI=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m -CONFIG_SND_SOC_INTEL_HASWELL=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL=m -CONFIG_SND_SOC_INTEL_APL=m -CONFIG_SND_SOC_INTEL_KBL=m -CONFIG_SND_SOC_INTEL_GLK=m -CONFIG_SND_SOC_INTEL_CNL=m -CONFIG_SND_SOC_INTEL_CFL=m -CONFIG_SND_SOC_INTEL_CML_H=m -CONFIG_SND_SOC_INTEL_CML_LP=m -CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m -CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m -# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set -CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m -CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m -CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m -CONFIG_SND_SOC_MTK_BTCVSD=m -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_OF=m -# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set -# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_ACPI=m -CONFIG_SND_SOC_SOF_INTEL_PCI=m -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE_LP=m -CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y -CONFIG_SND_SOC_SOF_COMETLAKE_H=m -CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -CONFIG_SND_SOC_XILINX_I2S=m -CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m -CONFIG_SND_SOC_XILINX_SPDIF=m -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -CONFIG_SND_SOC_AC97_CODEC=m -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_ADAU7118=m -CONFIG_SND_SOC_ADAU7118_HW=m -CONFIG_SND_SOC_ADAU7118_I2C=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4118=m -CONFIG_SND_SOC_AK4458=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_AK5558=m -CONFIG_SND_SOC_ALC5623=m -CONFIG_SND_SOC_BD28623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CPCAP=m -CONFIG_SND_SOC_CROS_EC_CODEC=m -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS35L36=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4341=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_CX2072X=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES7241=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_HDAC_HDA=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_LOCHNAGAR_SC=m -CONFIG_SND_SOC_MAX98088=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX9867=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX98373=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM1789=m -CONFIG_SND_SOC_PCM1789_I2C=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM186X=m -CONFIG_SND_SOC_PCM186X_I2C=m -CONFIG_SND_SOC_PCM186X_SPI=m -CONFIG_SND_SOC_PCM3060=m -CONFIG_SND_SOC_PCM3060_I2C=m -CONFIG_SND_SOC_PCM3060_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RK3328=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT1011=m -CONFIG_SND_SOC_RT1015=m -CONFIG_SND_SOC_RT1308_SDW=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5660=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_RT5682=m -CONFIG_SND_SOC_RT5682_SDW=m -CONFIG_SND_SOC_RT700=m -CONFIG_SND_SOC_RT700_SDW=m -CONFIG_SND_SOC_RT711=m -CONFIG_SND_SOC_RT711_SDW=m -CONFIG_SND_SOC_RT715=m -CONFIG_SND_SOC_RT715_SDW=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2305=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS2562=m -CONFIG_SND_SOC_TAS2770=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TAS6424=m -CONFIG_SND_SOC_TDA7419=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC32X4=m -CONFIG_SND_SOC_TLV320AIC32X4_I2C=m -CONFIG_SND_SOC_TLV320AIC32X4_SPI=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TLV320ADCX140=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_TSCS42XX=m -CONFIG_SND_SOC_TSCS454=m -CONFIG_SND_SOC_UDA1334=m -CONFIG_SND_SOC_WCD9335=m -CONFIG_SND_SOC_WCD934X=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8782=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8904=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_WSA881X=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_MAX9759=m -CONFIG_SND_SOC_MT6351=m -CONFIG_SND_SOC_MT6358=m -CONFIG_SND_SOC_MT6660=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8822=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -# end of CODEC drivers - -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_AUDIO_GRAPH_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_SND_XEN_FRONTEND=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_BIGBEN_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_COUGAR=m -CONFIG_HID_MACALLY=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CREATIVE_SB0540=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELAN=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_GLORIOUS=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GOOGLE_HAMMER=m -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_VIEWSONIC=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_JABRA=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MALTRON=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_REDRAGON=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEAM=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_U2FZERO=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set -CONFIG_HID_ALPS=m -CONFIG_HID_MCP2221=m -# end of Special HID drivers - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# end of USB HID Boot Protocol drivers -# end of USB HID support - -# -# I2C HID support -# -CONFIG_I2C_HID=m -# end of I2C HID support - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support -# end of HID support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_USB_CONN_GPIO=m -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=y -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -CONFIG_USB_DYNAMIC_MINORS=y -# CONFIG_USB_OTG is not set -# CONFIG_USB_OTG_WHITELIST is not set -# CONFIG_USB_OTG_BLACKLIST_HUB is not set -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_MON=m - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_CDNS3=m -CONFIG_USB_CDNS3_GADGET=y -CONFIG_USB_CDNS3_HOST=y -CONFIG_USB_CDNS3_PCI_WRAP=m -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -# CONFIG_MUSB_PIO_ONLY is not set -CONFIG_USB_DWC3=m -CONFIG_USB_DWC3_ULPI=y -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC3_HAPS=m -CONFIG_USB_DWC3_OF_SIMPLE=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_OF=m -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_CONSOLE=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -CONFIG_USB_SERIAL_UPD78F0730=m -CONFIG_USB_SERIAL_DEBUG=m - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_APPLE_MFI_FASTCHARGE=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set -CONFIG_USB_ISP1301=m -# end of USB Physical Layer drivers - -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_SNP_UDC_PLAT=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -CONFIG_USB_GADGET_XILINX=m -CONFIG_USB_MAX3420_UDC=m -CONFIG_USB_DUMMY_HCD=m -# end of USB Peripheral Controller - -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC1_LEGACY=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y - -# -# USB Gadget precomposed configurations -# -CONFIG_USB_ZERO=m -CONFIG_USB_AUDIO=m -# CONFIG_GADGET_UAC1 is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -CONFIG_USB_G_DBGP=m -# CONFIG_USB_G_DBGP_PRINTK is not set -CONFIG_USB_G_DBGP_SERIAL=y -CONFIG_USB_G_WEBCAM=m -CONFIG_USB_RAW_GADGET=m -# end of USB Gadget precomposed configurations - -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -CONFIG_TYPEC_FUSB302=m -CONFIG_TYPEC_WCOVE=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -CONFIG_TYPEC_HD3SS3220=m -CONFIG_TYPEC_TPS6598X=m - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -CONFIG_TYPEC_MUX_PI3USB30532=m -CONFIG_TYPEC_MUX_INTEL_PMC=m -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -CONFIG_TYPEC_NVIDIA_ALTMODE=m -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -CONFIG_USB_ROLES_INTEL_XHCI=m -CONFIG_MMC=m -CONFIG_PWRSEQ_EMMC=m -CONFIG_PWRSEQ_SD8787=m -CONFIG_PWRSEQ_SIMPLE=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -CONFIG_MMC_TEST=m - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_SDHCI_OF_ARASAN=m -CONFIG_MMC_SDHCI_OF_ASPEED=m -CONFIG_MMC_SDHCI_OF_AT91=m -CONFIG_MMC_SDHCI_OF_DWCMSHC=m -CONFIG_MMC_SDHCI_CADENCE=m -CONFIG_MMC_SDHCI_F_SDH30=m -CONFIG_MMC_SDHCI_MILBEAUT=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_ALCOR=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MMC_SDHCI_OMAP=m -CONFIG_MMC_SDHCI_AM654=m -CONFIG_MMC_SDHCI_EXTERNAL_DMA=y -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y - -# -# LED drivers -# -CONFIG_LEDS_88PM860X=m -CONFIG_LEDS_AAT1290=m -CONFIG_LEDS_AN30259A=m -CONFIG_LEDS_APU=m -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_BCM6328=m -CONFIG_LEDS_BCM6358=m -CONFIG_LEDS_CPCAP=m -CONFIG_LEDS_CR0014114=m -CONFIG_LEDS_EL15203000=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3532=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_LM3692X=m -CONFIG_LEDS_LM3601X=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -# CONFIG_LEDS_LP5521 is not set -# CONFIG_LEDS_LP5523 is not set -# CONFIG_LEDS_LP5562 is not set -# CONFIG_LEDS_LP8501 is not set -CONFIG_LEDS_LP8788=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -CONFIG_LEDS_PCA955X_GPIO=y -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_WM8350=m -CONFIG_LEDS_DA903X=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_MAX77650=m -CONFIG_LEDS_MAX77693=m -CONFIG_LEDS_MAX8997=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m -CONFIG_LEDS_KTD2692=m -CONFIG_LEDS_IS31FL319X=m -CONFIG_LEDS_IS31FL32XX=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_SYSCON=y -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_MLXREG=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m -CONFIG_LEDS_SPI_BYTE=m -CONFIG_LEDS_TI_LMU_COMMON=m -CONFIG_LEDS_LM3697=m -CONFIG_LEDS_LM36274=m -CONFIG_LEDS_TPS6105X=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_MTD=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_ACTIVITY=m -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_LEDS_TRIGGER_NETDEV=m -CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_EFA=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_RDMA_SIW=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_I10NM=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM860X=m -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABEOZ9=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_AS3722=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_CENTURY=y -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_HYM8563=m -CONFIG_RTC_DRV_LP8788=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_MAX8925=m -CONFIG_RTC_DRV_MAX8998=m -CONFIG_RTC_DRV_MAX8997=m -CONFIG_RTC_DRV_MAX77686=m -CONFIG_RTC_DRV_RK808=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_ISL12026=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF85363=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BD70528=m -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_TWL4030=m -CONFIG_RTC_DRV_PALMAS=m -CONFIG_RTC_DRV_TPS6586X=m -CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m -CONFIG_RTC_DRV_RC5T583=m -CONFIG_RTC_DRV_RC5T619=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3028=m -CONFIG_RTC_DRV_RV8803=m -CONFIG_RTC_DRV_S5M=m -CONFIG_RTC_DRV_SD3078=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9055=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_WM8350=m -CONFIG_RTC_DRV_PCF50633=m -CONFIG_RTC_DRV_AB3100=m -CONFIG_RTC_DRV_ZYNQMP=m -CONFIG_RTC_DRV_CROS_EC=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_CADENCE=m -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m -CONFIG_RTC_DRV_R7301=m -CONFIG_RTC_DRV_CPCAP=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_RTC_DRV_WILCO_EC=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_DMA_OF=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_DW_AXI_DMAC=m -CONFIG_FSL_EDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IDXD=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_PLX_DMA=m -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=y -CONFIG_DW_DMAC_PCI=y -CONFIG_DW_EDMA=m -CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y -CONFIG_SF_PDMA=m - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -# CONFIG_DMABUF_MOVE_NOTIFY is not set -# CONFIG_DMABUF_SELFTESTS is not set -CONFIG_DMABUF_HEAPS=y -CONFIG_DMABUF_HEAPS_SYSTEM=y -# end of DMABUF options - -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_HT16K33=m -CONFIG_PARPORT_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -# CONFIG_CHARLCD_BL_OFF is not set -# CONFIG_CHARLCD_BL_ON is not set -CONFIG_CHARLCD_BL_FLASH=y -CONFIG_PANEL=m -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VBOXGUEST=m -CONFIG_VIRTIO=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y -CONFIG_VDPA=m -CONFIG_VDPA_SIM=m -CONFIG_IFCVF=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_RING=m -CONFIG_VHOST_DPN=y -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TIMER=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -# end of Microsoft Hyper-V guest support - -# -# Xen driver support -# -CONFIG_XEN_BALLOON=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 -CONFIG_XEN_SCRUB_PAGES_DEFAULT=y -CONFIG_XEN_DEV_EVTCHN=m -CONFIG_XEN_BACKEND=y -CONFIG_XENFS=m -CONFIG_XEN_COMPAT_XENFS=y -CONFIG_XEN_SYS_HYPERVISOR=y -CONFIG_XEN_XENBUS_FRONTEND=y -CONFIG_XEN_GNTDEV=m -CONFIG_XEN_GNTDEV_DMABUF=y -CONFIG_XEN_GRANT_DEV_ALLOC=m -CONFIG_XEN_GRANT_DMA_ALLOC=y -CONFIG_SWIOTLB_XEN=y -CONFIG_XEN_PCIDEV_BACKEND=m -CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y -CONFIG_XEN_SCSI_BACKEND=m -CONFIG_XEN_PRIVCMD=m -CONFIG_XEN_ACPI_PROCESSOR=m -CONFIG_XEN_MCE_LOG=y -CONFIG_XEN_HAVE_PVMMU=y -CONFIG_XEN_EFI=y -CONFIG_XEN_AUTO_XLATE=y -CONFIG_XEN_ACPI=y -CONFIG_XEN_SYMS=y -CONFIG_XEN_HAVE_VPMU=y -CONFIG_XEN_FRONT_PGDIR_SHBUF=m -# end of Xen driver support - -# CONFIG_GREYBUS is not set -CONFIG_STAGING=y -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -# CONFIG_COMEDI_ISA_DRIVERS is not set -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_COMEDI_NI_ROUTING=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16203=m -CONFIG_ADIS16240=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD7816=m -CONFIG_AD7280=m -# end of Analog to digital converters - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m -# end of Analog digital bi-direction converters - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7746=m -# end of Capacitance to digital converters - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m -# end of Direct Digital Synthesis - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m -# end of Network Analyzer, Impedance Converters - -# -# Active energy metering IC -# -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m -# end of Active energy metering IC - -# -# Resolver to digital converters -# -CONFIG_AD2S1210=m -# end of Resolver to digital converters -# end of IIO staging drivers - -# CONFIG_FB_SM750 is not set - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -CONFIG_SPEAKUP_SYNTH_DUMMY=m -# end of Speakup console speech - -CONFIG_STAGING_MEDIA=y -CONFIG_VIDEO_IPU3_IMGU=m - -# -# soc_camera sensor drivers -# -CONFIG_VIDEO_USBVISION=m - -# -# Android -# -# end of Android - -CONFIG_STAGING_BOARD=y -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_GS_FPGABOOT=m -CONFIG_UNISYSSPAR=y -CONFIG_UNISYS_VISORNIC=m -CONFIG_UNISYS_VISORINPUT=m -CONFIG_UNISYS_VISORHBA=m -CONFIG_COMMON_CLK_XLNX_CLKWZRD=m -# CONFIG_FB_TFT is not set -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_MOST_COMPONENTS=m -CONFIG_MOST_CDEV=m -CONFIG_MOST_NET=m -CONFIG_MOST_SOUND=m -CONFIG_MOST_VIDEO=m -CONFIG_MOST_DIM2=m -CONFIG_MOST_I2C=m -CONFIG_MOST_USB=m -CONFIG_KS7010=m -CONFIG_PI433=m - -# -# Gasket devices -# -CONFIG_STAGING_GASKET_FRAMEWORK=m -CONFIG_STAGING_APEX_DRIVER=m -# end of Gasket devices - -CONFIG_XIL_AXIS_FIFO=m -CONFIG_FIELDBUS_DEV=m -CONFIG_HMS_ANYBUSS_BUS=m -CONFIG_ARCX_ANYBUS_CONTROLLER=m -CONFIG_HMS_PROFINET=m -CONFIG_KPC2000=y -CONFIG_KPC2000_CORE=m -CONFIG_KPC2000_SPI=m -CONFIG_KPC2000_I2C=m -CONFIG_KPC2000_DMA=m -CONFIG_QLGE=m -CONFIG_WFX=m -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_HUAWEI_WMI=m -CONFIG_INTEL_WMI_THUNDERBOLT=m -CONFIG_MXM_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_XIAOMI_WMI=m -CONFIG_ACERHDF=m -CONFIG_ACER_WIRELESS=m -CONFIG_ACER_WMI=m -CONFIG_APPLE_GMUX=m -CONFIG_ASUS_LAPTOP=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_EEEPC_WMI=m -CONFIG_DCDBAS=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_SMBIOS_WMI=y -CONFIG_DELL_SMBIOS_SMM=y -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_RBTN=m -# CONFIG_DELL_RBU is not set -CONFIG_DELL_SMO8800=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_DESCRIPTOR=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_AMILO_RFKILL=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_GPD_POCKET_FAN=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_IBM_RTL=m -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_INTEL_ATOMISP2_PM=m -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_MENLOW=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_VBTN=m -CONFIG_SURFACE3_WMI=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_SURFACE_3_POWER_OPREGION=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_MSI_LAPTOP=m -CONFIG_MSI_WMI=m -CONFIG_PCENGINES_APU2=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_SAMSUNG_Q10=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_LG_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_SYSTEM76_ACPI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_I2C_MULTI_INSTANTIATE=m -CONFIG_MLX_PLATFORM=m -CONFIG_TOUCHSCREEN_DMI=y -CONFIG_INTEL_IPS=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_INTEL_CHTDC_TI_PWRBTN=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_INTEL_PMC_IPC=m -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_PMC_ATOM=y -CONFIG_MFD_CROS_EC=m -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CHROMEOS_TBMC=m -CONFIG_CROS_EC=m -CONFIG_CROS_EC_I2C=m -CONFIG_CROS_EC_RPMSG=m -CONFIG_CROS_EC_ISHTP=m -CONFIG_CROS_EC_SPI=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LIGHTBAR=m -CONFIG_CROS_EC_VBC=m -# CONFIG_CROS_EC_DEBUGFS is not set -CONFIG_CROS_EC_SENSORHUB=m -CONFIG_CROS_EC_SYSFS=m -CONFIG_CROS_EC_TYPEC=m -CONFIG_CROS_USBPD_LOGGER=m -CONFIG_CROS_USBPD_NOTIFY=m -CONFIG_WILCO_EC=m -# CONFIG_WILCO_EC_DEBUGFS is not set -CONFIG_WILCO_EC_EVENTS=m -CONFIG_WILCO_EC_TELEMETRY=m -CONFIG_MELLANOX_PLATFORM=y -CONFIG_MLXREG_HOTPLUG=m -CONFIG_MLXREG_IO=m -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y - -# -# Common Clock Framework -# -CONFIG_COMMON_CLK_WM831X=m -CONFIG_CLK_HSDK=y -CONFIG_COMMON_CLK_MAX77686=m -CONFIG_COMMON_CLK_MAX9485=m -CONFIG_COMMON_CLK_RK808=m -CONFIG_COMMON_CLK_SI5341=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_SI514=m -CONFIG_COMMON_CLK_SI544=m -CONFIG_COMMON_CLK_SI570=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CDCE925=m -CONFIG_COMMON_CLK_CS2000_CP=m -CONFIG_COMMON_CLK_S2MPS11=m -CONFIG_CLK_TWL6040=m -CONFIG_COMMON_CLK_LOCHNAGAR=m -CONFIG_COMMON_CLK_PALMAS=m -CONFIG_COMMON_CLK_PWM=m -CONFIG_COMMON_CLK_VC5=m -CONFIG_COMMON_CLK_BD718XX=m -CONFIG_COMMON_CLK_FIXED_MMIO=y -# end of Common Clock Framework - -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_TIMER_OF=y -CONFIG_TIMER_PROBE=y -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -CONFIG_CLKSRC_MMIO=y -CONFIG_MICROCHIP_PIT64B=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PLATFORM_MHU=m -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_MAILBOX_TEST=m -CONFIG_IOMMU_IOVA=y -CONFIG_IOASID=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_OF_IOMMU=y -CONFIG_IOMMU_DMA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set -CONFIG_IRQ_REMAP=y -CONFIG_HYPERV_IOMMU=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=y -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK_NATIVE=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m -CONFIG_RPMSG_VIRTIO=m -# end of Rpmsg drivers - -CONFIG_SOUNDWIRE=m - -# -# SoundWire Devices -# -CONFIG_SOUNDWIRE_CADENCE=m -CONFIG_SOUNDWIRE_INTEL=m -CONFIG_SOUNDWIRE_QCOM=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Aspeed SoC drivers -# -# end of Aspeed SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Qualcomm SoC drivers -# -# end of Qualcomm SoC drivers - -CONFIG_SOC_TI=y - -# -# Xilinx SoC drivers -# -CONFIG_XILINX_VCU=m -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_FSA9480=m -CONFIG_EXTCON_GPIO=m -CONFIG_EXTCON_INTEL_INT3496=m -CONFIG_EXTCON_INTEL_CHT_WC=m -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_MAX77843=m -CONFIG_EXTCON_MAX8997=m -CONFIG_EXTCON_PALMAS=m -CONFIG_EXTCON_PTN5150=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -CONFIG_EXTCON_USB_GPIO=m -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_BUFFER_HW_CONSUMER=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16209=m -CONFIG_ADXL372=m -CONFIG_ADXL372_SPI=m -CONFIG_ADXL372_I2C=m -CONFIG_BMA180=m -CONFIG_BMA220=m -CONFIG_BMA400=m -CONFIG_BMA400_I2C=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD06=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7091R5=m -CONFIG_AD7124=m -CONFIG_AD7192=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7292=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7766=m -CONFIG_AD7768_1=m -CONFIG_AD7780=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD7949=m -CONFIG_AD799X=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_CPCAP_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_ENVELOPE_DETECTOR=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_LP8788_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2496=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MCP3911=m -CONFIG_MEN_Z188_ADC=m -CONFIG_NAU7802=m -CONFIG_PALMAS_GPADC=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_QCOM_SPMI_ADC5=m -CONFIG_RN5T618_ADC=m -CONFIG_SD_ADC_MODULATOR=m -CONFIG_STMPE_ADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_ADS8344=m -CONFIG_TI_ADS8688=m -CONFIG_TI_ADS124S08=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_TWL4030_MADC=m -CONFIG_TWL6030_GPADC=m -CONFIG_VF610_ADC=m -CONFIG_VIPERBOARD_ADC=m -CONFIG_XILINX_XADC=m -# end of Analog to digital converters - -# -# Analog Front Ends -# -CONFIG_IIO_RESCALE=m -# end of Analog Front Ends - -# -# Amplifiers -# -CONFIG_AD8366=m -CONFIG_HMC425=m -# end of Amplifiers - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_BME680=m -CONFIG_BME680_I2C=m -CONFIG_BME680_SPI=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_PMS7003=m -CONFIG_SENSIRION_SGP30=m -CONFIG_SPS30=m -CONFIG_VZ89X=m -# end of Chemical Sensors - -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m -CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -# end of Hid Sensor IIO Common - -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -# end of SSP Sensor Common - -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_AD5686=m -CONFIG_AD5686_SPI=m -CONFIG_AD5696_I2C=m -CONFIG_AD5755=m -CONFIG_AD5758=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5770R=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_DPOT_DAC=m -CONFIG_DS4424=m -CONFIG_LTC1660=m -CONFIG_LTC2632=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MAX5821=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m -CONFIG_TI_DAC082S085=m -CONFIG_TI_DAC5571=m -CONFIG_TI_DAC7311=m -CONFIG_TI_DAC7612=m -CONFIG_VF610_DAC=m -# end of Digital to analog converters - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set -# end of IIO dummy driver - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m -# end of Clock Generator/Distribution - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m -CONFIG_ADF4371=m -# end of Phase-Locked Loop (PLL) frequency synthesizers -# end of Frequency Synthesizers DDS/PLL - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_FXAS21002C=m -CONFIG_FXAS21002C_I2C=m -CONFIG_FXAS21002C_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m -# end of Digital gyroscope sensors - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m -# end of Heart Rate Monitors -# end of Health Sensors - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m -# end of Humidity sensors - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16460=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_FXOS8700=m -CONFIG_FXOS8700_I2C=m -CONFIG_FXOS8700_SPI=m -CONFIG_KMX61=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ST_LSM6DSX_I3C=m -# end of Inertial measurement units - -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -CONFIG_ACPI_ALS=m -CONFIG_ADJD_S311=m -CONFIG_ADUX1020=m -CONFIG_AL3010=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM3605=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP002=m -CONFIG_GP2AP020A00F=m -CONFIG_IQS621_ALS=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_LV0104CS=m -CONFIG_MAX44000=m -CONFIG_MAX44009=m -CONFIG_NOA1305=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1133=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_ST_UVIS25=m -CONFIG_ST_UVIS25_I2C=m -CONFIG_ST_UVIS25_SPI=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL2772=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VCNL4035=m -CONFIG_VEML6030=m -CONFIG_VEML6070=m -CONFIG_VL6180=m -CONFIG_ZOPT2201=m -# end of Light sensors - -# -# Magnetometer sensors -# -CONFIG_AK8974=m -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m -CONFIG_SENSORS_RM3100=m -CONFIG_SENSORS_RM3100_I2C=m -CONFIG_SENSORS_RM3100_SPI=m -# end of Magnetometer sensors - -# -# Multiplexers -# -CONFIG_IIO_MUX=m -# end of Multiplexers - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m -# end of Inclinometer sensors - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m -# end of Triggers - standalone - -# -# Linear and angular position sensors -# -CONFIG_IQS624_POS=m -# end of Linear and angular position sensors - -# -# Digital potentiometers -# -CONFIG_AD5272=m -CONFIG_DS1803=m -CONFIG_MAX5432=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4018=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_MCP41010=m -CONFIG_TPL0102=m -# end of Digital potentiometers - -# -# Digital potentiostats -# -CONFIG_LMP91000=m -# end of Digital potentiostats - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_DLHL60D=m -CONFIG_DPS310=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_ICP10100=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m -# end of Pressure sensors - -# -# Lightning sensors -# -CONFIG_AS3935=m -# end of Lightning sensors - -# -# Proximity and distance sensors -# -CONFIG_ISL29501=m -CONFIG_LIDAR_LITE_V2=m -CONFIG_MB1232=m -CONFIG_PING=m -CONFIG_RFD77402=m -CONFIG_SRF04=m -CONFIG_SX9500=m -CONFIG_SRF08=m -CONFIG_VL53L0X_I2C=m -# end of Proximity and distance sensors - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -# end of Resolver to digital converters - -# -# Temperature sensors -# -CONFIG_IQS620AT_TEMP=m -CONFIG_LTC2983=m -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_MLX90632=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_MAX31856=m -# end of Temperature sensors - -CONFIG_NTB=m -CONFIG_NTB_MSI=y -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_SWITCHTEC=m -# CONFIG_NTB_PINGPONG is not set -# CONFIG_NTB_TOOL is not set -# CONFIG_NTB_PERF is not set -# CONFIG_NTB_MSI_TEST is not set -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -# CONFIG_VME_FAKE is not set - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_ATMEL_HLCDC_PWM=m -CONFIG_PWM_CRC=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_FSL_FTM=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_PWM_STMPE=y -CONFIG_PWM_TWL=m -CONFIG_PWM_TWL_LED=m - -# -# IRQ chip support -# -CONFIG_IRQCHIP=y -CONFIG_AL_FIC=y -CONFIG_MADERA_IRQ=m -# end of IRQ chip support - -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -CONFIG_RESET_BRCMSTB_RESCAL=y -CONFIG_RESET_INTEL_GW=y -CONFIG_RESET_TI_SYSCON=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_GENERIC_PHY_MIPI_DPHY=y -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_CADENCE_TORRENT=m -CONFIG_PHY_CADENCE_DPHY=m -CONFIG_PHY_CADENCE_SIERRA=m -CONFIG_PHY_FSL_IMX8MQ_USB=m -CONFIG_PHY_MIXEL_MIPI_DPHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_MAPPHONE_MDM6600=m -CONFIG_PHY_OCELOT_SERDES=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -CONFIG_PHY_TUSB1210=m -CONFIG_PHY_INTEL_EMMC=m -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_IDLE_INJECT=y -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -# end of Performance monitor support - -CONFIG_RAS=y -CONFIG_RAS_CEC=y -# CONFIG_RAS_CEC_DEBUG is not set -CONFIG_USB4=m - -# -# Android -# -# CONFIG_ANDROID is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_ND_PFN=m -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_OF_PMEM=m -CONFIG_DAX_DRIVER=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_HMEM=m -CONFIG_DEV_DAX_KMEM=m -CONFIG_DEV_DAX_PMEM_COMPAT=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -CONFIG_NVMEM_SPMI_SDAM=m -CONFIG_RAVE_SP_EEPROM=m - -# -# HW tracing support -# -CONFIG_STM=m -CONFIG_STM_PROTO_BASIC=m -CONFIG_STM_PROTO_SYS_T=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_STM_SOURCE_FTRACE=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_ACPI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -# end of HW tracing support - -CONFIG_FPGA=m -CONFIG_ALTERA_PR_IP_CORE=m -CONFIG_ALTERA_PR_IP_CORE_PLAT=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_FPGA_MGR_ICE40_SPI=m -CONFIG_FPGA_MGR_MACHXO2_SPI=m -CONFIG_FPGA_BRIDGE=m -CONFIG_ALTERA_FREEZE_BRIDGE=m -CONFIG_XILINX_PR_DECOUPLER=m -CONFIG_FPGA_REGION=m -CONFIG_OF_FPGA_REGION=m -CONFIG_FPGA_DFL=m -CONFIG_FPGA_DFL_FME=m -CONFIG_FPGA_DFL_FME_MGR=m -CONFIG_FPGA_DFL_FME_BRIDGE=m -CONFIG_FPGA_DFL_FME_REGION=m -CONFIG_FPGA_DFL_AFU=m -CONFIG_FPGA_DFL_PCI=m -CONFIG_FSI=m -CONFIG_FSI_NEW_DEV_NODE=y -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_MASTER_ASPEED=m -CONFIG_FSI_SCOM=m -CONFIG_FSI_SBEFIFO=m -CONFIG_FSI_OCC=m -CONFIG_TEE=m - -# -# TEE drivers -# -CONFIG_AMDTEE=m -# end of TEE drivers - -CONFIG_MULTIPLEXER=m - -# -# Multiplexer drivers -# -CONFIG_MUX_ADG792A=m -CONFIG_MUX_ADGS1408=m -CONFIG_MUX_GPIO=m -CONFIG_MUX_MMIO=m -# end of Multiplexer drivers - -CONFIG_PM_OPP=y -CONFIG_UNISYS_VISORBUS=m -CONFIG_SIOX=m -CONFIG_SIOX_BUS_GPIO=m -CONFIG_SLIMBUS=m -CONFIG_SLIM_QCOM_CTRL=m -CONFIG_INTERCONNECT=m -CONFIG_COUNTER=m -CONFIG_FTM_QUADDEC=m -CONFIG_MOST=m -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=m -CONFIG_EXT4_USE_FOR_EXT2=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -# CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=m -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_XFS_ONLINE_SCRUB=y -CONFIG_XFS_ONLINE_REPAIR=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_OCFS2_FS_O2CB=m -CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m -CONFIG_OCFS2_FS_STATS=y -CONFIG_OCFS2_DEBUG_MASKLOG=y -# CONFIG_OCFS2_DEBUG_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_IO_TRACE is not set -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -# CONFIG_MANDATORY_FILE_LOCKING is not set -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=m -CONFIG_FS_VERITY=y -# CONFIG_FS_VERITY_DEBUG is not set -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_QUOTACTL_COMPAT=y -CONFIG_AUTOFS4_FS=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -CONFIG_FSCACHE_HISTOGRAM=y -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -# CONFIG_PROC_KCORE is not set -# CONFIG_PROC_VMCORE is not set -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_MEMFD_CREATE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=y -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -# CONFIG_JFFS2_FS_WBUF_VERIFY is not set -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_RTIME=y -CONFIG_UBIFS_FS=m -# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_FS_ZSTD=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_XATTR=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_UBIFS_FS_AUTHENTICATION=y -CONFIG_CRAMFS=m -CONFIG_CRAMFS_BLOCKDEV=y -CONFIG_CRAMFS_MTD=y -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -CONFIG_SQUASHFS_DECOMP_MULTI=y -# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set -# CONFIG_SQUASHFS_EMBEDDED is not set -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -CONFIG_ROMFS_FS=m -CONFIG_ROMFS_BACKED_BY_BLOCK=y -# CONFIG_ROMFS_BACKED_BY_MTD is not set -# CONFIG_ROMFS_BACKED_BY_BOTH is not set -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFLATE_COMPRESS=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -# CONFIG_PSTORE_842_COMPRESS is not set -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y -CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=y -# CONFIG_SYSV_FS is not set -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EROFS_FS=m -# CONFIG_EROFS_FS_DEBUG is not set -CONFIG_EROFS_FS_XATTR=y -CONFIG_EROFS_FS_POSIX_ACL=y -CONFIG_EROFS_FS_SECURITY=y -CONFIG_EROFS_FS_ZIP=y -CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 -CONFIG_VBOXSF_FS=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -# CONFIG_NFSD_FLEXFILELAYOUT is not set -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y -CONFIG_SUNRPC_DEBUG=y -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CEPH_FS_SECURITY_LABEL=y -CONFIG_CIFS=m -# CONFIG_CIFS_STATS2 is not set -# CONFIG_CIFS_ALLOW_INSECURE_LEGACY is not set -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_DEBUG=y -# CONFIG_CIFS_DEBUG2 is not set -# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set -CONFIG_CIFS_DFS_UPCALL=y -# CONFIG_CIFS_SMB_DIRECT is not set -CONFIG_CIFS_FSCACHE=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -# CONFIG_AFS_DEBUG_CURSOR is not set -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_BIG_KEYS=y -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -CONFIG_KEY_DH_OPERATIONS=y -CONFIG_SECURITY_DMESG_RESTRICT=y -CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y -CONFIG_SECURITY_TIOCSTI_RESTRICT=y -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_SECURITY_INFINIBAND=y -CONFIG_SECURITY_NETWORK_XFRM=y -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -# CONFIG_HARDENED_USERCOPY_FALLBACK is not set -# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set -CONFIG_FORTIFY_SOURCE=y -# CONFIG_FORTIFY_SOURCE_STRICT_STRING is not set -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DISABLE is not set -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -CONFIG_SECURITY_SMACK=y -CONFIG_SECURITY_SMACK_BRINGUP=y -CONFIG_SECURITY_SMACK_NETFILTER=y -CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y -CONFIG_SECURITY_TOMOYO=y -CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 -CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 -# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set -CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" -CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" -# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -# CONFIG_INTEGRITY is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_SMACK is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama" - -# -# Kernel hardening options -# -CONFIG_GCC_PLUGIN_STRUCTLEAK=y - -# -# Memory initialization -# -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set -CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y -# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set -CONFIG_GCC_PLUGIN_STACKLEAK=y -CONFIG_STACKLEAK_TRACK_MIN_SIZE=100 -# CONFIG_STACKLEAK_METRICS is not set -# CONFIG_STACKLEAK_RUNTIME_DISABLE is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -CONFIG_INIT_ON_FREE_DEFAULT_ON=y -CONFIG_PAGE_SANITIZE_VERIFY=y -CONFIG_SLAB_SANITIZE_VERIFY=y -# end of Memory initialization -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=y -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=y -CONFIG_CRYPTO_ECC=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_CURVE25519_X86=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_NHPOLY1305=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_ESSIV=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_BLAKE2B=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_BLAKE2S_X86=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=y -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=y -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -# CONFIG_CRYPTO_STATS is not set -CONFIG_CRYPTO_HASH_INFO=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA256=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -CONFIG_CRYPTO_DEV_ATMEL_I2C=m -CONFIG_CRYPTO_DEV_ATMEL_ECC=m -CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CHELSIO_IPSEC_INLINE=y -CONFIG_CHELSIO_TLS_DEVICE=y -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_CRYPTO_DEV_SAFEXCEL=m -CONFIG_CRYPTO_DEV_CCREE=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m -# CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG is not set -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_TPM_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -CONFIG_SIGNED_PE_FILE_VERIFICATION=y - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_CORDIC=m -CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_CRC_CCITT=y -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=y -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y -CONFIG_DMA_VIRT_OPS=y -CONFIG_SWIOTLB=y -# CONFIG_DMA_API_DEBUG is not set -CONFIG_SGL_ALLOC=y -CONFIG_IOMMU_HELPER=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_DIMLIB=y -CONFIG_LIBFDT=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -CONFIG_FONT_TER16x32=y -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_UACCESS_MCSAFE=y -CONFIG_ARCH_STACKWALK=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -CONFIG_OBJAGG=m -# CONFIG_STRING_SELFTEST is not set -# end of Library routines - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 -CONFIG_CONSOLE_LOGLEVEL_QUIET=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -CONFIG_DYNAMIC_DEBUG=y -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -# -# Compile-time checks and compiler options -# -# CONFIG_DEBUG_INFO is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -# CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_UBSAN is not set -# end of Generic Kernel Debugging Instruments - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_OWNER is not set -# CONFIG_PAGE_POISONING is not set -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -# CONFIG_DEBUG_VM is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -# CONFIG_KASAN is not set -CONFIG_KASAN_STACK=1 -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -CONFIG_PANIC_ON_OOPS=y -CONFIG_PANIC_ON_OOPS_VALUE=1 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -CONFIG_DEBUG_LIST=y -# CONFIG_DEBUG_PLIST is not set -CONFIG_DEBUG_SG=y -CONFIG_DEBUG_NOTIFIERS=y -CONFIG_BUG_ON_DATA_CORRUPTION=y -# end of Debug kernel data structures - -CONFIG_DEBUG_CREDENTIALS=y - -# -# RCU Debugging -# -# CONFIG_RCU_PERF_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_RING_BUFFER_ALLOW_SWAP=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_STACK_TRACER=y -# CONFIG_PREEMPTIRQ_EVENTS is not set -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -CONFIG_SCHED_TRACER=y -CONFIG_HWLAT_TRACER=y -CONFIG_MMIOTRACE=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT=y -# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -# CONFIG_BPF_KPROBE_OVERRIDE is not set -CONFIG_FTRACE_MCOUNT_RECORD=y -# CONFIG_HIST_TRIGGERS is not set -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_MMIOTRACE_TEST is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -# CONFIG_STRICT_DEVMEM is not set - -# -# x86 Debugging -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_X86_VERBOSE_BOOTUP is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -CONFIG_DEBUG_WX=y -CONFIG_DOUBLEFAULT=y -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEBUG_BOOT_PARAMS=y -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# CONFIG_UNWINDER_GUESS is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_STRSCPY is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_BITFIELD is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_OVERFLOW is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_HASH is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_PARMAN is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_VMALLOC is not set -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_OBJAGG is not set -# CONFIG_TEST_STACKINIT is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_MEMTEST is not set -# CONFIG_HYPERV_TESTING is not set -# end of Kernel Testing and Coverage -# end of Kernel hacking diff --git a/linux57-tkg/linux57-tkg-config/generic-desktop-profile.cfg b/linux57-tkg/linux57-tkg-config/generic-desktop-profile.cfg deleted file mode 100644 index 3750e64..0000000 --- a/linux57-tkg/linux57-tkg-config/generic-desktop-profile.cfg +++ /dev/null @@ -1,55 +0,0 @@ -# linux57-TkG config file -# Generic Desktop - - -#### MISC OPTIONS #### - -# External config file to use - If the given file exists in path, it will override default config (customization.cfg) - Default is ~/.config/frogminer/linux50-tkg.cfg -_EXT_CONFIG_PATH=~/.config/frogminer/linux57-tkg.cfg - -#### KERNEL OPTIONS #### - -# Name of the default config file to use from the linux???-tkg-config folder. Arch default is "config.x86_64". -_configfile="config.x86_64" - -# Disable some non-module debugging - See PKGBUILD for the list -_debugdisable="false" - -# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME - -# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" -_ftracedisable="false" - -# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" -_numadisable="false" - -# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" -_voluntary_preempt="false" - -# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" -_zenify="true" - -# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" -_compileroptlevel="1" - -# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" -_random_trust_cpu="false" - -# CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) -# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "mc" -_runqueue_sharing="mc" - -# Timer frequency - "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "750" -_timer_freq="500" - - -#### USER PATCHES #### - -# You can use your own patches by putting them in the same folder as the PKGBUILD and giving them the .mypatch extension. -# You can also revert patches by putting them in the same folder as the PKGBUILD and giving them the .myrevert extension. - -# Also, userpatches variable below must be set to true for the above to work. -_user_patches="true" - -# Apply all user patches without confirmation - !!! NOT RECOMMENDED !!! -_user_patches_no_confirm="false" diff --git a/linux57-tkg/linux57-tkg-config/prepare b/linux57-tkg/linux57-tkg-config/prepare deleted file mode 100644 index 1350f34..0000000 --- a/linux57-tkg/linux57-tkg-config/prepare +++ /dev/null @@ -1,983 +0,0 @@ -#!/bin/bash - -_basever=57 -_basekernel=5.7 -_sub=19 - -_tkg_initscript() { - - cp "$_where"/linux"$_basever"-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - cp "$_where"/linux"$_basever"-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - - # Load external configuration file if present. Available variable values will overwrite customization.cfg ones. - if [ -e "$_EXT_CONFIG_PATH" ]; then - source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" - fi - - if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then - # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. - plain "Do you want to use a predefined optimized profile?" - read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; - fi - if [ "$_OPTIPROFILE" = "2" ]; then - source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" - elif [ "$_OPTIPROFILE" = "3" ]; then - source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" - fi - - # source cpuschedset early if present - if [ -e "$_where"/cpuschedset ]; then - source "$_where"/cpuschedset - fi - - # CPU SCHED selector - if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then - plain "What CPU sched variant do you want to build/install?" - read -rp "`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" CONDITION; - if [ "$CONDITION" = "2" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "3" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "4" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - fi - if [ -n "$_custom_pkgbase" ]; then - echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset - fi - elif [ "$_cpusched" = "muqss" ] || [ "$_cpusched" = "MuQSS" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "pds" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "bmq" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - else - if [ "$_nofallback" != "true" ]; then - warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" - read -rp "`echo $' > N/y : '`" _fallback; - fi - if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - error "Exiting..." - exit 1 - fi - fi - - source "$_where"/cpuschedset -} - -user_patcher() { - # To patch the user because all your base are belong to us - local _patches=("$_where"/*."${_userpatch_ext}revert") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Reverting your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 -R < "${_f}" - echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi - - _patches=("$_where"/*."${_userpatch_ext}patch") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Applying your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 < "${_f}" - echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi -} - -_tkg_srcprep() { - - if [ "${_distro}" = "Arch" ]; then - msg2 "Setting version..." - scripts/setlocalversion --save-scmversion - echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel - echo "" > localversion.20-pkgname - - # add upstream patch - msg2 "Patching from $_basekernel to $pkgver" - patch -p1 -i "$srcdir"/patch-"${pkgver}" - - # ARCH Patches - if [ "${_configfile}" = "config_hardened.x86_64" ] && [ "${_cpusched}" = "cfs" ]; then - msg2 "Using linux hardened patchset" - patch -Np1 -i "$srcdir"/0012-linux-hardened.patch - else - patch -Np1 -i "$srcdir"/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - fi - fi - - # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch - msg2 "Applying graysky's cpu opts patch" - if [ "${_distro}" = "Arch" ]; then - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.7%2B.patch - else - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v5.7+.patch - fi - - # TkG - msg2 "Applying clear linux patches" - patch -Np1 -i "$srcdir"/0002-clear-patches.patch - - msg2 "Applying glitched base patch" - patch -Np1 -i "$srcdir"/0003-glitched-base.patch - - if [ -z $_misc_adds ]; then - plain "Enable misc additions ? May contain temporary fixes pending upstream or changes that can break on non-Arch. " - read -rp "`echo $' > [Y]/n : '`" _interactive_misc_adds; - if [ "$_interactive_misc_adds" != "n" ] && [ "$_interactive_misc_adds" != "N" ]; then - _misc_adds="true" - fi - fi - - if [ "$_misc_adds" = "true" ]; then - msg2 "Applying misc additions patch" - patch -Np1 -i "$srcdir"/0012-misc-additions.patch - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS - msg2 "Applying MuQSS base patch" - patch -Np1 -i "$srcdir"/0004-5.7-ck1.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying MuQSS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0004-glitched-ondemand-muqss.patch - fi - - msg2 "Applying Glitched MuQSS patch" - patch -Np1 -i "$srcdir"/0004-glitched-muqss.patch - - elif [ "${_cpusched}" = "pds" ]; then - # PDS-mq - msg2 "Applying PDS base patch" - patch -Np1 -i "$srcdir"/0005-v5.7_undead-pds099o.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying PDS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0005-glitched-ondemand-pds.patch - fi - - msg2 "Applying Glitched PDS patch" - patch -Np1 -i "$srcdir"/0005-glitched-pds.patch - - elif [ "${_cpusched}" = "bmq" ]; then - # Project C / BMQ - msg2 "Applying Project C / BMQ base patch" - - patch -Np1 -i "$srcdir"/0009-prjc_v5.7-r3.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying BMQ agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched BMQ patch" - patch -Np1 -i "$srcdir"/0009-glitched-bmq.patch - - elif [ "${_cpusched}" = "cfs" ]; then - msg2 "Applying Glitched CFS patch" - patch -Np1 -i "$srcdir"/0003-glitched-cfs.patch - fi - - if [ "${_distro}" = "Arch" ]; then - if [ -z "${_configfile}" ]; then - _configfile="config.x86_64" - fi - - cat "${srcdir}/${_configfile}" > ./.config - fi - - - # Set some -tkg defaults - echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config - sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config - echo "CONFIG_DEFAULT_CAKE=y" >> ./.config - echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config - echo "# CONFIG_NTP_PPS is not set" >> ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config - sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO=y/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set/' ./.config - sed -i -e 's/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lzo"/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4"/' ./.config - #sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config - sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config - echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config - echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config - echo "# CONFIG_X86_P6_NOP is not set" >> ./.config - if [ "$_noccache" != "true" ]; then - if { [ "$_distro" = "Arch" ] && pacman -Qq ccache &> /dev/null; } || { [ "$_distro" = "Ubuntu" ] && dpkg -l ccache > /dev/null; }; then - sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config - fi - fi - # Skip dbg package creation on non-Arch - if [ "$_distro" != "Arch" ]; then - sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config - fi - - if [ "$_font_autoselect" != "false" ]; then - sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config - fi - - # Inject cpuopts options - echo "# CONFIG_MK8SSE3 is not set" >> ./.config - echo "# CONFIG_MK10 is not set" >> ./.config - echo "# CONFIG_MBARCELONA is not set" >> ./.config - echo "# CONFIG_MBOBCAT is not set" >> ./.config - echo "# CONFIG_MJAGUAR is not set" >> ./.config - echo "# CONFIG_MBULLDOZER is not set" >> ./.config - echo "# CONFIG_MPILEDRIVER is not set" >> ./.config - echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config - echo "# CONFIG_MEXCAVATOR is not set" >> ./.config - echo "# CONFIG_MZEN is not set" >> ./.config - echo "# CONFIG_MZEN2 is not set" >> ./.config - echo "# CONFIG_MATOM is not set" >> ./.config - echo "# CONFIG_MNEHALEM is not set" >> ./.config - echo "# CONFIG_MWESTMERE is not set" >> ./.config - echo "# CONFIG_MSILVERMONT is not set" >> ./.config - echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config - echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config - echo "# CONFIG_MHASWELL is not set" >> ./.config - echo "# CONFIG_MBROADWELL is not set" >> ./.config - echo "# CONFIG_MSKYLAKE is not set" >> ./.config - echo "# CONFIG_MSKYLAKEX is not set" >> ./.config - echo "# CONFIG_MCANNONLAKE is not set" >> ./.config - echo "# CONFIG_MICELAKE is not set" >> ./.config - echo "# CONFIG_MGOLDMONT is not set" >> ./.config - echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config - echo "# CONFIG_MCASCADELAKE is not set" >> ./.config - echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config - echo "# CONFIG_MTIGERLAKE is not set" >> ./.config - - # Disable some debugging - if [ "${_debugdisable}" = "true" ]; then - sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config - sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS default config - echo "CONFIG_SCHED_MUQSS=y" >> ./.config - elif [ "${_cpusched}" = "pds" ]; then - # PDS default config - echo "CONFIG_SCHED_PDS=y" >> ./.config - elif [ "${_cpusched}" = "bmq" ]; then - # BMQ default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - # Disable CFS - sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config - sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config - # sched yield type - if [ -n "$_sched_yield_type" ]; then - CONDITION0="$_sched_yield_type" - else - plain "" - plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." - plain "" - plain "For PDS and MuQSS:" - plain "0: No yield." - plain "1: Yield only to better priority/deadline tasks." - plain "2: Expire timeslice and recalculate deadline." - plain "" - plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" - plain "0: No yield." - plain "1: Deboost and requeue task. (default)" - plain "2: Set rq skip task." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0. Supposedly best option for gaming performance - could lead to stability issues on some (AMD) platforms when combined with MuQSS\n > 1. Default and recommended option for MuQSS - could lead to stability issues on some (Intel) platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - else - read -rp "`echo $'\n > 0. Recommended option for gaming on PDS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - fi - fi - if [ "$CONDITION0" = "0" ]; then - if [ "${_cpusched}" = "bmq" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - elif [ "$CONDITION0" = "1" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "$CONDITION0" = "2" ]; then - if [ "${_cpusched}" = "bmq" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c - fi - else - if [ "${_cpusched}" = "MuQSS" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "${_cpusched}" = "bmq" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - fi - fi - - # Round Robin interval - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - if [ -n "$_rr_interval" ]; then - CONDITION1="$_rr_interval" - else - plain "" - plain "Round Robin interval is the longest duration two tasks with the same nice level will" - plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" - plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" - plain "value can help offset the disadvantages of rescheduling a process that has yielded." - plain "" - plain "MuQSS default: 6ms" - plain "PDS default: 4ms" - plain "BMQ default: 2ms" - read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; - fi - if [ "$CONDITION1" = "1" ]; then - msg2 "Using 2ms rr_interval" - _rrvalue="2" - elif [ "$CONDITION1" = "2" ]; then - msg2 "Using 4ms rr_interval" - _rrvalue="4" - elif [ "$CONDITION1" = "3" ]; then - msg2 "Using 6ms rr_interval" - _rrvalue="6" - elif [ "$CONDITION1" = "4" ]; then - msg2 "Using 8ms rr_interval" - _rrvalue="8" - else - msg2 "Using default rr_interval" - _rrvalue="default" - fi - if [ "$_rrvalue" != "default" ]; then - if [ "${_cpusched}" = "MuQSS" ]; then - sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" = "bmq" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - else - if [ "${_cpusched}" = "bmq" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - fi - fi - - # zenify - if [ "$_zenify" = "true" ]; then - echo "CONFIG_ZENIFY=y" >> ./.config - elif [ "$_zenify" = "false" ]; then - echo "# CONFIG_ZENIFY is not set" >> ./.config - fi - - # compiler optimization level - if [ "$_compileroptlevel" = "1" ]; then - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - elif [ "$_compileroptlevel" = "2" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config - elif [ "$_compileroptlevel" = "3" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - fi - - # cpu opt - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then - echo "# CONFIG_MNATIVE is not set" >> ./.config - fi - - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then - sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config - fi - - if [ "$_processor_opt" = "native" ]; then - echo "CONFIG_MNATIVE=y" >> ./.config - elif [ "$_processor_opt" = "k8" ]; then - sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config - elif [ "$_processor_opt" = "k8sse3" ]; then - sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config - elif [ "$_processor_opt" = "k10" ]; then - sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config - elif [ "$_processor_opt" = "barcelona" ]; then - sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config - elif [ "$_processor_opt" = "bobcat" ]; then - sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config - elif [ "$_processor_opt" = "jaguar" ]; then - sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config - elif [ "$_processor_opt" = "bulldozer" ]; then - sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config - elif [ "$_processor_opt" = "piledriver" ]; then - sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config - elif [ "$_processor_opt" = "steamroller" ]; then - sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config - elif [ "$_processor_opt" = "excavator" ]; then - sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config - elif [ "$_processor_opt" = "zen" ]; then - sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config - elif [ "$_processor_opt" = "zen2" ]; then - sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config - elif [ "$_processor_opt" = "mpsc" ]; then - sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config - elif [ "$_processor_opt" = "atom" ]; then - sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config - elif [ "$_processor_opt" = "core2" ]; then - sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config - elif [ "$_processor_opt" = "nehalem" ]; then - sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config - elif [ "$_processor_opt" = "westmere" ]; then - sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config - elif [ "$_processor_opt" = "silvermont" ]; then - sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config - elif [ "$_processor_opt" = "sandybridge" ]; then - sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "ivybridge" ]; then - sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "haswell" ]; then - sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config - elif [ "$_processor_opt" = "broadwell" ]; then - sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config - elif [ "$_processor_opt" = "skylake" ]; then - sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config - elif [ "$_processor_opt" = "skylakex" ]; then - sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config - elif [ "$_processor_opt" = "cannonlake" ]; then - sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config - elif [ "$_processor_opt" = "icelake" ]; then - sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config - elif [ "$_processor_opt" = "goldmont" ]; then - sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config - elif [ "$_processor_opt" = "goldmontplus" ]; then - sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config - elif [ "$_processor_opt" = "cascadelake" ]; then - sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config - elif [ "$_processor_opt" = "cooperlake" ]; then - sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config - elif [ "$_processor_opt" = "tigerlake" ]; then - sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config - fi - - # irq threading - if [ "$_irq_threading" = "true" ]; then - echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config - elif [ "$_irq_threading" = "false" ]; then - echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config - fi - - # smt nice - if [ "$_smt_nice" = "true" ]; then - echo "CONFIG_SMT_NICE=y" >> ./.config - elif [ "$_smt_nice" = "false" ]; then - echo "# CONFIG_SMT_NICE is not set" >> ./.config - fi - - # random trust cpu - if [ "$_random_trust_cpu" = "true" ]; then - sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config - fi - - # rq sharing - if [ "$_runqueue_sharing" = "none" ]; then - echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" = "smt" ]; then - echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "mc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "smp" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "all" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config - elif [ "$_runqueue_sharing" = "mc-llc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - fi - - # timer freq - if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - if [ "$_timer_freq" = "1000" ]; then - sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "CONFIG_HZ_1000_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "750" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "CONFIG_HZ_750=y" >> ./.config - echo "CONFIG_HZ_750_NODEF=y" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "500" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "100" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - fi - elif [ "${_cpusched}" = "MuQSS" ] && [ -z "$_timer_freq" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - else - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - fi - - # default cpu gov - if [ "$_default_cpu_gov" = "performance" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config - elif [ "$_default_cpu_gov" = "ondemand" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config - fi - - # ACPI_CPUFREQ disablement - if [ "$_disable_acpi_cpufreq" = "true" ]; then - sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config - fi - - # ftrace - if [ -z "$_ftracedisable" ]; then - plain "" - plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" - plain "and analyzing of kernel functions." - read -rp "`echo $' > N/y : '`" CONDITION2; - fi - if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" = "true" ]; then - sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config - sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config - fi - - # disable numa - if [ -z "$_numadisable" ]; then - plain "" - plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." - plain "https://bbs.archlinux.org/viewtopic.php?id=239174" - read -rp "`echo $' > N/y : '`" CONDITION3; - fi - if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" = "true" ]; then - # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU - sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ - -i -e '/CONFIG_AMD_NUMA=y/d' \ - -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ - -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ - -i -e '/# CONFIG_NUMA_EMU is not set/d' \ - -i -e '/CONFIG_NODES_SHIFT=6/d' \ - -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ - -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ - -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config - fi - - # tickless - if [ -z "$_tickless" ]; then - plain "" - plain "Use CattaRappa mode (Tickless/Dynticks) ?" - plain "Can give higher performances in many cases but lower consistency on some hardware." - plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - else - read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - fi - fi - if [ "$CONDITION4" = "0" ] || [ "$_tickless" = "0" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config - elif [ "$CONDITION4" = "2" ] || [ "$_tickless" = "2" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - if [ "${_cpusched}" = "MuQSS" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config - echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config - fi - fi - - # voluntary preempt - if [ -z "$_voluntary_preempt" ]; then - plain "" - plain "Use explicit preemption points?" - plain "It can improve latency on PDS (at the cost of throughput)" - plain "and improve throughput on other schedulers (at the cost of latency)" - read -rp "`echo $' > N/y : '`" CONDITION5; - fi - if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" = "true" ]; then - sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config - sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config - sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config - fi - - # Open Firmware support - if [ -z "$_OFenable" ]; then - plain "" - plain "Enable Device Tree and Open Firmware support?" - read -rp "`echo $' > N/y : '`" CONDITION6; - fi - if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" = "true" ]; then - sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config - fi - - # acs override - if [ -z "$_acs_override" ]; then - plain "" - plain "Use ACS override patch?" - plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" - read -rp "`echo $' > N/y : '`" CONDITION7; - fi - if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" = "true" ]; then - msg2 "Patching ACS override" - patch -Np1 -i "$srcdir"/0006-add-acs-overrides_iommu.patch - fi - - # bcachefs - if [ -z "$_bcachefs" ]; then - plain "" - plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." - plain "https://bcachefs.org/" - read -rp "`echo $' > N/y : '`" CONDITION8; - fi - if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" = "true" ]; then - msg2 "Patching Bcache filesystem support override" - patch -Np1 -i "$srcdir"/0008-5.7-bcachefs.patch - - echo "CONFIG_BCACHEFS_FS=m" >> ./.config - echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config - echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config - echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config - echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config - echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config - fi - - # fsync support - if [ -z "$_fsync" ]; then - plain "" - plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" - plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; - fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then - msg2 "Patching Fsync support" - patch -Np1 -i "$srcdir"/0007-v5.7-fsync.patch - fi - - # ZFS fix - if [ -z "$_zfsfix" ]; then - plain "" - plain "Add back missing symbol for AES-NI/AVX support on ZFS" - plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" - read -rp "`echo $' > N/y : '`" CONDITION11; - fi - if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" = "true" ]; then - msg2 "Patching missing symbol for AES-NI/AVX support on ZFS" - patch -Np1 -i "$srcdir"/0011-ZFS-fix.patch - fi - - # Community patches - if [ -n "$_community_patches" ]; then - if [ ! -d "$_where/../../community-patches" ]; then - cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/linux-${_basekernel}" - fi - _community_patches=($_community_patches) - for _p in ${_community_patches[@]}; do - ln -s "$_where"/../../community-patches/linux"$_basever"-tkg/$_p "$_where"/ - done - fi - - # userpatches - if [ "$_user_patches" = "true" ]; then - _userpatch_target="linux-${_basekernel}" - _userpatch_ext="my" - user_patcher - fi - - # Community patches removal - for _p in ${_community_patches[@]}; do - rm -f "$_where"/$_p - done - - if [ "$_distro" = "Arch" ]; then - # don't run depmod on 'make install'. We'll do this ourselves in packaging - sed -i '2iexit 0' scripts/depmod.sh - - # get kernel version - make prepare - fi - - # modprobed-db - if [ -z "$_modprobeddb" ]; then - plain "" - plain "Use modprobed db to clean config from unneeded modules?" - plain "Speeds up compilation considerably. Requires root." - plain "https://wiki.archlinux.org/index.php/Modprobed-db" - plain "!!!! Make sure to have a well populated db !!!!" - read -rp "`echo $' > N/y : '`" CONDITIONMPDB; - fi - if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" = "true" ]; then - sudo modprobed-db recall - yes "" | make localmodconfig - fi - - if [ true = "$_config_fragments" ]; then - local fragments=() - mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) - - if [ true = "$_config_fragments_no_confirm" ]; then - printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" - else - for i in "${!fragments[@]}"; do - while true; do - read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB - CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONMPDB" in - y|yes) - break;; - n|no|'') - unset fragments[$i] - break;; - *) - echo 'Please answer with yes or no' - esac - done - done - fi - - if [ 0 -lt "${#fragments[@]}" ]; then - scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" - fi - fi - - # menuconfig / nconfig - if [ -z "$_menunconfig" ]; then - plain "" - plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" - plain "to configure the kernel before building it?" - plain "If you do, make sure your terminal is currently" - plain "at least 19 lines by 80 columns large or you'll get an error :D" - read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n 3. xconfig\n choice[0-3?]: '`" CONDITIONMNC; - _menunconfig="$CONDITIONMNC" - fi - if [ 1 = "$_menunconfig" ]; then - cp .config .config.orig - make menuconfig - elif [ 2 = "$_menunconfig" ]; then - cp .config .config.orig - make nconfig - elif [ 3 = "$_menunconfig" ]; then - cp .config .config.orig - make xconfig - else - # rewrite configuration - yes "" | make config >/dev/null - fi - if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ] || [ 3 = "$_menunconfig" ]; then - if [ -z "${_diffconfig}" ]; then - while true; do - read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF - CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONF" in - y|yes) - _diffconfig=true - break;; - n|no|'') - _diffconfig=false - break;; - *) - echo 'Please answer with yes or no' - esac - done - fi - if [ true = "$_diffconfig" ]; then - if [ -z "$_diffconfig_name" ]; then - IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name - fi - if [ -z "$_diffconfig_name" ]; then - echo 'No file name given, not generating config fragment.' - else ( - prev_pwd="${PWD:-$(pwd)}" - cd "$_where" - "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" - ) fi - fi - rm .config.orig - fi - - if [ "$_distro" = "Arch" ]; then - make -s kernelrelease > version - msg2 "Prepared %s version %s" "$pkgbase" "$( -From: Serge Hallyn -Date: Fri, 31 May 2013 19:12:12 +0100 -Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default - -Signed-off-by: Serge Hallyn -[bwh: Remove unneeded binary sysctl bits] -Signed-off-by: Daniel Micay ---- - kernel/fork.c | 15 +++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 3 +++ - 3 files changed, 30 insertions(+) - -diff --git a/kernel/fork.c b/kernel/fork.c -index 07cc743698d3668e..4011d68a8ff9305c 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b86520ed3fb60fbf..f7dab3760839f1a1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -105,6 +105,9 @@ extern int core_uses_pid; - extern char core_pattern[]; - extern unsigned int core_pipe_limit; - #endif -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - extern int pid_max; - extern int pid_max_min, pid_max_max; - extern int percpu_pagelist_fraction; -@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index c490f1e4313b998a..dd03bd39d7bf194d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - --- -2.15.1 - -From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Thu, 7 Dec 2017 13:50:48 +0100 -Subject: ZEN: Add CONFIG for unprivileged_userns_clone - -This way our default behavior continues to match the vanilla kernel. ---- - init/Kconfig | 16 ++++++++++++++++ - kernel/user_namespace.c | 4 ++++ - 2 files changed, 20 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 4592bf7997c0..f3df02990aff 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1004,6 +1004,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 6b9dbc257e34..107b17f0d528 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -27,7 +27,11 @@ - #include - - /* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else - int unprivileged_userns_clone; -+#endif - - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux57-tkg/linux57-tkg-patches/0002-clear-patches.patch b/linux57-tkg/linux57-tkg-patches/0002-clear-patches.patch deleted file mode 100644 index a7c9d4a..0000000 --- a/linux57-tkg/linux57-tkg-patches/0002-clear-patches.patch +++ /dev/null @@ -1,354 +0,0 @@ -From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Mon, 14 Mar 2016 11:10:58 -0600 -Subject: [PATCH] pci pme wakeups - -Reduce wakeups for PME checks, which are a workaround for miswired -boards (sadly, too many of them) in laptops. ---- - drivers/pci/pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index c25acace7d91..0ddebdad9f5b 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -61,7 +61,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { --- -2.20.1 - -From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sat, 19 Mar 2016 21:32:19 -0400 -Subject: [PATCH] intel_idle: tweak cpuidle cstates - -Increase target_residency in cpuidle cstate - -Tune intel_idle to be a bit less agressive; -Clear linux is cleaner in hygiene (wakupes) than the average linux, -so we can afford changing these in a way that increases -performance while keeping power efficiency ---- - drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- - 1 file changed, 22 insertions(+), 22 deletions(-) - -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index 8b5d85c91e9d..5e2d813a048d 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01), - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { --- -2.20.1 - -From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Fri, 6 Jan 2017 15:34:09 +0000 -Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little - bigger than default - ---- - net/ipv4/tcp.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index cf3c5095c10e..b30d51837b2d 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -3897,8 +3897,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; --- -2.20.1 - -From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH] locking: rwsem: spin faster - -tweak rwsem owner spinning a bit ---- - kernel/locking/rwsem.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index eef04551eae7..1ec5ab4c8ff7 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); -@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - rcu_read_unlock(); - -From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: [PATCH] drivers: Initialize ata before graphics - -ATA init is the long pole in the boot process, and its asynchronous. -move the graphics init after it so that ata and graphics initialize -in parallel ---- - drivers/Makefile | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/Makefile b/drivers/Makefile -index aaef17cc6512..d08f3a394929 100644 ---- a/drivers/Makefile -+++ b/drivers/Makefile -@@ -58,15 +58,8 @@ obj-y += char/ - # iommu/ comes before gpu as gpu are using iommu controllers - obj-y += iommu/ - --# gpu/ comes after char for AGP vs DRM startup and after iommu --obj-y += gpu/ -- - obj-$(CONFIG_CONNECTOR) += connector/ - --# i810fb and intelfb depend on char/agp/ --obj-$(CONFIG_FB_I810) += video/fbdev/i810/ --obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -- - obj-$(CONFIG_PARPORT) += parport/ - obj-$(CONFIG_NVM) += lightnvm/ - obj-y += base/ block/ misc/ mfd/ nfc/ -@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ - obj-y += scsi/ - obj-y += nvme/ - obj-$(CONFIG_ATA) += ata/ -+ -+# gpu/ comes after char for AGP vs DRM startup and after iommu -+obj-y += gpu/ -+ -+# i810fb and intelfb depend on char/agp/ -+obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -+ - obj-$(CONFIG_TARGET_CORE) += target/ - obj-$(CONFIG_MTD) += mtd/ - obj-$(CONFIG_SPI) += spi/ diff --git a/linux57-tkg/linux57-tkg-patches/0003-glitched-base.patch b/linux57-tkg/linux57-tkg-patches/0003-glitched-base.patch deleted file mode 100644 index 0cd2ef0..0000000 --- a/linux57-tkg/linux57-tkg-patches/0003-glitched-base.patch +++ /dev/null @@ -1,545 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - -diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h -index 87f1fc9..b3be470 100755 ---- a/scripts/mkcompile_h -+++ b/scripts/mkcompile_h -@@ -50,8 +50,8 @@ else - fi - - UTS_VERSION="#$VERSION" --CONFIG_FLAGS="" --if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi -+CONFIG_FLAGS="TKG" -+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi - if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi - UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" - -diff --git a/fs/dcache.c b/fs/dcache.c -index 2acfc69878f5..3f1131431e06 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -69,7 +69,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 211890edf37e..37121563407d 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ --const_debug unsigned int sysctl_sched_nr_migrate = 32; -+const_debug unsigned int sysctl_sched_nr_migrate = 128; - - /* - * period over which we average the RT time consumption, measured -@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - /* - * __task_rq_lock - lock the rq @p resides on. -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 71f39410691b..288f9679e883 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ # echo "+" - return - fi - # If we are past a tagged commit (like - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: Zenify & stuff - - -diff --git a/init/Kconfig b/init/Kconfig -index b4daad2bac23..c1e59dc04209 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1244,7 +1244,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - - config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" -- depends on ARC - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 4f32c4062fb6..c0bf039e1b40 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 79226ca8f80f..2a30060e7e1d 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -47,7 +47,11 @@ struct blk_queue_stats; - struct blk_stat_callback; - - #define BLKDEV_MIN_RQ 4 -+#ifdef CONFIG_ZENIFY -+#define BLKDEV_MAX_RQ 512 -+#else - #define BLKDEV_MAX_RQ 128 /* Default maximum */ -+#endif - - /* Must be consistent with blk_mq_poll_stats_bkt() */ - #define BLK_MQ_POLL_STATS_BKTS 16 -diff --git a/init/Kconfig b/init/Kconfig -index 041f3a022122..5ed70eb1ad3a 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 2f0a0be4d344..bada807c7e59 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -37,8 +37,13 @@ - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_latency = 3000000ULL; -+static unsigned int normalized_sysctl_sched_latency = 3000000ULL; -+#else - unsigned int sysctl_sched_latency = 6000000ULL; - static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -+#endif - - /* - * The initial- and re-scaling of tunables is configurable -@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_min_granularity = 300000ULL; -+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -+#else - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+#endif - - /* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -+#ifdef CONFIG_ZENIFY -+static unsigned int sched_nr_latency = 10; -+#else - static unsigned int sched_nr_latency = 8; -+#endif - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_wakeup_granularity = 500000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; -+ -+const_debug unsigned int sysctl_sched_migration_cost = 50000UL; -+#else - unsigned int sysctl_sched_wakeup_granularity = 1000000UL; - static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -+#endif - - #ifdef CONFIG_SMP - /* -@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -+#else - unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif -+#endif - - /* - * The margin used when comparing utilization with CPU capacity: -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 337c6afb3345..9315e358f292 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; - /* - * Start background writeback (via writeback threads) at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int dirty_background_ratio = 20; -+#else - int dirty_background_ratio = 10; -+#endif - - /* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of -@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; - /* - * The generator of dirty data starts writeback at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int vm_dirty_ratio = 50; -+#else - int vm_dirty_ratio = 20; -+#endif - - /* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 80dad301361d..42b7fa7d01f8 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -702,6 +702,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO - -From: Nick Desaulniers -Date: Mon, 24 Dec 2018 13:37:41 +0200 -Subject: include/linux/compiler*.h: define asm_volatile_goto - -asm_volatile_goto should also be defined for other compilers that -support asm goto. - -Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h -mutually exclusive"). - -Signed-off-by: Nick Desaulniers -Signed-off-by: Miguel Ojeda - -diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h -index ba814f1..e77eeb0 100644 ---- a/include/linux/compiler_types.h -+++ b/include/linux/compiler_types.h -@@ -188,6 +188,10 @@ struct ftrace_likely_data { - #define asm_volatile_goto(x...) asm goto(x) - #endif - -+#ifndef asm_volatile_goto -+#define asm_volatile_goto(x...) asm goto(x) -+#endif -+ - /* Are two types/vars the same type (ignoring qualifiers)? */ - #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) - -From: Andy Lavr -Date: Mon, 24 Dec 2018 14:57:47 +0200 -Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: -https://lwn.net/Articles/711248/ - -Signed-off-by: Andy Lavr - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index e84a10b..21d62b7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1< -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 076ba7308e65..81f89095aa77 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* -From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 16:45:59 -0300 -Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O - requests - -Signed-off-by: Alexandre Frade ---- - include/linux/blkdev.h | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index f3ea78b0c91c..4dbacc6b073b 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -621,7 +621,8 @@ struct request_queue { - #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ - - #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ -- (1 << QUEUE_FLAG_SAME_COMP)) -+ (1 << QUEUE_FLAG_SAME_COMP) | \ -+ (1 << QUEUE_FLAG_SAME_FORCE)) - - void blk_queue_flag_set(unsigned int flag, struct request_queue *q); - void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); -From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 14:32:50 -0300 -Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead - pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index a2adf95b3f9c..e804d9f7583a 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); - void task_dirty_inc(struct task_struct *tsk); - - /* readahead.c */ --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); -From de7119e3db9fdb4c704355854a02a7e9fad931d4 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 15 Jan 2020 20:43:56 -0600 -Subject: [PATCH] ZEN: intel-pstate: Implement "enable" parameter - -If intel-pstate is compiled into the kernel, it will preempt the loading -of acpi-cpufreq so you can take advantage of hardware p-states without -any friction. - -However, intel-pstate is not completely superior to cpufreq's ondemand -for one reason. There's no concept of an up_threshold property. - -In ondemand, up_threshold essentially reduces the maximum utilization to -compare against, allowing you to hit max frequencies and turbo boost -from a much lower core utilization. - -With intel-pstate, you have the concept of minimum and maximum -performance, but no tunable that lets you define, maximum frequency -means 50% core utilization. For just this oversight, there's reasons -you may want ondemand. - -Lets support setting "enable" in kernel boot parameters. This lets -kernel maintainers include "intel_pstate=disable" statically in the -static boot parameters, but let users of the kernel override this -selection. ---- - Documentation/admin-guide/kernel-parameters.txt | 3 +++ - drivers/cpufreq/intel_pstate.c | 2 ++ - 2 files changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index ade4e6ec23e03..0b613370d28d8 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -1765,6 +1765,9 @@ - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors -+ enable -+ Enable intel_pstate in-case "disable" was passed -+ previously in the kernel boot parameters - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index d2fa3e9ccd97c..bd10cb02fc0ff 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -2826,6 +2826,8 @@ static int __init intel_pstate_setup(char *str) - pr_info("HWP disabled\n"); - no_hwp = 1; - } -+ if (!strcmp(str, "enable")) -+ no_load = 0; - if (!strcmp(str, "force")) - force_load = 1; - if (!strcmp(str, "hwp_only")) diff --git a/linux57-tkg/linux57-tkg-patches/0003-glitched-cfs.patch b/linux57-tkg/linux57-tkg-patches/0003-glitched-cfs.patch deleted file mode 100644 index 06b7f02..0000000 --- a/linux57-tkg/linux57-tkg-patches/0003-glitched-cfs.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - diff --git a/linux57-tkg/linux57-tkg-patches/0004-5.7-ck1.patch b/linux57-tkg/linux57-tkg-patches/0004-5.7-ck1.patch deleted file mode 100644 index ee1d1c8..0000000 --- a/linux57-tkg/linux57-tkg-patches/0004-5.7-ck1.patch +++ /dev/null @@ -1,13147 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 7bc83f3d9bdf..2f9e8cdf5fec 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4429,6 +4429,14 @@ - Memory area to be used by remote processor image, - managed by CMA. - -+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. -+ Format: -+ smt -- Share SMT (hyperthread) sibling runqueues -+ mc -- Share MC (multicore) sibling runqueues -+ smp -- Share SMP runqueues -+ none -- So not share any runqueues -+ Default value is mc -+ - rw [KNL] Mount root device read-write on boot - - S [KNL] Run init in single mode -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 0d427fd10941..5b3406a3d76f 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -344,6 +344,16 @@ Controls whether the panic kmsg data should be reported to Hyper-V. - = ========================================================= - - -+iso_cpu: (MuQSS CPU scheduler only) -+=================================== -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling five -+seconds over the -whole- system, meaning all cpus. -+ -+Set to 70 (percent) by default. -+ -+ - kexec_load_disabled - =================== - -@@ -922,6 +932,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after - rebooting. ??? - - -+rr_interval: (MuQSS CPU scheduler only) -+======================================= -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. Conversely decreasing it will decrease average and maximum -+latencies but at the expense of throughput. This value is in -+milliseconds and the default value chosen depends on the number of -+cpus available at scheduler initialisation with a minimum of 6. -+ -+Valid values are from 1-1000. -+ -+ - sched_energy_aware - ================== - -@@ -1230,3 +1254,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+ -+yield_type: (MuQSS CPU scheduler only) -+====================================== -+ -+This determines what type of yield calls to sched_yield will perform. -+ -+ 0: No yield. -+ 1: Yield only to better priority/deadline tasks. (default) -+ 2: Expire timeslice and recalculate deadline. -diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt -new file mode 100644 -index 000000000000..c0282002a079 ---- /dev/null -+++ b/Documentation/scheduler/sched-BFS.txt -@@ -0,0 +1,351 @@ -+BFS - The Brain Fuck Scheduler by Con Kolivas. -+ -+Goals. -+ -+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to -+completely do away with the complex designs of the past for the cpu process -+scheduler and instead implement one that is very simple in basic design. -+The main focus of BFS is to achieve excellent desktop interactivity and -+responsiveness without heuristics and tuning knobs that are difficult to -+understand, impossible to model and predict the effect of, and when tuned to -+one workload cause massive detriment to another. -+ -+ -+Design summary. -+ -+BFS is best described as a single runqueue, O(n) lookup, earliest effective -+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual -+deadline first) and my previous Staircase Deadline scheduler. Each component -+shall be described in order to understand the significance of, and reasoning for -+it. The codebase when the first stable version was released was approximately -+9000 lines less code than the existing mainline linux kernel scheduler (in -+2.6.31). This does not even take into account the removal of documentation and -+the cgroups code that is not used. -+ -+Design reasoning. -+ -+The single runqueue refers to the queued but not running processes for the -+entire system, regardless of the number of CPUs. The reason for going back to -+a single runqueue design is that once multiple runqueues are introduced, -+per-CPU or otherwise, there will be complex interactions as each runqueue will -+be responsible for the scheduling latency and fairness of the tasks only on its -+own runqueue, and to achieve fairness and low latency across multiple CPUs, any -+advantage in throughput of having CPU local tasks causes other disadvantages. -+This is due to requiring a very complex balancing system to at best achieve some -+semblance of fairness across CPUs and can only maintain relatively low latency -+for tasks bound to the same CPUs, not across them. To increase said fairness -+and latency across CPUs, the advantage of local runqueue locking, which makes -+for better scalability, is lost due to having to grab multiple locks. -+ -+A significant feature of BFS is that all accounting is done purely based on CPU -+used and nowhere is sleep time used in any way to determine entitlement or -+interactivity. Interactivity "estimators" that use some kind of sleep/run -+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag -+tasks that aren't interactive as being so. The reason for this is that it is -+close to impossible to determine that when a task is sleeping, whether it is -+doing it voluntarily, as in a userspace application waiting for input in the -+form of a mouse click or otherwise, or involuntarily, because it is waiting for -+another thread, process, I/O, kernel activity or whatever. Thus, such an -+estimator will introduce corner cases, and more heuristics will be required to -+cope with those corner cases, introducing more corner cases and failed -+interactivity detection and so on. Interactivity in BFS is built into the design -+by virtue of the fact that tasks that are waking up have not used up their quota -+of CPU time, and have earlier effective deadlines, thereby making it very likely -+they will preempt any CPU bound task of equivalent nice level. See below for -+more information on the virtual deadline mechanism. Even if they do not preempt -+a running task, because the rr interval is guaranteed to have a bound upper -+limit on how long a task will wait for, it will be scheduled within a timeframe -+that will not cause visible interface jitter. -+ -+ -+Design details. -+ -+Task insertion. -+ -+BFS inserts tasks into each relevant queue as an O(1) insertion into a double -+linked list. On insertion, *every* running queue is checked to see if the newly -+queued task can run on any idle queue, or preempt the lowest running task on the -+system. This is how the cross-CPU scheduling of BFS achieves significantly lower -+latency per extra CPU the system has. In this case the lookup is, in the worst -+case scenario, O(n) where n is the number of CPUs on the system. -+ -+Data protection. -+ -+BFS has one single lock protecting the process local data of every task in the -+global queue. Thus every insertion, removal and modification of task data in the -+global runqueue needs to grab the global lock. However, once a task is taken by -+a CPU, the CPU has its own local data copy of the running process' accounting -+information which only that CPU accesses and modifies (such as during a -+timer tick) thus allowing the accounting data to be updated lockless. Once a -+CPU has taken a task to run, it removes it from the global queue. Thus the -+global queue only ever has, at most, -+ -+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 -+ -+tasks in the global queue. This value is relevant for the time taken to look up -+tasks during scheduling. This will increase if many tasks with CPU affinity set -+in their policy to limit which CPUs they're allowed to run on if they outnumber -+the number of CPUs. The +1 is because when rescheduling a task, the CPU's -+currently running task is put back on the queue. Lookup will be described after -+the virtual deadline mechanism is explained. -+ -+Virtual deadline. -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in BFS is entirely in the virtual deadline mechanism. The one -+tunable in BFS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in jiffies by this equation: -+ -+ jiffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. Once a task is descheduled, it is put back on the queue, and an -+O(n) lookup of all queued-but-not-running tasks is done to determine which has -+the earliest deadline and that task is chosen to receive CPU next. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (jiffies) is -+constantly moving. -+ -+Task lookup. -+ -+BFS has 103 priority queues. 100 of these are dedicated to the static priority -+of realtime tasks, and the remaining 3 are, in order of best to worst priority, -+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority -+scheduling). When a task of these priorities is queued, a bitmap of running -+priorities is set showing which of these priorities has tasks waiting for CPU -+time. When a CPU is made to reschedule, the lookup for the next task to get -+CPU time is performed in the following way: -+ -+First the bitmap is checked to see what static priority tasks are queued. If -+any realtime priorities are found, the corresponding queue is checked and the -+first task listed there is taken (provided CPU affinity is suitable) and lookup -+is complete. If the priority corresponds to a SCHED_ISO task, they are also -+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds -+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this -+stage, every task in the runlist that corresponds to that priority is checked -+to see which has the earliest set deadline, and (provided it has suitable CPU -+affinity) it is taken off the runqueue and given the CPU. If a task has an -+expired deadline, it is taken and the rest of the lookup aborted (as they are -+chosen in FIFO order). -+ -+Thus, the lookup is O(n) in the worst case only, where n is as described -+earlier, as tasks may be chosen before the whole task list is looked over. -+ -+ -+Scalability. -+ -+The major limitations of BFS will be that of scalability, as the separate -+runqueue designs will have less lock contention as the number of CPUs rises. -+However they do not scale linearly even with separate runqueues as multiple -+runqueues will need to be locked concurrently on such designs to be able to -+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness -+across CPUs, and to achieve low enough latency for tasks on a busy CPU when -+other CPUs would be more suited. BFS has the advantage that it requires no -+balancing algorithm whatsoever, as balancing occurs by proxy simply because -+all CPUs draw off the global runqueue, in priority and deadline order. Despite -+the fact that scalability is _not_ the prime concern of BFS, it both shows very -+good scalability to smaller numbers of CPUs and is likely a more scalable design -+at these numbers of CPUs. -+ -+It also has some very low overhead scalability features built into the design -+when it has been deemed their overhead is so marginal that they're worth adding. -+The first is the local copy of the running process' data to the CPU it's running -+on to allow that data to be updated lockless where possible. Then there is -+deference paid to the last CPU a task was running on, by trying that CPU first -+when looking for an idle CPU to use the next time it's scheduled. Finally there -+is the notion of cache locality beyond the last running CPU. The sched_domains -+information is used to determine the relative virtual "cache distance" that -+other CPUs have from the last CPU a task was running on. CPUs with shared -+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated -+as cache local. CPUs without shared caches are treated as not cache local, and -+CPUs on different NUMA nodes are treated as very distant. This "relative cache -+distance" is used by modifying the virtual deadline value when doing lookups. -+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for -+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning -+behind the doubling of deadlines is as follows. The real cost of migrating a -+task from one CPU to another is entirely dependant on the cache footprint of -+the task, how cache intensive the task is, how long it's been running on that -+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and -+how layered the CPU cache is, how fast a context switch is... and so on. In -+other words, it's close to random in the real world where we do more than just -+one sole workload. The only thing we can be sure of is that it's not free. So -+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs -+is more important than cache locality, and cache locality only plays a part -+after that. Doubling the effective deadline is based on the premise that the -+"cache local" CPUs will tend to work on the same tasks up to double the number -+of cache local CPUs, and once the workload is beyond that amount, it is likely -+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA -+is a value I pulled out of my arse. -+ -+When choosing an idle CPU for a waking task, the cache locality is determined -+according to where the task last ran and then idle CPUs are ranked from best -+to worst to choose the most suitable idle CPU based on cache locality, NUMA -+node locality and hyperthread sibling business. They are chosen in the -+following preference (if idle): -+ -+* Same core, idle or busy cache, idle threads -+* Other core, same cache, idle or busy cache, idle threads. -+* Same node, other CPU, idle cache, idle threads. -+* Same node, other CPU, busy cache, idle threads. -+* Same core, busy threads. -+* Other core, same cache, busy threads. -+* Same node, other CPU, busy threads. -+* Other node, other CPU, idle cache, idle threads. -+* Other node, other CPU, busy cache, idle threads. -+* Other node, other CPU, busy threads. -+ -+This shows the SMT or "hyperthread" awareness in the design as well which will -+choose a real idle core first before a logical SMT sibling which already has -+tasks on the physical CPU. -+ -+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. -+However this benchmarking was performed on an earlier design that was far less -+scalable than the current one so it's hard to know how scalable it is in terms -+of both CPUs (due to the global runqueue) and heavily loaded machines (due to -+O(n) lookup) at this stage. Note that in terms of scalability, the number of -+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) -+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark -+results are very promising indeed, without needing to tweak any knobs, features -+or options. Benchmark contributions are most welcome. -+ -+ -+Features -+ -+As the initial prime target audience for BFS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval -+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition -+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is -+support for CGROUPS. The average user should neither need to know what these -+are, nor should they need to be using them to have good desktop behaviour. -+ -+rr_interval -+ -+There is only one "scheduler" tunable, the round robin interval. This can be -+accessed in -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6 on a -+uniprocessor machine, and automatically set to a progressively higher value on -+multiprocessor machines. The reasoning behind increasing the value on more CPUs -+is that the effective latency is decreased by virtue of there being more CPUs on -+BFS (for reasons explained above), and increasing the value allows for less -+cache contention and more throughput. Valid values are from 1 to 1000 -+Decreasing the value will decrease latencies at the cost of decreasing -+throughput, while increasing it will improve throughput, but at the cost of -+worsening latencies. The accuracy of the rr interval is limited by HZ resolution -+of the kernel configuration. Thus, the worst case latencies are usually slightly -+higher than this actual value. The default value of 6 is not an arbitrary one. -+It is based on the fact that humans can detect jitter at approximately 7ms, so -+aiming for much lower latencies is pointless under most circumstances. It is -+worth noting this fact when comparing the latency performance of BFS to other -+schedulers. Worst case latencies being higher than 7ms are far worse than -+average latencies not being in the microsecond range. -+ -+Isochronous scheduling. -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of _total CPU_ available across the machine, configurable -+as a percentage in the following "resource handling" tunable (as opposed to a -+scheduler tunable): -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of BFS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+Because some applications constantly set their policy as well as their nice -+level, there is potential for them to undo the override specified by the user -+on the command line of setting the policy to SCHED_ISO. To counter this, once -+a task has been set to SCHED_ISO policy, it needs superuser privileges to set -+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child -+processes and threads will also inherit the ISO policy. -+ -+Idleprio scheduling. -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start -+a video encode or so on without any slowdown of other tasks. To avoid this -+policy from grabbing shared resources and holding them indefinitely, if it -+detects a state where the task is waiting on I/O, the machine is about to -+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As -+per the Isochronous task management, once a task has been scheduled as IDLEPRIO, -+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can -+be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+ schedtool -D -e ./mprime -+ -+Subtick accounting. -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the -+timer tick frequency (HZ) is lowered. It is possible to create an application -+which uses almost 100% CPU, yet by being descheduled at the right time, records -+zero CPU usage. While the main problem with this is that there are possible -+security implications, it is also difficult to determine how much CPU a task -+really does use. BFS tries to use the sub-tick accounting from the TSC clock, -+where possible, to determine real CPU usage. This is not entirely reliable, but -+is far more likely to produce accurate CPU usage data than the existing designs -+and will not show tasks as consuming no CPU usage when they actually are. Thus, -+the amount of CPU reported as being used by BFS will more accurately represent -+how much CPU the task itself is using (as is shown for example by the 'time' -+application), so the reported values may be quite different to other schedulers. -+Values reported as the 'load' are more prone to problems with this design, but -+per process values are closer to real usage. When comparing throughput of BFS -+to other designs, it is important to compare the actual completed work in terms -+of total wall clock time taken and total work done, rather than the reported -+"cpu usage". -+ -+ -+Con Kolivas Fri Aug 27 2010 -diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt -new file mode 100644 -index 000000000000..ae28b85c9995 ---- /dev/null -+++ b/Documentation/scheduler/sched-MuQSS.txt -@@ -0,0 +1,373 @@ -+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. -+ -+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with -+one 8 level skiplist per runqueue, and fine grained locking for much more -+scalability. -+ -+ -+Goals. -+ -+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from -+here on (pronounced mux) is to completely do away with the complex designs of -+the past for the cpu process scheduler and instead implement one that is very -+simple in basic design. The main focus of MuQSS is to achieve excellent desktop -+interactivity and responsiveness without heuristics and tuning knobs that are -+difficult to understand, impossible to model and predict the effect of, and when -+tuned to one workload cause massive detriment to another, while still being -+scalable to many CPUs and processes. -+ -+ -+Design summary. -+ -+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) -+lookup, earliest effective virtual deadline first tickless design, loosely based -+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase -+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. -+Each component shall be described in order to understand the significance of, -+and reasoning for it. -+ -+ -+Design reasoning. -+ -+In BFS, the use of a single runqueue across all CPUs meant that each CPU would -+need to scan the entire runqueue looking for the process with the earliest -+deadline and schedule that next, regardless of which CPU it originally came -+from. This made BFS deterministic with respect to latency and provided -+guaranteed latencies dependent on number of processes and CPUs. The single -+runqueue, however, meant that all CPUs would compete for the single lock -+protecting it, which would lead to increasing lock contention as the number of -+CPUs rose and appeared to limit scalability of common workloads beyond 16 -+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously -+increased overhead proportionate to the number of queued proecesses and led to -+cache thrashing while iterating over the linked list. -+ -+MuQSS is an evolution of BFS, designed to maintain the same scheduling -+decision mechanism and be virtually deterministic without relying on the -+constrained design of the single runqueue by splitting out the single runqueue -+to be per-CPU and use skiplists instead of linked lists. -+ -+The original reason for going back to a single runqueue design for BFS was that -+once multiple runqueues are introduced, per-CPU or otherwise, there will be -+complex interactions as each runqueue will be responsible for the scheduling -+latency and fairness of the tasks only on its own runqueue, and to achieve -+fairness and low latency across multiple CPUs, any advantage in throughput of -+having CPU local tasks causes other disadvantages. This is due to requiring a -+very complex balancing system to at best achieve some semblance of fairness -+across CPUs and can only maintain relatively low latency for tasks bound to the -+same CPUs, not across them. To increase said fairness and latency across CPUs, -+the advantage of local runqueue locking, which makes for better scalability, is -+lost due to having to grab multiple locks. -+ -+MuQSS works around the problems inherent in multiple runqueue designs by -+making its skip lists priority ordered and through novel use of lockless -+examination of each other runqueue it can decide if it should take the earliest -+deadline task from another runqueue for latency reasons, or for CPU balancing -+reasons. It still does not have a balancing system, choosing to allow the -+next task scheduling decision and task wakeup CPU choice to allow balancing to -+happen by virtue of its choices. -+ -+As a further evolution of the design, MuQSS normally configures sharing of -+runqueues in a logical fashion for when CPU resources are shared for improved -+latency and throughput. By default it shares runqueues and locks between -+multicore siblings. Optionally it can be configured to run with sharing of -+SMT siblings only, all SMP packages or no sharing at all. Additionally it can -+be selected at boot time. -+ -+ -+Design details. -+ -+Custom skip list implementation: -+ -+To avoid the overhead of building up and tearing down skip list structures, -+the variant used by MuQSS has a number of optimisations making it specific for -+its use case in the scheduler. It uses static arrays of 8 'levels' instead of -+building up and tearing down structures dynamically. This makes each runqueue -+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU -+it means that it scales O(log N) up to 64k x number of logical CPUs which is -+far beyond the realistic task limits each CPU could handle. By being 8 levels -+it also makes the array exactly one cacheline in size. Additionally, each -+skip list node is bidirectional making insertion and removal amortised O(1), -+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very -+first entry in each list at all times with MuQSS, so there is never a need to -+do a search and thus look up is always O(1). In interactive mode, the queues -+will be searched beyond their first entry if the first task is not suitable -+for affinity or SMT nice reasons. -+ -+Task insertion: -+ -+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into -+a custom skip list as described above (based on the original design by William -+Pugh). Insertion is ordered in such a way that there is never a need to do a -+search by ordering tasks according to static priority primarily, and then -+virtual deadline at the time of insertion. -+ -+Niffies: -+ -+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are -+of nanosecond resolution. Niffies are calculated per-runqueue from the high -+resolution TSC timers, and in order to maintain fairness are synchronised -+between CPUs whenever both runqueues are locked concurrently. -+ -+Virtual deadline: -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in MuQSS is entirely in the virtual deadline mechanism. The one -+tunable in MuQSS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in niffies by this equation: -+ -+ niffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (niffies) is -+constantly moving. -+ -+Task lookup: -+ -+As tasks are already pre-ordered according to anticipated scheduling order in -+the skip lists, lookup for the next suitable task per-runqueue is always a -+matter of simply selecting the first task in the 0th level skip list entry. -+In order to maintain optimal latency and fairness across CPUs, MuQSS does a -+novel examination of every other runqueue in cache locality order, choosing the -+best task across all runqueues. This provides near-determinism of how long any -+task across the entire system may wait before receiving CPU time. The other -+runqueues are first examine lockless and then trylocked to minimise the -+potential lock contention if they are likely to have a suitable better task. -+Each other runqueue lock is only held for as long as it takes to examine the -+entry for suitability. In "interactive" mode, the default setting, MuQSS will -+look for the best deadline task across all CPUs, while in !interactive mode, -+it will only select a better deadline task from another CPU if it is more -+heavily laden than the current one. -+ -+Lookup is therefore O(k) where k is number of CPUs. -+ -+ -+Latency. -+ -+Through the use of virtual deadlines to govern the scheduling order of normal -+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by -+the rr_interval tunable which is set to 6ms by default. This means that the -+longest a CPU bound task will wait for more CPU is proportional to the number -+of running tasks and in the common case of 0-2 running tasks per CPU, will be -+under the 7ms threshold for human perception of jitter. Additionally, as newly -+woken tasks will have an early deadline from their previous runtime, the very -+tasks that are usually latency sensitive will have the shortest interval for -+activation, usually preempting any existing CPU bound tasks. -+ -+Tickless expiry: -+ -+A feature of MuQSS is that it is not tied to the resolution of the chosen tick -+rate in Hz, instead depending entirely on the high resolution timers where -+possible for sub-millisecond accuracy on timeouts regarless of the underlying -+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates -+such as 100 by default, benefiting from the improved throughput and lower -+power usage it provides. Another advantage of this approach is that in -+combination with the Full No HZ option, which disables ticks on running task -+CPUs instead of just idle CPUs, the tick can be disabled at all times -+regardless of how many tasks are running instead of being limited to just one -+running task. Note that this option is NOT recommended for regular desktop -+users. -+ -+ -+Scalability and balancing. -+ -+Unlike traditional approaches where balancing is a combination of CPU selection -+at task wakeup and intermittent balancing based on a vast array of rules set -+according to architecture, busyness calculations and special case management, -+MuQSS indirectly balances on the fly at task wakeup and next task selection. -+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for -+each logical CPU and uses this to aid task/CPU selection when CPUs are busy. -+Additionally it selects any idle CPUs, if they are available, at any time over -+busy CPUs according to the following preference: -+ -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ -+Mux is therefore SMT, MC and Numa aware without the need for extra -+intermittent balancing to maintain CPUs busy and make the most of cache -+coherency. -+ -+ -+Features -+ -+As the initial prime target audience for MuQSS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, -+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO -+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS -+does _not_ now feature is support for CGROUPS. The average user should neither -+need to know what these are, nor should they need to be using them to have good -+desktop behaviour. However since some applications refuse to work without -+cgroups, one can enable them with MuQSS as a stub and the filesystem will be -+created which will allow the applications to work. -+ -+rr_interval: -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6. Valid values -+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of -+decreasing throughput, while increasing it will improve throughput, but at the -+cost of worsening latencies. It is based on the fact that humans can detect -+jitter at approximately 7ms, so aiming for much lower latencies is pointless -+under most circumstances. It is worth noting this fact when comparing the -+latency performance of MuQSS to other schedulers. Worst case latencies being -+higher than 7ms are far worse than average latencies not being in the -+microsecond range. -+ -+interactive: -+ -+ /proc/sys/kernel/interactive -+ -+The value is a simple boolean of 1 for on and 0 for off and is set to on by -+default. Disabling this will disable the near-determinism of MuQSS when -+selecting the next task by not examining all CPUs for the earliest deadline -+task, or which CPU to wake to, instead prioritising CPU balancing for improved -+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis -+instead of across the whole system. -+ -+Runqueue sharing. -+ -+By default MuQSS chooses to share runqueue resources (specifically the skip -+list and locking) between multicore siblings. It is configurable at build time -+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing -+only between simultaneous mulithreading siblings, multicore siblings, or -+symmetric multiprocessing physical packages. Additionally it can be se at -+bootime with the use of the rqshare parameter. The reason for configurability -+is that some architectures have CPUs with many multicore siblings (>= 16) -+where it may be detrimental to throughput to share runqueues and another -+sharing option may be desirable. Additionally, more sharing than usual can -+improve latency on a system-wide level at the expense of throughput if desired. -+ -+The options are: -+none, smt, mc, smp -+ -+eg: -+ rqshare=mc -+ -+Isochronous scheduling: -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of CPU available per CPU, configurable as a percentage in -+the following "resource handling" tunable (as opposed to a scheduler tunable): -+ -+iso_cpu: -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of MuQSS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+ -+ -+Idleprio scheduling: -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start a -+video encode or so on without any slowdown of other tasks. To avoid this policy -+from grabbing shared resources and holding them indefinitely, if it detects a -+state where the task is waiting on I/O, the machine is about to suspend to ram -+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has -+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without -+superuser privileges since it is effectively a lower scheduling policy. Tasks -+can be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+schedtool -D -e ./mprime -+ -+Subtick accounting: -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the timer -+tick frequency (HZ) is lowered. It is possible to create an application which -+uses almost 100% CPU, yet by being descheduled at the right time, records zero -+CPU usage. While the main problem with this is that there are possible security -+implications, it is also difficult to determine how much CPU a task really does -+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU -+usage. Thus, the amount of CPU reported as being used by MuQSS will more -+accurately represent how much CPU the task itself is using (as is shown for -+example by the 'time' application), so the reported values may be quite -+different to other schedulers. When comparing throughput of MuQSS to other -+designs, it is important to compare the actual completed work in terms of total -+wall clock time taken and total work done, rather than the reported "cpu usage". -+ -+Symmetric MultiThreading (SMT) aware nice: -+ -+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the -+logical CPU count rises by adding thread units to each CPU core, allowing more -+than one task to be run simultaneously on the same core, the disadvantage of it -+is that the CPU power is shared between the tasks, not summating to the power -+of two CPUs. The practical upshot of this is that two tasks running on -+separate threads of the same core run significantly slower than if they had one -+core each to run on. While smart CPU selection allows each task to have a core -+to itself whenever available (as is done on MuQSS), it cannot offset the -+slowdown that occurs when the cores are all loaded and only a thread is left. -+Most of the time this is harmless as the CPU is effectively overloaded at this -+point and the extra thread is of benefit. However when running a niced task in -+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets -+precisely the same amount of CPU power as the unniced one. MuQSS has an -+optional configuration feature known as SMT-NICE which selectively idles the -+secondary niced thread for a period proportional to the nice difference, -+allowing CPU distribution according to nice level to be maintained, at the -+expense of a small amount of extra overhead. If this is configured in on a -+machine without SMT threads, the overhead is minimal. -+ -+ -+Con Kolivas Sat, 29th October 2016 -diff --git a/Makefile b/Makefile -index b668725a2a62..73a4381d3ea9 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus - PHONY := _all - _all: - -+CKVERSION = -ck1 -+CKNAME = MuQSS Powered -+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) -+ - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. - # -diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig -index ef179033a7c2..14b576a531ad 100644 ---- a/arch/alpha/Kconfig -+++ b/arch/alpha/Kconfig -@@ -665,6 +665,8 @@ config HZ - default 1200 if HZ_1200 - default 1024 - -+source "kernel/Kconfig.MuQSS" -+ - config SRM_ENV - tristate "SRM environment through procfs" - depends on PROC_FS -diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index a12656ec0072..b46b6ddc7636 100644 ---- a/arch/arc/configs/tb10x_defconfig -+++ b/arch/arc/configs/tb10x_defconfig -@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y - CONFIG_ARC_CACHE_LINE_SHIFT=5 - CONFIG_HZ=250 - CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_COMPACTION is not set - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index c77c93c485a0..c16a89549ff2 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -1237,6 +1237,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config HAVE_ARM_SCU - bool - help -diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig -index 8e7a3ed2a4df..8a1ec6d2c3fb 100644 ---- a/arch/arm/configs/bcm2835_defconfig -+++ b/arch/arm/configs/bcm2835_defconfig -@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_ARCH_MULTI_V6=y - CONFIG_ARCH_BCM=y - CONFIG_ARCH_BCM2835=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_KSM=y - CONFIG_CLEANCACHE=y -diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig -index 5a20d12d62bd..fb76e6ff18a0 100644 ---- a/arch/arm/configs/imx_v6_v7_defconfig -+++ b/arch/arm/configs/imx_v6_v7_defconfig -@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y - CONFIG_PCI_IMX6=y - CONFIG_SMP=y - CONFIG_ARM_PSCI=y -+CONFIG_PREEMPT=y - CONFIG_HIGHMEM=y - CONFIG_FORCE_MAX_ZONEORDER=14 - CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" -diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig -index 1d923dbb9928..9c1931f1fafd 100644 ---- a/arch/arm/configs/mps2_defconfig -+++ b/arch/arm/configs/mps2_defconfig -@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y - CONFIG_SET_MEM_PARAM=y - CONFIG_DRAM_BASE=0x21000000 - CONFIG_DRAM_SIZE=0x1000000 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_ATAGS is not set - CONFIG_ZBOOT_ROM_TEXT=0x0 - CONFIG_ZBOOT_ROM_BSS=0x0 -diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig -index a9c6f32a9b1c..870866aaa39d 100644 ---- a/arch/arm/configs/mxs_defconfig -+++ b/arch/arm/configs/mxs_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT_VOLUNTARY=n - CONFIG_TASKSTATS=y - CONFIG_TASK_DELAY_ACCT=y - CONFIG_TASK_XACCT=y -@@ -25,6 +25,13 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_MODVERSIONS=y - CONFIG_BLK_DEV_INTEGRITY=y -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+# CONFIG_ARCH_MULTI_V7 is not set -+CONFIG_ARCH_MXS=y -+# CONFIG_ARM_THUMB is not set -+CONFIG_PREEMPT=y -+CONFIG_AEABI=y - CONFIG_NET=y - CONFIG_PACKET=y - CONFIG_UNIX=y -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 5d513f461957..7cb8456280be 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -942,6 +942,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 -diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig -index 6466e83067b4..776d8783fc2a 100644 ---- a/arch/mips/configs/fuloong2e_defconfig -+++ b/arch/mips/configs/fuloong2e_defconfig -@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig -index 9085f4d6c698..fb23111d45f6 100644 ---- a/arch/mips/configs/gpr_defconfig -+++ b/arch/mips/configs/gpr_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig -index 21a1168ae301..529a1b1007cf 100644 ---- a/arch/mips/configs/ip22_defconfig -+++ b/arch/mips/configs/ip22_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig -index 0921ef38e9fb..6da05cef46f8 100644 ---- a/arch/mips/configs/ip28_defconfig -+++ b/arch/mips/configs/ip28_defconfig -@@ -1,5 +1,5 @@ - CONFIG_SYSVIPC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig -index 8c223035921f..a3bf87450343 100644 ---- a/arch/mips/configs/jazz_defconfig -+++ b/arch/mips/configs/jazz_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_RELAY=y -diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig -index 914af125a7fa..76a64290373f 100644 ---- a/arch/mips/configs/mtx1_defconfig -+++ b/arch/mips/configs/mtx1_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig -index 4ecb157e56d4..ea7309283b01 100644 ---- a/arch/mips/configs/nlm_xlr_defconfig -+++ b/arch/mips/configs/nlm_xlr_defconfig -@@ -1,10 +1,10 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_TASKSTATS=y -diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig -index 63fe2da1b37f..7f08ee237345 100644 ---- a/arch/mips/configs/pic32mzda_defconfig -+++ b/arch/mips/configs/pic32mzda_defconfig -@@ -1,7 +1,7 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig -index 24e07180c57d..38582e8f71c4 100644 ---- a/arch/mips/configs/pistachio_defconfig -+++ b/arch/mips/configs/pistachio_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_DEFAULT_HOSTNAME="localhost" - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=m - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=18 -diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig -index d06db6b87959..fb2cd3234d95 100644 ---- a/arch/mips/configs/pnx8335_stb225_defconfig -+++ b/arch/mips/configs/pnx8335_stb225_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - # CONFIG_SWAP is not set - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_EXPERT=y - CONFIG_SLAB=y -diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig -index 30d7c3db884e..9e68acfa0d0e 100644 ---- a/arch/mips/configs/rm200_defconfig -+++ b/arch/mips/configs/rm200_defconfig -@@ -1,6 +1,6 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig -new file mode 100644 -index 000000000000..578524f80cc4 ---- /dev/null -+++ b/arch/parisc/configs/712_defconfig -@@ -0,0 +1,181 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_GSC_LASI=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+# CONFIG_IPV6 is not set -+CONFIG_NETFILTER=y -+CONFIG_LLC2=m -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_ATA_OVER_ETH=m -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=m -+CONFIG_MD_LINEAR=m -+CONFIG_MD_RAID0=m -+CONFIG_MD_RAID1=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPP_MPPE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=m -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_SERIAL_MUX is not set -+CONFIG_PDC_CONSOLE=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_HARMONY=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_JFS_FS=m -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_UFS_FS=m -+CONFIG_NFS_FS=y -+CONFIG_NFS_V4=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+CONFIG_CRYPTO_DEFLATE=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_FONTS=y -+CONFIG_FONT_8x8=y -+CONFIG_FONT_8x16=y -diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig -new file mode 100644 -index 000000000000..d1bdfad94048 ---- /dev/null -+++ b/arch/parisc/configs/c3000_defconfig -@@ -0,0 +1,151 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA8X00=y -+CONFIG_PREEMPT=y -+# CONFIG_GSC is not set -+CONFIG_PCI=y -+CONFIG_PCI_LBA=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_BOOTP=y -+# CONFIG_INET_DIAG is not set -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_NETFILTER=y -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_BLK_DEV_UMEM=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_BLK_DEV_DM=m -+CONFIG_DM_CRYPT=m -+CONFIG_DM_SNAPSHOT=m -+CONFIG_DM_MIRROR=m -+CONFIG_DM_ZERO=m -+CONFIG_DM_MULTIPATH=m -+CONFIG_FUSION=y -+CONFIG_FUSION_SPI=m -+CONFIG_FUSION_CTL=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=m -+CONFIG_TIGON3=m -+CONFIG_NET_TULIP=y -+CONFIG_DE2104X=m -+CONFIG_TULIP=y -+CONFIG_TULIP_MMIO=y -+CONFIG_E100=m -+CONFIG_E1000=m -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_ATKBD is not set -+# CONFIG_MOUSE_PS2 is not set -+CONFIG_SERIO=m -+CONFIG_SERIO_LIBPS2=m -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=13 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_USB_HIDDEV=y -+CONFIG_USB=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_PRINTER=m -+CONFIG_USB_STORAGE=m -+CONFIG_USB_STORAGE_USBAT=m -+CONFIG_USB_STORAGE_SDDR09=m -+CONFIG_USB_STORAGE_SDDR55=m -+CONFIG_USB_STORAGE_JUMPSHOT=m -+CONFIG_USB_MDC800=m -+CONFIG_USB_MICROTEK=m -+CONFIG_USB_LEGOTOWER=m -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_MUTEXES=y -+# CONFIG_DEBUG_BUGVERBOSE is not set -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MD5=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_DES=m -+# CONFIG_CRYPTO_HW is not set -diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig -new file mode 100644 -index 000000000000..0d976614934c ---- /dev/null -+++ b/arch/parisc/configs/defconfig -@@ -0,0 +1,206 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_IOMMU_CCIO=y -+CONFIG_GSC_LASI=y -+CONFIG_GSC_WAX=y -+CONFIG_EISA=y -+CONFIG_PCI=y -+CONFIG_GSC_DINO=y -+CONFIG_PCI_LBA=y -+CONFIG_PCCARD=y -+CONFIG_YENTA=y -+CONFIG_PD6729=y -+CONFIG_I82092=y -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+CONFIG_INET6_AH=y -+CONFIG_INET6_ESP=y -+CONFIG_INET6_IPCOMP=y -+CONFIG_LLC2=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_PARPORT_PC_PCMCIA=m -+CONFIG_PARPORT_1284=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECS=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_GENERIC=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_ZALON=y -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_MD_RAID10=y -+CONFIG_BLK_DEV_DM=y -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=y -+CONFIG_TIGON3=y -+CONFIG_NET_TULIP=y -+CONFIG_TULIP=y -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=y -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_CS=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_DYNAMIC_MINORS=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_SND_HARMONY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_NTRIG=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_HID_TOPSEED=y -+CONFIG_USB=y -+CONFIG_USB_MON=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_UHCI_HCD=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_VFAT_FS=y -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=y -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=y -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=y -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_KEYS=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_LIBCRC32C=m -+CONFIG_FONTS=y -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index b29d7cb38368..3af947541fdc 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -879,6 +879,8 @@ config SCHED_SMT - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config PPC_DENORMALISATION - bool "PowerPC denormalisation exception handling" - depends on PPC_BOOK3S_64 -diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig -index feb5d47d8d1e..6ce1ce306381 100644 ---- a/arch/powerpc/configs/ppc6xx_defconfig -+++ b/arch/powerpc/configs/ppc6xx_defconfig -@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y - CONFIG_MCU_MPC8349EMITX=y - CONFIG_HIGHMEM=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_HIBERNATION=y - CONFIG_PM_DEBUG=y -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig -index 9a527f978106..5895f2cc726e 100644 ---- a/arch/sh/configs/se7712_defconfig -+++ b/arch/sh/configs/se7712_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=66666666 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" - CONFIG_NET=y -diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig -index 3b0e1eb6e874..e296a2cd9903 100644 ---- a/arch/sh/configs/se7721_defconfig -+++ b/arch/sh/configs/se7721_defconfig -@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_7721_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=33333333 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" - CONFIG_NET=y -diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig -index 4ec961ace688..a03a1ad670a0 100644 ---- a/arch/sh/configs/titan_defconfig -+++ b/arch/sh/configs/titan_defconfig -@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y - CONFIG_SH_PCLK_FREQ=30000000 - CONFIG_SH_DMA=y - CONFIG_SH_DMA_API=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" - CONFIG_PCI=y -diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig -index bde4d21a8ac8..c054ec82d91b 100644 ---- a/arch/sparc/configs/sparc64_defconfig -+++ b/arch/sparc/configs/sparc64_defconfig -@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NUMA=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_SUN_LDOMS=y - CONFIG_PCI=y - CONFIG_PCI_MSI=y -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 2d3f963fd6f1..4df276a5781b 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1006,6 +1006,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_MUQSS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -@@ -1036,6 +1052,8 @@ config SCHED_MC_PRIO - - If unsure say Y here. - -+source "kernel/Kconfig.MuQSS" -+ - config UP_LATE_INIT - def_bool y - depends on !SMP && X86_LOCAL_APIC -@@ -1423,7 +1441,7 @@ config HIGHMEM64G - endchoice - - choice -- prompt "Memory split" if EXPERT -+ prompt "Memory split" - default VMSPLIT_3G - depends on X86_32 - ---help--- -@@ -1443,17 +1461,17 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !X86_PAE -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !X86_PAE -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig -index 550904591e94..b5e80947326e 100644 ---- a/arch/x86/configs/i386_defconfig -+++ b/arch/x86/configs/i386_defconfig -@@ -29,7 +29,7 @@ CONFIG_SMP=y - CONFIG_X86_GENERIC=y - CONFIG_HPET_TIMER=y - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_X86_REBOOTFIXUPS=y -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 614961009075..05802ec44d19 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -27,7 +27,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_SMP=y - CONFIG_NR_CPUS=64 - CONFIG_SCHED_SMT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_MCE=y - CONFIG_MICROCODE=y -diff --git a/drivers/block/swim.c b/drivers/block/swim.c -index 4c297f69171d..5bc4f1be2617 100644 ---- a/drivers/block/swim.c -+++ b/drivers/block/swim.c -@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, - if (swim_readbit(base, MOTOR_ON)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - } else if (action == OFF) { - swim_action(base, MOTOR_OFF); -@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) - if (!swim_readbit(base, DISK_IN)) - break; - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - swim_select(base, RELAX); - } -@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) - for (wait = 0; wait < HZ; wait++) { - - current->state = TASK_INTERRUPTIBLE; -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - swim_select(base, RELAX); - if (!swim_readbit(base, STEP)) -diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c -index c48d8f086382..8a6e399936c7 100644 ---- a/drivers/char/ipmi/ipmi_msghandler.c -+++ b/drivers/char/ipmi/ipmi_msghandler.c -@@ -3543,7 +3543,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) - /* Current message first, to preserve order */ - while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { - /* Wait for the message to clear out. */ -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - /* No need for locks, the interface is down. */ -diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c -index 2704470e021d..49504b7f3aa9 100644 ---- a/drivers/char/ipmi/ipmi_ssif.c -+++ b/drivers/char/ipmi/ipmi_ssif.c -@@ -1295,7 +1295,7 @@ static void shutdown_ssif(void *send_info) - - /* make sure the driver is not looking for flags any more. */ - while (ssif_info->ssif_state != SSIF_NORMAL) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - ssif_info->stopping = true; - del_timer_sync(&ssif_info->watch_timer); -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -index 6941689085ed..ec5a24e95401 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, - DRM_ERROR("SVGA device lockup.\n"); - break; - } -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - if (interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -index 75f3efee21a4..09b1932ce85b 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, - break; - } - if (lazy) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - else if ((++count & 0x0F) == 0) { - /** - * FIXME: Use schedule_hr_timeout here for -diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c -index 267eac00a3fb..352af68c6cd7 100644 ---- a/drivers/hwmon/fam15h_power.c -+++ b/drivers/hwmon/fam15h_power.c -@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, - prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; - } - -- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); -+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); - if (leftover) - return 0; - -diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c -index d8c40a83097d..8332baf4961c 100644 ---- a/drivers/iio/light/tsl2563.c -+++ b/drivers/iio/light/tsl2563.c -@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) - default: - delay = 402; - } -- /* -- * TODO: Make sure that we wait at least required delay but why we -- * have to extend it one tick more? -- */ -- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); -+ schedule_msec_hrtimeout_interruptible(delay + 1); - } - - static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) -diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c -index 39530d43590e..a7caf2eb5771 100644 ---- a/drivers/media/i2c/msp3400-driver.c -+++ b/drivers/media/i2c/msp3400-driver.c -@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) - break; - dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) - break; - dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c -index cf7cfda94107..f63e17489547 100644 ---- a/drivers/media/pci/cx18/cx18-gpio.c -+++ b/drivers/media/pci/cx18/cx18-gpio.c -@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, - - /* Assert */ - gpio_update(cx, mask, ~active_lo); -- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); - - /* Deassert */ - gpio_update(cx, mask, ~active_hi); -- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); - } - - /* -diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c -index 856e7ab7f33e..766a26251337 100644 ---- a/drivers/media/pci/ivtv/ivtv-gpio.c -+++ b/drivers/media/pci/ivtv/ivtv-gpio.c -@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) - curout = (curout & ~0xF) | 1; - write_reg(curout, IVTV_REG_GPIO_OUT); - /* We could use something else for smaller time */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - curout |= 2; - write_reg(curout, IVTV_REG_GPIO_OUT); - curdir &= ~0x80; -@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) - curout = read_reg(IVTV_REG_GPIO_OUT); - curout &= ~(1 << itv->card->xceive_pin); - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - - curout |= 1 << itv->card->xceive_pin; - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - return 0; - } - -diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c -index 137853944e46..76830892f373 100644 ---- a/drivers/media/pci/ivtv/ivtv-ioctl.c -+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c -@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) - TASK_UNINTERRUPTIBLE); - if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) - break; -- schedule_timeout(msecs_to_jiffies(25)); -+ schedule_msec_hrtimeout((25)); - } - finish_wait(&itv->vsync_waitq, &wait); - mutex_lock(&itv->serialize_lock); -diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c -index f04ee84bab5f..c4469b4b8f99 100644 ---- a/drivers/media/pci/ivtv/ivtv-streams.c -+++ b/drivers/media/pci/ivtv/ivtv-streams.c -@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) - while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && - time_before(jiffies, - then + msecs_to_jiffies(2000))) { -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - } - - /* To convert jiffies to ms, we must multiply by 1000 -diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c -index cb0437b4c331..163fffc0e1d4 100644 ---- a/drivers/media/radio/radio-mr800.c -+++ b/drivers/media/radio/radio-mr800.c -@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, - retval = -ENODATA; - break; - } -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - retval = -ERESTARTSYS; - break; - } -diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c -index fb9de7bbcd19..e53cf45e7f3f 100644 ---- a/drivers/media/radio/radio-tea5777.c -+++ b/drivers/media/radio/radio-tea5777.c -@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) - } - - if (wait) { -- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) -+ if (schedule_msec_hrtimeout_interruptible((wait))) - return -ERESTARTSYS; - } - -diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c -index b0303cf00387..0925b5065147 100644 ---- a/drivers/media/radio/tea575x.c -+++ b/drivers/media/radio/tea575x.c -@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, - for (;;) { - if (time_after(jiffies, timeout)) - break; -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - /* some signal arrived, stop search */ - tea->val &= ~TEA575X_BIT_SEARCH; - snd_tea575x_set_freq(tea); -diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c -index b690796d24d4..448b13da62b4 100644 ---- a/drivers/mfd/ucb1x00-core.c -+++ b/drivers/mfd/ucb1x00-core.c -@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) - break; - /* yield to other processes */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - return UCB_ADC_DAT(val); -diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c -index 8e6607fc8a67..b9ab770bbdb5 100644 ---- a/drivers/misc/sgi-xp/xpc_channel.c -+++ b/drivers/misc/sgi-xp/xpc_channel.c -@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) - - atomic_inc(&ch->n_on_msg_allocate_wq); - prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); -- ret = schedule_timeout(1); -+ ret = schedule_min_hrtimeout(); - finish_wait(&ch->msg_allocate_wq, &wait); - atomic_dec(&ch->n_on_msg_allocate_wq); - -diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c -index bbb2575d4728..637757144221 100644 ---- a/drivers/net/caif/caif_hsi.c -+++ b/drivers/net/caif/caif_hsi.c -@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) - break; - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - retry--; - } - -diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c -index d2539c95adb6..0c2f31a03ce9 100644 ---- a/drivers/net/can/usb/peak_usb/pcan_usb.c -+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c -@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) - } else { - /* the PCAN-USB needs time to init */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); -+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); - } - - return err; -diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c -index eccbf4cd7149..03d285f022b0 100644 ---- a/drivers/net/usb/lan78xx.c -+++ b/drivers/net/usb/lan78xx.c -@@ -2670,7 +2670,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) - while (!skb_queue_empty(&dev->rxq) && - !skb_queue_empty(&dev->txq) && - !skb_queue_empty(&dev->done)) { -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); -diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c -index 5ec97def3513..9e2bf55bbccd 100644 ---- a/drivers/net/usb/usbnet.c -+++ b/drivers/net/usb/usbnet.c -@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) - spin_lock_irqsave(&q->lock, flags); - while (!skb_queue_empty(q)) { - spin_unlock_irqrestore(&q->lock, flags); -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&q->lock, flags); - } -diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -index 97ea6e2035e6..1c693729bbd3 100644 ---- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c -+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, - * doesn't seem to have as many firmware restart cycles... - * - * As a test, we're sticking in a 1/100s delay here */ -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - return 0; - -@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) - IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); - i = 5000; - do { -- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_uninterruptible((40)); - /* Todo... wait for sync command ... */ - - read_register(priv->net_dev, IPW_REG_INTA, &inta); -diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c -index 90fb73575495..c94048b048a5 100644 ---- a/drivers/parport/ieee1284.c -+++ b/drivers/parport/ieee1284.c -@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, - /* parport_wait_event didn't time out, but the - * peripheral wasn't actually ready either. - * Wait for another 10ms. */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - } - -diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c -index 5d41dda6da4e..34705f6b423f 100644 ---- a/drivers/parport/ieee1284_ops.c -+++ b/drivers/parport/ieee1284_ops.c -@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, - /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { - parport_release (dev); -- schedule_timeout_interruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_interruptible((40)); - parport_claim_or_block (dev); - } - else -diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c -index bffe548187ee..c2918ee3e100 100644 ---- a/drivers/platform/x86/intel_ips.c -+++ b/drivers/platform/x86/intel_ips.c -@@ -798,7 +798,7 @@ static int ips_adjust(void *data) - ips_gpu_lower(ips); - - sleep: -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); - } while (!kthread_should_stop()); - - dev_dbg(ips->dev, "ips-adjust thread stopped\n"); -@@ -974,7 +974,7 @@ static int ips_monitor(void *data) - seqno_timestamp = get_jiffies_64(); - - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - - /* Collect an initial average */ - for (i = 0; i < IPS_SAMPLE_COUNT; i++) { -@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) - mchp_samples[i] = mchp; - } - -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - if (kthread_should_stop()) - break; - } -@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) - * us to reduce the sample frequency if the CPU and GPU are idle. - */ - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - last_sample_period = IPS_SAMPLE_PERIOD; - - timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); -diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c -index 2018614f258f..fc19b312c345 100644 ---- a/drivers/rtc/rtc-wm8350.c -+++ b/drivers/rtc/rtc-wm8350.c -@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); - - if (!retries) { -@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); - - if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) -@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) - /* Wait until confirmation */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); - - if (rtc_ctrl & WM8350_RTC_ALMSTS) -diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c -index b60795893994..d2d05691dbd2 100644 ---- a/drivers/scsi/fnic/fnic_scsi.c -+++ b/drivers/scsi/fnic/fnic_scsi.c -@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) - - /* wait for io cmpl */ - while (atomic_read(&fnic->in_flight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); - -@@ -2277,7 +2277,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, - } - } - -- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); -+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); - - /* walk again to check, if IOs are still pending in fw */ - if (fnic_is_abts_pending(fnic, lr_sc)) -diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c -index ad62fb3f3a54..a84d4c99d7d7 100644 ---- a/drivers/scsi/lpfc/lpfc_scsi.c -+++ b/drivers/scsi/lpfc/lpfc_scsi.c -@@ -5191,7 +5191,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, - tgt_id, lun_id, context); - later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; - while (time_after(later, jiffies) && cnt) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); -+ schedule_msec_hrtimeout_uninterruptible((20)); - cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); - } - if (cnt) { -diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c -index b3650c989ed4..7ed1fb285754 100644 ---- a/drivers/scsi/snic/snic_scsi.c -+++ b/drivers/scsi/snic/snic_scsi.c -@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) - - /* Wait for all the IOs that are entered in Qcmd */ - while (atomic_read(&snic->ios_inflight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - ret = snic_issue_hba_reset(snic, sc); - if (ret) { -diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c -index d99f4065b96d..15f870d4e95f 100644 ---- a/drivers/staging/comedi/drivers/ni_mio_common.c -+++ b/drivers/staging/comedi/drivers/ni_mio_common.c -@@ -4748,7 +4748,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) - if ((status & NI67XX_CAL_STATUS_BUSY) == 0) - break; - set_current_state(TASK_INTERRUPTIBLE); -- if (schedule_timeout(1)) -+ if (schedule_min_hrtimeout()) - return -EIO; - } - if (i == timeout) { -diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c -index be0053c795b7..cc2e18c733e1 100644 ---- a/drivers/staging/rts5208/rtsx.c -+++ b/drivers/staging/rts5208/rtsx.c -@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); -+ schedule_msec_hrtimeout((POLLING_INTERVAL)); - - /* lock the device pointers */ - mutex_lock(&dev->dev_mutex); -diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c -index c94328a5bd4a..6e7d4671aa69 100644 ---- a/drivers/staging/speakup/speakup_acntpc.c -+++ b/drivers/staging/speakup/speakup_acntpc.c -@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c -index 0877b4044c28..627102d048c1 100644 ---- a/drivers/staging/speakup/speakup_apollo.c -+++ b/drivers/staging/speakup/speakup_apollo.c -@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) - if (!synth->io_ops->synth_out(synth, ch)) { - synth->io_ops->tiocmset(0, UART_MCR_RTS); - synth->io_ops->tiocmset(UART_MCR_RTS, 0); -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c -index ddbb7e97d118..f9502addc765 100644 ---- a/drivers/staging/speakup/speakup_decext.c -+++ b/drivers/staging/speakup/speakup_decext.c -@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c -index 798c42dfa16c..d85b41db67a3 100644 ---- a/drivers/staging/speakup/speakup_decpc.c -+++ b/drivers/staging/speakup/speakup_decpc.c -@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (dt_sendchar(ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c -index dccb4ea29d37..8ecead307d04 100644 ---- a/drivers/staging/speakup/speakup_dectlk.c -+++ b/drivers/staging/speakup/speakup_dectlk.c -@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c -index dbebed0eeeec..6d83c13ca4a6 100644 ---- a/drivers/staging/speakup/speakup_dtlk.c -+++ b/drivers/staging/speakup/speakup_dtlk.c -@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - jiffy_delta_val = jiffy_delta->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c -index 414827e888fc..cb31c9176daa 100644 ---- a/drivers/staging/speakup/speakup_keypc.c -+++ b/drivers/staging/speakup/speakup_keypc.c -@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c -index 3568bfb89912..0a80b3b098b2 100644 ---- a/drivers/staging/speakup/synth.c -+++ b/drivers/staging/speakup/synth.c -@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (ch == '\n') - ch = synth->procspeech; -- if (unicode) -- ret = synth->io_ops->synth_out_unicode(synth, ch); -- else -- ret = synth->io_ops->synth_out(synth, ch); -- if (!ret) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ if (!synth->io_ops->synth_out(synth, ch)) { -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth->io_ops->synth_out(synth, synth->procspeech)) -- schedule_timeout( -- msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - else -- schedule_timeout( -- msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c -index 0433536930a9..d8726f28843f 100644 ---- a/drivers/staging/unisys/visornic/visornic_main.c -+++ b/drivers/staging/unisys/visornic/visornic_main.c -@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - if (atomic_read(&devdata->usage)) - break; -@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c -index cfe63932f825..71c00ef772a3 100644 ---- a/drivers/video/fbdev/omap/hwa742.c -+++ b/drivers/video/fbdev/omap/hwa742.c -@@ -913,7 +913,7 @@ static void hwa742_resume(void) - if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(5)); -+ schedule_msec_hrtimeout((5)); - } - hwa742_set_update_mode(hwa742.update_mode_before_suspend); - } -diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c -index 00b96a78676e..37fc1c2d4cb9 100644 ---- a/drivers/video/fbdev/pxafb.c -+++ b/drivers/video/fbdev/pxafb.c -@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) - mutex_unlock(&fbi->ctrlr_lock); - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(30)); -+ schedule_msec_hrtimeout((30)); - } - - pr_debug("%s(): task ending\n", __func__); -diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c -index 6009e0e939b5..43868e6a85dc 100644 ---- a/fs/btrfs/inode-map.c -+++ b/fs/btrfs/inode-map.c -@@ -91,7 +91,7 @@ static int caching_kthread(void *data) - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - goto again; - } else - continue; -diff --git a/fs/proc/base.c b/fs/proc/base.c -index eb2255e95f62..62b8cedbccb6 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 21f5aa0b217f..ee9b46394fdf 100644 ---- a/include/linux/freezer.h -+++ b/include/linux/freezer.h -@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} - #define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - -+#define pm_freezing (false) - #endif /* !CONFIG_FREEZER */ - - #endif /* FREEZER_H_INCLUDED */ -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..73417df5daa2 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_MUQSS -+#define INIT_TASK_COMM "MuQSS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h -index e9bfe6972aed..16ba1c7e5bde 100644 ---- a/include/linux/ioprio.h -+++ b/include/linux/ioprio.h -@@ -53,6 +53,8 @@ enum { - */ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (iso_task(task)) -+ return 0; - return (task_nice(task) + 20) / 5; - } - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 4418f5cb8324..71e3063c06b3 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -31,6 +31,9 @@ - #include - #include - #include -+#ifdef CONFIG_SCHED_MUQSS -+#include -+#endif - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -214,13 +217,40 @@ struct task_group; - - extern void scheduler_tick(void); - --#define MAX_SCHEDULE_TIMEOUT LONG_MAX -- -+#define MAX_SCHEDULE_TIMEOUT LONG_MAX - extern long schedule_timeout(long timeout); - extern long schedule_timeout_interruptible(long timeout); - extern long schedule_timeout_killable(long timeout); - extern long schedule_timeout_uninterruptible(long timeout); - extern long schedule_timeout_idle(long timeout); -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+extern long schedule_msec_hrtimeout(long timeout); -+extern long schedule_min_hrtimeout(void); -+extern long schedule_msec_hrtimeout_interruptible(long timeout); -+extern long schedule_msec_hrtimeout_uninterruptible(long timeout); -+#else -+static inline long schedule_msec_hrtimeout(long timeout) -+{ -+ return schedule_timeout(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_min_hrtimeout(void) -+{ -+ return schedule_timeout(1); -+} -+ -+static inline long schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); -+} -+#endif -+ - asmlinkage void schedule(void); - extern void schedule_preempt_disabled(void); - asmlinkage void preempt_schedule_irq(void); -@@ -652,9 +682,11 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) -+ int on_cpu; -+#endif - #ifdef CONFIG_SMP - struct llist_node wake_entry; -- int on_cpu; - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -679,10 +711,25 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+#ifdef CONFIG_SCHED_MUQSS -+ int time_slice; -+ u64 deadline; -+ skiplist_node node; /* Skip list node */ -+ u64 last_ran; -+ u64 sched_time; /* sched_clock time spent running */ -+#ifdef CONFIG_SMT_NICE -+ int smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+#ifdef CONFIG_HOTPLUG_CPU -+ bool zerobound; /* Bound to CPU0 for hotplug */ -+#endif -+ unsigned long rt_timeout; -+#else /* CONFIG_SCHED_MUQSS */ - - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -@@ -850,6 +897,10 @@ struct task_struct { - #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME - u64 utimescaled; - u64 stimescaled; -+#endif -+#ifdef CONFIG_SCHED_MUQSS -+ /* Unbanked cpu time */ -+ unsigned long utime_ns, stime_ns; - #endif - u64 gtime; - struct prev_cputime prev_cputime; -@@ -1306,6 +1357,40 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_MUQSS -+#define tsk_seruntime(t) ((t)->sched_time) -+#define tsk_rttimeout(t) ((t)->rt_timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+} -+ -+void print_scheduler_version(void); -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return (p->policy == SCHED_ISO); -+} -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+ p->nr_cpus_allowed = current->nr_cpus_allowed; -+} -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "CFS CPU scheduler.\n"); -+} -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_MUQSS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..73d6319a856a 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) - #ifdef CONFIG_SMP - - struct root_domain; -+#ifdef CONFIG_SCHED_MUQSS -+static inline void dl_clear_root_domain(struct root_domain *rd) -+{ -+} -+static inline void dl_add_task_root_domain(struct task_struct *p) -+{ -+} -+#else /* CONFIG_SCHED_MUQSS */ - extern void dl_add_task_root_domain(struct task_struct *p); - extern void dl_clear_root_domain(struct root_domain *rd); -+#endif /* CONFIG_SCHED_MUQSS */ - - #endif /* CONFIG_SMP */ -diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h -index 6d67e9a5af6b..101fe470aa8f 100644 ---- a/include/linux/sched/nohz.h -+++ b/include/linux/sched/nohz.h -@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); - static inline void nohz_balance_enter_idle(int cpu) { } - #endif - --#ifdef CONFIG_NO_HZ_COMMON -+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - void calc_load_nohz_start(void); - void calc_load_nohz_remote(struct rq *rq); - void calc_load_nohz_stop(void); -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..43c9d9e50c09 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,8 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_MUQSS -+/* Note different MAX_RT_PRIO */ -+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) -+ -+#define ISO_PRIO (MAX_RT_PRIO) -+#define NORMAL_PRIO (MAX_RT_PRIO + 1) -+#define IDLE_PRIO (MAX_RT_PRIO + 2) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* CONFIG_SCHED_MUQSS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - -+#endif /* CONFIG_SCHED_MUQSS */ -+ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..010b2244e0b6 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_MUQSS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 38359071236a..e2ebedb6512c 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..d4be84ba273b ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,33 @@ -+#ifndef _LINUX_SKIP_LISTS_H -+#define _LINUX_SKIP_LISTS_H -+typedef u64 keyType; -+typedef void *valueType; -+ -+typedef struct nodeStructure skiplist_node; -+ -+struct nodeStructure { -+ int level; /* Levels in this structure */ -+ keyType key; -+ valueType value; -+ skiplist_node *next[8]; -+ skiplist_node *prev[8]; -+}; -+ -+typedef struct listStructure { -+ int entries; -+ int level; /* Maximum level of the list -+ (1 more than the number of levels in the list) */ -+ skiplist_node *header; /* pointer to header */ -+} skiplist; -+ -+void skiplist_init(skiplist_node *slnode); -+skiplist *new_skiplist(skiplist_node *slnode); -+void free_skiplist(skiplist *l); -+void skiplist_node_init(skiplist_node *node); -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); -+void skiplist_delete(skiplist *l, skiplist_node *node); -+ -+static inline bool skiplist_node_empty(skiplist_node *node) { -+ return (!node->next[0]); -+} -+#endif /* _LINUX_SKIP_LISTS_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..f48c5c5da651 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -115,9 +115,16 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented on MuQSS only */ - #define SCHED_IDLE 5 -+#ifdef CONFIG_SCHED_MUQSS -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO SCHED_IDLE -+#define SCHED_MAX (SCHED_IDLEPRIO) -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+#else /* CONFIG_SCHED_MUQSS */ - #define SCHED_DEADLINE 6 -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index 74a5ac65644f..44bba84664f3 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -61,6 +61,18 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_MUQSS -+ bool "MuQSS cpu scheduler" -+ select HIGH_RES_TIMERS -+ ---help--- -+ The Multiple Queue Skiplist Scheduler for excellent interactivity and -+ responsiveness on the desktop and highly scalable deterministic -+ low latency on any hardware. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -440,7 +452,7 @@ config HAVE_SCHED_AVG_IRQ - - config SCHED_THERMAL_PRESSURE - bool "Enable periodic averaging of thermal pressure" -- depends on SMP -+ depends on SMP && !SCHED_MUQSS - - config BSD_PROCESS_ACCT - bool "BSD Process Accounting" -@@ -777,6 +789,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_MUQSS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -876,9 +889,13 @@ menuconfig CGROUP_SCHED - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. It uses cgroups to group -- tasks. -+ tasks. In combination with MuQSS this is purely a STUB to create the -+ files associated with the CPU controller cgroup but most of the -+ controls do nothing. This is useful for working in environments and -+ with applications that will only work if this control group is -+ present. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_MUQSS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1007,6 +1024,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_MUQSS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1134,6 +1152,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_MUQSS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index bd403ed3e418..5df65b2578eb 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -67,9 +67,17 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_MUQSS -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, -+ .time_slice = 1000000, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -79,6 +87,7 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifndef CONFIG_SCHED_MUQSS - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -86,6 +95,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/init/main.c b/init/main.c -index 03371976d387..63243a24de9b 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -1411,6 +1411,8 @@ static int __ref kernel_init(void *unused) - - rcu_end_inkernel_boot(); - -+ print_scheduler_version(); -+ - if (ramdisk_execute_command) { - ret = run_init_process(ramdisk_execute_command); - if (!ret) -diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS -new file mode 100644 -index 000000000000..a6a58781ef91 ---- /dev/null -+++ b/kernel/Kconfig.MuQSS -@@ -0,0 +1,105 @@ -+choice -+ prompt "CPU scheduler runqueue sharing" -+ default RQ_MC if SCHED_MUQSS -+ default RQ_NONE -+ -+config RQ_NONE -+ bool "No sharing" -+ help -+ This is the default behaviour where the CPU scheduler has one runqueue -+ per CPU, whether it is a physical or logical CPU (hyperthread). -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=none -+ -+ If unsure, say N. -+ -+config RQ_SMT -+ bool "SMT (hyperthread) siblings" -+ depends on SCHED_SMT && SCHED_MUQSS -+ -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by SMT (hyperthread) siblings. As these logical cores share -+ one physical core, sharing the runqueue resource can lead to decreased -+ overhead, lower latency and higher throughput. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smt -+ -+ If unsure, say N. -+ -+config RQ_MC -+ bool "Multicore siblings" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by multicore siblings in addition to any SMT siblings. -+ As these physical cores share caches, sharing the runqueue resource -+ will lead to lower latency, but its effects on overhead and throughput -+ are less predictable. As a general rule, 6 or fewer cores will likely -+ benefit from this, while larger CPUs will only derive a latency -+ benefit. If your workloads are primarily single threaded, this will -+ possibly worsen throughput. If you are only concerned about latency -+ then enable this regardless of how many cores you have. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=mc -+ -+ If unsure, say Y. -+ -+config RQ_MC_LLC -+ bool "Multicore siblings (LLC)" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will behave similarly as -+ with "Multicore siblings". -+ This option takes LLC cache into account when scheduling tasks. -+ Option may benefit CPUs with multiple LLC caches, such as Ryzen -+ and Xeon CPUs. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=llc -+ -+ If unsure, say N. -+ -+config RQ_SMP -+ bool "Symmetric Multi-Processing" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by all physical CPUs unless they are on separate NUMA nodes. -+ As physical CPUs usually do not share resources, sharing the runqueue -+ will normally worsen throughput but improve latency. If you only -+ care about latency enable this. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smp -+ -+ If unsure, say N. -+ -+config RQ_ALL -+ bool "NUMA" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ regardless of the architecture configuration, including across NUMA -+ nodes. This can substantially decrease throughput in NUMA -+ configurations, but light NUMA designs will not be dramatically -+ affected. This option should only be chosen if latency is the prime -+ concern. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=all -+ -+ If unsure, say N. -+endchoice -+ -+config SHARERQ -+ int -+ default 0 if RQ_NONE -+ default 1 if RQ_SMT -+ default 2 if RQ_MC -+ default 3 if RQ_MC_LLC -+ default 4 if RQ_SMP -+ default 5 if RQ_ALL -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888e..89ed751ac4e4 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,8 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_100 if SCHED_MUQSS -+ default HZ_250_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -20,11 +21,18 @@ choice - config HZ_100 - bool "100 HZ" - help -+ 100 Hz is a suitable choice in combination with MuQSS which does -+ not rely on ticks for rescheduling interrupts, and is not Hz limited -+ for timeouts and sleeps from both the kernel and userspace. -+ This allows us to benefit from the lower overhead and higher -+ throughput of fewer timer ticks. -+ -+ Non-MuQSS kernels: - 100 Hz is a typical choice for servers, SMP and NUMA systems - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEF - bool "250 HZ" - help - 250 Hz is a good compromise choice allowing server performance -@@ -32,7 +40,10 @@ choice - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. - -- config HZ_300 -+ 250 Hz is the default choice for the mainline scheduler but not -+ advantageous in combination with MuQSS. -+ -+ config HZ_300_NODEF - bool "300 HZ" - help - 300 Hz is a good compromise choice allowing server performance -@@ -40,7 +51,7 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -- config HZ_1000 -+ config HZ_1000_NODEF - bool "1000 HZ" - help - 1000 Hz is the preferred choice for desktop systems and other -@@ -51,9 +62,9 @@ endchoice - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -- default 300 if HZ_300 -- default 1000 if HZ_1000 -+ default 250 if HZ_250_NODEF -+ default 300 if HZ_300_NODEF -+ default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - def_bool HIGH_RES_TIMERS -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index bf82259cff96..d9438eb6f91c 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -2,7 +2,7 @@ - - choice - prompt "Preemption Model" -- default PREEMPT_NONE -+ default PREEMPT - - config PREEMPT_NONE - bool "No Forced Preemption (Server)" -@@ -18,7 +18,7 @@ config PREEMPT_NONE - latencies. - - config PREEMPT_VOLUNTARY -- bool "Voluntary Kernel Preemption (Desktop)" -+ bool "Voluntary Kernel Preemption (Nothing)" - depends on !ARCH_NO_PREEMPT - help - This option reduces the latency of the kernel by adding more -@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY - applications to run more 'smoothly' even when the system is - under load. - -- Select this if you are building a kernel for a desktop system. -+ Select this for no system in particular (choose Preemptible -+ instead on a desktop if you know what's good for you). - - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" -diff --git a/kernel/Makefile b/kernel/Makefile -index 4cb4130ced32..b11afae9eea8 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ - extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ - notifier.o ksysfs.o cred.o reboot.o \ -- async.o range.o smpboot.o ucount.o -+ async.o range.o smpboot.o ucount.o skip_list.o - - obj-$(CONFIG_MODULES) += kmod.o - obj-$(CONFIG_MULTIUSER) += groups.o -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index ce2a75bc0ade..f0f864bc1ab9 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig -index 20d501af4f2e..f92cabe495bd 100644 ---- a/kernel/irq/Kconfig -+++ b/kernel/irq/Kconfig -@@ -115,6 +115,23 @@ config GENERIC_IRQ_RESERVATION_MODE - config IRQ_FORCED_THREADING - bool - -+config FORCE_IRQ_THREADING -+ bool "Make IRQ threading compulsory" -+ depends on IRQ_FORCED_THREADING -+ default n -+ ---help--- -+ -+ Make IRQ threading mandatory for any IRQ handlers that support it -+ instead of being optional and requiring the threadirqs kernel -+ parameter. Instead they can be optionally disabled with the -+ nothreadirqs kernel parameter. -+ -+ Enabling this may make some architectures not boot with runqueue -+ sharing and MuQSS. -+ -+ Enable if you are building for a desktop or low latency system, -+ otherwise say N. -+ - config SPARSE_IRQ - bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - ---help--- -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 453a8a0f4804..2f14a31d8efd 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -25,9 +25,20 @@ - #include "internals.h" - - #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -+#ifdef CONFIG_FORCE_IRQ_THREADING -+__read_mostly bool force_irqthreads = true; -+#else - __read_mostly bool force_irqthreads; -+#endif - EXPORT_SYMBOL_GPL(force_irqthreads); - -+static int __init setup_noforced_irqthreads(char *arg) -+{ -+ force_irqthreads = false; -+ return 0; -+} -+early_param("nothreadirqs", setup_noforced_irqthreads); -+ - static int __init setup_forced_irqthreads(char *arg) - { - force_irqthreads = true; -diff --git a/kernel/kthread.c b/kernel/kthread.c -index bfbfa481be3a..f5942fb29ba8 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -446,6 +446,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) - } - EXPORT_SYMBOL(kthread_bind); - -+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) -+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -+ -+/* -+ * new_kthread_bind is a special variant of __kthread_bind_mask. -+ * For new threads to work on muqss we want to call do_set_cpus_allowed -+ * without the task_cpu being set and the task rescheduled until they're -+ * rescheduled on their own so we call __do_set_cpus_allowed directly which -+ * only changes the cpumask. This is particularly important for smpboot threads -+ * to work. -+ */ -+static void new_kthread_bind(struct task_struct *p, unsigned int cpu) -+{ -+ unsigned long flags; -+ -+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) -+ return; -+ -+ /* It's safe because the task is inactive. */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ __do_set_cpus_allowed(p, cpumask_of(cpu)); -+ p->flags |= PF_NO_SETAFFINITY; -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+#else -+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) -+#endif -+ - /** - * kthread_create_on_cpu - Create a cpu bound kthread - * @threadfn: the function to run until signal_pending(current). -@@ -467,7 +495,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), - cpu); - if (IS_ERR(p)) - return p; -- kthread_bind(p, cpu); -+ new_kthread_bind(p, cpu); - /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); - to_kthread(p)->cpu = cpu; -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..825f9b8e228f 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) - { - static char err_buf[STACK_ERR_BUF_SIZE]; - struct rq *rq; -- struct rq_flags flags; -+ struct rq_flags rf; - int ret; - bool success = false; - -@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) - * functions. If all goes well, switch the task to the target patch - * state. - */ -- rq = task_rq_lock(task, &flags); -+ rq = task_rq_lock(task, &rf); - - if (task_running(rq, task) && task != current) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, -@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) - task->patch_state = klp_target_state; - - done: -- task_rq_unlock(rq, task, &flags); -+ task_rq_unlock(rq, task, &rf); - - /* - * Due to console deadlock issues, pr_debug() can't be used while -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..a04ffebc6b7a 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - -+ifdef CONFIG_SCHED_MUQSS -+obj-y += MuQSS.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+ -+obj-$(CONFIG_SMP) += topology.o -+else - obj-y += core.o loadavg.o clock.o cputime.o - obj-y += idle.o fair.o rt.o deadline.o - obj-y += wait.o wait_bit.o swait.o completion.o - - obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -new file mode 100644 -index 000000000000..18a9b4a23e44 ---- /dev/null -+++ b/kernel/sched/MuQSS.c -@@ -0,0 +1,7624 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * kernel/sched/MuQSS.c, was kernel/sched.c -+ * -+ * Kernel scheduler and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and -+ * make semaphores SMP safe -+ * 1998-11-19 Implemented schedule_timeout() and related stuff -+ * by Andrea Arcangeli -+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: -+ * hybrid priority-list and round-robin design with -+ * an array-switch method of distributing timeslices -+ * and per-CPU runqueues. Cleanups and useful suggestions -+ * by Davide Libenzi, preemptible kernel bits by Robert Love. -+ * 2003-09-03 Interactivity tuning by Con Kolivas. -+ * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-04-15 Work begun on replacing all interactivity tuning with a -+ * fair scheduling design by Con Kolivas. -+ * 2007-05-05 Load balancing (smp-nice) and other improvements -+ * by Peter Williams -+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith -+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri -+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, -+ * Thomas Gleixner, Mike Kravetz -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS -+ * scheduler by Con Kolivas. -+ * 2019-08-31 LLC bits by Eduards Bezverhijs -+ */ -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#include "MuQSS.h" -+ -+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) -+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+ -+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -+ -+#define is_iso_policy(policy) ((policy) == SCHED_ISO) -+#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -+ -+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -+ -+#define ISO_PERIOD (5 * HZ) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) -+#define JIFFY_NS (APPROX_NS_PS / HZ) -+#define JIFFY_US (1048576 / HZ) -+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) -+#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) -+#define HALF_JIFFY_US (1048576 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "MuQSS CPU scheduler v0.202 by Con Kolivas.\n"); -+} -+ -+/* Define RQ share levels */ -+#define RQSHARE_NONE 0 -+#define RQSHARE_SMT 1 -+#define RQSHARE_MC 2 -+#define RQSHARE_MC_LLC 3 -+#define RQSHARE_SMP 4 -+#define RQSHARE_ALL 5 -+ -+/* Define locality levels */ -+#define LOCALITY_SAME 0 -+#define LOCALITY_SMT 1 -+#define LOCALITY_MC_LLC 2 -+#define LOCALITY_MC 3 -+#define LOCALITY_SMP 4 -+#define LOCALITY_DISTANT 5 -+ -+/* -+ * This determines what level of runqueue sharing will be done and is -+ * configurable at boot time with the bootparam rqshare = -+ */ -+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ -+ -+static int __init set_rqshare(char *str) -+{ -+ if (!strncmp(str, "none", 4)) { -+ rqshare = RQSHARE_NONE; -+ return 0; -+ } -+ if (!strncmp(str, "smt", 3)) { -+ rqshare = RQSHARE_SMT; -+ return 0; -+ } -+ if (!strncmp(str, "mc", 2)) { -+ rqshare = RQSHARE_MC; -+ return 0; -+ } -+ if (!strncmp(str, "llc", 3)) { -+ rqshare = RQSHARE_MC_LLC; -+ return 0; -+ } -+ if (!strncmp(str, "smp", 3)) { -+ rqshare = RQSHARE_SMP; -+ return 0; -+ } -+ if (!strncmp(str, "all", 3)) { -+ rqshare = RQSHARE_ALL; -+ return 0; -+ } -+ return 1; -+} -+__setup("rqshare=", set_rqshare); -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+ -+/* -+ * Tunable to choose whether to prioritise latency or throughput, simple -+ * binary yes or no -+ */ -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run five seconds as real time tasks. This is the total over -+ * all online cpus. -+ */ -+int sched_iso_cpu __read_mostly = 70; -+ -+/* -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The relative length of deadline for each priority(nice) level. -+ */ -+static int prio_ratios[NICE_WIDTH] __read_mostly; -+ -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifdef CONFIG_SMP -+/* -+ * Total number of runqueues. Equals number of CPUs when there is no runqueue -+ * sharing but is usually less with SMT/MC sharing of runqueues. -+ */ -+static int total_runqueues __read_mostly = 1; -+ -+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -+ -+struct rq *cpu_rq(int cpu) -+{ -+ return &per_cpu(runqueues, (cpu)); -+} -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+/* -+ * For asym packing, by default the lower numbered cpu has higher priority. -+ */ -+int __weak arch_asym_cpu_priority(int cpu) -+{ -+ return -cpu; -+} -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+#include "stats.h" -+ -+/* -+ * All common locking functions performed on rq->lock. rq->clock is local to -+ * the CPU accessing it so it can be modified just with interrupts disabled -+ * when we're not updating niffies. -+ * Looking up task_rq must be done under rq->lock to be safe. -+ */ -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+static void update_irq_load_avg(struct rq *rq, long delta); -+#else -+static inline void update_irq_load_avg(struct rq *rq, long delta) {} -+#endif -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if (irq_delta + steal) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta < 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * Niffies are a globally increasing nanosecond counter. They're only used by -+ * update_load_avg and time_slice_expired, however deadlines are based on them -+ * across CPUs. Update them whenever we will call one of those functions, and -+ * synchronise them across CPUs whenever we hold both runqueue locks. -+ */ -+static inline void update_clocks(struct rq *rq) -+{ -+ s64 ndiff, minndiff; -+ long jdiff; -+ -+ update_rq_clock(rq); -+ ndiff = rq->clock - rq->old_clock; -+ rq->old_clock = rq->clock; -+ jdiff = jiffies - rq->last_jiffy; -+ -+ /* Subtract any niffies added by balancing with other rqs */ -+ ndiff -= rq->niffies - rq->last_niffy; -+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; -+ if (minndiff < 0) -+ minndiff = 0; -+ ndiff = max(ndiff, minndiff); -+ rq->niffies += ndiff; -+ rq->last_niffy = rq->niffies; -+ if (jdiff) { -+ rq->last_jiffy += jdiff; -+ rq->last_jiffy_niffies = rq->niffies; -+ } -+} -+ -+/* -+ * Any time we have two runqueues locked we use that as an opportunity to -+ * synchronise niffies to the highest value as idle ticks may have artificially -+ * kept niffies low on one CPU and the truth can only be later. -+ */ -+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) -+{ -+ if (rq1->niffies > rq2->niffies) -+ rq2->niffies = rq1->niffies; -+ else -+ rq1->niffies = rq2->niffies; -+} -+ -+/* -+ * double_rq_lock - safely lock two runqueues -+ * -+ * Note this does not disable interrupts like task_rq_lock, -+ * you need to do so manually before calling. -+ */ -+ -+/* For when we know rq1 != rq2 */ -+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ if (rq1 < rq2) { -+ raw_spin_lock(rq1->lock); -+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); -+ } else { -+ raw_spin_lock(rq2->lock); -+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ BUG_ON(!irqs_disabled()); -+ if (rq1->lock == rq2->lock) { -+ raw_spin_lock(rq1->lock); -+ __acquire(rq2->lock); /* Fake it out ;) */ -+ } else -+ __double_rq_lock(rq1, rq2); -+ synchronise_niffies(rq1, rq2); -+} -+ -+/* -+ * double_rq_unlock - safely unlock two runqueues -+ * -+ * Note this does not restore interrupts like task_rq_unlock, -+ * you need to do so manually after calling. -+ */ -+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) -+ __releases(rq1->lock) -+ __releases(rq2->lock) -+{ -+ raw_spin_unlock(rq1->lock); -+ if (rq1->lock != rq2->lock) -+ raw_spin_unlock(rq2->lock); -+ else -+ __release(rq2->lock); -+} -+ -+static inline void lock_all_rqs(void) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_lock(rq->lock); -+ } -+} -+ -+static inline void unlock_all_rqs(void) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_unlock(rq->lock); -+ } -+ preempt_enable(); -+} -+ -+/* Specially nest trylock an rq */ -+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) -+{ -+ if (unlikely(!do_raw_spin_trylock(rq->lock))) -+ return false; -+ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ synchronise_niffies(this_rq, rq); -+ return true; -+} -+ -+/* Unlock a specially nested trylocked rq */ -+static inline void unlock_rq(struct rq *rq) -+{ -+ spin_release(&rq->lock->dep_map, _RET_IP_); -+ do_raw_spin_unlock(rq->lock); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* Task can safely be re-inserted now */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+static inline void smp_sched_reschedule(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ smp_send_reschedule(cpu); -+} -+ -+/* -+ * resched_task - mark a task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_task(struct task_struct *p) -+{ -+ int cpu; -+#ifdef CONFIG_LOCKDEP -+ /* Kernel threads call this when creating workqueues while still -+ * inactive from __kthread_bind_mask, holding only the pi_lock */ -+ if (!(p->flags & PF_KTHREAD)) { -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(rq->lock); -+ } -+#endif -+ if (test_tsk_need_resched(p)) -+ return; -+ -+ cpu = task_cpu(p); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(p)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * A task that is not running or queued will not have a node set. -+ * A task that is queued but not running will have a node set. -+ * A task that is currently running will have ->on_cpu set but no node set. -+ */ -+static inline bool task_queued(struct task_struct *p) -+{ -+ return !skiplist_node_empty(&p->node); -+} -+ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -+static inline void resched_if_idle(struct rq *rq); -+ -+static inline bool deadline_before(u64 deadline, u64 time) -+{ -+ return (deadline < time); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes cpu fairly amongst tasks of the -+ * same nice value, it proportions cpu according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 prio_deadline_diff(int user_prio) -+{ -+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -+} -+ -+static inline u64 task_deadline_diff(struct task_struct *p) -+{ -+ return prio_deadline_diff(TASK_USER_PRIO(p)); -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return prio_deadline_diff(USER_PRIO(static_prio)); -+} -+ -+static inline int longest_deadline_diff(void) -+{ -+ return prio_deadline_diff(39); -+} -+ -+static inline int ms_longest_deadline_diff(void) -+{ -+ return NS_TO_MS(longest_deadline_diff()); -+} -+ -+static inline bool rq_local(struct rq *rq); -+ -+#ifndef SCHED_CAPACITY_SCALE -+#define SCHED_CAPACITY_SCALE 1024 -+#endif -+ -+static inline int rq_load(struct rq *rq) -+{ -+ return rq->nr_running; -+} -+ -+/* -+ * Update the load average for feeding into cpu frequency governors. Use a -+ * rough estimate of a rolling average with ~ time constant of 32ms. -+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 -+ * Make sure a call to update_clocks has been made before calling this to get -+ * an updated rq->niffies. -+ */ -+static void update_load_avg(struct rq *rq, unsigned int flags) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->load_avg = load; -+ -+ rq->load_update = rq->niffies; -+ update_irq_load_avg(rq, 0); -+ if (likely(rq_local(rq))) -+ cpufreq_trigger(rq, flags); -+} -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+/* -+ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds -+ * here so we scale curload to how long it's been since the last update. -+ */ -+static void update_irq_load_avg(struct rq *rq, long delta) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; -+ rq->irq_load_avg = load; -+ -+ rq->irq_load_update = rq->niffies; -+} -+#endif -+ -+/* -+ * Removing from the runqueue. Enter with rq locked. Deleting a task -+ * from the skip list is done via the stored node reference in the task struct -+ * and does not require a full look up. Thus it occurs in O(k) time where k -+ * is the "level" of the list the task was stored at - usually < 4, max 8. -+ */ -+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ skiplist_delete(rq->sl, &p->node); -+ rq->best_key = rq->node->next[0]->key; -+ update_clocks(rq); -+ -+ if (!(flags & DEQUEUE_SAVE)) { -+ sched_info_dequeued(rq, p); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ } -+ rq->nr_running--; -+ if (rt_task(p)) -+ rq->rt_nr_running--; -+ update_load_avg(rq, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_RCU -+static bool rcu_read_critical(struct task_struct *p) -+{ -+ return p->rcu_read_unlock_special.b.blocked; -+} -+#else /* CONFIG_PREEMPT_RCU */ -+#define rcu_read_critical(p) (false) -+#endif /* CONFIG_PREEMPT_RCU */ -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && -+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -+} -+ -+/* -+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check -+ * that the iso_refractory flag is not set. -+ */ -+static inline bool isoprio_suitable(struct rq *rq) -+{ -+ return !rq->iso_refractory; -+} -+ -+/* -+ * Adding to the runqueue. Enter with rq locked. -+ */ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ unsigned int randseed, cflags = 0; -+ u64 sl_id; -+ -+ if (!rt_task(p)) { -+ /* Check it hasn't gotten rt from PI */ -+ if ((idleprio_task(p) && idleprio_suitable(p)) || -+ (iso_task(p) && isoprio_suitable(rq))) -+ p->prio = p->normal_prio; -+ else -+ p->prio = NORMAL_PRIO; -+ } else -+ rq->rt_nr_running++; -+ /* -+ * The sl_id key passed to the skiplist generates a sorted list. -+ * Realtime and sched iso tasks run FIFO so they only need be sorted -+ * according to priority. The skiplist will put tasks of the same -+ * key inserted later in FIFO order. Tasks of sched normal, batch -+ * and idleprio are sorted according to their deadlines. Idleprio -+ * tasks are offset by an impossibly large deadline value ensuring -+ * they get sorted into last positions, but still according to their -+ * own deadlines. This creates a "landscape" of skiplists running -+ * from priority 0 realtime in first place to the lowest priority -+ * idleprio tasks last. Skiplist insertion is an O(log n) process. -+ */ -+ if (p->prio <= ISO_PRIO) { -+ sl_id = p->prio; -+ } else { -+ sl_id = p->deadline; -+ if (idleprio_task(p)) { -+ if (p->prio == IDLE_PRIO) -+ sl_id |= 0xF000000000000000; -+ else -+ sl_id += longest_deadline_diff(); -+ } -+ } -+ /* -+ * Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as the random seed for skiplist insertion. -+ */ -+ update_clocks(rq); -+ if (!(flags & ENQUEUE_RESTORE)) { -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags & ENQUEUE_WAKEUP); -+ } -+ -+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; -+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); -+ rq->best_key = rq->node->next[0]->key; -+ if (p->in_iowait) -+ cflags |= SCHED_CPUFREQ_IOWAIT; -+ rq->nr_running++; -+ update_load_avg(rq, cflags); -+} -+ -+/* -+ * Returns the relative length of deadline all compared to the shortest -+ * deadline which is that of nice -20. -+ */ -+static inline int task_prio_ratio(struct task_struct *p) -+{ -+ return prio_ratios[TASK_USER_PRIO(p)]; -+} -+ -+/* -+ * task_timeslice - all tasks of all priorities get the exact same timeslice -+ * length. CPU distribution is handled by giving different deadlines to -+ * tasks of different priorities. Use 128 as the base value for fast shifts. -+ */ -+static inline int task_timeslice(struct task_struct *p) -+{ -+ return (rr_interval * task_prio_ratio(p) / 128); -+} -+ -+#ifdef CONFIG_SMP -+/* Entered with rq locked */ -+static inline void resched_if_idle(struct rq *rq) -+{ -+ if (rq_idle(rq)) -+ resched_task(rq->curr); -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return (rq->cpu == smp_processor_id()); -+} -+#ifdef CONFIG_SMT_NICE -+static const cpumask_t *thread_cpumask(int cpu); -+ -+/* Find the best real time priority running on any SMT siblings of cpu and if -+ * none are running, the static priority of the best deadline task running. -+ * The lookups to the other runqueues is done lockless as the occasional wrong -+ * value would be harmless. */ -+static int best_smt_bias(struct rq *this_rq) -+{ -+ int other_cpu, best_bias = 0; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq = cpu_rq(other_cpu); -+ -+ if (rq_idle(rq)) -+ continue; -+ if (unlikely(!rq->online)) -+ continue; -+ if (!rq->rq_mm) -+ continue; -+ if (likely(rq->rq_smt_bias > best_bias)) -+ best_bias = rq->rq_smt_bias; -+ } -+ return best_bias; -+} -+ -+static int task_prio_bias(struct task_struct *p) -+{ -+ if (rt_task(p)) -+ return 1 << 30; -+ else if (task_running_iso(p)) -+ return 1 << 29; -+ else if (task_running_idle(p)) -+ return 0; -+ return MAX_PRIO - p->static_prio; -+} -+ -+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -+{ -+ return true; -+} -+ -+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; -+ -+/* We've already decided p can run on CPU, now test if it shouldn't for SMT -+ * nice reasons. */ -+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -+{ -+ int best_bias, task_bias; -+ -+ /* Kernel threads always run */ -+ if (unlikely(!p->mm)) -+ return true; -+ if (rt_task(p)) -+ return true; -+ if (!idleprio_suitable(p)) -+ return true; -+ best_bias = best_smt_bias(this_rq); -+ /* The smt siblings are all idle or running IDLEPRIO */ -+ if (best_bias < 1) -+ return true; -+ task_bias = task_prio_bias(p); -+ if (task_bias < 1) -+ return false; -+ if (task_bias >= best_bias) -+ return true; -+ /* Dither 25% cpu of normal tasks regardless of nice difference */ -+ if (best_bias % 4 == 1) -+ return true; -+ /* Sorry, you lose */ -+ return false; -+} -+#else /* CONFIG_SMT_NICE */ -+#define smt_schedule(p, this_rq) (true) -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) -+{ -+ set_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+/* -+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to -+ * allow easy lookup of whether any suitable idle CPUs are available. -+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the -+ * idle_cpus variable than to do a full bitmask check when we are busy. The -+ * bits are set atomically but read locklessly as occasional false positive / -+ * negative is harmless. -+ */ -+static inline void set_cpuidle_map(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ atomic_set_cpu(cpu, &cpu_idle_map); -+} -+ -+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) -+{ -+ clear_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+ atomic_clear_cpu(cpu, &cpu_idle_map); -+} -+ -+static bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); -+} -+ -+/* -+ * Resched current on rq. We don't know if rq is local to this CPU nor if it -+ * is locked so we do not use an intermediate variable for the task to avoid -+ * having it dereferenced. -+ */ -+static void resched_curr(struct rq *rq) -+{ -+ int cpu; -+ -+ if (test_tsk_need_resched(rq->curr)) -+ return; -+ -+ rq->preempt = rq->curr; -+ cpu = rq->cpu; -+ -+ /* We're doing this without holding the rq lock if it's not task_rq */ -+ -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(rq->curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(rq->curr)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+#define CPUIDLE_DIFF_THREAD (1) -+#define CPUIDLE_DIFF_CORE_LLC (2) -+#define CPUIDLE_DIFF_CORE (4) -+#define CPUIDLE_CACHE_BUSY (8) -+#define CPUIDLE_DIFF_CPU (16) -+#define CPUIDLE_THREAD_BUSY (32) -+#define CPUIDLE_DIFF_NODE (64) -+ -+/* -+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the -+ * lowest value would give the most suitable CPU to schedule p onto next. The -+ * order works out to be the following: -+ * -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ */ -+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -+{ -+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | -+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | -+ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; -+ int cpu_tmp; -+ -+ if (cpumask_test_cpu(best_cpu, tmpmask)) -+ goto out; -+ -+ for_each_cpu(cpu_tmp, tmpmask) { -+ int ranking, locality; -+ struct rq *tmp_rq; -+ -+ ranking = 0; -+ tmp_rq = cpu_rq(cpu_tmp); -+ -+ locality = rq->cpu_locality[cpu_tmp]; -+#ifdef CONFIG_NUMA -+ if (locality > LOCALITY_SMP) -+ ranking |= CPUIDLE_DIFF_NODE; -+ else -+#endif -+ if (locality > LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CPU; -+#ifdef CONFIG_SCHED_MC -+ else if (locality == LOCALITY_MC_LLC) -+ ranking |= CPUIDLE_DIFF_CORE_LLC; -+ else if (locality == LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CORE; -+ if (!(tmp_rq->cache_idle(tmp_rq))) -+ ranking |= CPUIDLE_CACHE_BUSY; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (locality == LOCALITY_SMT) -+ ranking |= CPUIDLE_DIFF_THREAD; -+#endif -+ if (ranking < best_ranking -+#ifdef CONFIG_SCHED_SMT -+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) -+#endif -+ ) { -+ best_cpu = cpu_tmp; -+ best_ranking = ranking; -+ } -+ } -+out: -+ return best_cpu; -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ struct rq *this_rq = cpu_rq(this_cpu); -+ -+ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); -+} -+ -+/* As per resched_curr but only will resched idle task */ -+static inline void resched_idle(struct rq *rq) -+{ -+ if (test_tsk_need_resched(rq->idle)) -+ return; -+ -+ rq->preempt = rq->idle; -+ -+ set_tsk_need_resched(rq->idle); -+ -+ if (rq_local(rq)) { -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ smp_sched_reschedule(rq->cpu); -+} -+ -+DEFINE_PER_CPU(cpumask_t, idlemask); -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); -+ struct rq *rq; -+ int best_cpu; -+ -+ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); -+ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); -+ rq = cpu_rq(best_cpu); -+ if (!smt_schedule(p, rq)) -+ return NULL; -+ rq->preempt = p; -+ resched_idle(rq); -+ return rq; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq->rq_order[cpu]; -+} -+#else /* CONFIG_SMP */ -+static inline void set_cpuidle_map(int cpu) -+{ -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+} -+ -+static inline bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return uprq->curr == uprq->idle; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+} -+ -+static inline void resched_curr(struct rq *rq) -+{ -+ resched_task(rq->curr); -+} -+ -+static inline void resched_if_idle(struct rq *rq) -+{ -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return true; -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq; -+} -+ -+static inline bool smt_schedule(struct task_struct *p, struct rq *rq) -+{ -+ return true; -+} -+#endif /* CONFIG_SMP */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ if (idleprio_task(p)) -+ return IDLE_PRIO; -+ if (iso_task(p)) -+ return ISO_PRIO; -+ return NORMAL_PRIO; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. Enter with rq locked. -+ */ -+static void activate_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ resched_if_idle(rq); -+ -+ /* -+ * Sleep time is in units of nanosecs, so shift by 20 to get a -+ * milliseconds-range estimation of the amount of time that the task -+ * spent sleeping: -+ */ -+ if (unlikely(prof_on == SLEEP_PROFILING)) { -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), -+ (rq->niffies - p->last_ran) >> 20); -+ } -+ -+ p->prio = effective_prio(p); -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ -+ enqueue_task(rq, p, flags); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+} -+ -+/* -+ * deactivate_task - If it's running, it's not on the runqueue and we can just -+ * decrement the nr_running. Enter with rq locked. -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ -+ p->on_rq = 0; -+ sched_info_dequeued(rq, p); -+ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ -+ psi_dequeue(p, DEQUEUE_SLEEP); -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+ struct rq *rq; -+ -+ if (task_cpu(p) == new_cpu) -+ return; -+ -+ /* Do NOT call set_task_cpu on a currently queued task as we will not -+ * be reliably holding the rq lock after changing CPU. */ -+ BUG_ON(task_queued(p)); -+ rq = task_rq(p); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * Furthermore, all task_rq users should acquire both locks, see -+ * task_rq_lock(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(rq->lock))); -+#endif -+ -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ /* -+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ p->wake_cpu = new_cpu; -+ -+ if (task_running(rq, p)) { -+ /* -+ * We should only be calling this on a running task if we're -+ * holding rq lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ -+ /* -+ * We can't change the task_thread_info CPU on a running task -+ * as p will still be protected by the rq lock of the CPU it -+ * is still running on so we only set the wake_cpu for it to be -+ * lazily updated once off the CPU. -+ */ -+ return; -+ } -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, new_cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); -+#endif -+ /* We're no longer protecting p after this point since we're holding -+ * the wrong runqueue lock. */ -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Move a task off the runqueue and take it to a cpu for it will -+ * become the running task. -+ */ -+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) -+{ -+ struct rq *p_rq = task_rq(p); -+ -+ dequeue_task(p_rq, p, DEQUEUE_SAVE); -+ if (p_rq != rq) { -+ sched_info_dequeued(p_rq, p); -+ sched_info_queued(rq, p); -+ } -+ set_task_cpu(p, cpu); -+} -+ -+/* -+ * Returns a descheduling task to the runqueue unless it is being -+ * deactivated. -+ */ -+static inline void return_task(struct task_struct *p, struct rq *rq, -+ int cpu, bool deactivate) -+{ -+ if (deactivate) -+ deactivate_task(p, rq); -+ else { -+#ifdef CONFIG_SMP -+ /* -+ * set_task_cpu was called on the running task that doesn't -+ * want to deactivate so it has to be enqueued to a different -+ * CPU and we need its lock. Tag it to be moved with as the -+ * lock is dropped in finish_lock_switch. -+ */ -+ if (unlikely(p->wake_cpu != cpu)) -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ else -+#endif -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ } -+} -+ -+/* Enter with rq lock held. We know p is on the local cpu */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ int running, queued; -+ struct rq_flags rf; -+ unsigned long ncsw; -+ struct rq *rq; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(rq, p)) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ rq = task_rq_lock(p, &rf); -+ trace_sched_wait_task(p); -+ running = task_running(rq, p); -+ queued = task_on_rq_queued(p); -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_rq_unlock(rq, p, &rf); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(queued)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_sched_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+#endif -+ -+/* -+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the -+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or -+ * between themselves, they cooperatively multitask. An idle rq scores as -+ * prio PRIO_LIMIT so it is always preempted. -+ */ -+static inline bool -+can_preempt(struct task_struct *p, int prio, u64 deadline) -+{ -+ /* Better static priority RT task or better policy preemption */ -+ if (p->prio < prio) -+ return true; -+ if (p->prio > prio) -+ return false; -+ if (p->policy == SCHED_BATCH) -+ return false; -+ /* SCHED_NORMAL and ISO will preempt based on deadline */ -+ if (!deadline_before(p->deadline, deadline)) -+ return false; -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * Check to see if p can run on cpu, and if not, whether there are any online -+ * CPUs it can run on instead. This only happens with the hotplug threads that -+ * bring up the CPUs. -+ */ -+static inline bool sched_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) -+ return false; -+ if (p->nr_cpus_allowed == 1) { -+ cpumask_t valid_mask; -+ -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); -+ if (unlikely(cpumask_empty(&valid_mask))) -+ return false; -+ } -+ return true; -+} -+ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ return true; -+} -+ -+#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ int i, this_entries = rq_load(this_rq); -+ cpumask_t tmp; -+ -+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) -+ return; -+ -+ /* IDLEPRIO tasks never preempt anything but idle */ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ -+ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *rq = this_rq->cpu_order[i]; -+ -+ if (!cpumask_test_cpu(rq->cpu, &tmp)) -+ continue; -+ -+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) -+ continue; -+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { -+ /* We set rq->preempting lockless, it's a hint only */ -+ rq->preempting = p; -+ resched_curr(rq); -+ return; -+ } -+ } -+} -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check); -+#else /* CONFIG_SMP */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ return false; -+} -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) -+ resched_curr(uprq); -+} -+ -+static inline int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ } else { -+ struct sched_domain *sd; -+ -+ rcu_read_lock(); -+ for_each_domain(rq->cpu, sd) { -+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -+ __schedstat_inc(sd->ttwu_wake_remote); -+ break; -+ } -+ } -+ rcu_read_unlock(); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ /* -+ * Sync wakeups (i.e. those types of wakeups where the waker -+ * has indicated that it will leave the CPU in short order) -+ * don't trigger a preemption if there are no idle cpus, -+ * instead waiting for current to deschedule. -+ */ -+ if (wake_flags & WF_SYNC) -+ resched_suitable_idle(p); -+ else -+ try_preempt(p, rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ int en_flags = ENQUEUE_WAKEUP; -+ -+ lockdep_assert_held(rq->lock); -+ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ if (wake_flags & WF_MIGRATED) -+ en_flags |= ENQUEUE_MIGRATED; -+#endif -+ -+ activate_task(rq, p, en_flags); -+ ttwu_do_wakeup(rq, p, wake_flags); -+} -+ -+/* -+ * Called in case the task @p isn't fully descheduled from its runqueue, -+ * in this case we must do a remote wakeup. Its a 'light' wakeup though, -+ * since all we need to do is flip p->state to TASK_RUNNING, since -+ * the task is still ->on_rq. -+ */ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = __task_rq_lock(p, NULL); -+ if (likely(task_on_rq_queued(p))) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_rq_unlock(rq, NULL); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void) -+{ -+ struct rq *rq = this_rq(); -+ struct llist_node *llist = llist_del_all(&rq->wake_list); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry) -+ ttwu_do_activate(rq, p, 0); -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) -+ return; -+ -+ /* -+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since -+ * traditionally all their work was done from the interrupt return -+ * path. Now that we actually do some work, we need to make sure -+ * we do call them. -+ * -+ * Some archs already do call them, luckily irq_enter/exit nest -+ * properly. -+ * -+ * Arguably we should visit all archs and update all handlers, -+ * however a fair share of IPIs are still resched only so this would -+ * somewhat pessimize the simple resched case. -+ */ -+ irq_enter(); -+ sched_ttwu_pending(); -+ irq_exit(); -+} -+ -+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { -+ if (!set_nr_if_polling(rq->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+ } -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ rq_lock_irqsave(rq, &rf); -+ if (likely(is_idle_task(rq->curr))) -+ smp_sched_reschedule(cpu); -+ /* Else cpu is not in idle, do nothing here */ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ cpumask_t valid_mask; -+ -+ if (p->flags & PF_KTHREAD) -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); -+ else -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); -+ -+ if (unlikely(!cpumask_weight(&valid_mask))) { -+ /* We shouldn't be hitting this any more */ -+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, -+ p->pid, cpumask_weight(p->cpus_ptr)); -+ return cpumask_any(p->cpus_ptr); -+ } -+ return cpumask_any(&valid_mask); -+} -+ -+/* -+ * For a task that's just being woken up we have a valuable balancing -+ * opportunity so choose the nearest cache most lightly loaded runqueue. -+ * Entered with rq locked and returns with the chosen runqueue locked. -+ */ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ unsigned int idlest = ~0U; -+ struct rq *rq = NULL; -+ int i; -+ -+ if (suitable_idle_cpus(p)) { -+ int cpu = task_cpu(p); -+ -+ if (unlikely(needs_other_cpu(p, cpu))) -+ cpu = valid_task_cpu(p); -+ rq = resched_best_idle(p, cpu); -+ if (likely(rq)) -+ return rq->cpu; -+ } -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *other_rq = task_rq(p)->cpu_order[i]; -+ int entries; -+ -+ if (!other_rq->online) -+ continue; -+ if (needs_other_cpu(p, other_rq->cpu)) -+ continue; -+ entries = rq_load(other_rq); -+ if (entries >= idlest) -+ continue; -+ idlest = entries; -+ rq = other_rq; -+ } -+ if (unlikely(!rq)) -+ return task_cpu(p); -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ return NULL; -+} -+#endif /* CONFIG_SMP */ -+ -+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+#if defined(CONFIG_SMP) -+ if (!cpus_share_cache(smp_processor_id(), cpu)) { -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ ttwu_queue_remote(p, cpu, wake_flags); -+ return; -+ } -+#endif -+ rq_lock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ rq_unlock(rq); -+} -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int -+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_remote()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ cpu = task_cpu(p); -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ cpu = select_best_cpu(p); -+ if (task_cpu(p) != cpu) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+ -+#else /* CONFIG_SMP */ -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, cpu, wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+static void time_slice_expired(struct task_struct *p, struct rq *rq); -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * The process state is set to the same value of the process executing -+ * do_fork() code. That is running. This guarantees that nobody will -+ * actually run it, and a signal or other external event cannot wake -+ * it up and insert it on the runqueue either. -+ */ -+ -+ /* Should be reset in fork.c but done here for ease of MuQSS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = -+ p->stime_ns = -+ p->utime_ns = 0; -+ skiplist_node_init(&p->node); -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { -+ p->policy = SCHED_NORMAL; -+ p->normal_prio = normal_prio(p); -+ } -+ -+ if (PRIO_TO_NICE(p->static_prio) < 0) { -+ p->static_prio = NICE_TO_PRIO(0); -+ p->normal_prio = p->static_prio; -+ } -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); -+ -+static void account_task_cpu(struct rq *rq, struct task_struct *p) -+{ -+ update_clocks(rq); -+ /* This isn't really a context switch but accounting is the same */ -+ update_cpu_clock_switch(rq, p); -+ p->last_ran = rq->niffies; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+static inline int hrexpiry_enabled(struct rq *rq) -+{ -+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrexpiry_timer); -+} -+ -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+static inline void hrexpiry_clear(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (hrtimer_active(&rq->hrexpiry_timer)) -+ hrtimer_cancel(&rq->hrexpiry_timer); -+} -+ -+/* -+ * High-resolution time_slice expiry. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrexpiry(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); -+ struct task_struct *p; -+ -+ /* This can happen during CPU hotplug / resume */ -+ if (unlikely(cpu_of(rq) != smp_processor_id())) -+ goto out; -+ -+ /* -+ * We're doing this without the runqueue lock but this should always -+ * be run on the local CPU. Time slice should run out in __schedule -+ * but we set it to zero here in case niffies is slightly less. -+ */ -+ p = rq->curr; -+ p->time_slice = 0; -+ __set_tsk_resched(p); -+out: -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Called to set the hrexpiry timer state. -+ * -+ * called with irqs disabled from the local CPU only -+ */ -+static void hrexpiry_start(struct rq *rq, u64 delay) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ -+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED); -+} -+ -+static void init_rq_hrexpiry(struct rq *rq) -+{ -+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ rq->hrexpiry_timer.function = hrexpiry; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return HALF_JIFFY_US; -+ return 0; -+} -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ struct task_struct *parent, *rq_curr; -+ struct rq *rq, *new_rq; -+ unsigned long flags; -+ -+ parent = p->parent; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ p->state = TASK_RUNNING; -+ /* Task_rq can't change yet on a new task */ -+ new_rq = rq = task_rq(p); -+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { -+ set_task_cpu(p, valid_task_cpu(p)); -+ new_rq = task_rq(p); -+ } -+ -+ double_rq_lock(rq, new_rq); -+ rq_curr = rq->curr; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = rq_curr->normal_prio; -+ -+ trace_sched_wakeup_new(p); -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. If it's negative, it won't -+ * matter since that's the same as being 0. rq->rq_deadline is only -+ * modified within schedule() so it is always equal to -+ * current->deadline. -+ */ -+ account_task_cpu(rq, rq_curr); -+ p->last_ran = rq_curr->last_ran; -+ if (likely(rq_curr->policy != SCHED_FIFO)) { -+ rq_curr->time_slice /= 2; -+ if (rq_curr->time_slice < RESCHED_US) { -+ /* -+ * Forking task has run out of timeslice. Reschedule it and -+ * start its child with a new time slice and deadline. The -+ * child will end up running first because its deadline will -+ * be slightly earlier. -+ */ -+ __set_tsk_resched(rq_curr); -+ time_slice_expired(p, new_rq); -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+ else if (unlikely(rq != new_rq)) -+ try_preempt(p, new_rq); -+ } else { -+ p->time_slice = rq_curr->time_slice; -+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { -+ /* -+ * The VM isn't cloned, so we're in a good position to -+ * do child-runs-first in anticipation of an exec. This -+ * usually avoids a lot of COW overhead. -+ */ -+ __set_tsk_resched(rq_curr); -+ } else { -+ /* -+ * Adjust the hrexpiry since rq_curr will keep -+ * running and its timeslice has been shortened. -+ */ -+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); -+ try_preempt(p, new_rq); -+ } -+ } -+ } else { -+ time_slice_expired(p, new_rq); -+ try_preempt(p, new_rq); -+ } -+ activate_task(new_rq, p, 0); -+ double_rq_unlock(rq, new_rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock->dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock->owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If prev was marked as migrating to another CPU in return_task, drop -+ * the local runqueue lock but leave interrupts disabled and grab the -+ * remote lock we're migrating it to before enabling them. -+ */ -+ if (unlikely(task_on_rq_migrating(prev))) { -+ sched_info_dequeued(rq, prev); -+ /* -+ * We move the ownership of prev to the new cpu now. ttwu can't -+ * activate prev to the wrong cpu since it has to grab this -+ * runqueue in ttwu_remote. -+ */ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ prev->cpu = prev->wake_cpu; -+#else -+ task_thread_info(prev)->cpu = prev->wake_cpu; -+#endif -+ raw_spin_unlock(rq->lock); -+ -+ raw_spin_lock(&prev->pi_lock); -+ rq = __task_rq_lock(prev, NULL); -+ /* Check that someone else hasn't already queued prev */ -+ if (likely(!task_queued(prev))) { -+ enqueue_task(rq, prev, 0); -+ prev->on_rq = TASK_ON_RQ_QUEUED; -+ /* Wake up the CPU if it's not already running */ -+ resched_if_idle(rq); -+ } -+ raw_spin_unlock(&prev->pi_lock); -+ } -+#endif -+ rq_unlock(rq); -+ local_irq_enable(); -+} -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_switch -+# define finish_arch_switch(prev) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static void finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq, prev); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline void -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+static unsigned long nr_uninterruptible(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_uninterruptible; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ if (rq_load(raw_rq()) == 1) -+ return true; -+ else -+ return false; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int cpu; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += cpu_rq(cpu)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpufreq menu -+ * governor are using nonsensical data. Boosting frequency for a CPU that has -+ * IO-wait which might not even end up running the task when it does become -+ * runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long cpu, sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += nr_iowait_cpu(cpu); -+ -+ return sum; -+} -+ -+unsigned long nr_active(void) -+{ -+ return nr_running() + nr_uninterruptible(); -+} -+ -+/* Variables and functions for calc_load */ -+static unsigned long calc_load_update; -+unsigned long avenrun[3]; -+EXPORT_SYMBOL(avenrun); -+ -+/** -+ * get_avenrun - get the load average array -+ * @loads: pointer to dest load array -+ * @offset: offset to add -+ * @shift: shift count to shift the result left -+ * -+ * These values are estimates at best, so no need for locking. -+ */ -+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -+{ -+ loads[0] = (avenrun[0] + offset) << shift; -+ loads[1] = (avenrun[1] + offset) << shift; -+ loads[2] = (avenrun[2] + offset) << shift; -+} -+ -+/* -+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. -+ */ -+void calc_global_load(unsigned long ticks) -+{ -+ long active; -+ -+ if (time_before(jiffies, READ_ONCE(calc_load_update))) -+ return; -+ active = nr_active() * FIXED_1; -+ -+ avenrun[0] = calc_load(avenrun[0], EXP_1, active); -+ avenrun[1] = calc_load(avenrun[1], EXP_5, active); -+ avenrun[2] = calc_load(avenrun[2], EXP_15, active); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+} -+ -+/** -+ * fixed_power_int - compute: x^n, in O(log n) time -+ * -+ * @x: base of the power -+ * @frac_bits: fractional bits of @x -+ * @n: power to raise @x to. -+ * -+ * By exploiting the relation between the definition of the natural power -+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and -+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, -+ * (where: n_i \elem {0, 1}, the binary vector representing n), -+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is -+ * of course trivially computable in O(log_2 n), the length of our binary -+ * vector. -+ */ -+static unsigned long -+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -+{ -+ unsigned long result = 1UL << frac_bits; -+ -+ if (n) { -+ for (;;) { -+ if (n & 1) { -+ result *= x; -+ result += 1UL << (frac_bits - 1); -+ result >>= frac_bits; -+ } -+ n >>= 1; -+ if (!n) -+ break; -+ x *= x; -+ x += 1UL << (frac_bits - 1); -+ x >>= frac_bits; -+ } -+ } -+ -+ return result; -+} -+ -+/* -+ * a1 = a0 * e + a * (1 - e) -+ * -+ * a2 = a1 * e + a * (1 - e) -+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) -+ * = a0 * e^2 + a * (1 - e) * (1 + e) -+ * -+ * a3 = a2 * e + a * (1 - e) -+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) -+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) -+ * -+ * ... -+ * -+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] -+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) -+ * = a0 * e^n + a * (1 - e^n) -+ * -+ * [1] application of the geometric series: -+ * -+ * n 1 - x^(n+1) -+ * S_n := \Sum x^i = ------------- -+ * i=0 1 - x -+ */ -+unsigned long -+calc_load_n(unsigned long load, unsigned long exp, -+ unsigned long active, unsigned int n) -+{ -+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+#ifdef CONFIG_PARAVIRT -+static inline u64 steal_ticks(u64 steal) -+{ -+ if (unlikely(steal > NSEC_PER_SEC)) -+ return div_u64(steal, TICK_NSEC); -+ -+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -+} -+#endif -+ -+#ifndef nsecs_to_cputime -+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -+#endif -+ -+/* -+ * On each tick, add the number of nanoseconds to the unbanked variables and -+ * once one tick's worth has accumulated, account it allowing for accurate -+ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we -+ * deduct nanoseconds. -+ */ -+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ if (atomic_read(&rq->nr_iowait) > 0) { -+ rq->iowait_ns += ns; -+ if (rq->iowait_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->iowait_ns); -+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->iowait_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->idle_ns += ns; -+ if (rq->idle_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->idle_ns); -+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->idle_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(idle); -+} -+ -+static void pc_system_time(struct rq *rq, struct task_struct *p, -+ int hardirq_offset, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->stime_ns += ns; -+ if (p->stime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->stime_ns); -+ p->stime_ns %= JIFFY_NS; -+ p->stime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_system_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (hardirq_count() - hardirq_offset) { -+ rq->irq_ns += ns; -+ if (rq->irq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->irq_ns); -+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->irq_ns %= JIFFY_NS; -+ } -+ } else if (in_serving_softirq()) { -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->system_ns += ns; -+ if (rq->system_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->system_ns); -+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->system_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->utime_ns += ns; -+ if (p->utime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->utime_ns); -+ p->utime_ns %= JIFFY_NS; -+ p->utime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_user_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (this_cpu_ksoftirqd() == p) { -+ /* -+ * ksoftirqd time do not get accounted in cpu_softirq_time. -+ * So, we have to handle it separately here. -+ */ -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } -+ -+ if (task_nice(p) > 0 || idleprio_task(p)) { -+ rq->nice_ns += ns; -+ if (rq->nice_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->nice_ns); -+ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->nice_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->user_ns += ns; -+ if (rq->user_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->user_ns); -+ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->user_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+/* -+ * This is called on clock ticks. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate tick timekeeping */ -+ if (user_mode(get_irq_regs())) -+ pc_user_time(rq, p, account_ns); -+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { -+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); -+ } else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+ -+ p->last_ran = rq->niffies; -+} -+ -+/* -+ * This is called on context switches. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate subtick timekeeping */ -+ if (p != idle) -+ pc_user_time(rq, p, account_ns); -+ else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+} -+ -+/* -+ * Return any ns on the sched_clock that have not yet been accounted in -+ * @p in case that task is currently running. -+ * -+ * Called with task_rq_lock(p) held. -+ */ -+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -+{ -+ u64 ns = 0; -+ -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_clocks(rq); -+ ns = rq->niffies - p->last_ran; -+ } -+ -+ return ns; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ struct rq_flags rf; -+ struct rq *rq; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimisation chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_rq_lock(p, &rf); -+ ns = p->sched_time + do_task_delta_exec(p, rq); -+ task_rq_unlock(rq, p, &rf); -+ -+ return ns; -+} -+ -+/* -+ * Functions to test for when SCHED_ISO tasks have used their allocated -+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All -+ * data is modified only by the local runqueue during scheduler_tick with -+ * interrupts disabled. -+ */ -+ -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a -+ * slow division. -+ */ -+static inline void iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+ rq->iso_ticks += 100; -+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { -+ rq->iso_refractory = true; -+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) -+ rq->iso_ticks = ISO_PERIOD * 100; -+ } -+} -+ -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq, int ticks) -+{ -+ if (rq->iso_ticks > 0 || rq->iso_refractory) { -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; -+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { -+ rq->iso_refractory = false; -+ if (unlikely(rq->iso_ticks < 0)) -+ rq->iso_ticks = 0; -+ } -+ } -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if (rt_task(p) || task_running_iso(p)) -+ iso_tick(rq); -+ else -+ no_iso_tick(rq, 1); -+ -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->policy == SCHED_FIFO) -+ return; -+ -+ if (iso_task(p)) { -+ if (task_running_iso(p)) { -+ if (rq->iso_refractory) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Force it to reschedule as -+ * SCHED_NORMAL by zeroing its time_slice -+ */ -+ p->time_slice = 0; -+ } -+ } else if (!rq->iso_refractory) { -+ /* Can now run again ISO. Reschedule to pick up prio */ -+ goto out_resched; -+ } -+ } -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ * Dither is used as a backup for when hrexpiry is disabled or high res -+ * timers not configured in. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+out_resched: -+ rq_lock(rq); -+ __set_tsk_resched(p); -+ rq_unlock(rq); -+} -+ -+static inline void task_tick(struct rq *rq) -+{ -+ if (!rq_idle(rq)) -+ task_running_tick(rq); -+ else if (rq->last_jiffy > rq->last_scheduler_tick) -+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * We can stop the timer tick any time highres timers are active since -+ * we rely entirely on highres timeouts for task expiry rescheduling. -+ */ -+static void sched_stop_tick(struct rq *rq, int cpu) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (!tick_nohz_full_enabled()) -+ return; -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ rq_lock_irq(rq); -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ curr = rq->curr; -+ update_rq_clock(rq); -+ -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ task_tick(rq); -+ -+out_unlock: -+ rq_unlock_irq(rq, NULL); -+ -+out_requeue: -+ -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ /* There cannot be competing actions, but don't rely on stop-machine. */ -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); -+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); -+ /* Don't cancel, as this would mess up the state machine. */ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_stop_tick(struct rq *rq, int cpu) {} -+static inline void sched_start_tick(struct rq *rq, int cpu) {} -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+DEFINE_PER_CPU(unsigned long, thermal_pressure); -+ -+void arch_set_thermal_pressure(struct cpumask *cpus, -+ unsigned long th_pressure) -+{ -+ int cpu; -+ -+ for_each_cpu(cpu, cpus) -+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -+} -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ update_clocks(rq); -+ update_load_avg(rq, 0); -+ update_cpu_clock_tick(rq, rq->curr); -+ task_tick(rq); -+ rq->last_scheduler_tick = rq->last_jiffy; -+ rq->last_tick = rq->clock; -+ psi_task_tick(rq); -+ perf_event_task_tick(); -+ sched_stop_tick(rq, cpu); -+} -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline. Make sure update_clocks has been called recently to update -+ * rq->niffies. -+ */ -+static void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ p->deadline = rq->niffies + task_deadline_diff(p); -+#ifdef CONFIG_SMT_NICE -+ if (!p->mm) -+ p->smt_bias = 0; -+ else if (rt_task(p)) -+ p->smt_bias = 1 << 30; -+ else if (task_running_iso(p)) -+ p->smt_bias = 1 << 29; -+ else if (idleprio_task(p)) { -+ if (task_running_idle(p)) -+ p->smt_bias = 0; -+ else -+ p->smt_bias = 1; -+ } else if (--p->smt_bias < 1) -+ p->smt_bias = MAX_PRIO - p->static_prio; -+#endif -+} -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (p->time_slice < RESCHED_US || batch_task(p)) -+ time_slice_expired(p, rq); -+} -+ -+/* -+ * Task selection with skiplists is a simple matter of picking off the first -+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) -+ * being bound to the number of processors. -+ * -+ * Runqueues are selectively locked based on their unlocked data and then -+ * unlocked if not needed. At most 3 locks will be held at any time and are -+ * released as soon as they're no longer needed. All balancing between CPUs -+ * is thus done here in an extremely simple first come best fit manner. -+ * -+ * This iterates over runqueues in cache locality order. In interactive mode -+ * it iterates over all CPUs and finds the task with the best key/deadline. -+ * In non-interactive mode it will only take a task if it's from the current -+ * runqueue or a runqueue with more tasks than the current one with a better -+ * key/deadline. -+ */ -+#ifdef CONFIG_SMP -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct rq *locked = NULL, *chosen = NULL; -+ struct task_struct *edt = idle; -+ int i, best_entries = 0; -+ u64 best_key = ~0ULL; -+ -+ for (i = 0; i < total_runqueues; i++) { -+ struct rq *other_rq = rq_order(rq, i); -+ skiplist_node *next; -+ int entries; -+ -+ entries = other_rq->sl->entries; -+ /* -+ * Check for queued entres lockless first. The local runqueue -+ * is locked so entries will always be accurate. -+ */ -+ if (!sched_interactive) { -+ /* -+ * Don't reschedule balance across nodes unless the CPU -+ * is idle. -+ */ -+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) -+ break; -+ if (entries <= best_entries) -+ continue; -+ } else if (!entries) -+ continue; -+ -+ /* if (i) implies other_rq != rq */ -+ if (i) { -+ /* Check for best id queued lockless first */ -+ if (other_rq->best_key >= best_key) -+ continue; -+ -+ if (unlikely(!trylock_rq(rq, other_rq))) -+ continue; -+ -+ /* Need to reevaluate entries after locking */ -+ entries = other_rq->sl->entries; -+ if (unlikely(!entries)) { -+ unlock_rq(other_rq); -+ continue; -+ } -+ } -+ -+ next = other_rq->node; -+ /* -+ * In interactive mode we check beyond the best entry on other -+ * runqueues if we can't get the best for smt or affinity -+ * reasons. -+ */ -+ while ((next = next->next[0]) != other_rq->node) { -+ struct task_struct *p; -+ u64 key = next->key; -+ -+ /* Reevaluate key after locking */ -+ if (key >= best_key) -+ break; -+ -+ p = next->value; -+ if (!smt_schedule(p, rq)) { -+ if (i && !sched_interactive) -+ break; -+ continue; -+ } -+ -+ if (sched_other_cpu(p, cpu)) { -+ if (sched_interactive || !i) -+ continue; -+ break; -+ } -+ /* Make sure affinity is ok */ -+ if (i) { -+ /* From this point on p is the best so far */ -+ if (locked) -+ unlock_rq(locked); -+ chosen = locked = other_rq; -+ } -+ best_entries = entries; -+ best_key = key; -+ edt = p; -+ break; -+ } -+ /* rq->preempting is a hint only as the state may have changed -+ * since it was set with the resched call but if we have met -+ * the condition we can break out here. */ -+ if (edt == rq->preempting) -+ break; -+ if (i && other_rq != chosen) -+ unlock_rq(other_rq); -+ } -+ -+ if (likely(edt != idle)) -+ take_task(rq, cpu, edt); -+ -+ if (locked) -+ unlock_rq(locked); -+ -+ rq->preempting = NULL; -+ -+ return edt; -+} -+#else /* CONFIG_SMP */ -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct task_struct *edt; -+ -+ if (unlikely(!rq->sl->entries)) -+ return idle; -+ edt = rq->node->next[0]->value; -+ take_task(rq, cpu, edt); -+ return edt; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * The currently running task's information is all stored in rq local data -+ * which is only modified by the local CPU. -+ */ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ if (p == rq->idle || p->policy == SCHED_FIFO) -+ hrexpiry_clear(rq); -+ else -+ hrexpiry_start(rq, US_TO_NS(p->time_slice)); -+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) -+ rq->dither = 0; -+ else -+ rq->dither = rq_dither(rq); -+ -+ rq->rq_deadline = p->deadline; -+ rq->rq_prio = p->prio; -+#ifdef CONFIG_SMT_NICE -+ rq->rq_mm = p->mm; -+ rq->rq_smt_bias = p->smt_bias; -+#endif -+} -+ -+#ifdef CONFIG_SMT_NICE -+static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; -+ -+/* Iterate over smt siblings when we've scheduled a process on cpu and decide -+ * whether they should continue running or be descheduled. */ -+static void check_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct task_struct *p; -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ continue; -+ p = rq->curr; -+ if (!smt_schedule(p, this_rq)) -+ resched_curr(rq); -+ } -+} -+ -+static void wake_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ resched_idle(rq); -+ } -+} -+#else -+static void check_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_siblings(struct rq __maybe_unused *this_rq) {} -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next, *idle; -+ unsigned long *switch_count; -+ bool deactivate = false; -+ struct rq *rq; -+ u64 niffies; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ idle = rq->idle; -+ -+ schedule_debug(prev, preempt); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ rq_lock(rq); -+ smp_mb__after_spinlock(); -+#ifdef CONFIG_SMP -+ if (rq->preempt) { -+ /* -+ * Make sure resched_curr hasn't triggered a preemption -+ * locklessly on a task that has since scheduled away. Spurious -+ * wakeup of idle is okay though. -+ */ -+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { -+ rq->preempt = NULL; -+ clear_preempt_need_resched(); -+ rq_unlock_irq(rq, NULL); -+ return; -+ } -+ rq->preempt = NULL; -+ } -+#endif -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate = true; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ /* -+ * Store the niffy value here for use by the next task's last_ran -+ * below to avoid losing niffies due to update_clocks being called -+ * again after this point. -+ */ -+ update_clocks(rq); -+ niffies = rq->niffies; -+ update_cpu_clock_switch(rq, prev); -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ if (idle != prev) { -+ check_deadline(prev, rq); -+ return_task(prev, rq, cpu, deactivate); -+ } -+ -+ next = earliest_deadline_task(rq, cpu, idle); -+ if (likely(next->prio != PRIO_LIMIT)) -+ clear_cpuidle_map(cpu); -+ else { -+ set_cpuidle_map(cpu); -+ update_load_avg(rq, 0); -+ } -+ -+ set_rq_task(rq, next); -+ next->last_ran = niffies; -+ -+ if (likely(prev != next)) { -+ /* -+ * Don't reschedule an idle task or deactivated tasks -+ */ -+ if (prev == idle) { -+ rq->nr_running++; -+ if (rt_task(next)) -+ rq->rt_nr_running++; -+ } else if (!deactivate) -+ resched_suitable_idle(prev); -+ if (unlikely(next == idle)) { -+ rq->nr_running--; -+ if (rt_task(prev)) -+ rq->rt_nr_running--; -+ wake_siblings(rq); -+ } else -+ check_siblings(rq); -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ context_switch(rq, prev, next); /* unlocks the rq */ -+ } else { -+ check_siblings(rq); -+ rq_unlock(rq); -+ local_irq_enable(); -+ } -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(). */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static inline void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+ -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != IN_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio, oldprio; -+ struct rq *rq; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_rq_lock(p, NULL); -+ update_rq_clock(rq); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ oldprio = p->prio; -+ p->prio = prio; -+ if (task_running(rq, p)){ -+ if (prio > oldprio) -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (prio < oldprio) -+ try_preempt(p, rq); -+ } -+out_unlock: -+ __task_rq_unlock(rq, NULL); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+/* -+ * Adjust the deadline for when the priority is to change, before it's -+ * changed. -+ */ -+static inline void adjust_deadline(struct task_struct *p, int new_prio) -+{ -+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -+} -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static, old_static; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (has_rt_policy(p)) { -+ p->static_prio = new_static; -+ goto out_unlock; -+ } -+ -+ adjust_deadline(p, new_static); -+ old_static = p->static_prio; -+ p->static_prio = new_static; -+ p->prio = effective_prio(p); -+ -+ if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (new_static < old_static) -+ try_preempt(p, rq); -+ } else if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ if (old_static < new_static) -+ resched_task(p); -+ } -+out_unlock: -+ task_rq_unlock(rq, p, &rf); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int delta, prio = p->prio - MAX_RT_PRIO; -+ -+ /* rt tasks and iso tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ /* Convert to ms to avoid overflows */ -+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); -+ if (unlikely(delta < 0)) -+ delta = 0; -+ delta = delta * 40 / ms_longest_deadline_diff(); -+ if (delta <= 80) -+ prio += delta; -+ if (idleprio_task(p)) -+ prio += 40; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * available_idle_cpu - is a given CPU idle for enqueuing work. -+ * @cpu: the CPU in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int available_idle_cpu(int cpu) -+{ -+ if (!idle_cpu(cpu)) -+ return 0; -+ -+ if (vcpu_is_preempted(cpu)) -+ return 0; -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the CPU @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, -+ int prio, const struct sched_attr *attr, -+ bool keep_boost) -+{ -+ int oldrtprio, oldprio; -+ -+ /* -+ * If params can't change scheduling class changes aren't allowed -+ * either. -+ */ -+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) -+ return; -+ -+ p->policy = policy; -+ oldrtprio = p->rt_priority; -+ p->rt_priority = prio; -+ p->normal_prio = normal_prio(p); -+ oldprio = p->prio; -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ -+ if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (p->prio < oldprio || p->rt_priority > oldrtprio) -+ try_preempt(p, rq); -+ } -+} -+ -+/* -+ * Check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; -+ unsigned long rlim_rtprio = 0; -+ struct rq_flags rf; -+ int reset_on_fork; -+ struct rq *rq; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ priority = 0; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -+ policy &= ~SCHED_RESET_ON_FORK; -+ -+ if (!SCHED_RANGE(policy)) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH is 0. -+ */ -+ if (priority < 0 || -+ (p->mm && priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if (is_rt_policy(policy) != (priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (is_rt_policy(policy)) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (priority > p->rt_priority && -+ priority > rlim_rtprio) -+ return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy != SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ * -+ * To be able to change p->policy safely, the runqueue lock must be -+ * held. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea: -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || -+ priority == p->rt_priority))) { -+ retval = 0; -+ goto unlock; -+ } -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ __setscheduler(p, rq, policy, priority, attr, pi); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ task_rq_unlock(rq, p, &rf); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ preempt_enable(); -+out: -+ return 0; -+ -+unlock: -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) -+ attr.sched_policy = SETPARAM_POLICY; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ cpumask_t *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ unsigned long flags; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ put_online_cpus(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min(len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ -+ if (!sched_yield_type) -+ return; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ -+ if (sched_yield_type > 1) -+ time_slice_expired(current, rq); -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ rq_unlock(rq); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ struct task_struct *rq_p; -+ struct rq *rq, *p_rq; -+ unsigned long flags; -+ int yielded = 0; -+ -+ local_irq_save(flags); -+ rq = this_rq(); -+ -+again: -+ p_rq = task_rq(p); -+ /* -+ * If we're the only runnable task on the rq and target rq also -+ * has only one task, there's absolutely no point in yielding. -+ */ -+ if (task_running(p_rq, p) || p->state) { -+ yielded = -ESRCH; -+ goto out_irq; -+ } -+ -+ double_rq_lock(rq, p_rq); -+ if (unlikely(task_rq(p) != p_rq)) { -+ double_rq_unlock(rq, p_rq); -+ goto again; -+ } -+ -+ yielded = 1; -+ schedstat_inc(rq->yld_count); -+ rq_p = rq->curr; -+ if (p->deadline > rq_p->deadline) -+ p->deadline = rq_p->deadline; -+ p->time_slice += rq_p->time_slice; -+ if (p->time_slice > timeslice()) -+ p->time_slice = timeslice(); -+ time_slice_expired(rq_p, rq); -+ if (preempt && rq != p_rq) -+ resched_task(p_rq->curr); -+ double_rq_unlock(rq, p_rq); -+out_irq: -+ local_irq_restore(flags); -+ -+ if (yielded > 0) -+ schedule(); -+ return yielded; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ unsigned int time_slice; -+ struct rq_flags rf; -+ struct rq *rq; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ rq = task_rq_lock(p, &rf); -+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); -+ task_rq_unlock(rq, p, &rf); -+ -+ rcu_read_unlock(); -+ *t = ns_to_timespec64(time_slice); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * this syscall writes the default timeslice value of a given process -+ * into the user-space timespec buffer. A value of '0' means infinity. -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+#ifdef CONFIG_SMP -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ -+ if (task_queued(p)) { -+ /* -+ * Because __kthread_bind() calls this on blocked tasks without -+ * holding rq->lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ } -+} -+ -+/* -+ * Calling do_set_cpus_allowed from outside the scheduler code should not be -+ * called on a running or queued task. We should be holding pi_lock. -+ */ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+ if (needs_other_cpu(p, task_cpu(p))) { -+ struct rq *rq; -+ -+ rq = __task_rq_lock(p, NULL); -+ set_task_cpu(p, valid_task_cpu(p)); -+ resched_task(p); -+ __task_rq_unlock(rq, NULL); -+ } -+} -+#endif -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(rq->lock); -+ idle->last_ran = rq->niffies; -+ time_slice_expired(idle, rq); -+ idle->state = TASK_RUNNING; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->flags |= PF_IDLE; -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#ifdef CONFIG_SMT_NICE -+ idle->smt_bias = 0; -+#endif -+#endif -+ set_rq_task(rq, idle); -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_rq = TASK_ON_RQ_QUEUED; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(rq); -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+void nohz_balance_enter_idle(int cpu) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct sched_domain *sd; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ rcu_read_lock(); -+ for_each_domain(cpu, sd) { -+ for_each_cpu_and(i, sched_domain_span(sd), -+ housekeeping_cpumask(HK_FLAG_TIMER)) { -+ if (cpu == i) -+ continue; -+ -+ if (!idle_cpu(i)) { -+ cpu = i; -+ goto unlock; -+ } -+ } -+ } -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+unlock: -+ rcu_read_unlock(); -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * Wake up the specified CPU. If the CPU is going offline, it is the -+ * caller's responsibility to deal with the lost wakeup, for example, -+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. -+ */ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool queued = false, running_wrong = false, kthread; -+ unsigned int dest_cpu; -+ struct rq_flags rf; -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ kthread = !!(p->flags & PF_KTHREAD); -+ if (kthread) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(p->cpus_ptr, new_mask)) -+ goto out; -+ -+ /* -+ * Picking a ~random cpu helps in cases where we are changing affinity -+ * for groups of tasks (ie. cpuset), so that load balancing is not -+ * immediately required to distribute the tasks within their new mask. -+ */ -+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ queued = task_queued(p); -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (kthread) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(rq, p)) { -+ /* Task is running on the wrong cpu now, reschedule it. */ -+ if (rq == this_rq()) { -+ set_task_cpu(p, dest_cpu); -+ set_tsk_need_resched(p); -+ running_wrong = true; -+ } else -+ resched_task(p); -+ } else { -+ if (queued) { -+ /* -+ * Switch runqueue locks after dequeueing the task -+ * here while still holding the pi_lock to be holding -+ * the correct lock for enqueueing. -+ */ -+ dequeue_task(rq, p, 0); -+ rq_unlock(rq); -+ -+ rq = cpu_rq(dest_cpu); -+ rq_lock(rq); -+ } -+ set_task_cpu(p, dest_cpu); -+ if (queued) -+ enqueue_task(rq, p, 0); -+ } -+ if (queued) -+ try_preempt(p, rq); -+ if (running_wrong) -+ preempt_disable(); -+out: -+ task_rq_unlock(rq, p, &rf); -+ -+ if (running_wrong) { -+ __schedule(true); -+ preempt_enable(); -+ } -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Run through task list and find tasks affined to the dead cpu, then remove -+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold -+ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and -+ * pi_lock to change cpus_mask but it's not going to matter here. -+ */ -+static void bind_zero(int src_cpu) -+{ -+ struct task_struct *p, *t; -+ struct rq *rq0; -+ int bound = 0; -+ -+ if (src_cpu == 0) -+ return; -+ -+ rq0 = cpu_rq(0); -+ -+ do_each_thread(t, p) { -+ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { -+ bool local = (task_cpu(p) == src_cpu); -+ struct rq *rq = task_rq(p); -+ -+ /* task_running is the cpu stopper thread */ -+ if (local && task_running(rq, p)) -+ continue; -+ atomic_clear_cpu(src_cpu, &p->cpus_mask); -+ atomic_set_cpu(0, &p->cpus_mask); -+ p->zerobound = true; -+ bound++; -+ if (local) { -+ bool queued = task_queued(p); -+ -+ if (queued) -+ dequeue_task(rq, p, 0); -+ set_task_cpu(p, 0); -+ if (queued) -+ enqueue_task(rq0, p, 0); -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (bound) { -+ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", -+ bound, src_cpu); -+ } -+} -+ -+/* Find processes with the zerobound flag and reenable their affinity for the -+ * CPU coming alive. */ -+static void unbind_zero(int src_cpu) -+{ -+ int unbound = 0, zerobound = 0; -+ struct task_struct *p, *t; -+ -+ if (src_cpu == 0) -+ return; -+ -+ do_each_thread(t, p) { -+ if (!p->mm) -+ p->zerobound = false; -+ if (p->zerobound) { -+ unbound++; -+ cpumask_set_cpu(src_cpu, &p->cpus_mask); -+ /* Once every CPU affinity has been re-enabled, remove -+ * the zerobound flag */ -+ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { -+ p->zerobound = false; -+ zerobound++; -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (unbound) { -+ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", -+ unbound, src_cpu); -+ } -+ if (zerobound) { -+ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", -+ zerobound); -+ } -+} -+ -+/* -+ * Ensure that the idle task is using init_mm right before its cpu goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ current->active_mm = &init_mm; -+ finish_arch_post_lock_switch(); -+ } -+ mmdrop(mm); -+} -+#else /* CONFIG_HOTPLUG_CPU */ -+static void unbind_zero(int src_cpu) {} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+ -+static struct ctl_table sd_ctl_dir[] = { -+ { -+ .procname = "sched_domain", -+ .mode = 0555, -+ }, -+ {} -+}; -+ -+static struct ctl_table sd_ctl_root[] = { -+ { -+ .procname = "kernel", -+ .mode = 0555, -+ .child = sd_ctl_dir, -+ }, -+ {} -+}; -+ -+static struct ctl_table *sd_alloc_ctl_entry(int n) -+{ -+ struct ctl_table *entry = -+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); -+ -+ return entry; -+} -+ -+static void sd_free_ctl_entry(struct ctl_table **tablep) -+{ -+ struct ctl_table *entry; -+ -+ /* -+ * In the intermediate directories, both the child directory and -+ * procname are dynamically allocated and could fail but the mode -+ * will always be set. In the lowest directory the names are -+ * static strings and all have proc handlers. -+ */ -+ for (entry = *tablep; entry->mode; entry++) { -+ if (entry->child) -+ sd_free_ctl_entry(&entry->child); -+ if (entry->proc_handler == NULL) -+ kfree(entry->procname); -+ } -+ -+ kfree(*tablep); -+ *tablep = NULL; -+} -+ -+static void -+set_table_entry(struct ctl_table *entry, -+ const char *procname, void *data, int maxlen, -+ umode_t mode, proc_handler *proc_handler) -+{ -+ entry->procname = procname; -+ entry->data = data; -+ entry->maxlen = maxlen; -+ entry->mode = mode; -+ entry->proc_handler = proc_handler; -+} -+ -+static struct ctl_table * -+sd_alloc_ctl_domain_table(struct sched_domain *sd) -+{ -+ struct ctl_table *table = sd_alloc_ctl_entry(9); -+ -+ if (table == NULL) -+ return NULL; -+ -+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); -+ /* &table[8] is terminator */ -+ -+ return table; -+} -+ -+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -+{ -+ struct ctl_table *entry, *table; -+ struct sched_domain *sd; -+ int domain_num = 0, i; -+ char buf[32]; -+ -+ for_each_domain(cpu, sd) -+ domain_num++; -+ entry = table = sd_alloc_ctl_entry(domain_num + 1); -+ if (table == NULL) -+ return NULL; -+ -+ i = 0; -+ for_each_domain(cpu, sd) { -+ snprintf(buf, 32, "domain%d", i); -+ entry->procname = kstrdup(buf, GFP_KERNEL); -+ entry->mode = 0555; -+ entry->child = sd_alloc_ctl_domain_table(sd); -+ entry++; -+ i++; -+ } -+ return table; -+} -+ -+static cpumask_var_t sd_sysctl_cpus; -+static struct ctl_table_header *sd_sysctl_header; -+ -+void register_sched_domain_sysctl(void) -+{ -+ static struct ctl_table *cpu_entries; -+ static struct ctl_table **cpu_idx; -+ char buf[32]; -+ int i; -+ -+ if (!cpu_entries) { -+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); -+ if (!cpu_entries) -+ return; -+ -+ WARN_ON(sd_ctl_dir[0].child); -+ sd_ctl_dir[0].child = cpu_entries; -+ } -+ -+ if (!cpu_idx) { -+ struct ctl_table *e = cpu_entries; -+ -+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); -+ if (!cpu_idx) -+ return; -+ -+ /* deal with sparse possible map */ -+ for_each_possible_cpu(i) { -+ cpu_idx[i] = e; -+ e++; -+ } -+ } -+ -+ if (!cpumask_available(sd_sysctl_cpus)) { -+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) -+ return; -+ -+ /* init to possible to not have holes in @cpu_entries */ -+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); -+ } -+ -+ for_each_cpu(i, sd_sysctl_cpus) { -+ struct ctl_table *e = cpu_idx[i]; -+ -+ if (e->child) -+ sd_free_ctl_entry(&e->child); -+ -+ if (!e->procname) { -+ snprintf(buf, 32, "cpu%d", i); -+ e->procname = kstrdup(buf, GFP_KERNEL); -+ } -+ e->mode = 0555; -+ e->child = sd_alloc_ctl_cpu_table(i); -+ -+ __cpumask_clear_cpu(i, sd_sysctl_cpus); -+ } -+ -+ WARN_ON(sd_sysctl_header); -+ sd_sysctl_header = register_sysctl_table(sd_ctl_root); -+} -+ -+void dirty_sched_domain_sysctl(int cpu) -+{ -+ if (cpumask_available(sd_sysctl_cpus)) -+ __cpumask_set_cpu(cpu, sd_sysctl_cpus); -+} -+ -+/* may be called multiple times per register */ -+void unregister_sched_domain_sysctl(void) -+{ -+ unregister_sysctl_table(sd_sysctl_header); -+ sd_sysctl_header = NULL; -+} -+#endif /* CONFIG_SYSCTL */ -+ -+void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) { -+ cpumask_set_cpu(cpu_of(rq), rq->rd->online); -+ rq->online = true; -+ } -+} -+ -+void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) { -+ int cpu = cpu_of(rq); -+ -+ cpumask_clear_cpu(cpu, rq->rd->online); -+ rq->online = false; -+ clear_cpuidle_map(cpu); -+ } -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) { -+ sched_domains_numa_masks_set(cpu); -+ cpuset_cpu_active(); -+ } -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all CPUs have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ rq_lock_irqsave(rq, &rf); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_online(rq); -+ } -+ unbind_zero(cpu); -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ sched_domains_numa_masks_clear(cpu); -+ return 0; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_ttwu_pending(); -+ sched_tick_stop(cpu); -+ -+ local_irq_save(flags); -+ double_rq_lock(rq, cpu_rq(0)); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ bind_zero(cpu); -+ double_rq_unlock(rq, cpu_rq(0)); -+ sched_start_tick(rq, cpu); -+ hrexpiry_clear(rq); -+ local_irq_restore(flags); -+ -+ return 0; -+} -+#endif -+ -+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -+/* -+ * Cheaper version of the below functions in case support for SMT and MC is -+ * compiled in but CPUs have no siblings. -+ */ -+static bool sole_cpu_idle(struct rq *rq) -+{ -+ return rq_idle(rq); -+} -+#endif -+#ifdef CONFIG_SCHED_SMT -+static const cpumask_t *thread_cpumask(int cpu) -+{ -+ return topology_sibling_cpumask(cpu); -+} -+/* All this CPU's SMT siblings are idle */ -+static bool siblings_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); -+} -+#endif -+#ifdef CONFIG_SCHED_MC -+static const cpumask_t *core_cpumask(int cpu) -+{ -+ return topology_core_cpumask(cpu); -+} -+/* All this CPU's shared cache siblings are idle */ -+static bool cache_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->core_mask, &cpu_idle_map); -+} -+/* MC siblings CPU mask which share the same LLC */ -+static const cpumask_t *llc_core_cpumask(int cpu) -+{ -+#ifdef CONFIG_X86 -+ return per_cpu(cpu_llc_shared_map, cpu); -+#else -+ return topology_core_cpumask(cpu); -+#endif -+} -+#endif -+ -+enum sched_domain_level { -+ SD_LV_NONE = 0, -+ SD_LV_SIBLING, -+ SD_LV_MC, -+ SD_LV_BOOK, -+ SD_LV_CPU, -+ SD_LV_NODE, -+ SD_LV_ALLNODES, -+ SD_LV_MAX -+}; -+ -+/* -+ * Set up the relative cache distance of each online cpu from each -+ * other in a simple array for quick lookup. Locality is determined -+ * by the closest sched_domain that CPUs are separated by. CPUs with -+ * shared cache in SMT and MC are treated as local. Separate CPUs -+ * (within the same package or physically) within the same node are -+ * treated as not local. CPUs not even in the same domain (different -+ * nodes) are treated as very distant. -+ */ -+static void __init select_leaders(void) -+{ -+ struct rq *rq, *other_rq, *leader; -+ struct sched_domain *sd; -+ int cpu, other_cpu; -+#ifdef CONFIG_SCHED_SMT -+ bool smt_threads = false; -+#endif -+ -+ for (cpu = 0; cpu < num_online_cpus(); cpu++) { -+ rq = cpu_rq(cpu); -+ leader = NULL; -+ /* First check if this cpu is in the same node */ -+ for_each_domain(cpu, sd) { -+ if (sd->level > SD_LV_MC) -+ continue; -+ if (rqshare != RQSHARE_ALL) -+ leader = NULL; -+ /* Set locality to local node if not already found lower */ -+ for_each_cpu(other_cpu, sched_domain_span(sd)) { -+ if (rqshare >= RQSHARE_SMP) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smp_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smp_leader) -+ other_rq->smp_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMP; -+ } -+ } -+ -+ /* -+ * Each runqueue has its own function in case it doesn't have -+ * siblings of its own allowing mixed topologies. -+ */ -+#ifdef CONFIG_SCHED_MC -+ leader = NULL; -+ if (cpumask_weight(core_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->core_mask); -+ for_each_cpu(other_cpu, core_cpumask(cpu)) { -+ if (rqshare == RQSHARE_MC || -+ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the mc_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->mc_leader) -+ other_rq->mc_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { -+ /* this is to get LLC into play even in case LLC sharing is not used */ -+ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) -+ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; -+ else -+ rq->cpu_locality[other_cpu] = LOCALITY_MC; -+ } -+ } -+ rq->cache_idle = cache_cpu_idle; -+ } -+#endif -+#ifdef CONFIG_SCHED_SMT -+ leader = NULL; -+ if (cpumask_weight(thread_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->thread_mask); -+ for_each_cpu(other_cpu, thread_cpumask(cpu)) { -+ if (rqshare == RQSHARE_SMT) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smt_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smt_leader) -+ other_rq->smt_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMT; -+ } -+ rq->siblings_idle = siblings_cpu_idle; -+ smt_threads = true; -+ } -+#endif -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (smt_threads) { -+ check_siblings = &check_smt_siblings; -+ wake_siblings = &wake_smt_siblings; -+ smt_schedule = &smt_should_schedule; -+ } -+#endif -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for_each_online_cpu(other_cpu) { -+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); -+ } -+ } -+} -+ -+/* FIXME freeing locked spinlock */ -+static void __init share_and_free_rq(struct rq *leader, struct rq *rq) -+{ -+ WARN_ON(rq->nr_running > 0); -+ -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ rq->is_leader = false; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+} -+ -+static void __init share_rqs(void) -+{ -+ struct rq *rq, *leader; -+ int cpu; -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smp_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+ -+#ifdef CONFIG_SCHED_MC -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->mc_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_MC */ -+ -+#ifdef CONFIG_SCHED_SMT -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smt_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_SMT */ -+} -+ -+static void __init setup_rq_orders(void) -+{ -+ int *selected_cpus, *ordered_cpus; -+ struct rq *rq, *other_rq; -+ int cpu, other_cpu, i; -+ -+ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ -+ total_runqueues = 0; -+ for_each_online_cpu(cpu) { -+ int locality, total_rqs = 0, total_cpus = 0; -+ -+ rq = cpu_rq(cpu); -+ if (rq->is_leader) -+ total_runqueues++; -+ -+ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { -+ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; -+ int ordered_cpus_idx; -+ -+ ordered_cpus_idx = -1; -+ selected_cpu_cnt = 0; -+ -+ for_each_online_cpu(test_cpu) { -+ if (cpu < num_online_cpus() / 2) -+ other_cpu = cpu + test_cpu; -+ else -+ other_cpu = cpu - test_cpu; -+ if (other_cpu < 0) -+ other_cpu += num_online_cpus(); -+ else -+ other_cpu %= num_online_cpus(); -+ /* gather CPUs of the same locality */ -+ if (rq->cpu_locality[other_cpu] == locality) { -+ selected_cpus[selected_cpu_cnt] = other_cpu; -+ selected_cpu_cnt++; -+ } -+ } -+ -+ /* reserve first CPU as starting point */ -+ if (selected_cpu_cnt > 0) { -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; -+ selected_cpus[ordered_cpus_idx] = -1; -+ } -+ -+ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ -+ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { -+ /* starting point with worst locality and current CPU */ -+ best_locality = LOCALITY_DISTANT; -+ selected_cpu_idx = test_cpu_idx; -+ -+ /* try to find the best locality within group */ -+ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { -+ /* if CPU has not been used and locality is better */ -+ if (selected_cpus[cpu_idx] > -1) { -+ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); -+ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { -+ /* assign best locality and best CPU idx in array */ -+ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; -+ selected_cpu_idx = cpu_idx; -+ } -+ } -+ } -+ -+ /* add our next best CPU to ordered list */ -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; -+ /* mark this CPU as used */ -+ selected_cpus[selected_cpu_idx] = -1; -+ } -+ -+ /* set up RQ and CPU orders */ -+ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { -+ other_rq = cpu_rq(ordered_cpus[test_cpu]); -+ /* set up cpu orders */ -+ rq->cpu_order[total_cpus++] = other_rq; -+ if (other_rq->is_leader) { -+ /* set up RQ orders */ -+ rq->rq_order[total_rqs++] = other_rq; -+ } -+ } -+ } -+ } -+ -+ kfree(selected_cpus); -+ kfree(ordered_cpus); -+ -+#ifdef CONFIG_X86 -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < total_runqueues; i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < num_online_cpus(); i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); -+ } -+ } -+#endif -+} -+ -+void __init sched_init_smp(void) -+{ -+ sched_init_numa(); -+ -+ /* -+ * There's no userspace yet to cause hotplug operations; hence all the -+ * cpu masks are stable and all blatant races in the below code cannot -+ * happen. -+ */ -+ mutex_lock(&sched_domains_mutex); -+ sched_init_domains(cpu_active_mask); -+ mutex_unlock(&sched_domains_mutex); -+ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ local_irq_disable(); -+ mutex_lock(&sched_domains_mutex); -+ lock_all_rqs(); -+ -+ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", -+ num_possible_cpus(), num_present_cpus(), num_online_cpus()); -+ -+ select_leaders(); -+ -+ unlock_all_rqs(); -+ mutex_unlock(&sched_domains_mutex); -+ -+ share_rqs(); -+ -+ local_irq_enable(); -+ -+ setup_rq_orders(); -+ -+ switch (rqshare) { -+ case RQSHARE_ALL: -+ /* This should only ever read 1 */ -+ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMP: -+ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC: -+ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC_LLC: -+ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMT: -+ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_NONE: -+ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", -+ total_runqueues); -+ break; -+ } -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ sched_smp_initialized = true; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+#ifdef CONFIG_SMP -+ int cpu_ids; -+#endif -+ int i; -+ struct rq *rq; -+ -+ wait_bit_init(); -+ -+ prio_ratios[0] = 128; -+ for (i = 1 ; i < NICE_WIDTH ; i++) -+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; -+ -+ skiplist_node_init(&init_task.node); -+ -+#ifdef CONFIG_SMP -+ init_defrootdomain(); -+ cpumask_clear(&cpu_idle_map); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); -+ skiplist_init(rq->node); -+ rq->sl = new_skiplist(rq->node); -+ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); -+ raw_spin_lock_init(rq->lock); -+ rq->nr_running = 0; -+ rq->nr_uninterruptible = 0; -+ rq->nr_switches = 0; -+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; -+ rq->last_jiffy = jiffies; -+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = -+ rq->iowait_ns = rq->idle_ns = 0; -+ rq->dither = 0; -+ set_rq_task(rq, &init_task); -+ rq->iso_ticks = 0; -+ rq->iso_refractory = false; -+#ifdef CONFIG_SMP -+ rq->is_leader = true; -+ rq->smp_leader = NULL; -+#ifdef CONFIG_SCHED_MC -+ rq->mc_leader = NULL; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ rq->smt_leader = NULL; -+#endif -+ rq->sd = NULL; -+ rq->rd = NULL; -+ rq->online = false; -+ rq->cpu = i; -+ rq_attach_root(rq, &def_root_domain); -+#endif -+ init_rq_hrexpiry(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+ -+#ifdef CONFIG_SMP -+ cpu_ids = i; -+ /* -+ * Set the base locality for cpu cache distance calculation to -+ * "distant" (3). Make sure the distance from a CPU to itself is 0. -+ */ -+ for_each_possible_cpu(i) { -+ int j; -+ -+ rq = cpu_rq(i); -+#ifdef CONFIG_SCHED_SMT -+ rq->siblings_idle = sole_cpu_idle; -+#endif -+#ifdef CONFIG_SCHED_MC -+ rq->cache_idle = sole_cpu_idle; -+#endif -+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); -+ for_each_possible_cpu(j) { -+ if (i == j) -+ rq->cpu_locality[j] = LOCALITY_SAME; -+ else -+ rq->cpu_locality[j] = LOCALITY_DISTANT; -+ } -+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->rq_order[0] = rq->cpu_order[0] = rq; -+ for (j = 1; j < cpu_ids; j++) -+ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); -+ } -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && !preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+static inline void normalise_rt_tasks(void) -+{ -+ struct sched_attr attr = {}; -+ struct task_struct *g, *p; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p) && !iso_task(p)) -+ continue; -+ -+ rq = task_rq_lock(p, &rf); -+ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); -+ task_rq_unlock(rq, p, &rf); -+ } -+ read_unlock(&tasklist_lock); -+} -+ -+void normalize_rt_tasks(void) -+{ -+ normalise_rt_tasks(); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+void init_idle_bootup_task(struct task_struct *idle) -+{} -+ -+#ifdef CONFIG_SCHED_DEBUG -+__read_mostly bool sched_debug_enabled; -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h -new file mode 100644 -index 000000000000..b34f2797e44f ---- /dev/null -+++ b/kernel/sched/MuQSS.h -@@ -0,0 +1,1056 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef MUQSS_SCHED_H -+#define MUQSS_SCHED_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+#else -+# define SCHED_WARN_ON(x) ((void)(x)) -+#endif -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+struct rq; -+ -+#ifdef CONFIG_SMP -+ -+static inline bool sched_asym_prefer(int a, int b) -+{ -+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); -+} -+ -+struct perf_domain { -+ struct em_perf_domain *em_pd; -+ struct perf_domain *next; -+ struct rcu_head rcu; -+}; -+ -+/* Scheduling group status flags */ -+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ -+ -+/* -+ * We add the notion of a root-domain which will be used to define per-domain -+ * variables. Each exclusive cpuset essentially defines an island domain by -+ * fully partitioning the member cpus from any other cpuset. Whenever a new -+ * exclusive cpuset is created, we also create and attach a new root-domain -+ * object. -+ * -+ */ -+struct root_domain { -+ atomic_t refcount; -+ atomic_t rto_count; -+ struct rcu_head rcu; -+ cpumask_var_t span; -+ cpumask_var_t online; -+ -+ /* -+ * Indicate pullable load on at least one CPU, e.g: -+ * - More than one runnable task -+ * - Running task is misfit -+ */ -+ int overload; -+ -+ /* Indicate one or more cpus over-utilized (tipping point) */ -+ int overutilized; -+ -+ /* -+ * The bit corresponding to a CPU gets set here if such CPU has more -+ * than one runnable -deadline task (as it is below for RT tasks). -+ */ -+ cpumask_var_t dlo_mask; -+ atomic_t dlo_count; -+ /* Replace unused CFS structures with void */ -+ //struct dl_bw dl_bw; -+ //struct cpudl cpudl; -+ void *dl_bw; -+ void *cpudl; -+ -+ /* -+ * The "RT overload" flag: it gets set if a CPU has more than -+ * one runnable RT task. -+ */ -+ cpumask_var_t rto_mask; -+ //struct cpupri cpupri; -+ void *cpupri; -+ -+ unsigned long max_cpu_capacity; -+ -+ /* -+ * NULL-terminated list of performance domains intersecting with the -+ * CPUs of the rd. Protected by RCU. -+ */ -+ struct perf_domain *pd; -+}; -+ -+extern void init_defrootdomain(void); -+extern int sched_init_domains(const struct cpumask *cpu_map); -+extern void rq_attach_root(struct rq *rq, struct root_domain *rd); -+ -+static inline void cpupri_cleanup(void __maybe_unused *cpupri) -+{ -+} -+ -+static inline void cpudl_cleanup(void __maybe_unused *cpudl) -+{ -+} -+ -+static inline void init_dl_bw(void __maybe_unused *dl_bw) -+{ -+} -+ -+static inline int cpudl_init(void __maybe_unused *dl_bw) -+{ -+ return 0; -+} -+ -+static inline int cpupri_init(void __maybe_unused *cpupri) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ raw_spinlock_t *lock; -+ raw_spinlock_t *orig_lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle; -+ struct task_struct *stop; -+ struct mm_struct *prev_mm; -+ -+ unsigned int nr_running; -+ /* -+ * This is part of a global counter where only the total sum -+ * over all CPUs matters. A task can increase this counter on -+ * one CPU and if it got migrated afterwards it may decrease -+ * it on another CPU. Always updated under the runqueue lock: -+ */ -+ unsigned long nr_uninterruptible; -+ u64 nr_switches; -+ -+ /* Stored data about rq->curr to work outside rq lock */ -+ u64 rq_deadline; -+ int rq_prio; -+ -+ /* Best queued id for use outside lock */ -+ u64 best_key; -+ -+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ -+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ -+ u64 niffies; /* Last time this RQ updated rq clock */ -+ u64 last_niffy; /* Last niffies as updated by local clock */ -+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ -+ -+ u64 load_update; /* When we last updated load */ -+ unsigned long load_avg; /* Rolling load average */ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ u64 irq_load_update; /* When we last updated IRQ load */ -+ unsigned long irq_load_avg; /* Rolling IRQ load average */ -+#endif -+#ifdef CONFIG_SMT_NICE -+ struct mm_struct *rq_mm; -+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+ /* Accurate timekeeping data */ -+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, -+ iowait_ns, idle_ns; -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+ skiplist_node *node; -+ skiplist *sl; -+#ifdef CONFIG_SMP -+ struct task_struct *preempt; /* Preempt triggered on this task */ -+ struct task_struct *preempting; /* Hint only, what task is preempting */ -+ -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ struct root_domain *rd; -+ struct sched_domain *sd; -+ -+ unsigned long cpu_capacity_orig; -+ -+ int *cpu_locality; /* CPU relative cache distance */ -+ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ -+ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ -+ -+ bool is_leader; -+ struct rq *smp_leader; /* First physical CPU per node */ -+#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+ struct sched_avg avg_thermal; -+#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ -+#ifdef CONFIG_SCHED_SMT -+ struct rq *smt_leader; /* First logical CPU in SMT siblings */ -+ cpumask_t thread_mask; -+ bool (*siblings_idle)(struct rq *rq); -+ /* See if all smt siblings are idle */ -+#endif /* CONFIG_SCHED_SMT */ -+#ifdef CONFIG_SCHED_MC -+ struct rq *mc_leader; /* First logical CPU in MC siblings */ -+ cpumask_t core_mask; -+ bool (*cache_idle)(struct rq *rq); -+ /* See if all cache siblings are idle */ -+#endif /* CONFIG_SCHED_MC */ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ u64 clock, old_clock, last_tick; -+ /* Ensure that all clocks are in the same cache line */ -+ u64 clock_task ____cacheline_aligned; -+ int dither; -+ -+ int iso_ticks; -+ bool iso_refractory; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ struct hrtimer hrexpiry_timer; -+#endif -+ -+ int rt_nr_running; /* Number real time tasks running */ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_SMP -+ struct llist_head wake_list; -+#endif -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock_task; -+} -+ -+/** -+ * By default the decay is the default pelt decay period. -+ * The decay shift can change the decay period in -+ * multiples of 32. -+ * Decay shift Decay period(ms) -+ * 0 32 -+ * 1 64 -+ * 2 128 -+ * 3 256 -+ * 4 512 -+ */ -+extern int sched_thermal_decay_shift; -+ -+static inline u64 rq_clock_thermal(struct rq *rq) -+{ -+ return rq_clock_task(rq) >> sched_thermal_decay_shift; -+} -+ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu); -+#endif -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#endif /* CONFIG_SMP */ -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline int task_running(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->on_cpu; -+#else -+ return task_current(rq, p); -+#endif -+} -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+static inline void rq_lock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(rq->lock); -+} -+ -+static inline void rq_unlock(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(rq->lock); -+} -+ -+static inline void rq_lock_irq(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(rq->lock); -+} -+ -+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(rq->lock); -+} -+ -+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(rq->lock, rf->flags); -+} -+ -+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(rq->lock, rf->flags); -+} -+ -+static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ while (42) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ } -+ return rq; -+} -+ -+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ rq_unlock(rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ while (42) { -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ } -+ return rq; -+} -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) -+{ -+ rq_unlock(rq); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ return rq; -+} -+ -+/* -+ * {de,en}queue flags: Most not used on MuQSS. -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks -+ * are in a known state which allows modification. Such pairs -+ * should preserve as much state as possible. -+ * -+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location -+ * in the runqueue. -+ * -+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) -+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) -+ * ENQUEUE_MIGRATED - the task was migrated during wakeup -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -+ -+#define ENQUEUE_WAKEUP 0x01 -+#define ENQUEUE_RESTORE 0x02 -+ -+#ifdef CONFIG_SMP -+#define ENQUEUE_MIGRATED 0x40 -+#else -+#define ENQUEUE_MIGRATED 0x00 -+#endif -+ -+#ifdef CONFIG_NUMA -+enum numa_topology_type { -+ NUMA_DIRECT, -+ NUMA_GLUELESS_MESH, -+ NUMA_BACKPLANE, -+}; -+extern enum numa_topology_type sched_numa_topology_type; -+extern int sched_max_numa_distance; -+extern bool find_numa_distance(int distance); -+extern void sched_init_numa(void); -+extern void sched_domains_numa_masks_set(unsigned int cpu); -+extern void sched_domains_numa_masks_clear(unsigned int cpu); -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline void sched_init_numa(void) { } -+static inline void sched_domains_numa_masks_set(unsigned int cpu) { } -+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern struct mutex sched_domains_mutex; -+extern struct static_key_false sched_schedstats; -+ -+#define rcu_dereference_check_sched_domain(p) \ -+ rcu_dereference_check((p), \ -+ lockdep_is_held(&sched_domains_mutex)) -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -+ * See destroy_sched_domains: call_rcu for details. -+ * -+ * The domain tree of any CPU may only be accessed from within -+ * preempt-disabled sections. -+ */ -+#define for_each_domain(cpu, __sd) \ -+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ -+ __sd; __sd = __sd->parent) -+ -+/** -+ * highest_flag_domain - Return highest sched_domain containing flag. -+ * @cpu: The cpu whose highest level of sched domain is to -+ * be returned. -+ * @flag: The flag to check for the highest sched_domain -+ * for the given cpu. -+ * -+ * Returns the highest sched_domain of a cpu which contains the given flag. -+ */ -+static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd, *hsd = NULL; -+ -+ for_each_domain(cpu, sd) { -+ if (!(sd->flags & flag)) -+ break; -+ hsd = sd; -+ } -+ -+ return hsd; -+} -+ -+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd; -+ -+ for_each_domain(cpu, sd) { -+ if (sd->flags & flag) -+ break; -+ } -+ -+ return sd; -+} -+ -+DECLARE_PER_CPU(struct sched_domain *, sd_llc); -+DECLARE_PER_CPU(int, sd_llc_size); -+DECLARE_PER_CPU(int, sd_llc_id); -+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -+DECLARE_PER_CPU(struct sched_domain *, sd_numa); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); -+ -+struct sched_group_capacity { -+ atomic_t ref; -+ /* -+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity -+ * for a single CPU. -+ */ -+ unsigned long capacity; -+ unsigned long min_capacity; /* Min per-CPU capacity in group */ -+ unsigned long max_capacity; /* Max per-CPU capacity in group */ -+ unsigned long next_update; -+ int imbalance; /* XXX unrelated to capacity but shared group state */ -+ -+#ifdef CONFIG_SCHED_DEBUG -+ int id; -+#endif -+ -+ unsigned long cpumask[0]; /* balance mask */ -+}; -+ -+struct sched_group { -+ struct sched_group *next; /* Must be a circular list */ -+ atomic_t ref; -+ -+ unsigned int group_weight; -+ struct sched_group_capacity *sgc; -+ int asym_prefer_cpu; /* cpu of highest priority in group */ -+ -+ /* -+ * The CPUs this group covers. -+ * -+ * NOTE: this field is variable length. (Allocated dynamically -+ * by attaching extra space to the end of the structure, -+ * depending on how many CPUs the kernel has booted up with) -+ */ -+ unsigned long cpumask[0]; -+}; -+ -+static inline struct cpumask *sched_group_span(struct sched_group *sg) -+{ -+ return to_cpumask(sg->cpumask); -+} -+ -+/* -+ * See build_balance_mask(). -+ */ -+static inline struct cpumask *group_balance_mask(struct sched_group *sg) -+{ -+ return to_cpumask(sg->sgc->cpumask); -+} -+ -+/** -+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. -+ * @group: The group whose first cpu is to be returned. -+ */ -+static inline unsigned int group_first_cpu(struct sched_group *group) -+{ -+ return cpumask_first(sched_group_span(group)); -+} -+ -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void dirty_sched_domain_sysctl(int cpu); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void dirty_sched_domain_sysctl(int cpu) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern void sched_ttwu_pending(void); -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_rq_online (struct rq *rq); -+extern void set_rq_offline(struct rq *rq); -+extern bool sched_smp_initialized; -+ -+static inline void update_group_capacity(struct sched_domain *sd, int cpu) -+{ -+} -+ -+static inline void trigger_load_balance(struct rq *rq) -+{ -+} -+ -+#define sched_feat(x) 0 -+ -+#else /* CONFIG_SMP */ -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ SCHED_WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+extern bool sched_debug_enabled; -+#endif -+ -+extern void schedule_idle(void); -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+static inline bool sched_stop_runnable(struct rq *rq) -+{ -+ return rq->stop && task_on_rq_queued(rq->stop); -+} -+ -+#ifdef CONFIG_SMP -+static inline int cpu_of(struct rq *rq) -+{ -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static inline int cpu_of(struct rq *rq) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); -+ -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ -+ if (data) -+ data->func(data, rq->niffies, flags); -+} -+#else -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) -+{ -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+static __always_inline -+unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, -+ struct task_struct __maybe_unused *p) -+{ -+ return util; -+} -+ -+static inline bool uclamp_is_used(void) -+{ -+ return false; -+} -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return tsk_seruntime(t); -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ struct rq_flags rf; -+ u64 ns; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = tsk_seruntime(t); -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern bool sched_can_stop_tick(struct rq *rq); -+extern int __init sched_tick_offload_init(void); -+ -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out of -+ * nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (sched_can_stop_tick(rq)) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+static inline bool rt_rq_is_runnable(struct rq *rt_rq) -+{ -+ return rt_rq->rt_nr_running; -+} -+ -+/** -+ * enum schedutil_type - CPU utilization type -+ * @FREQUENCY_UTIL: Utilization used to select frequency -+ * @ENERGY_UTIL: Utilization used during energy calculation -+ * -+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time -+ * need to be aggregated differently depending on the usage made of them. This -+ * enum is used within schedutil_freq_util() to differentiate the types of -+ * utilization expected by the callers, and adjust the aggregation accordingly. -+ */ -+enum schedutil_type { -+ FREQUENCY_UTIL, -+ ENERGY_UTIL, -+}; -+ -+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -+ -+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, -+ unsigned long max, enum schedutil_type type, -+ struct task_struct *p); -+ -+static inline unsigned long cpu_bw_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_cfs(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline unsigned long cpu_util_rt(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->rt_nr_running); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->irq_load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ util *= (max - irq); -+ util /= max; -+ -+ return util; -+ -+} -+#else -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ return util; -+} -+#endif -+#endif -+ -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -+ -+DECLARE_STATIC_KEY_FALSE(sched_energy_present); -+ -+static inline bool sched_energy_enabled(void) -+{ -+ return static_branch_unlikely(&sched_energy_present); -+} -+ -+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ -+ -+#define perf_domain_span(pd) NULL -+static inline bool sched_energy_enabled(void) { return false; } -+ -+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ if (!(p->flags & PF_KTHREAD)) -+ return false; -+ -+ if (p->nr_cpus_allowed != 1) -+ return false; -+ -+ return true; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ -+static inline int -+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -+{ -+ return 0; -+} -+ -+static inline u64 thermal_load_avg(struct rq *rq) -+{ -+ return 0; -+} -+ -+#endif /* MUQSS_SCHED_H */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 7fbaee24c824..15d274af9b1c 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifdef CONFIG_SCHED_MUQSS -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) -+#else -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) -+#endif -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && -- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { -+ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { - return max; - } - -@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - struct task_struct *thread; - struct sched_attr attr = { - .size = sizeof(struct sched_attr), -+#ifdef CONFIG_SCHED_MUQSS -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, -+#endif - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, -diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h -index efbb492bb94c..f0288c32ab17 100644 ---- a/kernel/sched/cpupri.h -+++ b/kernel/sched/cpupri.h -@@ -17,6 +17,7 @@ struct cpupri { - int *cpu_to_pri; - }; - -+#ifndef CONFIG_SCHED_MUQSS - #ifdef CONFIG_SMP - int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask); -@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); - int cpupri_init(struct cpupri *cp); - void cpupri_cleanup(struct cpupri *cp); - #endif -+#endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index ff9435dee1df..d7bd67204d65 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) - return accounted; - } - --#ifdef CONFIG_64BIT --static inline u64 read_sum_exec_runtime(struct task_struct *t) --{ -- return t->se.sum_exec_runtime; --} --#else --static u64 read_sum_exec_runtime(struct task_struct *t) --{ -- u64 ns; -- struct rq_flags rf; -- struct rq *rq; -- -- rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -- task_rq_unlock(rq, t, &rf); -- -- return ns; --} --#endif -- - /* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. -@@ -658,7 +638,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index b743bf38f08f..c769795d726b 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_MUQSS - /* - * idle-task scheduling class. - */ -@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index db3a57675ccf..1f11cefe8d20 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,19 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_MUQSS -+#include "MuQSS.h" -+ -+/* Begin compatibility wrappers for MuQSS/CFS differences */ -+#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->nr_running) -+ -+#else /* CONFIG_SCHED_MUQSS */ -+ -+#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) -+ -+ - #include - - #include -@@ -2546,3 +2559,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* MuQSS compatibility functions */ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return t->se.sum_exec_runtime; -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ u64 ns; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = t->se.sum_exec_runtime; -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 8344757bba6e..d819af35a770 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -450,7 +450,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - struct root_domain *old_rd = NULL; - unsigned long flags; - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_lock_irqsave(rq->lock, flags); -+#else - raw_spin_lock_irqsave(&rq->lock, flags); -+#endif - - if (rq->rd) { - old_rd = rq->rd; -@@ -476,7 +480,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_unlock_irqrestore(rq->lock, flags); -+#else - raw_spin_unlock_irqrestore(&rq->lock, flags); -+#endif - - if (old_rd) - call_rcu(&old_rd->rcu, free_rootdomain); -diff --git a/kernel/skip_list.c b/kernel/skip_list.c -new file mode 100644 -index 000000000000..bf5c6e97e139 ---- /dev/null -+++ b/kernel/skip_list.c -@@ -0,0 +1,148 @@ -+/* -+ Copyright (C) 2011,2016 Con Kolivas. -+ -+ Code based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+The routine randomLevel has been hard-coded to generate random -+levels using p=0.25. It can be easily changed. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+Levels start at zero and go up to MaxLevel (which is equal to -+MaxNumberOfLevels-1). -+ -+The routines defined in this file are: -+ -+init: defines slnode -+ -+new_skiplist: returns a new, empty list -+ -+randomLevel: Returns a random level based on a u64 random seed passed to it. -+In MuQSS, the "niffy" time is used for this purpose. -+ -+insert(l,key, value): inserts the binding (key, value) into l. This operation -+occurs in O(log n) time. -+ -+delnode(slnode, l, node): deletes any binding of key from the l based on the -+actual node value. This operation occurs in O(k) time where k is the -+number of levels of the node in question (max 8). The original delete -+function occurred in O(log n) time and involved a search. -+ -+MuQSS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+ -+*/ -+ -+#include -+#include -+ -+#define MaxNumberOfLevels 8 -+#define MaxLevel (MaxNumberOfLevels - 1) -+ -+void skiplist_init(skiplist_node *slnode) -+{ -+ int i; -+ -+ slnode->key = 0xFFFFFFFFFFFFFFFF; -+ slnode->level = 0; -+ slnode->value = NULL; -+ for (i = 0; i < MaxNumberOfLevels; i++) -+ slnode->next[i] = slnode->prev[i] = slnode; -+} -+ -+skiplist *new_skiplist(skiplist_node *slnode) -+{ -+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); -+ -+ BUG_ON(!l); -+ l->header = slnode; -+ return l; -+} -+ -+void free_skiplist(skiplist *l) -+{ -+ skiplist_node *p, *q; -+ -+ p = l->header; -+ do { -+ q = p->next[0]; -+ p->next[0]->prev[0] = q->prev[0]; -+ skiplist_node_init(p); -+ p = q; -+ } while (p != l->header); -+ kfree(l); -+} -+ -+void skiplist_node_init(skiplist_node *node) -+{ -+ memset(node, 0, sizeof(skiplist_node)); -+} -+ -+static inline unsigned int randomLevel(const long unsigned int randseed) -+{ -+ return find_first_bit(&randseed, MaxLevel) / 2; -+} -+ -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -+{ -+ skiplist_node *update[MaxNumberOfLevels]; -+ skiplist_node *p, *q; -+ int k = l->level; -+ -+ p = l->header; -+ do { -+ while (q = p->next[k], q->key <= key) -+ p = q; -+ update[k] = p; -+ } while (--k >= 0); -+ -+ ++l->entries; -+ k = randomLevel(randseed); -+ if (k > l->level) { -+ k = ++l->level; -+ update[k] = l->header; -+ } -+ -+ node->level = k; -+ node->key = key; -+ node->value = value; -+ do { -+ p = update[k]; -+ node->next[k] = p->next[k]; -+ p->next[k] = node; -+ node->prev[k] = p; -+ node->next[k]->prev[k] = node; -+ } while (--k >= 0); -+} -+ -+void skiplist_delete(skiplist *l, skiplist_node *node) -+{ -+ int k, m = node->level; -+ -+ for (k = 0; k <= m; k++) { -+ node->prev[k]->next[k] = node->next[k]; -+ node->next[k]->prev[k] = node->prev[k]; -+ } -+ skiplist_node_init(node); -+ if (m == l->level) { -+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) -+ m--; -+ l->level = m; -+ } -+ l->entries--; -+} -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 8a176d8727a3..808473f947ee 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; - static unsigned long zero_ul; - static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; --#ifdef CONFIG_PRINTK -+static int __read_mostly one_hundred = 100; -+static int __read_mostly one_thousand = 1000; -+static int zero = 0; -+static int one = 1; -+#ifdef CONFIG_SCHED_MUQSS -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_yield_type; -+#endif -+extern int hrtimer_granularity_us; -+extern int hrtimeout_min_us; -+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) - static int ten_thousand = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS -@@ -288,7 +298,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -305,6 +315,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_MUQSS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -486,6 +497,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_MUQSS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1049,6 +1061,62 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_MUQSS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif -+ { -+ .procname = "hrtimer_granularity_us", -+ .data = &hrtimer_granularity_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, -+ { -+ .procname = "hrtimeout_min_us", -+ .data = &hrtimeout_min_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig -index fcc42353f125..2960cace6719 100644 ---- a/kernel/time/Kconfig -+++ b/kernel/time/Kconfig -@@ -66,6 +66,9 @@ config NO_HZ_COMMON - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - -+config NO_HZ_FULL -+ bool -+ - choice - prompt "Timer tick handling" - default NO_HZ_IDLE if NO_HZ -@@ -87,8 +90,9 @@ config NO_HZ_IDLE - - Most of the time you want to say Y here. - --config NO_HZ_FULL -+config NO_HZ_FULL_NODEF - bool "Full dynticks system (tickless)" -+ select NO_HZ_FULL - # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - # We need at least one periodic CPU for timekeeping -@@ -114,6 +118,8 @@ config NO_HZ_FULL - transitions: syscalls, exceptions and interrupts. Even when it's - dynamically off. - -+ Not recommended for desktops,laptops, or mobile devices. -+ - Say N. - - endchoice -@@ -123,7 +129,7 @@ config CONTEXT_TRACKING - - config CONTEXT_TRACKING_FORCE - bool "Force context tracking" -- depends on CONTEXT_TRACKING -+ depends on CONTEXT_TRACKING && !SCHED_MUQSS - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to -diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c -index f5490222e134..544c58c29267 100644 ---- a/kernel/time/clockevents.c -+++ b/kernel/time/clockevents.c -@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - --/* Limit min_delta to a jiffie */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) -+int __read_mostly hrtimer_granularity_us = 100; -+/* Limit min_delta to 100us */ -+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index d89da1c7e005..e4f5b4c483a0 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2216,3 +2216,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, - return schedule_hrtimeout_range(expires, 0, mode); - } - EXPORT_SYMBOL_GPL(schedule_hrtimeout); -+ -+/* -+ * As per schedule_hrtimeout but taskes a millisecond value and returns how -+ * many milliseconds are left. -+ */ -+long __sched schedule_msec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ int delta, jiffs; -+ ktime_t expires; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ jiffs = msecs_to_jiffies(timeout); -+ /* -+ * If regular timer resolution is adequate or hrtimer resolution is not -+ * (yet) better than Hz, as would occur during startup, use regular -+ * timers. -+ */ -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) -+ return schedule_timeout(jiffs); -+ -+ delta = (timeout % 1000) * NSEC_PER_MSEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_ms(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+EXPORT_SYMBOL(schedule_msec_hrtimeout); -+ -+#define USECS_PER_SEC 1000000 -+extern int hrtimer_granularity_us; -+ -+static inline long schedule_usec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ ktime_t expires; -+ int delta; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(usecs_to_jiffies(timeout)); -+ -+ if (timeout < hrtimer_granularity_us) -+ timeout = hrtimer_granularity_us; -+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_us(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+int __read_mostly hrtimeout_min_us = 500; -+ -+long __sched schedule_min_hrtimeout(void) -+{ -+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); -+} -+ -+EXPORT_SYMBOL(schedule_min_hrtimeout); -+ -+long __sched schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); -+ -+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 2fd3b3fa68bf..1202d7fe5d8e 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -855,7 +855,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index a5221abb4594..9a9287cb2a37 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -43,6 +43,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1568,7 +1569,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ --static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) -+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) - { - u64 nextevt = hrtimer_get_next_event(); - -@@ -1586,6 +1587,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) - if (nextevt <= basem) - return basem; - -+ if (nextevt < expires && nextevt - basem <= TICK_NSEC) -+ base->is_idle = false; -+ - /* - * Round up to the next jiffie. High resolution timers are - * off, so the hrtimers are expired in the tick and we need to -@@ -1655,7 +1659,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) - } - raw_spin_unlock(&base->lock); - -- return cmp_next_hrtimer_event(basem, expires); -+ return cmp_next_hrtimer_event(base, basem, expires); - } - - /** -@@ -1892,6 +1896,18 @@ signed long __sched schedule_timeout(signed long timeout) - - expire = timeout + jiffies; - -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ /* -+ * Special case 1 as being a request for the minimum timeout -+ * and use highres timers to timeout after 1ms to workaround -+ * the granularity of low Hz tick timers. -+ */ -+ if (!schedule_min_hrtimeout()) -+ return 0; -+ goto out_timeout; -+ } -+#endif - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); -@@ -1900,10 +1916,10 @@ signed long __sched schedule_timeout(signed long timeout) - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); -- -+out_timeout: - timeout = expire - jiffies; - -- out: -+out: - return timeout < 0 ? 0 : timeout; - } - EXPORT_SYMBOL(schedule_timeout); -@@ -2045,7 +2061,19 @@ void __init init_timers(void) - */ - void msleep(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ /* -+ * Use high resolution timers where the resolution of tick based -+ * timers is inadequate. -+ */ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs) -+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); -+ return; -+ } -+ timeout = jiffs + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -2059,7 +2087,15 @@ EXPORT_SYMBOL(msleep); - */ - unsigned long msleep_interruptible(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs && !signal_pending(current)) -+ msecs = schedule_msec_hrtimeout_interruptible(msecs); -+ return msecs; -+ } -+ timeout = jiffs + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..68930e7f4d28 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_MUQSS -+ /* No deadline on MuQSS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index a37c87b5aee2..7b1d19e17af9 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -163,7 +163,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 33; - /* - * The total number of pages which are beyond the high watermark within all - * zones. -diff --git a/net/core/pktgen.c b/net/core/pktgen.c -index 08e2811b5274..955fcfdd3c3c 100644 ---- a/net/core/pktgen.c -+++ b/net/core/pktgen.c -@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) - mutex_unlock(&pktgen_thread_lock); - pr_debug("%s: waiting for %s to disappear....\n", - __func__, ifname); -- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); -+ schedule_msec_hrtimeout_interruptible((msec_per_try)); - mutex_lock(&pktgen_thread_lock); - - if (++i >= max_tries) { -diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c -index 40232a278b1a..d87fae1113aa 100644 ---- a/sound/pci/maestro3.c -+++ b/sound/pci/maestro3.c -@@ -1995,7 +1995,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(0, io + GPIO_DATA); - outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); -+ schedule_msec_hrtimeout_uninterruptible((delay1)); - - outw(GPO_PRIMARY_AC97, io + GPIO_DATA); - udelay(5); -@@ -2003,7 +2003,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); - outw(~0, io + GPIO_MASK); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); -+ schedule_msec_hrtimeout_uninterruptible((delay2)); - - if (! snd_m3_try_read_vendor(chip)) - break; -diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c -index f70b9f7e68bb..77b65398ca07 100644 ---- a/sound/soc/codecs/rt5631.c -+++ b/sound/soc/codecs/rt5631.c -@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena - hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - /* config one-bit depop parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); - snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, -@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable - hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - /* config depop sequence parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); -diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c -index fe99584c917f..f1344d532a13 100644 ---- a/sound/soc/codecs/wm8350.c -+++ b/sound/soc/codecs/wm8350.c -@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) - out2->ramp == WM8350_RAMP_UP) { - /* delay is longer over 0dB as increases are larger */ - if (i >= WM8350_OUTn_0dB) -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (2)); - else -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (1)); - } else - udelay(50); /* doesn't matter if we delay longer */ -@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - (platform->dis_out4 << 6)); - - /* wait for discharge */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - cap_discharge_msecs)); - -@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - WM8350_VBUFEN); - - /* wait for vmid */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_charge_msecs)); - -@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_discharge_msecs)); - -@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - pm1 | WM8350_OUTPUT_DRAIN_EN); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform->drain_msecs)); - - pm1 &= ~WM8350_BIASEN; -diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c -index 271235a69c01..3ec90e1b1eb4 100644 ---- a/sound/soc/codecs/wm8900.c -+++ b/sound/soc/codecs/wm8900.c -@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, - /* Need to let things settle before stopping the clock - * to ensure that restart works, see "Stopping the - * master clock" in the datasheet. */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_write(component, WM8900_REG_POWER2, - WM8900_REG_POWER2_SYSCLK_ENA); - break; -diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c -index 6497c1ea6228..08fefeca9d82 100644 ---- a/sound/soc/codecs/wm9713.c -+++ b/sound/soc/codecs/wm9713.c -@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, - - /* Gracefully shut down the voice interface. */ - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); - snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); - -@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, - wm9713->pll_in = freq_in; - - /* wait 10ms AC97 link frames for the link to stabilise */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - return 0; - } - -diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c -index e2632841b321..7a445c1a2167 100644 ---- a/sound/soc/soc-dapm.c -+++ b/sound/soc/soc-dapm.c -@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) - static void pop_wait(u32 pop_time) - { - if (pop_time) -- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); -+ schedule_msec_hrtimeout_uninterruptible((pop_time)); - } - - __printf(3, 4) -diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c -index fdbdfb7bce92..fa8e8faf3eb3 100644 ---- a/sound/usb/line6/pcm.c -+++ b/sound/usb/line6/pcm.c -@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, - if (!alive) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } while (--timeout > 0); - if (alive) - dev_err(line6pcm->line6->ifcdev, -diff --git a/kernel/cpu.c b/kernel/cpu.c -index 244d305443773..90b77028233b0 100644 ---- a/kernel/cpu.c -+++ b/kernel/cpu.c -@@ -1565,7 +1565,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { - [CPUHP_BRINGUP_CPU] = { - .name = "cpu:bringup", - .startup.single = bringup_cpu, -+#ifdef CONFIG_SCHED_MUQSS -+ .teardown.single = NULL, -+#else - .teardown.single = finish_cpu, -+#endif - .cant_stop = true, - }, - /* Final state before CPU kills itself */ diff --git a/linux57-tkg/linux57-tkg-patches/0004-glitched-muqss.patch b/linux57-tkg/linux57-tkg-patches/0004-glitched-muqss.patch deleted file mode 100644 index 2c4837e..0000000 --- a/linux57-tkg/linux57-tkg-patches/0004-glitched-muqss.patch +++ /dev/null @@ -1,78 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - MuQSS - -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index 84a1d08d68551..57c3036a68952 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -+#ifdef CONFIG_ZENIFY -+int sched_iso_cpu __read_mostly = 25; -+#else - int sched_iso_cpu __read_mostly = 70; -+#endif - - /* - * sched_yield_type - Choose what sort of yield sched_yield will perform. - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,7 @@ - choice - prompt "Timer frequency" - default HZ_100 if SCHED_MUQSS -- default HZ_250_NODEF if !SCHED_MUQSS -+ default HZ_500_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -50,6 +50,20 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500_NODEF -+ bool "500 HZ" -+ help -+ 500 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ -+ config HZ_750_NODEF -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000_NODEF - bool "1000 HZ" - help -@@ -63,6 +70,8 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250_NODEF - default 300 if HZ_300_NODEF -+ default 500 if HZ_500_NODEF -+ default 750 if HZ_750_NODEF - default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - -diff --git a/Makefile b/Makefile -index d4d36c61940b..4a9dfe471f1f 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus - - CKVERSION = -ck1 - CKNAME = MuQSS Powered --EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) - - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. diff --git a/linux57-tkg/linux57-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux57-tkg/linux57-tkg-patches/0004-glitched-ondemand-muqss.patch deleted file mode 100644 index 02933e4..0000000 --- a/linux57-tkg/linux57-tkg-patches/0004-glitched-ondemand-muqss.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (45) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (45) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux57-tkg/linux57-tkg-patches/0005-glitched-ondemand-pds.patch b/linux57-tkg/linux57-tkg-patches/0005-glitched-ondemand-pds.patch deleted file mode 100644 index c1929e8..0000000 --- a/linux57-tkg/linux57-tkg-patches/0005-glitched-ondemand-pds.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (63) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux57-tkg/linux57-tkg-patches/0005-glitched-pds.patch b/linux57-tkg/linux57-tkg-patches/0005-glitched-pds.patch deleted file mode 100644 index 23271f5..0000000 --- a/linux57-tkg/linux57-tkg-patches/0005-glitched-pds.patch +++ /dev/null @@ -1,166 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. - -diff --git a/init/Kconfig b/init/Kconfig -index 11fd9b502d06..e9bc34d3019b 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -715,6 +715,7 @@ menu "Scheduler features" - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_PDS - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -948,7 +948,6 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -- depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index b23231bae996..cab4e5c5b38e 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o - obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o --obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - endif - obj-y += loadavg.o clock.o cputime.o - obj-y += idle.o - obj-y += wait.o wait_bit.o swait.o completion.o - obj-$(CONFIG_SMP) += cpupri.o pelt.o - obj-$(CONFIG_SCHEDSTATS) += stats.o -+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o - -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -index 9281ad164..f09a609cf 100644 ---- a/kernel/sched/pds.c -+++ b/kernel/sched/pds.c -@@ -81,6 +81,18 @@ enum { - NR_CPU_AFFINITY_CHK_LEVEL - }; - -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ - static inline void print_scheduler_version(void) - { - printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); -@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) - #ifdef CONFIG_SCHED_DEBUG - void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - struct seq_file *m) --{} -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} - - void proc_sched_set_task(struct task_struct *p) - {} diff --git a/linux57-tkg/linux57-tkg-patches/0005-v5.7_undead-pds099o.patch b/linux57-tkg/linux57-tkg-patches/0005-v5.7_undead-pds099o.patch deleted file mode 100644 index 59c8d8d..0000000 --- a/linux57-tkg/linux57-tkg-patches/0005-v5.7_undead-pds099o.patch +++ /dev/null @@ -1,8400 +0,0 @@ -From 68f1a9541ef3185b1021e8e54d2712c7039418d7 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Tue, 2 Jun 2020 18:55:09 +0200 -Subject: PDS 099o, 5.7 rebase (release/v2) - - -diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt -new file mode 100644 -index 000000000000..709e86f6487e ---- /dev/null -+++ b/Documentation/scheduler/sched-PDS-mq.txt -@@ -0,0 +1,56 @@ -+ Priority and Deadline based Skiplist multiple queue Scheduler -+ ------------------------------------------------------------- -+ -+CONTENT -+======== -+ -+ 0. Development -+ 1. Overview -+ 1.1 Design goal -+ 1.2 Design summary -+ 2. Design Detail -+ 2.1 Skip list implementation -+ 2.2 Task preempt -+ 2.3 Task policy, priority and deadline -+ 2.4 Task selection -+ 2.5 Run queue balance -+ 2.6 Task migration -+ -+ -+0. Development -+============== -+ -+Priority and Deadline based Skiplist multiple queue scheduler, referred to as -+PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run -+Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing -+design from VRQ and inspired by the introduction of skiplist data structure -+to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple -+Queue Skiplist Scheduler, the successor after BFS) in many ways. -+ -+1. Overview -+=========== -+ -+1.1 Design goal -+--------------- -+ -+PDS is designed to make the cpu process scheduler code to be simple, but while -+efficiency and scalable. Be Simple, the scheduler code will be easy to be read -+and the behavious of scheduler will be easy to predict. Be efficiency, the -+scheduler shall be well balance the thoughput performance and task interactivity -+at the same time for different properties the tasks behave. Be scalable, the -+performance of the scheduler should be in good shape with the glowing of -+workload or with the growing of the cpu numbers. -+ -+1.2 Design summary -+------------------ -+ -+PDS is described as a multiple run queues cpu scheduler. Each cpu has its own -+run queue. A heavry customized skiplist is used as the backend data structure -+of the cpu run queue. Tasks in run queue is sorted by priority then virtual -+deadline(simplfy to just deadline from here on). In PDS, balance action among -+run queues are kept as less as possible to reduce the migration cost. Cpumask -+data structure is widely used in cpu affinity checking and cpu preemption/ -+selection to make PDS scalable with increasing cpu number. -+ -+ -+To be continued... -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 2d3f963fd6f1..5f41ead019b1 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1006,6 +1006,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_PDS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c -index 737ff3b9c2c0..b5bc5a1b6de7 100644 ---- a/drivers/cpufreq/cpufreq_conservative.c -+++ b/drivers/cpufreq/cpufreq_conservative.c -@@ -28,8 +28,8 @@ struct cs_dbs_tuners { - }; - - /* Conservative governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_FREQUENCY_DOWN_THRESHOLD (20) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) -+#define DEF_FREQUENCY_DOWN_THRESHOLD (26) - #define DEF_FREQUENCY_STEP (5) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (10) -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 82a4d37ddecb..1130e0f5db72 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -18,7 +18,7 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (100000) - #define MICRO_FREQUENCY_UP_THRESHOLD (95) -@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) - } - - /* -- * Every sampling_rate, we check, if current idle time is less than 20% -+ * Every sampling_rate, we check, if current idle time is less than 37% - * (default), then we try to increase frequency. Else, we adjust the frequency - * proportional to load. - */ -diff --git a/fs/proc/base.c b/fs/proc/base.c -index eb2255e95f62..62b8cedbccb6 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..1a7987c40c80 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_PDS -+#define INIT_TASK_COMM "PDS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif /* !CONFIG_SCHED_PDS */ - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h -index fed6ba96c527..f03a5ee419a1 100644 ---- a/include/linux/jiffies.h -+++ b/include/linux/jiffies.h -@@ -169,7 +169,7 @@ static inline u64 get_jiffies_64(void) - * Have the 32 bit jiffies value wrap 5 minutes after boot - * so jiffies wrap bugs show up earlier. - */ --#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) -+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) - - /* - * Change timeval to jiffies, trying to avoid the -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 4418f5cb8324..2b51afac5b06 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -652,9 +653,13 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - struct llist_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -663,6 +668,7 @@ struct task_struct { - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; - -+#ifndef CONFIG_SCHED_PDS - /* - * recent_used_cpu is initially set as the last CPU used by a task - * that wakes affine another task. Waker/wakee relationships can -@@ -671,6 +677,7 @@ struct task_struct { - * used CPU that may be idle. - */ - int recent_used_cpu; -+#endif /* CONFIG_SCHED_PDS */ - int wake_cpu; - #endif - int on_rq; -@@ -680,13 +687,27 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_PDS -+ int time_slice; -+ u64 deadline; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+ /* 8bits prio and 56bits deadline for quick processing */ -+ u64 priodl; -+ u64 last_ran; -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* CONFIG_SCHED_PDS */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1306,6 +1327,29 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_PDS -+void cpu_scaling(int cpu); -+void cpu_nonscaling(int cpu); -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+ -+#define task_running_idle(p) ((p)->prio == IDLE_PRIO) -+#else /* CFS */ -+extern int runqueue_is_locked(int cpu); -+static inline void cpu_scaling(int cpu) -+{ -+} -+ -+static inline void cpu_nonscaling(int cpu) -+{ -+} -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+#define iso_task(p) (false) -+#endif /* CONFIG_SCHED_PDS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..a5e5fc2c9170 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,22 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_PDS -+ -+#define __tsk_deadline(p) ((p)->deadline) -+ -+static inline int dl_prio(int prio) -+{ -+ return 1; -+} -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 1; -+} -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_PDS */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..fba04bb91492 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,7 +20,18 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_PDS -+#define ISO_PRIO (MAX_USER_RT_PRIO) -+ -+#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) -+ -+#define NORMAL_PRIO (MAX_RT_PRIO) -+#define IDLE_PRIO ((MAX_RT_PRIO) + 1) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* !CONFIG_SCHED_PDS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO -+#endif /* CONFIG_SCHED_PDS */ - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..a96012e6f15e 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_PDS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 38359071236a..90328ccd527f 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..713fedd8034f ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ Copyright (C) 2016 Alfred Chen. -+ -+ Code based on Con Kolivas's skip list implementation for BFS, and -+ which is based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+ -+This file only provides a infrastructure of skip list. -+ -+skiplist_node is embedded into container data structure, to get rid the -+dependency of kmalloc/kfree operation in scheduler code. -+ -+A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+macro and be used for skip list insert operation. -+ -+Random Level is also not defined in this file, instead, it should be customized -+implemented and set to node->level then pass to the customized skiplist_insert -+function. -+ -+Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ -+NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+considering that there will be 256 entries to enable the top level when using -+random level p=0.5, and that number is more than enough for a run queue usage -+in a scheduler usage. And it also help to reduce the memory usage of the -+embedded skip list node in task_struct to about 50%. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+BFS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+*/ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty()*/ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..d6d384ddb57d 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -115,7 +115,10 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ -+ -+#define SCHED_ISO 4 -+ - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 - -diff --git a/init/Kconfig b/init/Kconfig -index 74a5ac65644f..e4fd406b58dd 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -61,6 +61,21 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_PDS -+ bool "PDS-mq cpu scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler for excellent interactivity and responsiveness on the -+ desktop and solid scalability on normal hardware and commodity -+ servers. -+ -+ Currently incompatible with the Group CPU scheduler, and RCU TORTURE -+ TEST so these options are disabled. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -777,6 +792,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_PDS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -878,7 +894,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_PDS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1007,6 +1023,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1134,6 +1151,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_PDS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index bd403ed3e418..162d3deddd45 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -59,6 +59,126 @@ struct task_struct init_task - __init_task_data - #endif - = { -+#ifdef CONFIG_SCHED_PDS -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ .thread_info = INIT_THREAD_INFO(init_task), -+ .stack_refcount = ATOMIC_INIT(1), -+#endif -+ .state = 0, -+ .stack = init_stack, -+ .usage = ATOMIC_INIT(2), -+ .flags = PF_KTHREAD, -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, /* PDS only */ -+ .policy = SCHED_NORMAL, -+ .cpus_ptr = &init_task.cpus_mask, -+ .cpus_mask = CPU_MASK_ALL, -+ .nr_cpus_allowed= NR_CPUS, -+ .mm = NULL, -+ .active_mm = &init_mm, -+ .restart_block = { -+ .fn = do_no_restart_syscall, -+ }, -+ .sl_level = 0, /* PDS only */ -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ -+ .time_slice = HZ, /* PDS only */ -+ .tasks = LIST_HEAD_INIT(init_task.tasks), -+#ifdef CONFIG_SMP -+ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -+#endif -+#ifdef CONFIG_CGROUP_SCHED -+ .sched_task_group = &root_task_group, -+#endif -+ .ptraced = LIST_HEAD_INIT(init_task.ptraced), -+ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -+ .real_parent = &init_task, -+ .parent = &init_task, -+ .children = LIST_HEAD_INIT(init_task.children), -+ .sibling = LIST_HEAD_INIT(init_task.sibling), -+ .group_leader = &init_task, -+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), -+ RCU_POINTER_INITIALIZER(cred, &init_cred), -+ .comm = INIT_TASK_COMM, -+ .thread = INIT_THREAD, -+ .fs = &init_fs, -+ .files = &init_files, -+ .signal = &init_signals, -+ .sighand = &init_sighand, -+ .nsproxy = &init_nsproxy, -+ .pending = { -+ .list = LIST_HEAD_INIT(init_task.pending.list), -+ .signal = {{0}} -+ }, -+ .blocked = {{0}}, -+ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), -+ .journal_info = NULL, -+ INIT_CPU_TIMERS(init_task) -+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), -+ .timer_slack_ns = 50000, /* 50 usec default slack */ -+ .thread_pid = &init_struct_pid, -+ .thread_group = LIST_HEAD_INIT(init_task.thread_group), -+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), -+#ifdef CONFIG_AUDITSYSCALL -+ .loginuid = INVALID_UID, -+ .sessionid = AUDIT_SID_UNSET, -+#endif -+#ifdef CONFIG_PERF_EVENTS -+ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), -+ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), -+#endif -+#ifdef CONFIG_PREEMPT_RCU -+ .rcu_read_lock_nesting = 0, -+ .rcu_read_unlock_special.s = 0, -+ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), -+ .rcu_blocked_node = NULL, -+#endif -+#ifdef CONFIG_TASKS_RCU -+ .rcu_tasks_holdout = false, -+ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), -+ .rcu_tasks_idle_cpu = -1, -+#endif -+#ifdef CONFIG_CPUSETS -+ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), -+#endif -+#ifdef CONFIG_RT_MUTEXES -+ .pi_waiters = RB_ROOT_CACHED, -+ .pi_top_task = NULL, -+#endif -+ INIT_PREV_CPUTIME(init_task) -+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -+ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), -+ .vtime.starttime = 0, -+ .vtime.state = VTIME_SYS, -+#endif -+#ifdef CONFIG_NUMA_BALANCING -+ .numa_preferred_nid = -1, -+ .numa_group = NULL, -+ .numa_faults = NULL, -+#endif -+#ifdef CONFIG_KASAN -+ .kasan_depth = 1, -+#endif -+#ifdef CONFIG_TRACE_IRQFLAGS -+ .softirqs_enabled = 1, -+#endif -+#ifdef CONFIG_LOCKDEP -+ .lockdep_recursion = 0, -+#endif -+#ifdef CONFIG_FUNCTION_GRAPH_TRACER -+ .ret_stack = NULL, -+#endif -+#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) -+ .trace_recursion = 0, -+#endif -+#ifdef CONFIG_LIVEPATCH -+ .patch_state = KLP_UNDEFINED, -+#endif -+#ifdef CONFIG_SECURITY -+ .security = NULL, -+#endif -+#else /* CONFIG_SCHED_PDS */ - #ifdef CONFIG_THREAD_INFO_IN_TASK - .thread_info = INIT_THREAD_INFO(init_task), - .stack_refcount = REFCOUNT_INIT(1), -@@ -182,6 +302,7 @@ struct task_struct init_task - #ifdef CONFIG_SECURITY - .security = NULL, - #endif -+#endif /* CONFIG_SCHED_PDS */ - }; - EXPORT_SYMBOL(init_task); - -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 729d3a5c772e..10a7c52b90d5 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index ce2a75bc0ade..f0f864bc1ab9 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..b5de980c7d4e 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_PDS -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index c9f090d64f00..063d15a1ab8b 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, -@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..8ebe4e33fb5f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_PDS -+obj-y += pds.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 7fbaee24c824..28377ad56248 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_PDS */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_PDS - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -916,6 +927,7 @@ static int __init sugov_register(void) - core_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_PDS - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_PDS */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index ff9435dee1df..1377ea3d1b76 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -+#ifdef CONFIG_SCHED_PDS -+ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : -+ CPUTIME_USER; -+#else - index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+#endif - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -+#ifdef CONFIG_SCHED_PDS -+ if (task_nice(p) > 0 || task_running_idle(p)) { -+#else - if (task_nice(p) > 0) { -+#endif - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -658,7 +667,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index b743bf38f08f..16e5754af1cf 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * idle-task scheduling class. - */ -@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -new file mode 100644 -index 000000000000..02d7d5a67c77 ---- /dev/null -+++ b/kernel/sched/pds.c -@@ -0,0 +1,6554 @@ -+/* -+ * kernel/sched/pds.c, was kernel/sched.c -+ * -+ * PDS-mq Core kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ */ -+#include "pds_sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+ -+#define rt_prio(prio) ((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR || \ -+ (policy) == SCHED_ISO) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define idle_policy(policy) ((policy) == SCHED_IDLE) -+#define idleprio_task(p) unlikely(idle_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -+#define JIFFY_NS (1000000000 / HZ) -+#define HALF_JIFFY_NS (1000000000 / HZ / 2) -+#define HALF_JIFFY_US (1000000 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); -+} -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. -+ * Tunable via /proc interface. -+ */ -+#define SCHED_DEFAULT_RR (4) -+int rr_interval __read_mostly = SCHED_DEFAULT_RR; -+ -+static int __init rr_interval_set(char *str) -+{ -+ u32 rr; -+ -+ pr_info("rr_interval: "); -+ if (kstrtouint(str, 0, &rr)) { -+ pr_cont("using default of %u, unable to parse %s\n", -+ rr_interval, str); -+ return 1; -+ } -+ -+ rr_interval = rr; -+ pr_cont("%d\n", rr_interval); -+ -+ return 1; -+} -+__setup("rr_interval=", rr_interval_set); -+ -+ -+static const u64 sched_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, -+/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, -+/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, -+/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, -+/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, -+/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, -+/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, -+/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 -+}; -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+#ifdef CONFIG_SMP -+enum { -+SCHED_RQ_EMPTY = 0, -+SCHED_RQ_IDLE, -+SCHED_RQ_NORMAL_0, -+SCHED_RQ_NORMAL_1, -+SCHED_RQ_NORMAL_2, -+SCHED_RQ_NORMAL_3, -+SCHED_RQ_NORMAL_4, -+SCHED_RQ_NORMAL_5, -+SCHED_RQ_NORMAL_6, -+SCHED_RQ_NORMAL_7, -+SCHED_RQ_ISO, -+SCHED_RQ_RT, -+NR_SCHED_RQ_QUEUED_LEVEL -+}; -+ -+static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_PER_CPU(int, sched_sibling_cpu); -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+ -+static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SMT_NICE -+/* -+ * Preemptible sibling group mask -+ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO -+ */ -+static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; -+/* -+ * SMT supressed mask -+ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu -+ * will be supressed to run IDLE priority task. -+ */ -+static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; -+#endif /* CONFIG_SMT_NICE */ -+#endif -+ -+static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes CPU fairly amongst tasks of the -+ * same nice value, it proportions CPU according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 task_deadline_diff(const struct task_struct *p) -+{ -+ return sched_prio2deadline[TASK_USER_PRIO(p)]; -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return sched_prio2deadline[USER_PRIO(static_prio)]; -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline for non-rt tasks. -+ */ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ if (p->prio >= NORMAL_PRIO) -+ p->deadline = rq->clock + task_deadline_diff(p); -+ -+ update_task_priodl(p); -+} -+ -+static inline struct task_struct *rq_first_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct *rq_second_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) -+{ -+ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); -+} -+ -+static const int task_dl_hash_tbl[] = { -+/* 0 4 8 12 */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -+/* 16 20 24 28 */ -+ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 -+}; -+ -+static inline int -+task_deadline_level(const struct task_struct *p, const struct rq *rq) -+{ -+ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; -+ -+ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); -+ return task_dl_hash_tbl[delta]; -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_SMT_NICE -+static void resched_cpu_if_curr_is(int cpu, int priority) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rcu_read_lock(); -+ -+ if (rcu_dereference(rq->curr)->prio != priority) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ if (!do_raw_spin_trylock(&rq->lock)) -+ goto out; -+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if (priority == rq->curr->prio) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ -+ spin_release(&rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&rq->lock); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline bool -+__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, -+ cpumask_t cpumasks[], unsigned long bitmap[]) -+{ -+ if (*plevel == level) -+ return false; -+ -+ cpumask_clear_cpu(cpu, cpumasks + *plevel); -+ if (cpumask_empty(cpumasks + *plevel)) -+ clear_bit(*plevel, bitmap); -+ cpumask_set_cpu(cpu, cpumasks + level); -+ set_bit(level, bitmap); -+ -+ *plevel = level; -+ -+ return true; -+} -+ -+static inline int -+task_running_policy_level(const struct task_struct *p, const struct rq *rq) -+{ -+ int prio = p->prio; -+ -+ if (NORMAL_PRIO == prio) -+ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); -+ -+ if (ISO_PRIO == prio) -+ return SCHED_RQ_ISO; -+ if (prio < MAX_RT_PRIO) -+ return SCHED_RQ_RT; -+ return PRIO_LIMIT - prio; -+} -+ -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) -+{ -+ struct task_struct *p = rq_first_queued_task(rq); -+ -+ if (p->prio != NORMAL_PRIO) -+ return; -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, -+ task_running_policy_level(p, rq), -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0]); -+} -+ -+#ifdef CONFIG_SMT_NICE -+static inline void update_sched_cpu_psg_mask(const int cpu) -+{ -+ cpumask_t tmp; -+ -+ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_IDLE]); -+ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+ else -+ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+} -+#endif -+ -+static inline void update_sched_rq_queued_masks(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ struct task_struct *p = rq_first_queued_task(rq); -+ unsigned long level; -+#ifdef CONFIG_SCHED_SMT -+ unsigned long last_level = rq->queued_level; -+#endif -+ -+ level = task_running_policy_level(p, rq); -+ sched_rq_prio[cpu] = p->prio; -+ -+ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0])) -+ return; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpu == per_cpu(sched_sibling_cpu, cpu)) -+ return; -+ -+ if (SCHED_RQ_EMPTY == last_level) { -+ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, -+ cpu_smt_mask(cpu)); -+ } else if (SCHED_RQ_EMPTY == level) { -+ cpumask_t tmp; -+ -+ cpumask_and(&tmp, cpu_smt_mask(cpu), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_cpu_sg_idle_mask); -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { -+ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); -+ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { -+ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); -+ } -+#endif /* CONFIG_SMT_NICE */ -+#endif -+} -+ -+static inline void update_sched_rq_pending_masks(struct rq *rq) -+{ -+ unsigned long level; -+ struct task_struct *p = rq_second_queued_task(rq); -+ -+ level = task_running_policy_level(p, rq); -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, -+ &sched_rq_pending_masks[0], -+ &sched_rq_pending_masks_bitmap[0]); -+} -+ -+#else /* CONFIG_SMP */ -+static inline void update_sched_rq_queued_masks(struct rq *rq) {} -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} -+static inline void update_sched_rq_pending_masks(struct rq *rq) {} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Removing from the runqueue. Deleting a task from the skip list is done -+ * via the stored node reference in the task struct and does not require a full -+ * look up. Thus it occurs in O(k) time where k is the "level" of the list the -+ * task was stored at - usually < 4, max 16. -+ * -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running--; -+ -+ sched_update_tick_dependency(rq); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ -+ sched_info_dequeued(rq, p); -+} -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLE to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static inline bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!freezing(p) && !signal_pending(p) && -+ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+/** -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Adding task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running++; -+ -+ sched_update_tick_dependency(rq); -+ -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ bool b_first, b_second; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); -+ b_second = is_second_in_rq(p, rq); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq) || b_second) -+ update_sched_rq_pending_masks(rq); -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) -+{ -+ struct task_struct *curr = rq->curr; -+ -+ if (curr->prio == PRIO_LIMIT) -+ resched_curr(rq); -+ -+ if (task_running_idle(p)) -+ return; -+ -+ if (p->priodl < curr->priodl) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * PDS doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq->hrtick_csd.flags = 0; -+ rq->hrtick_csd.func = __hrtick_start; -+ rq->hrtick_csd.info = rq; -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) -+ return 0; -+ -+ return HALF_JIFFY_NS; -+} -+ -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ static const int policy_to_prio[] = { -+ NORMAL_PRIO, /* SCHED_NORMAL */ -+ 0, /* SCHED_FIFO */ -+ 0, /* SCHED_RR */ -+ IDLE_PRIO, /* SCHED_BATCH */ -+ ISO_PRIO, /* SCHED_ISO */ -+ IDLE_PRIO /* SCHED_IDLE */ -+ }; -+ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ return policy_to_prio[p->policy]; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = 1; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ p->on_rq = TASK_ON_RQ_MIGRATING; -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq, p); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq) -+ if (task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/* Enter with rq lock held. We know p is on the local CPU */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_mask is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, &p->cpus_mask) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ cpumask_t *mask; -+ -+ if (cpumask_test_cpu(cpu, cpumask)) -+ return cpu; -+ -+ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ -+ return cpu; -+} -+ -+/* -+ * task_preemptible_rq - return the rq which the given task can preempt on -+ * @p: task wants to preempt CPU -+ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p -+ */ -+static inline int -+task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) -+{ -+ cpumask_t tmp; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+ -+#ifdef CONFIG_SMT_NICE -+ /* Only ttwu on cpu which is not smt supressed */ -+ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { -+ cpumask_t t; -+ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &t); -+ return best_mask_cpu(task_cpu(p), &tmp); -+ } -+#endif -+ -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+static inline int -+task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, -+ int preempt_level) -+{ -+ cpumask_t tmp; -+ int level; -+ -+#ifdef CONFIG_SCHED_SMT -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#else -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+#endif -+ -+ level = find_first_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL); -+ -+ while (level < preempt_level) { -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ level = find_next_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL, -+ level + 1); -+ } -+ -+ if (unlikely(SCHED_RQ_RT == level && -+ level == preempt_level && -+ cpumask_and(&tmp, chk_mask, -+ &sched_rq_queued_masks[SCHED_RQ_RT]))) { -+ unsigned int cpu; -+ -+ for_each_cpu (cpu, &tmp) -+ if (p->prio < sched_rq_prio[cpu]) -+ return cpu; -+ } -+ -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask; -+ -+ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ /* Check IDLE tasks suitable to run normal priority */ -+ if (idleprio_task(p)) { -+ if (idleprio_suitable(p)) { -+ p->prio = p->normal_prio; -+ update_task_priodl(p); -+ return task_preemptible_rq_idle(p, &chk_mask); -+ } -+ p->prio = NORMAL_PRIO; -+ update_task_priodl(p); -+ } -+ -+ return task_preemptible_rq(p, &chk_mask, -+ task_running_policy_level(p, this_rq())); -+} -+#else /* CONFIG_SMP */ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** PDS ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ int cpu, success = 0; -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto out; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto stat; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { -+ p->prio = ISO_PRIO; -+ p->deadline = 0UL; -+ update_task_priodl(p); -+ } -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else /* CONFIG_SMP */ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+#endif -+ -+ rq = cpu_rq(cpu); -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ -+stat: -+ ttwu_stat(p, cpu, wake_flags); -+out: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ int cpu = get_cpu(); -+ struct rq *rq = this_rq(); -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ /* Should be reset in fork.c but done here for ease of PDS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = 0; -+ -+ p->sl_level = pds_skiplist_random_level(p); -+ INIT_SKIPLIST_NODE(&p->sl_node); -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); -+#endif -+ -+ if (p->time_slice < RESCHED_US) { -+ update_rq_clock(rq); -+ time_slice_expired(p, rq); -+ resched_curr(rq); -+ } else -+ update_task_priodl(p); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ put_cpu(); -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_mask can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void pds_update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ p->time_slice -= NS_TO_US(ns); -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ pds_update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void pds_scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+ -+ /** -+ * p->time_slice < RESCHED_US. We will modify task_struct under -+ * rq lock as p is rq->curr -+ */ -+ __set_tsk_resched(p); -+} -+ -+#ifdef CONFIG_SMP -+ -+#ifdef CONFIG_SCHED_SMT -+static int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ int cpu; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* -+ * _something_ may have changed the task, double check again -+ */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) -+ rq = __migrate_task(rq, p, cpu); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ -+static void pds_sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return; -+ curr = rq->curr; -+ if (!is_idle_task(curr) && -+ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { -+ int active_balance = 0; -+ -+ if (likely(!rq->active_balance)) { -+ rq->active_balance = 1; -+ active_balance = 1; -+ } -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (likely(active_balance)) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ } else -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+/* -+ * pds_sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void pds_sg_balance_check(const struct rq *rq) -+{ -+ cpumask_t chk; -+ int i; -+ -+ /* Only online cpu will do sg balance checking */ -+ if (unlikely(!rq->online)) -+ return; -+ -+ /* Only cpu in slibing idle group will do the checking */ -+ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) -+ return; -+ -+ /* Find potential cpus which can migrate the currently running task */ -+ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return; -+ -+ for_each_cpu(i, &chk) { -+ /* skip the cpu which has idle slibing cpu */ -+ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ continue; -+ pds_sg_balance_trigger(i); -+ } -+} -+DEFINE_PER_CPU(unsigned long, thermal_pressure); -+ -+void arch_set_thermal_pressure(struct cpumask *cpus, -+ unsigned long th_pressure) -+{ -+ int cpu; -+ -+ for_each_cpu(cpu, cpus) -+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -+} -+#endif /* CONFIG_SCHED_SMT */ -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ calc_load_nohz_remote(rq); -+ -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (rq->idle == p) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_US) { -+ time_slice_expired(p, rq); -+ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { -+ p->prio = NORMAL_PRIO; -+ p->deadline = rq->clock + task_deadline_diff(p); -+ update_task_priodl(p); -+ } -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) -+{ -+ struct task_struct *p; -+ int dest_cpu = cpu_of(dest_rq); -+ int nr_migrated = 0; -+ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ while (nr_tries && node != &rq->sl_header) { -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ node = node->next[0]; -+ -+ if (task_running(p)) -+ continue; -+ if (p->prio >= filter_prio) -+ break; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, dest_cpu); -+ enqueue_task(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ /* make a jump */ -+ if (node == &rq->sl_header) -+ break; -+ node = node->next[0]; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int -+take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) -+{ -+ int src_cpu; -+ -+ for_each_cpu(src_cpu, chk_mask) { -+ int nr_migrated; -+ struct rq *src_rq = cpu_rq(src_cpu); -+ -+ if (!do_raw_spin_trylock(&src_rq->lock)) { -+ if (PRIO_LIMIT == filter_prio) -+ continue; -+ return 0; -+ } -+ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ update_rq_clock(src_rq); -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) -+ cpufreq_update_this_cpu(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ if (nr_migrated || PRIO_LIMIT != filter_prio) -+ return nr_migrated; -+ } -+ return 0; -+} -+ -+static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) -+{ -+ struct cpumask *affinity_mask, *end; -+ struct cpumask chk; -+ -+ if (PRIO_LIMIT == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+#ifdef CONFIG_SMT_NICE -+ { -+ /* also try to take IDLE priority tasks from smt supressed cpu */ -+ struct cpumask t; -+ if (cpumask_and(&t, &sched_smt_supressed_mask, -+ &sched_rq_queued_masks[SCHED_RQ_IDLE])) -+ cpumask_or(&chk, &chk, &t); -+ } -+#endif -+ } else if (NORMAL_PRIO == filter_prio) { -+ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], -+ &sched_rq_pending_masks[SCHED_RQ_ISO]); -+ } else if (IDLE_PRIO == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); -+ } else -+ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); -+ -+ if (cpumask_empty(&chk)) -+ return 0; -+ -+ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); -+ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); -+ do { -+ struct cpumask tmp; -+ -+ if (cpumask_and(&tmp, &chk, affinity_mask) && -+ take_queued_task_cpumask(rq, &tmp, filter_prio)) -+ return 1; -+ } while (++affinity_mask < end); -+ -+ return 0; -+} -+#endif -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next = rq_first_queued_task(rq); -+ -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { -+ if (next->prio >= IDLE_PRIO) { -+ if (rq->online && -+ take_other_rq_task(rq, cpu, IDLE_PRIO)) -+ return rq_first_queued_task(rq); -+ return rq->idle; -+ } -+ } -+#endif -+ -+#ifdef CONFIG_SMP -+ if (likely(rq->online)) -+ if (take_other_rq_task(rq, cpu, next->prio)) { -+ resched_curr(rq); -+ return rq_first_queued_task(rq); -+ } -+#endif -+ return next; -+} -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ p->last_ran = rq->clock_task; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (p != rq->idle) -+ hrtick_start(rq, US_TO_NS(p->time_slice)); -+#endif -+ /* update rq->dither */ -+ rq->dither = rq_dither(rq); -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_deadline(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ set_rq_task(rq, next); -+ -+ if (prev != next) { -+ if (next->prio == PRIO_LIMIT) -+ schedstat_inc(rq->sched_goidle); -+ -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ rq->nr_switches++; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+#ifdef CONFIG_SCHED_SMT -+ pds_sg_balance_check(rq); -+#endif -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state || tsk_is_pi_blocked(tsk) || -+ signal_pending_state(tsk->state, tsk)) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void -+check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* -+ * Trigger changes when task priority/deadline modified. -+ */ -+ if (task_on_rq_queued(p)) { -+ struct task_struct *first; -+ -+ requeue_task(p, rq); -+ -+ /* Resched if first queued task not running and not IDLE */ -+ if ((first = rq_first_queued_task(rq)) != rq->curr && -+ !task_running_idle(first)) -+ resched_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+ -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ /* rq lock may not held!! */ -+ update_rq_clock(rq); -+ -+ p->static_prio = new_static; -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->deadline -= task_deadline_diff(p); -+ p->deadline += static_deadline_diff(new_static); -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int level, prio = p->prio - MAX_RT_PRIO; -+ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; -+ -+ /* rt tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ preempt_disable(); -+ level = task_deadline_level(p, this_rq()); -+ preempt_enable(); -+ prio += level_to_nice_prio[level]; -+ if (idleprio_task(p)) -+ prio += NICE_WIDTH; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+#ifdef CONFIG_SMP -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif -+ -+static u64 task_init_deadline(const struct task_struct *p) -+{ -+ return task_rq(p)->clock + task_deadline_diff(p); -+} -+ -+u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { -+ task_init_deadline, /* SCHED_NORMAL */ -+ NULL, /* SCHED_FIFO */ -+ NULL, /* SCHED_RR */ -+ task_init_deadline, /* SCHED_BATCH */ -+ NULL, /* SCHED_ISO */ -+ task_init_deadline /* SCHED_IDLE */ -+}; -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int old_policy = p->policy; -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+ -+ if (old_policy != policy) -+ p->deadline = (task_init_deadline_func_tbl[p->policy])? -+ task_init_deadline_func_tbl[p->policy](p):0ULL; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int -+__sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_mask, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_mask); -+ cpumask_and(new_mask, in_mask, cpus_mask); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_mask); -+ if (!cpumask_subset(new_mask, cpus_mask)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_mask to the -+ * cpuset's cpus_mask -+ */ -+ cpumask_copy(new_mask, cpus_mask); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_mask); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ if (sched_yield_type > 1) { -+ time_slice_expired(current, rq); -+ requeue_task(current, rq); -+ } -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In PDS, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* PDS TODO: should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); -+ update_task_priodl(idle); -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+static bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ struct skiplist_node *node; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ node = &rq->sl_header; -+ while ((node = node->next[0]) != &rq->sl_header) { -+ int dest_cpu; -+ -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ -+ /* skip the running task */ -+ if (task_running(p)) -+ continue; -+ -+ /* -+ * Rules for changing task_struct::cpus_mask are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ node = &rq->sl_header; -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+ -+static __read_mostly int sched_debug_enabled; -+ -+static int __init sched_debug_setup(char *str) -+{ -+ sched_debug_enabled = 1; -+ -+ return 0; -+} -+early_param("sched_debug", sched_debug_setup); -+ -+static inline bool sched_debug(void) -+{ -+ return sched_debug_enabled; -+} -+#else /* !CONFIG_SCHED_DEBUG */ -+static inline bool sched_debug(void) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#ifdef CONFIG_SMP -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (!idle_cpu(smp_processor_id()) || need_resched()) -+ return; -+ -+ irq_enter(); -+ irq_exit(); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Topology list, bottom-up. -+ */ -+static struct sched_domain_topology_level default_topology[] = { -+#ifdef CONFIG_SCHED_SMT -+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -+#endif -+#ifdef CONFIG_SCHED_MC -+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -+#endif -+ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, -+ { NULL, }, -+}; -+ -+static struct sched_domain_topology_level *sched_domain_topology = -+ default_topology; -+ -+#define for_each_sd_topology(tl) \ -+ for (tl = sched_domain_topology; tl->mask; tl++) -+ -+void set_sched_topology(struct sched_domain_topology_level *tl) -+{ -+ if (WARN_ON_ONCE(sched_smp_initialized)) -+ return; -+ -+ sched_domain_topology = tl; -+} -+ -+/* -+ * Initializers for schedule domains -+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() -+ */ -+ -+int sched_domain_level_max; -+ -+/* -+ * Partition sched domains as specified by the 'ndoms_new' -+ * cpumasks in the array doms_new[] of cpumasks. This compares -+ * doms_new[] to the current sched domain partitioning, doms_cur[]. -+ * It destroys each deleted domain and builds each new domain. -+ * -+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. -+ * The masks don't intersect (don't overlap.) We should setup one -+ * sched domain for each mask. CPUs not in any of the cpumasks will -+ * not be load balanced. If the same cpumask appears both in the -+ * current 'doms_cur' domains and in the new 'doms_new', we can leave -+ * it as it is. -+ * -+ * The passed in 'doms_new' should be allocated using -+ * alloc_sched_domains. This routine takes ownership of it and will -+ * free_sched_domains it when done with it. If the caller failed the -+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, -+ * and partition_sched_domains() will fallback to the single partition -+ * 'fallback_doms', it also forces the domains to be rebuilt. -+ * -+ * If doms_new == NULL it will be replaced with cpu_online_mask. -+ * ndoms_new == 0 is a special case for destroying existing domains, -+ * and it will not create the default domain. -+ * -+ * Call with hotplug lock held -+ */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{ -+ /** -+ * PDS doesn't depend on sched domains, but just keep this api -+ */ -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+/* -+ * sched_numa_find_closest() - given the NUMA topology, find the cpu -+ * closest to @cpu from @cpumask. -+ * cpumask: cpumask to find a cpu from -+ * cpu: cpu to be close to -+ * -+ * returns: cpu, or nr_cpu_ids when nothing found. -+ */ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ sched_tick_stop(cpu); -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_start_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); -+ } -+} -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ -+#ifdef CONFIG_SCHED_SMT -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { -+ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+#endif -+#ifdef CONFIG_SCHED_MC -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+ cpumask_complement(chk, cpu_coregroup_mask(cpu)); -+ -+ /** -+ * Set up sd_llc_id per CPU -+ */ -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(cpu_coregroup_mask(cpu)); -+#else -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(topology_core_cpumask(cpu)); -+ -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+#endif /* NOT CONFIG_SCHED_MC */ -+ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ cpumask_complement(chk, topology_core_cpumask(cpu)); -+ -+ if (cpumask_and(chk, chk, cpu_online_mask)) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ print_scheduler_version(); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) -+ cpumask_clear(&sched_rq_queued_masks[i]); -+ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); -+ -+ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ raw_spin_lock_init(&rq->lock); -+ rq->dither = 0; -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+ rq->queued_level = SCHED_RQ_EMPTY; -+ rq->pending_level = SCHED_RQ_EMPTY; -+#ifdef CONFIG_SCHED_SMT -+ per_cpu(sched_sibling_cpu, i) = i; -+ rq->active_balance = 0; -+#endif -+#endif -+ rq->nr_switches = 0; -+ atomic_set(&rq->nr_iowait, 0); -+ hrtick_rq_init(rq); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h -new file mode 100644 -index 000000000000..6c3361f06087 ---- /dev/null -+++ b/kernel/sched/pds_sched.h -@@ -0,0 +1,518 @@ -+#ifndef PDS_SCHED_H -+#define PDS_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop; -+ struct mm_struct *prev_mm; -+ -+ struct skiplist_node sl_header; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+ struct sched_avg avg_thermal; -+#endif -+ -+ unsigned long queued_level; -+ unsigned long pending_level; -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 clock_task; -+ int dither; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+#endif /* CONFIG_SMP */ -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/** -+ * By default the decay is the default pelt decay period. -+ * The decay shift can change the decay period in -+ * multiples of 32. -+ * Decay shift Decay period(ms) -+ * 0 32 -+ * 1 64 -+ * 2 128 -+ * 3 256 -+ * 4 512 -+ */ -+extern int sched_thermal_decay_shift; -+ -+static inline u64 rq_clock_thermal(struct rq *rq) -+{ -+ return rq_clock_task(rq) >> sched_thermal_decay_shift; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : PDS need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+static inline void sched_ttwu_pending(void) { } -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+ -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -+{ -+ if (cpu_of(rq) == smp_processor_id()) -+ cpufreq_update_util(rq, flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* PDS_SCHED_H */ -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index b647d04d9c8b..05b6cfd91842 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * sched_entity: - * -@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - /* -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index eb034d9f024d..a074572f2976 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,11 +1,13 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_PDS - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); -@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_PDS - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_PDS */ - - #else - -+#ifndef CONFIG_SCHED_PDS - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -188,6 +193,7 @@ static inline u64 thermal_load_avg(struct rq *rq) - { - return 0; - } -+#endif - - static inline int - update_irq_load_avg(struct rq *rq, u64 running) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index db3a57675ccf..5a8060bd2343 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_PDS -+#include "pds_sched.h" -+#else -+ - #include - - #include -@@ -2546,3 +2550,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* !CONFIG_SCHED_PDS */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..45bd43942575 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 8a176d8727a3..b9dde576b576 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; - static unsigned long zero_ul; - static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; -+static int __read_mostly one_hundred = 100; -+static int __read_mostly one_thousand = 1000; -+#ifdef CONFIG_SCHED_PDS -+extern int rr_interval; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_PDS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_PDS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1049,6 +1055,26 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_PDS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ONE, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 2fd3b3fa68bf..6f3b08afdd4c 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_PDS - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..0816db0b9c16 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_PDS -+ /* No deadline on BFS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux57-tkg/linux57-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux57-tkg/linux57-tkg-patches/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index d1303a5..0000000 --- a/linux57-tkg/linux57-tkg-patches/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/linux57-tkg/linux57-tkg-patches/0007-v5.7-fsync.patch b/linux57-tkg/linux57-tkg-patches/0007-v5.7-fsync.patch deleted file mode 100644 index 01c86d8..0000000 --- a/linux57-tkg/linux57-tkg-patches/0007-v5.7-fsync.patch +++ /dev/null @@ -1,908 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 20 Apr 2020 14:09:11 +0200 -Subject: Import Fsync v3 patchset - Squashed from https://gitlab.collabora.com/tonyk/linux/-/commits/futex-proton-v3 - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e2ee527be1e3e11b1117ff5bf94b4..580001e89c6caed57dd8b3cb491d65dce846caff 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -40,6 +41,8 @@ - FUTEX_PRIVATE_FLAG) - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -+#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ -+ FUTEX_PRIVATE_FLAG) - - /* - * Support for robust futexes: the kernel cleans up held futexes at -@@ -150,4 +153,21 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+/* -+ * Maximum number of multiple futexes to wait for -+ */ -+#define FUTEX_MULTIPLE_MAX_COUNT 128 -+ -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index 0cf84c8664f207c574325b899ef2e57f01295a94..58cf9eb2b851b4858e29b5ef4114a29a92e676ba 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -215,6 +215,8 @@ struct futex_pi_state { - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup -+ * @uaddr: userspace address of futex -+ * @uval: expected futex's value - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). -@@ -237,6 +239,8 @@ struct futex_q { - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -+ u32 __user *uaddr; -+ u32 uval; - } __randomize_layout; - - static const struct futex_q futex_q_init = { -@@ -2420,6 +2424,29 @@ static int unqueue_me(struct futex_q *q) - return ret; - } - -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_q *q, int count) -+{ -+ int ret = -1; -+ int i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&q[i])) -+ ret = i; -+ } -+ return ret; -+} -+ - /* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -@@ -2783,6 +2810,211 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_q *qs, int count, -+ unsigned int flags, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret)) { -+ for (--i; i >= 0; i--) -+ put_futex_key(&qs[i].key); -+ return ret; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &qs[i]; -+ -+ hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, q->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ -+ /* -+ * Keys 0..(i-1) are implicitly put -+ * on unqueue_multiple. -+ */ -+ put_futex_key(&q->key); -+ -+ *awaken = unqueue_multiple(qs, i); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * On a real fault, prioritize the error even if -+ * some other futex was awoken. Userspace gave -+ * us a bad address, -EFAULT them. -+ */ -+ ret = get_user(uval, q->uaddr); -+ if (ret) -+ return ret; -+ -+ /* -+ * Even if the page fault was handled, If -+ * something was already awaken, we can safely -+ * give up and succeed to give a hint for userspace to -+ * acquire the right futex faster. -+ */ -+ if (*awaken >= 0) -+ return 1; -+ -+ goto retry; -+ } -+ -+ if (uval != q->uval) { -+ queue_unlock(hb); -+ -+ put_futex_key(&qs[i].key); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ __set_current_state(TASK_RUNNING); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_q *qs, int op, -+ u32 count, ktime_t *abs_time) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ int ret, flags = 0, hint = 0; -+ unsigned int i; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) -+ flags |= FLAGS_CLOCKRT; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, 0); -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, flags, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ break; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ ret = unqueue_multiple(qs, count); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (ret >= 0) -+ break; -+ if (to && !to->task) { -+ ret = -ETIMEDOUT; -+ break; -+ } else if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -3907,6 +4139,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - return -ENOSYS; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = fwb.uaddr; -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} - - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -@@ -3919,7 +4188,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3940,6 +4210,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs; -+ -+#ifdef CONFIG_X86_X32 -+ if (unlikely(in_x32_syscall())) -+ return -ENOSYS; -+#endif -+ qs = futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - -@@ -4102,6 +4391,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - #endif /* CONFIG_COMPAT */ - - #ifdef CONFIG_COMPAT_32BIT_TIME -+/** -+ * struct compat_futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex (compatible pointer) -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct compat_futex_wait_block { -+ compat_uptr_t uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ -+/** -+ * compat_futex_read_wait_block - Read an array of futex_wait_block from -+ * userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function does the same as futex_read_wait_block(), except that it -+ * converts the pointer to the futex from the compat version to the regular one. -+ */ -+inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, -+ u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct compat_futex_wait_block fwb; -+ struct compat_futex_wait_block __user *entry = -+ (struct compat_futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = compat_ptr(fwb.uaddr); -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} -+ - SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -@@ -4113,7 +4453,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) -@@ -4128,6 +4469,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ -diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -index ee55e6d389a3f053194435342c4e471dc7cf8786..2a63e1c2cfb6407a5988233217cff2e52787bc66 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -@@ -11,6 +11,7 @@ - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart -+ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman - * - *****************************************************************************/ - -@@ -41,6 +42,8 @@ int main(int argc, char *argv[]) - { - futex_t f1 = FUTEX_INITIALIZER; - struct timespec to; -+ time_t secs; -+ struct futex_wait_block fwb = {&f1, f1, 0}; - int res, ret = RET_PASS; - int c; - -@@ -65,7 +68,7 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); -@@ -79,8 +82,39 @@ int main(int argc, char *argv[]) - if (!res || errno != ETIMEDOUT) { - fail("futex_wait returned %d\n", ret < 0 ? errno : ret); - ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait timeout succeeds\n"); -+ -+ info("Calling futex_wait_multiple on f1: %u @ %p\n", f1, &f1); -+ -+ /* Setup absolute time */ -+ ret = clock_gettime(CLOCK_REALTIME, &to); -+ secs = (to.tv_nsec + timeout_ns) / 1000000000; -+ to.tv_nsec = ((int64_t)to.tv_nsec + timeout_ns) % 1000000000; -+ to.tv_sec += secs; -+ info("to.tv_sec = %ld\n", to.tv_sec); -+ info("to.tv_nsec = %ld\n", to.tv_nsec); -+ -+ res = futex_wait_multiple(&fwb, 1, &to, -+ FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); -+ -+#ifdef __ILP32__ -+ if (res == -1 && errno == ENOSYS) { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } else { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; - } -+#else -+ if (!res || errno != ETIMEDOUT) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait_multiple timeout succeeds\n"); -+#endif /* __ILP32__ */ - -- print_result(TEST_NAME, ret); -+ ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h -index ddbcfc9b7bac4aebb5bac2f249e26ecfd948aa84..bb103bef4557012ef9a389ca74c868e4476a8a31 100644 ---- a/tools/testing/selftests/futex/include/futextest.h -+++ b/tools/testing/selftests/futex/include/futextest.h -@@ -38,6 +38,14 @@ typedef volatile u_int32_t futex_t; - #ifndef FUTEX_CMP_REQUEUE_PI - #define FUTEX_CMP_REQUEUE_PI 12 - #endif -+#ifndef FUTEX_WAIT_MULTIPLE -+#define FUTEX_WAIT_MULTIPLE 13 -+struct futex_wait_block { -+ futex_t *uaddr; -+ futex_t val; -+ __u32 bitset; -+}; -+#endif - #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE - #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -@@ -80,6 +88,20 @@ futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) - return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); - } - -+/** -+ * futex_wait_multiple() - block on several futexes with optional timeout -+ * @fwb: wait block user space address -+ * @count: number of entities at fwb -+ * @timeout: absolute timeout -+ */ -+static inline int -+futex_wait_multiple(struct futex_wait_block *fwb, int count, -+ struct timespec *timeout, int opflags) -+{ -+ return futex(fwb, FUTEX_WAIT_MULTIPLE, count, timeout, NULL, 0, -+ opflags); -+} -+ - /** - * futex_wake() - wake one or more tasks blocked on uaddr - * @nr_wake: wake up to this many tasks -diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -index 0ae390ff816449c88d0bb655a26eb014382c2b4f..bcbac042992d447e0bc9ef5fefe94e875de310f2 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -@@ -12,6 +12,7 @@ - * - * HISTORY - * 2009-Nov-14: Initial version by Gowrishankar -+ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman - * - *****************************************************************************/ - -@@ -40,6 +41,7 @@ int main(int argc, char *argv[]) - { - struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; - futex_t f1 = FUTEX_INITIALIZER; -+ struct futex_wait_block fwb = {&f1, f1+1, 0}; - int res, ret = RET_PASS; - int c; - -@@ -61,7 +63,7 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - -@@ -71,8 +73,30 @@ int main(int argc, char *argv[]) - fail("futex_wait returned: %d %s\n", - res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); -+ -+ info("Calling futex_wait_multiple on f1: %u @ %p with val=%u\n", -+ f1, &f1, f1+1); -+ res = futex_wait_multiple(&fwb, 1, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (!res || errno != EWOULDBLOCK) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; - } -+ ksft_test_result_pass("futex_wait_multiple wouldblock succeeds\n"); -+#endif /* __ILP32__ */ - -- print_result(TEST_NAME, ret); -+ ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index a09f570619023750f558c84004aff166b4337d72..4660128a545edb04a17cc6bd9760931c1386122f 100644 ---- a/tools/testing/selftests/futex/functional/.gitignore -+++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -5,3 +5,4 @@ futex_wait_private_mapped_file - futex_wait_timeout - futex_wait_uninitialized_heap - futex_wait_wouldblock -+futex_wait_multiple -diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index 30996306cabcfe89a47977643e529b122893bb7e..75f9fface11fa3c90c1bdb9a49b3ea51291afd58 100644 ---- a/tools/testing/selftests/futex/functional/Makefile -+++ b/tools/testing/selftests/futex/functional/Makefile -@@ -14,7 +14,8 @@ TEST_GEN_FILES := \ - futex_requeue_pi_signal_restart \ - futex_requeue_pi_mismatched_ops \ - futex_wait_uninitialized_heap \ -- futex_wait_private_mapped_file -+ futex_wait_private_mapped_file \ -+ futex_wait_multiple - - TEST_PROGS := run.sh - -diff --git a/tools/testing/selftests/futex/functional/futex_wait_multiple.c b/tools/testing/selftests/futex/functional/futex_wait_multiple.c -new file mode 100644 -index 0000000000000000000000000000000000000000..b48422e79f42edba1653bb0bd2a4c4fd98d2d48d ---- /dev/null -+++ b/tools/testing/selftests/futex/functional/futex_wait_multiple.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/****************************************************************************** -+ * -+ * Copyright © Collabora, Ltd., 2019 -+ * -+ * DESCRIPTION -+ * Test basic semantics of FUTEX_WAIT_MULTIPLE -+ * -+ * AUTHOR -+ * Gabriel Krisman Bertazi -+ * -+ * HISTORY -+ * 2019-Dec-13: Initial version by Krisman -+ * -+ *****************************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "futextest.h" -+#include "logging.h" -+ -+#define TEST_NAME "futex-wait-multiple" -+#define timeout_ns 100000 -+#define MAX_COUNT 128 -+#define WAKE_WAIT_US 3000000 -+ -+int ret = RET_PASS; -+char *progname; -+futex_t f[MAX_COUNT] = {0}; -+struct futex_wait_block fwb[MAX_COUNT]; -+ -+void usage(char *prog) -+{ -+ printf("Usage: %s\n", prog); -+ printf(" -c Use color\n"); -+ printf(" -h Display this help message\n"); -+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", -+ VQUIET, VCRITICAL, VINFO); -+} -+ -+void test_count_overflow(void) -+{ -+ futex_t f = FUTEX_INITIALIZER; -+ struct futex_wait_block fwb[MAX_COUNT+1]; -+ int res, i; -+ -+ ksft_print_msg("%s: Test a too big number of futexes\n", progname); -+ -+ for (i = 0; i < MAX_COUNT+1; i++) { -+ fwb[i].uaddr = &f; -+ fwb[i].val = f; -+ fwb[i].bitset = 0; -+ } -+ -+ res = futex_wait_multiple(fwb, MAX_COUNT+1, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (res != -1 || errno != EINVAL) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_wait_multiple count overflow succeed\n"); -+ } -+ -+#endif /* __ILP32__ */ -+} -+ -+void *waiterfn(void *arg) -+{ -+ int res; -+ -+ res = futex_wait_multiple(fwb, MAX_COUNT, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (res < 0) -+ ksft_print_msg("waiter failed %d\n", res); -+ -+ info("futex_wait_multiple: Got hint futex %d was freed\n", res); -+#endif /* __ILP32__ */ -+ -+ return NULL; -+} -+ -+void test_fwb_wakeup(void) -+{ -+ int res, i; -+ pthread_t waiter; -+ -+ ksft_print_msg("%s: Test wake up in a list of futex\n", progname); -+ -+ for (i = 0; i < MAX_COUNT; i++) { -+ fwb[i].uaddr = &f[i]; -+ fwb[i].val = f[i]; -+ fwb[i].bitset = 0xffffffff; -+ } -+ -+ res = pthread_create(&waiter, NULL, waiterfn, NULL); -+ if (res) { -+ ksft_test_result_fail("Creating waiting thread failed"); -+ ksft_exit_fail(); -+ } -+ -+ usleep(WAKE_WAIT_US); -+ res = futex_wake(&(f[MAX_COUNT-1]), 1, FUTEX_PRIVATE_FLAG); -+ if (res != 1) { -+ ksft_test_result_fail("Failed to wake thread res=%d\n", res); -+ ksft_exit_fail(); -+ } -+ -+ pthread_join(waiter, NULL); -+ ksft_test_result_pass("%s succeed\n", __func__); -+} -+ -+int main(int argc, char *argv[]) -+{ -+ int c; -+ -+ while ((c = getopt(argc, argv, "cht:v:")) != -1) { -+ switch (c) { -+ case 'c': -+ log_color(1); -+ break; -+ case 'h': -+ usage(basename(argv[0])); -+ exit(0); -+ case 'v': -+ log_verbosity(atoi(optarg)); -+ break; -+ default: -+ usage(basename(argv[0])); -+ exit(1); -+ } -+ } -+ -+ progname = basename(argv[0]); -+ -+ ksft_print_header(); -+ ksft_set_plan(2); -+ -+ test_count_overflow(); -+ -+#ifdef __ILP32__ -+ // if it's a 32x binary, there's no futex to wakeup -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+#else -+ test_fwb_wakeup(); -+#endif /* __ILP32__ */ -+ -+ ksft_print_cnts(); -+ return ret; -+} -diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index 1acb6ace1680e8f3d6b3ee2dc528c19ddfdb018e..a8be94f28ff78b4879d2d19bca5d9b0fcb26c1f8 100755 ---- a/tools/testing/selftests/futex/functional/run.sh -+++ b/tools/testing/selftests/futex/functional/run.sh -@@ -73,3 +73,6 @@ echo - echo - ./futex_wait_uninitialized_heap $COLOR - ./futex_wait_private_mapped_file $COLOR -+ -+echo -+./futex_wait_multiple $COLOR -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 580001e89c6caed57dd8b3cb491d65dce846caff..a3e760886b8e7e74285fdcf2caaaa6f66ad16675 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -diff --git a/kernel/futex.c b/kernel/futex.c -index 58cf9eb2b851b4858e29b5ef4114a29a92e676ba..e0bb628a5e1988dcc9ae5442a4259edc229d578d 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -4198,7 +4198,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } -@@ -4399,6 +4399,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - */ - struct compat_futex_wait_block { - compat_uptr_t uaddr; -+ __u32 pad; - __u32 val; - __u32 bitset; - }; -@@ -4461,7 +4462,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } diff --git a/linux57-tkg/linux57-tkg-patches/0008-5.7-bcachefs.patch b/linux57-tkg/linux57-tkg-patches/0008-5.7-bcachefs.patch deleted file mode 100644 index 4ca0a38..0000000 --- a/linux57-tkg/linux57-tkg-patches/0008-5.7-bcachefs.patch +++ /dev/null @@ -1,71085 +0,0 @@ -diff --git a/block/bio.c b/block/bio.c -index 21cbaa6a1c20..8d236b819612 100644 ---- a/block/bio.c -+++ b/block/bio.c -@@ -1049,6 +1049,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) - bio_set_flag(bio, BIO_NO_PAGE_REF); - return bio->bi_vcnt ? 0 : ret; - } -+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); - - static void submit_bio_wait_endio(struct bio *bio) - { -@@ -1243,6 +1244,7 @@ void bio_set_pages_dirty(struct bio *bio) - set_page_dirty_lock(bvec->bv_page); - } - } -+EXPORT_SYMBOL_GPL(bio_set_pages_dirty); - - /* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1302,6 +1304,7 @@ void bio_check_pages_dirty(struct bio *bio) - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } -+EXPORT_SYMBOL_GPL(bio_check_pages_dirty); - - void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) - { -diff --git a/block/blk-core.c b/block/blk-core.c -index 9bfaee050c82..60a1a2907abf 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -210,18 +210,23 @@ int blk_status_to_errno(blk_status_t status) - } - EXPORT_SYMBOL_GPL(blk_status_to_errno); - --static void print_req_error(struct request *req, blk_status_t status, -- const char *caller) -+const char *blk_status_to_str(blk_status_t status) - { - int idx = (__force int)status; - - if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) -- return; -+ return "(invalid error)"; -+ return blk_errors[idx].name; -+} -+EXPORT_SYMBOL_GPL(blk_status_to_str); - -+static void print_req_error(struct request *req, blk_status_t status, -+ const char *caller) -+{ - printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", -- caller, blk_errors[idx].name, -+ caller, blk_status_to_str(status), - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, -diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig -index 6dfa653d30db..6b256291b924 100644 ---- a/drivers/md/bcache/Kconfig -+++ b/drivers/md/bcache/Kconfig -@@ -3,6 +3,7 @@ - config BCACHE - tristate "Block device as cache" - select CRC64 -+ select CLOSURES - help - Allows a block device to be used as cache for other devices; uses - a btree for indexing and the layout is optimized for SSDs. -@@ -17,12 +18,3 @@ config BCACHE_DEBUG - - Enables extra debugging tools, allows expensive runtime checks to be - turned on. -- --config BCACHE_CLOSURES_DEBUG -- bool "Debug closures" -- depends on BCACHE -- select DEBUG_FS -- help -- Keeps all active closures in a linked list and provides a debugfs -- interface to list them, which makes it possible to see asynchronous -- operations that get stuck. -diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile -index fd714628da6a..0fb1b6009da3 100644 ---- a/drivers/md/bcache/Makefile -+++ b/drivers/md/bcache/Makefile -@@ -2,6 +2,6 @@ - - obj-$(CONFIG_BCACHE) += bcache.o - --bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ -- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ -+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ -+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 74a9849ea164..e03597696920 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -180,6 +180,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -192,7 +193,6 @@ - - #include "bset.h" - #include "util.h" --#include "closure.h" - - struct bucket { - atomic_t pin; -diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c -deleted file mode 100644 -index 0164a1fe94a9..000000000000 ---- a/drivers/md/bcache/closure.c -+++ /dev/null -@@ -1,217 +0,0 @@ --// SPDX-License-Identifier: GPL-2.0 --/* -- * Asynchronous refcounty things -- * -- * Copyright 2010, 2011 Kent Overstreet -- * Copyright 2012 Google, Inc. -- */ -- --#include --#include --#include --#include -- --#include "closure.h" -- --static inline void closure_put_after_sub(struct closure *cl, int flags) --{ -- int r = flags & CLOSURE_REMAINING_MASK; -- -- BUG_ON(flags & CLOSURE_GUARD_MASK); -- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -- -- if (!r) { -- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -- atomic_set(&cl->remaining, -- CLOSURE_REMAINING_INITIALIZER); -- closure_queue(cl); -- } else { -- struct closure *parent = cl->parent; -- closure_fn *destructor = cl->fn; -- -- closure_debug_destroy(cl); -- -- if (destructor) -- destructor(cl); -- -- if (parent) -- closure_put(parent); -- } -- } --} -- --/* For clearing flags with the same atomic op as a put */ --void closure_sub(struct closure *cl, int v) --{ -- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); --} -- --/* -- * closure_put - decrement a closure's refcount -- */ --void closure_put(struct closure *cl) --{ -- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); --} -- --/* -- * closure_wake_up - wake up all closures on a wait list, without memory barrier -- */ --void __closure_wake_up(struct closure_waitlist *wait_list) --{ -- struct llist_node *list; -- struct closure *cl, *t; -- struct llist_node *reverse = NULL; -- -- list = llist_del_all(&wait_list->list); -- -- /* We first reverse the list to preserve FIFO ordering and fairness */ -- reverse = llist_reverse_order(list); -- -- /* Then do the wakeups */ -- llist_for_each_entry_safe(cl, t, reverse, list) { -- closure_set_waiting(cl, 0); -- closure_sub(cl, CLOSURE_WAITING + 1); -- } --} -- --/** -- * closure_wait - add a closure to a waitlist -- * @waitlist: will own a ref on @cl, which will be released when -- * closure_wake_up() is called on @waitlist. -- * @cl: closure pointer. -- * -- */ --bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) --{ -- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -- return false; -- -- closure_set_waiting(cl, _RET_IP_); -- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -- llist_add(&cl->list, &waitlist->list); -- -- return true; --} -- --struct closure_syncer { -- struct task_struct *task; -- int done; --}; -- --static void closure_sync_fn(struct closure *cl) --{ -- struct closure_syncer *s = cl->s; -- struct task_struct *p; -- -- rcu_read_lock(); -- p = READ_ONCE(s->task); -- s->done = 1; -- wake_up_process(p); -- rcu_read_unlock(); --} -- --void __sched __closure_sync(struct closure *cl) --{ -- struct closure_syncer s = { .task = current }; -- -- cl->s = &s; -- continue_at(cl, closure_sync_fn, NULL); -- -- while (1) { -- set_current_state(TASK_UNINTERRUPTIBLE); -- if (s.done) -- break; -- schedule(); -- } -- -- __set_current_state(TASK_RUNNING); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --static LIST_HEAD(closure_list); --static DEFINE_SPINLOCK(closure_list_lock); -- --void closure_debug_create(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_ALIVE; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_add(&cl->all, &closure_list); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --void closure_debug_destroy(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_DEAD; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_del(&cl->all); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --static struct dentry *closure_debug; -- --static int debug_seq_show(struct seq_file *f, void *data) --{ -- struct closure *cl; -- -- spin_lock_irq(&closure_list_lock); -- -- list_for_each_entry(cl, &closure_list, all) { -- int r = atomic_read(&cl->remaining); -- -- seq_printf(f, "%p: %pS -> %pS p %p r %i ", -- cl, (void *) cl->ip, cl->fn, cl->parent, -- r & CLOSURE_REMAINING_MASK); -- -- seq_printf(f, "%s%s\n", -- test_bit(WORK_STRUCT_PENDING_BIT, -- work_data_bits(&cl->work)) ? "Q" : "", -- r & CLOSURE_RUNNING ? "R" : ""); -- -- if (r & CLOSURE_WAITING) -- seq_printf(f, " W %pS\n", -- (void *) cl->waiting_on); -- -- seq_printf(f, "\n"); -- } -- -- spin_unlock_irq(&closure_list_lock); -- return 0; --} -- --static int debug_seq_open(struct inode *inode, struct file *file) --{ -- return single_open(file, debug_seq_show, NULL); --} -- --static const struct file_operations debug_ops = { -- .owner = THIS_MODULE, -- .open = debug_seq_open, -- .read = seq_read, -- .release = single_release --}; -- --void __init closure_debug_init(void) --{ -- if (!IS_ERR_OR_NULL(bcache_debug)) -- /* -- * it is unnecessary to check return value of -- * debugfs_create_file(), we should not care -- * about this. -- */ -- closure_debug = debugfs_create_file( -- "closures", 0400, bcache_debug, NULL, &debug_ops); --} --#endif -- --MODULE_AUTHOR("Kent Overstreet "); --MODULE_LICENSE("GPL"); -diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h -deleted file mode 100644 -index c88cdc4ae4ec..000000000000 ---- a/drivers/md/bcache/closure.h -+++ /dev/null -@@ -1,378 +0,0 @@ --/* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _LINUX_CLOSURE_H --#define _LINUX_CLOSURE_H -- --#include --#include --#include --#include -- --/* -- * Closure is perhaps the most overused and abused term in computer science, but -- * since I've been unable to come up with anything better you're stuck with it -- * again. -- * -- * What are closures? -- * -- * They embed a refcount. The basic idea is they count "things that are in -- * progress" - in flight bios, some other thread that's doing something else - -- * anything you might want to wait on. -- * -- * The refcount may be manipulated with closure_get() and closure_put(). -- * closure_put() is where many of the interesting things happen, when it causes -- * the refcount to go to 0. -- * -- * Closures can be used to wait on things both synchronously and asynchronously, -- * and synchronous and asynchronous use can be mixed without restriction. To -- * wait synchronously, use closure_sync() - you will sleep until your closure's -- * refcount hits 1. -- * -- * To wait asynchronously, use -- * continue_at(cl, next_function, workqueue); -- * -- * passing it, as you might expect, the function to run when nothing is pending -- * and the workqueue to run that function out of. -- * -- * continue_at() also, critically, requires a 'return' immediately following the -- * location where this macro is referenced, to return to the calling function. -- * There's good reason for this. -- * -- * To use safely closures asynchronously, they must always have a refcount while -- * they are running owned by the thread that is running them. Otherwise, suppose -- * you submit some bios and wish to have a function run when they all complete: -- * -- * foo_endio(struct bio *bio) -- * { -- * closure_put(cl); -- * } -- * -- * closure_init(cl); -- * -- * do_stuff(); -- * closure_get(cl); -- * bio1->bi_endio = foo_endio; -- * bio_submit(bio1); -- * -- * do_more_stuff(); -- * closure_get(cl); -- * bio2->bi_endio = foo_endio; -- * bio_submit(bio2); -- * -- * continue_at(cl, complete_some_read, system_wq); -- * -- * If closure's refcount started at 0, complete_some_read() could run before the -- * second bio was submitted - which is almost always not what you want! More -- * importantly, it wouldn't be possible to say whether the original thread or -- * complete_some_read()'s thread owned the closure - and whatever state it was -- * associated with! -- * -- * So, closure_init() initializes a closure's refcount to 1 - and when a -- * closure_fn is run, the refcount will be reset to 1 first. -- * -- * Then, the rule is - if you got the refcount with closure_get(), release it -- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -- * on a closure because you called closure_init() or you were run out of a -- * closure - _always_ use continue_at(). Doing so consistently will help -- * eliminate an entire class of particularly pernicious races. -- * -- * Lastly, you might have a wait list dedicated to a specific event, and have no -- * need for specifying the condition - you just want to wait until someone runs -- * closure_wake_up() on the appropriate wait list. In that case, just use -- * closure_wait(). It will return either true or false, depending on whether the -- * closure was already on a wait list or not - a closure can only be on one wait -- * list at a time. -- * -- * Parents: -- * -- * closure_init() takes two arguments - it takes the closure to initialize, and -- * a (possibly null) parent. -- * -- * If parent is non null, the new closure will have a refcount for its lifetime; -- * a closure is considered to be "finished" when its refcount hits 0 and the -- * function to run is null. Hence -- * -- * continue_at(cl, NULL, NULL); -- * -- * returns up the (spaghetti) stack of closures, precisely like normal return -- * returns up the C stack. continue_at() with non null fn is better thought of -- * as doing a tail call. -- * -- * All this implies that a closure should typically be embedded in a particular -- * struct (which its refcount will normally control the lifetime of), and that -- * struct can very much be thought of as a stack frame. -- */ -- --struct closure; --struct closure_syncer; --typedef void (closure_fn) (struct closure *); --extern struct dentry *bcache_debug; -- --struct closure_waitlist { -- struct llist_head list; --}; -- --enum closure_state { -- /* -- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -- * the thread that owns the closure, and cleared by the thread that's -- * waking up the closure. -- * -- * The rest are for debugging and don't affect behaviour: -- * -- * CLOSURE_RUNNING: Set when a closure is running (i.e. by -- * closure_init() and when closure_put() runs then next function), and -- * must be cleared before remaining hits 0. Primarily to help guard -- * against incorrect usage and accidentally transferring references. -- * continue_at() and closure_return() clear it for you, if you're doing -- * something unusual you can use closure_set_dead() which also helps -- * annotate where references are being transferred. -- */ -- -- CLOSURE_BITS_START = (1U << 26), -- CLOSURE_DESTRUCTOR = (1U << 26), -- CLOSURE_WAITING = (1U << 28), -- CLOSURE_RUNNING = (1U << 30), --}; -- --#define CLOSURE_GUARD_MASK \ -- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -- --#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) --#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -- --struct closure { -- union { -- struct { -- struct workqueue_struct *wq; -- struct closure_syncer *s; -- struct llist_node list; -- closure_fn *fn; -- }; -- struct work_struct work; -- }; -- -- struct closure *parent; -- -- atomic_t remaining; -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG --#define CLOSURE_MAGIC_DEAD 0xc054dead --#define CLOSURE_MAGIC_ALIVE 0xc054a11e -- -- unsigned int magic; -- struct list_head all; -- unsigned long ip; -- unsigned long waiting_on; --#endif --}; -- --void closure_sub(struct closure *cl, int v); --void closure_put(struct closure *cl); --void __closure_wake_up(struct closure_waitlist *list); --bool closure_wait(struct closure_waitlist *list, struct closure *cl); --void __closure_sync(struct closure *cl); -- --/** -- * closure_sync - sleep until a closure a closure has nothing left to wait on -- * -- * Sleeps until the refcount hits 1 - the thread that's running the closure owns -- * the last refcount. -- */ --static inline void closure_sync(struct closure *cl) --{ -- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -- __closure_sync(cl); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --void closure_debug_init(void); --void closure_debug_create(struct closure *cl); --void closure_debug_destroy(struct closure *cl); -- --#else -- --static inline void closure_debug_init(void) {} --static inline void closure_debug_create(struct closure *cl) {} --static inline void closure_debug_destroy(struct closure *cl) {} -- --#endif -- --static inline void closure_set_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _THIS_IP_; --#endif --} -- --static inline void closure_set_ret_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _RET_IP_; --#endif --} -- --static inline void closure_set_waiting(struct closure *cl, unsigned long f) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->waiting_on = f; --#endif --} -- --static inline void closure_set_stopped(struct closure *cl) --{ -- atomic_sub(CLOSURE_RUNNING, &cl->remaining); --} -- --static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -- struct workqueue_struct *wq) --{ -- closure_set_ip(cl); -- cl->fn = fn; -- cl->wq = wq; -- /* between atomic_dec() in closure_put() */ -- smp_mb__before_atomic(); --} -- --static inline void closure_queue(struct closure *cl) --{ -- struct workqueue_struct *wq = cl->wq; -- /** -- * Changes made to closure, work_struct, or a couple of other structs -- * may cause work.func not pointing to the right location. -- */ -- BUILD_BUG_ON(offsetof(struct closure, fn) -- != offsetof(struct work_struct, func)); -- if (wq) { -- INIT_WORK(&cl->work, cl->work.func); -- BUG_ON(!queue_work(wq, &cl->work)); -- } else -- cl->fn(cl); --} -- --/** -- * closure_get - increment a closure's refcount -- */ --static inline void closure_get(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- BUG_ON((atomic_inc_return(&cl->remaining) & -- CLOSURE_REMAINING_MASK) <= 1); --#else -- atomic_inc(&cl->remaining); --#endif --} -- --/** -- * closure_init - Initialize a closure, setting the refcount to 1 -- * @cl: closure to initialize -- * @parent: parent of the new closure. cl will take a refcount on it for its -- * lifetime; may be NULL. -- */ --static inline void closure_init(struct closure *cl, struct closure *parent) --{ -- memset(cl, 0, sizeof(struct closure)); -- cl->parent = parent; -- if (parent) -- closure_get(parent); -- -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -- -- closure_debug_create(cl); -- closure_set_ip(cl); --} -- --static inline void closure_init_stack(struct closure *cl) --{ -- memset(cl, 0, sizeof(struct closure)); -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); --} -- --/** -- * closure_wake_up - wake up all closures on a wait list, -- * with memory barrier -- */ --static inline void closure_wake_up(struct closure_waitlist *list) --{ -- /* Memory barrier for the wait list */ -- smp_mb(); -- __closure_wake_up(list); --} -- --/** -- * continue_at - jump to another function with barrier -- * -- * After @cl is no longer waiting on anything (i.e. all outstanding refs have -- * been dropped with closure_put()), it will resume execution at @fn running out -- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -- * -- * This is because after calling continue_at() you no longer have a ref on @cl, -- * and whatever @cl owns may be freed out from under you - a running closure fn -- * has a ref on its own closure which continue_at() drops. -- * -- * Note you are expected to immediately return after using this macro. -- */ --#define continue_at(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_sub(_cl, CLOSURE_RUNNING + 1); \ --} while (0) -- --/** -- * closure_return - finish execution of a closure -- * -- * This is used to indicate that @cl is finished: when all outstanding refs on -- * @cl have been dropped @cl's ref on its parent closure (as passed to -- * closure_init()) will be dropped, if one was specified - thus this can be -- * thought of as returning to the parent closure. -- */ --#define closure_return(_cl) continue_at((_cl), NULL, NULL) -- --/** -- * continue_at_nobarrier - jump to another function without barrier -- * -- * Causes @fn to be executed out of @cl, in @wq context (or called directly if -- * @wq is NULL). -- * -- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -- * thus it's not safe to touch anything protected by @cl after a -- * continue_at_nobarrier(). -- */ --#define continue_at_nobarrier(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_queue(_cl); \ --} while (0) -- --/** -- * closure_return_with_destructor - finish execution of a closure, -- * with destructor -- * -- * Works like closure_return(), except @destructor will be called when all -- * outstanding refs on @cl have been dropped; @destructor may be used to safely -- * free the memory occupied by @cl, and it is called with the ref on the parent -- * closure still held - so @destructor could safely return an item to a -- * freelist protected by @cl's parent. -- */ --#define closure_return_with_destructor(_cl, _destructor) \ --do { \ -- set_closure_fn(_cl, _destructor, NULL); \ -- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ --} while (0) -- --/** -- * closure_call - execute @fn out of a new, uninitialized closure -- * -- * Typically used when running out of one closure, and we want to run @fn -- * asynchronously out of a new closure - @parent will then wait for @cl to -- * finish. -- */ --static inline void closure_call(struct closure *cl, closure_fn fn, -- struct workqueue_struct *wq, -- struct closure *parent) --{ -- closure_init(cl, parent); -- continue_at_nobarrier(cl, fn, wq); --} -- --#endif /* _LINUX_CLOSURE_H */ -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index d98354fa28e3..9f3e769b5a67 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2696,7 +2696,6 @@ static int __init bcache_init(void) - goto err; - - bch_debug_init(); -- closure_debug_init(); - - bcache_is_reboot = false; - -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index c029f7443190..59093f9f1793 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -4,6 +4,7 @@ - #define _BCACHE_UTIL_H - - #include -+#include - #include - #include - #include -@@ -13,8 +14,6 @@ - #include - #include - --#include "closure.h" -- - #define PAGE_SECTORS (PAGE_SIZE / 512) - - struct closure; -diff --git a/fs/Kconfig b/fs/Kconfig -index f08fbbfafd9a..8502f8b7d8a7 100644 ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" - source "fs/btrfs/Kconfig" - source "fs/nilfs2/Kconfig" - source "fs/f2fs/Kconfig" -+source "fs/bcachefs/Kconfig" - source "fs/zonefs/Kconfig" - - config FS_DAX -diff --git a/fs/Makefile b/fs/Makefile -index 2ce5112b02c8..8e926e6bf48f 100644 ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ - obj-$(CONFIG_BTRFS_FS) += btrfs/ - obj-$(CONFIG_GFS2_FS) += gfs2/ - obj-$(CONFIG_F2FS_FS) += f2fs/ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ - obj-$(CONFIG_CEPH_FS) += ceph/ - obj-$(CONFIG_PSTORE) += pstore/ - obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -new file mode 100644 -index 000000000000..10abddae6a80 ---- /dev/null -+++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,50 @@ -+ -+config BCACHEFS_FS -+ tristate "bcachefs filesystem support" -+ depends on BLOCK -+ select EXPORTFS -+ select CLOSURES -+ select LIBCRC32C -+ select CRC64 -+ select FS_POSIX_ACL -+ select LZ4_COMPRESS -+ select LZ4_DECOMPRESS -+ select ZLIB_DEFLATE -+ select ZLIB_INFLATE -+ select ZSTD_COMPRESS -+ select ZSTD_DECOMPRESS -+ select CRYPTO_SHA256 -+ select CRYPTO_CHACHA20 -+ select CRYPTO_POLY1305 -+ select KEYS -+ select SIXLOCKS -+ select RAID6_PQ -+ select XOR_BLOCKS -+ ---help--- -+ The bcachefs filesystem - a modern, copy on write filesystem, with -+ support for multiple devices, compression, checksumming, etc. -+ -+config BCACHEFS_QUOTA -+ bool "bcachefs quota support" -+ depends on BCACHEFS_FS -+ select QUOTACTL -+ -+config BCACHEFS_POSIX_ACL -+ bool "bcachefs POSIX ACL support" -+ depends on BCACHEFS_FS -+ select FS_POSIX_ACL -+ -+config BCACHEFS_DEBUG -+ bool "bcachefs debugging" -+ depends on BCACHEFS_FS -+ ---help--- -+ Enables many extra debugging checks and assertions. -+ -+ The resulting code will be significantly slower than normal; you -+ probably shouldn't select this option unless you're a developer. -+ -+config BCACHEFS_TESTS -+ bool "bcachefs unit and performance tests" -+ depends on BCACHEFS_FS -+ ---help--- -+ Include some unit and performance tests for the core btree code -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -new file mode 100644 -index 000000000000..d85ced62c0dd ---- /dev/null -+++ b/fs/bcachefs/Makefile -@@ -0,0 +1,59 @@ -+ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o -+ -+bcachefs-y := \ -+ acl.o \ -+ alloc_background.o \ -+ alloc_foreground.o \ -+ bkey.o \ -+ bkey_methods.o \ -+ bkey_sort.o \ -+ bset.o \ -+ btree_cache.o \ -+ btree_gc.o \ -+ btree_io.o \ -+ btree_iter.o \ -+ btree_key_cache.o \ -+ btree_update_interior.o \ -+ btree_update_leaf.o \ -+ buckets.o \ -+ chardev.o \ -+ checksum.o \ -+ clock.o \ -+ compress.o \ -+ debug.o \ -+ dirent.o \ -+ disk_groups.o \ -+ ec.o \ -+ error.o \ -+ extents.o \ -+ extent_update.o \ -+ fs.o \ -+ fs-common.o \ -+ fs-ioctl.o \ -+ fs-io.o \ -+ fsck.o \ -+ inode.o \ -+ io.o \ -+ journal.o \ -+ journal_io.o \ -+ journal_reclaim.o \ -+ journal_seq_blacklist.o \ -+ keylist.o \ -+ migrate.o \ -+ move.o \ -+ movinggc.o \ -+ opts.o \ -+ quota.o \ -+ rebalance.o \ -+ recovery.o \ -+ reflink.o \ -+ replicas.o \ -+ siphash.o \ -+ super.o \ -+ super-io.o \ -+ sysfs.o \ -+ tests.o \ -+ trace.o \ -+ util.o \ -+ xattr.o -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -new file mode 100644 -index 000000000000..76c98ddbf628 ---- /dev/null -+++ b/fs/bcachefs/acl.c -@@ -0,0 +1,388 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#include "bcachefs.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "acl.h" -+#include "fs.h" -+#include "xattr.h" -+ -+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -+{ -+ return sizeof(bch_acl_header) + -+ sizeof(bch_acl_entry_short) * nr_short + -+ sizeof(bch_acl_entry) * nr_long; -+} -+ -+static inline int acl_to_xattr_type(int type) -+{ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; -+ case ACL_TYPE_DEFAULT: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Convert from filesystem to in-memory representation. -+ */ -+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) -+{ -+ const void *p, *end = value + size; -+ struct posix_acl *acl; -+ struct posix_acl_entry *out; -+ unsigned count = 0; -+ -+ if (!value) -+ return NULL; -+ if (size < sizeof(bch_acl_header)) -+ goto invalid; -+ if (((bch_acl_header *)value)->a_version != -+ cpu_to_le32(BCH_ACL_VERSION)) -+ goto invalid; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *entry = p; -+ -+ if (p + sizeof(bch_acl_entry_short) > end) -+ goto invalid; -+ -+ switch (le16_to_cpu(entry->e_tag)) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ case ACL_GROUP: -+ p += sizeof(bch_acl_entry); -+ break; -+ default: -+ goto invalid; -+ } -+ -+ count++; -+ } -+ -+ if (p > end) -+ goto invalid; -+ -+ if (!count) -+ return NULL; -+ -+ acl = posix_acl_alloc(count, GFP_KERNEL); -+ if (!acl) -+ return ERR_PTR(-ENOMEM); -+ -+ out = acl->a_entries; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *in = p; -+ -+ out->e_tag = le16_to_cpu(in->e_tag); -+ out->e_perm = le16_to_cpu(in->e_perm); -+ -+ switch (out->e_tag) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ out->e_uid = make_kuid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ out->e_gid = make_kgid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ } -+ -+ out++; -+ } -+ -+ BUG_ON(out != acl->a_entries + acl->a_count); -+ -+ return acl; -+invalid: -+ pr_err("invalid acl entry"); -+ return ERR_PTR(-EINVAL); -+} -+ -+#define acl_for_each_entry(acl, acl_e) \ -+ for (acl_e = acl->a_entries; \ -+ acl_e < acl->a_entries + acl->a_count; \ -+ acl_e++) -+ -+/* -+ * Convert from in-memory to filesystem representation. -+ */ -+static struct bkey_i_xattr * -+bch2_acl_to_xattr(struct btree_trans *trans, -+ const struct posix_acl *acl, -+ int type) -+{ -+ struct bkey_i_xattr *xattr; -+ bch_acl_header *acl_header; -+ const struct posix_acl_entry *acl_e; -+ void *outptr; -+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; -+ -+ acl_for_each_entry(acl, acl_e) { -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ case ACL_GROUP: -+ nr_long++; -+ break; -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ nr_short++; -+ break; -+ default: -+ return ERR_PTR(-EINVAL); -+ } -+ } -+ -+ acl_len = bch2_acl_size(nr_short, nr_long); -+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); -+ -+ if (u64s > U8_MAX) -+ return ERR_PTR(-E2BIG); -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return xattr; -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = acl_to_xattr_type(type); -+ xattr->v.x_name_len = 0, -+ xattr->v.x_val_len = cpu_to_le16(acl_len); -+ -+ acl_header = xattr_val(&xattr->v); -+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); -+ -+ outptr = (void *) acl_header + sizeof(*acl_header); -+ -+ acl_for_each_entry(acl, acl_e) { -+ bch_acl_entry *entry = outptr; -+ -+ entry->e_tag = cpu_to_le16(acl_e->e_tag); -+ entry->e_perm = cpu_to_le16(acl_e->e_perm); -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ entry->e_id = cpu_to_le32( -+ from_kuid(&init_user_ns, acl_e->e_uid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ entry->e_id = cpu_to_le32( -+ from_kgid(&init_user_ns, acl_e->e_gid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ outptr += sizeof(bch_acl_entry_short); -+ break; -+ } -+ } -+ -+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); -+ -+ return xattr; -+} -+ -+struct posix_acl *bch2_get_acl(struct inode *vinode, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct posix_acl *acl = NULL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(acl_to_xattr_type(type), "", 0), -+ 0); -+ if (IS_ERR(iter)) { -+ if (PTR_ERR(iter) == -EINTR) -+ goto retry; -+ -+ if (PTR_ERR(iter) != -ENOENT) -+ acl = ERR_CAST(iter); -+ goto out; -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ -+ if (!IS_ERR(acl)) -+ set_cached_acl(&inode->v, type, acl); -+out: -+ bch2_trans_exit(&trans); -+ return acl; -+} -+ -+int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ int ret; -+ -+ if (type == ACL_TYPE_DEFAULT && -+ !S_ISDIR(inode_u->bi_mode)) -+ return acl ? -EACCES : 0; -+ -+ if (acl) { -+ struct bkey_i_xattr *xattr = -+ bch2_acl_to_xattr(trans, acl, type); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &xattr->k_i, 0); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(acl_to_xattr_type(type), "", 0); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &search); -+ } -+ -+ return ret == -ENOENT ? 0 : ret; -+} -+ -+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl; -+ umode_t mode; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ acl = _acl; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ mode = inode_u.bi_mode; -+ -+ if (type == ACL_TYPE_ACCESS) { -+ ret = posix_acl_update_mode(&inode->v, &mode, &acl); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_set_acl_trans(&trans, &inode_u, -+ &inode->ei_str_hash, -+ acl, type); -+ if (ret) -+ goto btree_err; -+ -+ inode_u.bi_ctime = bch2_current_time(c); -+ inode_u.bi_mode = mode; -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_CTIME|ATTR_MODE); -+ -+ set_cached_acl(&inode->v, type, acl); -+err: -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct bkey_i_xattr *new; -+ struct posix_acl *acl; -+ int ret = 0; -+ -+ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ if (IS_ERR_OR_NULL(acl)) -+ return PTR_ERR(acl); -+ -+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); -+ if (ret) -+ goto err; -+ -+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); -+ if (IS_ERR(new)) { -+ ret = PTR_ERR(new); -+ goto err; -+ } -+ -+ new->k.p = iter->pos; -+ bch2_trans_update(trans, iter, &new->k_i, 0); -+ *new_acl = acl; -+ acl = NULL; -+err: -+ kfree(acl); -+ return ret; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h -new file mode 100644 -index 000000000000..cb62d502a7ff ---- /dev/null -+++ b/fs/bcachefs/acl.h -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ACL_H -+#define _BCACHEFS_ACL_H -+ -+struct bch_inode_unpacked; -+struct bch_hash_info; -+struct bch_inode_info; -+struct posix_acl; -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#define BCH_ACL_VERSION 0x0001 -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+ __le32 e_id; -+} bch_acl_entry; -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+} bch_acl_entry_short; -+ -+typedef struct { -+ __le32 a_version; -+} bch_acl_header; -+ -+struct posix_acl *bch2_get_acl(struct inode *, int); -+ -+int bch2_set_acl_trans(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ const struct bch_hash_info *, -+ struct posix_acl *, int); -+int bch2_set_acl(struct inode *, struct posix_acl *, int); -+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, -+ umode_t, struct posix_acl **); -+ -+#else -+ -+static inline int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ return 0; -+} -+ -+static inline int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -+ -+#endif /* _BCACHEFS_ACL_H */ -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -new file mode 100644 -index 000000000000..cb720ee04b86 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1434 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "recovery.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static const char * const bch2_alloc_field_names[] = { -+#define x(name, bytes) #name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ NULL -+}; -+ -+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); -+ -+/* Ratelimiting/PD controllers */ -+ -+static void pd_controllers_update(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(to_delayed_work(work), -+ struct bch_fs, -+ pd_controllers_update); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_member_device(ca, c, i) { -+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); -+ -+ u64 free = bucket_to_sector(ca, -+ __dev_buckets_free(ca, stats)) << 9; -+ /* -+ * Bytes of internal fragmentation, which can be -+ * reclaimed by copy GC -+ */ -+ s64 fragmented = (bucket_to_sector(ca, -+ stats.buckets[BCH_DATA_USER] + -+ stats.buckets[BCH_DATA_CACHED]) - -+ (stats.sectors[BCH_DATA_USER] + -+ stats.sectors[BCH_DATA_CACHED])) << 9; -+ -+ fragmented = max(0LL, fragmented); -+ -+ bch2_pd_controller_update(&ca->copygc_pd, -+ free, fragmented, -1); -+ } -+ -+ schedule_delayed_work(&c->pd_controllers_update, -+ c->pd_controllers_update_seconds * HZ); -+} -+ -+/* Persistent alloc info: */ -+ -+static inline u64 get_alloc_field(const struct bch_alloc *a, -+ const void **p, unsigned field) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ u64 v; -+ -+ if (!(a->fields & (1 << field))) -+ return 0; -+ -+ switch (bytes) { -+ case 1: -+ v = *((const u8 *) *p); -+ break; -+ case 2: -+ v = le16_to_cpup(*p); -+ break; -+ case 4: -+ v = le32_to_cpup(*p); -+ break; -+ case 8: -+ v = le64_to_cpup(*p); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+ return v; -+} -+ -+static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, -+ unsigned field, u64 v) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ -+ if (!v) -+ return; -+ -+ a->v.fields |= 1 << field; -+ -+ switch (bytes) { -+ case 1: -+ *((u8 *) *p) = v; -+ break; -+ case 2: -+ *((__le16 *) *p) = cpu_to_le16(v); -+ break; -+ case 4: -+ *((__le32 *) *p) = cpu_to_le32(v); -+ break; -+ case 8: -+ *((__le64 *) *p) = cpu_to_le64(v); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -+{ -+ struct bkey_alloc_unpacked ret = { .gen = 0 }; -+ -+ if (k.k->type == KEY_TYPE_alloc) { -+ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; -+ const void *d = a->data; -+ unsigned idx = 0; -+ -+ ret.gen = a->gen; -+ -+#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); -+ BCH_ALLOC_FIELDS() -+#undef x -+ } -+ return ret; -+} -+ -+void bch2_alloc_pack(struct bkey_i_alloc *dst, -+ const struct bkey_alloc_unpacked src) -+{ -+ unsigned idx = 0; -+ void *d = dst->v.data; -+ unsigned bytes; -+ -+ dst->v.fields = 0; -+ dst->v.gen = src.gen; -+ -+#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); -+ BCH_ALLOC_FIELDS() -+#undef x -+ -+ bytes = (void *) d - (void *) &dst->v; -+ set_bkey_val_bytes(&dst->k, bytes); -+ memset_u64s_tail(&dst->v, 0, bytes); -+} -+ -+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) -+{ -+ unsigned i, bytes = offsetof(struct bch_alloc, data); -+ -+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) -+ if (a->fields & (1 << i)) -+ bytes += BCH_ALLOC_FIELD_BYTES[i]; -+ -+ return DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ -+ if (k.k->p.inode >= c->sb.nr_devices || -+ !c->devs[k.k->p.inode]) -+ return "invalid device"; -+ -+ /* allow for unknown fields */ -+ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ const void *d = a.v->data; -+ unsigned i; -+ -+ pr_buf(out, "gen %u", a.v->gen); -+ -+ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) -+ if (a.v->fields & (1 << i)) -+ pr_buf(out, " %s %llu", -+ bch2_alloc_field_names[i], -+ get_alloc_field(a.v, &d, i)); -+} -+ -+static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ if (!level) -+ bch2_mark_key(c, k, 0, 0, NULL, 0, -+ BTREE_TRIGGER_ALLOC_READ| -+ BTREE_TRIGGER_NOATOMIC); -+ -+ return 0; -+} -+ -+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, -+ NULL, bch2_alloc_read_fn); -+ if (ret) { -+ bch_err(c, "error reading alloc info: %i", ret); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_dev_usage_from_buckets(c); -+ percpu_up_write(&c->mark_lock); -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, READ); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[READ].lock); -+ -+ mutex_lock(&c->bucket_clock[WRITE].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, WRITE); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[WRITE].lock); -+ -+ return 0; -+} -+ -+enum alloc_write_ret { -+ ALLOC_WROTE, -+ ALLOC_NOWROTE, -+ ALLOC_END, -+}; -+ -+static int bch2_alloc_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ struct bucket_array *ba; -+ struct bucket *g; -+ struct bucket_mark m; -+ struct bkey_alloc_unpacked old_u, new_u; -+ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ -+ struct bkey_i_alloc *a; -+ int ret; -+retry: -+ bch2_trans_begin(trans); -+ -+ ret = bch2_btree_key_cache_flush(trans, -+ BTREE_ID_ALLOC, iter->pos); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ old_u = bch2_alloc_unpack(k); -+ -+ if (iter->pos.inode >= c->sb.nr_devices || -+ !c->devs[iter->pos.inode]) -+ return ALLOC_END; -+ -+ percpu_down_read(&c->mark_lock); -+ ca = bch_dev_bkey_exists(c, iter->pos.inode); -+ ba = bucket_array(ca); -+ -+ if (iter->pos.offset >= ba->nbuckets) { -+ percpu_up_read(&c->mark_lock); -+ return ALLOC_END; -+ } -+ -+ g = &ba->b[iter->pos.offset]; -+ m = READ_ONCE(g->mark); -+ new_u = alloc_mem_to_key(g, m); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) -+ return ALLOC_NOWROTE; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, new_u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_NORUN); -+ ret = bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ flags); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ return ret; -+} -+ -+int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ for_each_rw_member(ca, c, i) { -+ unsigned first_bucket; -+ -+ percpu_down_read(&c->mark_lock); -+ first_bucket = bucket_array(ca)->first_bucket; -+ percpu_up_read(&c->mark_lock); -+ -+ bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); -+ -+ while (1) { -+ ret = bch2_alloc_write_key(&trans, iter, flags); -+ if (ret < 0 || ret == ALLOC_END) -+ break; -+ if (ret == ALLOC_WROTE) -+ *wrote = true; -+ bch2_btree_iter_next_slot(iter); -+ } -+ -+ if (ret < 0) { -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* Bucket IO clocks: */ -+ -+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket *g; -+ u16 max_last_io = 0; -+ unsigned i; -+ -+ lockdep_assert_held(&c->bucket_clock[rw].lock); -+ -+ /* Recalculate max_last_io for this device: */ -+ for_each_bucket(g, buckets) -+ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); -+ -+ ca->max_last_bucket_io[rw] = max_last_io; -+ -+ /* Recalculate global max_last_io: */ -+ max_last_io = 0; -+ -+ for_each_member_device(ca, c, i) -+ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); -+ -+ clock->max_last_io = max_last_io; -+} -+ -+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets; -+ struct bch_dev *ca; -+ struct bucket *g; -+ unsigned i; -+ -+ trace_rescale_prios(c); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->io_time[rw] = clock->hand - -+ bucket_last_io(c, g, rw) / 2; -+ -+ bch2_recalc_oldest_io(c, ca, rw); -+ -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+static inline u64 bucket_clock_freq(u64 capacity) -+{ -+ return max(capacity >> 10, 2028ULL); -+} -+ -+static void bch2_inc_clock_hand(struct io_timer *timer) -+{ -+ struct bucket_clock *clock = container_of(timer, -+ struct bucket_clock, rescale); -+ struct bch_fs *c = container_of(clock, -+ struct bch_fs, bucket_clock[clock->rw]); -+ struct bch_dev *ca; -+ u64 capacity; -+ unsigned i; -+ -+ mutex_lock(&clock->lock); -+ -+ /* if clock cannot be advanced more, rescale prio */ -+ if (clock->max_last_io >= U16_MAX - 2) -+ bch2_rescale_bucket_io_times(c, clock->rw); -+ -+ BUG_ON(clock->max_last_io >= U16_MAX - 2); -+ -+ for_each_member_device(ca, c, i) -+ ca->max_last_bucket_io[clock->rw]++; -+ clock->max_last_io++; -+ clock->hand++; -+ -+ mutex_unlock(&clock->lock); -+ -+ capacity = READ_ONCE(c->capacity); -+ -+ if (!capacity) -+ return; -+ -+ /* -+ * we only increment when 0.1% of the filesystem capacity has been read -+ * or written too, this determines if it's time -+ * -+ * XXX: we shouldn't really be going off of the capacity of devices in -+ * RW mode (that will be 0 when we're RO, yet we can still service -+ * reads) -+ */ -+ timer->expire += bucket_clock_freq(capacity); -+ -+ bch2_io_timer_add(&c->io_clock[clock->rw], timer); -+} -+ -+static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ -+ clock->hand = 1; -+ clock->rw = rw; -+ clock->rescale.fn = bch2_inc_clock_hand; -+ clock->rescale.expire = bucket_clock_freq(c->capacity); -+ mutex_init(&clock->lock); -+} -+ -+/* Background allocator thread: */ -+ -+/* -+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens -+ * (marking them as invalidated on disk), then optionally issues discard -+ * commands to the newly free buckets, then puts them on the various freelists. -+ */ -+ -+#define BUCKET_GC_GEN_MAX 96U -+ -+/** -+ * wait_buckets_available - wait on reclaimable buckets -+ * -+ * If there aren't enough available buckets to fill up free_inc, wait until -+ * there are. -+ */ -+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned long gc_count = c->gc_count; -+ u64 available; -+ int ret = 0; -+ -+ ca->allocator_state = ALLOCATOR_BLOCKED; -+ closure_wake_up(&c->freelist_wait); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ if (gc_count != c->gc_count) -+ ca->inc_gen_really_needs_gc = 0; -+ -+ available = max_t(s64, 0, dev_buckets_available(c, ca) - -+ ca->inc_gen_really_needs_gc); -+ -+ if (available > fifo_free(&ca->free_inc) || -+ (available && !fifo_full(&ca->free[RESERVE_BTREE]))) -+ break; -+ -+ up_read(&c->gc_lock); -+ schedule(); -+ try_to_freeze(); -+ down_read(&c->gc_lock); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ closure_wake_up(&c->freelist_wait); -+ -+ return ret; -+} -+ -+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, -+ size_t bucket, -+ struct bucket_mark mark) -+{ -+ u8 gc_gen; -+ -+ if (!is_available_bucket(mark)) -+ return false; -+ -+ if (ca->buckets_nouse && -+ test_bit(bucket, ca->buckets_nouse)) -+ return false; -+ -+ gc_gen = bucket_gc_gen(ca, bucket); -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) -+ ca->inc_gen_needs_gc++; -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX) -+ ca->inc_gen_really_needs_gc++; -+ -+ return gc_gen < BUCKET_GC_GEN_MAX; -+} -+ -+/* -+ * Determines what order we're going to reuse buckets, smallest bucket_key() -+ * first. -+ * -+ * -+ * - We take into account the read prio of the bucket, which gives us an -+ * indication of how hot the data is -- we scale the prio so that the prio -+ * farthest from the clock is worth 1/8th of the closest. -+ * -+ * - The number of sectors of cached data in the bucket, which gives us an -+ * indication of the cost in cache misses this eviction will cause. -+ * -+ * - If hotness * sectors used compares equal, we pick the bucket with the -+ * smallest bucket_gc_gen() - since incrementing the same bucket's generation -+ * number repeatedly forces us to run mark and sweep gc to avoid generation -+ * number wraparound. -+ */ -+ -+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark m) -+{ -+ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); -+ unsigned max_last_io = ca->max_last_bucket_io[READ]; -+ -+ /* -+ * Time since last read, scaled to [0, 8) where larger value indicates -+ * more recently read data: -+ */ -+ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; -+ -+ /* How much we want to keep the data in this bucket: */ -+ unsigned long data_wantness = -+ (hotness + 1) * bucket_sectors_used(m); -+ -+ unsigned long needs_journal_commit = -+ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); -+ -+ return (data_wantness << 9) | -+ (needs_journal_commit << 8) | -+ (bucket_gc_gen(ca, b) / 16); -+} -+ -+static inline int bucket_alloc_cmp(alloc_heap *h, -+ struct alloc_heap_entry l, -+ struct alloc_heap_entry r) -+{ -+ return cmp_int(l.key, r.key) ?: -+ cmp_int(r.nr, l.nr) ?: -+ cmp_int(l.bucket, r.bucket); -+} -+ -+static inline int bucket_idx_cmp(const void *_l, const void *_r) -+{ -+ const struct alloc_heap_entry *l = _l, *r = _r; -+ -+ return cmp_int(l->bucket, r->bucket); -+} -+ -+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ struct alloc_heap_entry e = { 0 }; -+ size_t b, i, nr = 0; -+ -+ ca->alloc_heap.used = 0; -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ down_read(&ca->bucket_lock); -+ -+ buckets = bucket_array(ca); -+ -+ bch2_recalc_oldest_io(c, ca, READ); -+ -+ /* -+ * Find buckets with lowest read priority, by building a maxheap sorted -+ * by read priority and repeatedly replacing the maximum element until -+ * all buckets have been visited. -+ */ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ unsigned long key = bucket_sort_key(c, ca, b, m); -+ -+ if (!bch2_can_invalidate_bucket(ca, b, m)) -+ continue; -+ -+ if (e.nr && e.bucket + e.nr == b && e.key == key) { -+ e.nr++; -+ } else { -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ e = (struct alloc_heap_entry) { -+ .bucket = b, -+ .nr = 1, -+ .key = key, -+ }; -+ } -+ -+ cond_resched(); -+ } -+ -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { -+ nr -= ca->alloc_heap.data[0].nr; -+ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); -+ } -+ -+ up_read(&ca->bucket_lock); -+ mutex_unlock(&c->bucket_clock[READ].lock); -+} -+ -+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t b, start; -+ -+ if (ca->fifo_last_bucket < ca->mi.first_bucket || -+ ca->fifo_last_bucket >= ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ start = ca->fifo_last_bucket; -+ -+ do { -+ ca->fifo_last_bucket++; -+ if (ca->fifo_last_bucket == ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ b = ca->fifo_last_bucket; -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } while (ca->fifo_last_bucket != start); -+} -+ -+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t checked, i; -+ -+ for (checked = 0; -+ checked < ca->mi.nbuckets / 2; -+ checked++) { -+ size_t b = bch2_rand_range(ca->mi.nbuckets - -+ ca->mi.first_bucket) + -+ ca->mi.first_bucket; -+ -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ sort(ca->alloc_heap.data, -+ ca->alloc_heap.used, -+ sizeof(ca->alloc_heap.data[0]), -+ bucket_idx_cmp, NULL); -+ -+ /* remove duplicates: */ -+ for (i = 0; i + 1 < ca->alloc_heap.used; i++) -+ if (ca->alloc_heap.data[i].bucket == -+ ca->alloc_heap.data[i + 1].bucket) -+ ca->alloc_heap.data[i].nr = 0; -+} -+ -+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ size_t i, nr = 0; -+ -+ ca->inc_gen_needs_gc = 0; -+ -+ switch (ca->mi.replacement) { -+ case CACHE_REPLACEMENT_LRU: -+ find_reclaimable_buckets_lru(c, ca); -+ break; -+ case CACHE_REPLACEMENT_FIFO: -+ find_reclaimable_buckets_fifo(c, ca); -+ break; -+ case CACHE_REPLACEMENT_RANDOM: -+ find_reclaimable_buckets_random(c, ca); -+ break; -+ } -+ -+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ return nr; -+} -+ -+static inline long next_alloc_bucket(struct bch_dev *ca) -+{ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ while (ca->alloc_heap.used) { -+ if (top->nr) { -+ size_t b = top->bucket; -+ -+ top->bucket++; -+ top->nr--; -+ return b; -+ } -+ -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ } -+ -+ return -1; -+} -+ -+/* -+ * returns sequence number of most recent journal entry that updated this -+ * bucket: -+ */ -+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -+{ -+ if (m.journal_seq_valid) { -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u64 bucket_seq = journal_seq; -+ -+ bucket_seq &= ~((u64) U16_MAX); -+ bucket_seq |= m.journal_seq; -+ -+ if (bucket_seq > journal_seq) -+ bucket_seq -= 1 << 16; -+ -+ return bucket_seq; -+ } else { -+ return 0; -+ } -+} -+ -+static int bch2_invalidate_one_bucket2(struct btree_trans *trans, -+ struct bch_dev *ca, -+ struct btree_iter *iter, -+ u64 *journal_seq, unsigned flags) -+{ -+#if 0 -+ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -+#else -+ /* hack: */ -+ __BKEY_PADDED(k, 8) alloc_key; -+#endif -+ struct bch_fs *c = trans->c; -+ struct bkey_i_alloc *a; -+ struct bkey_alloc_unpacked u; -+ struct bucket *g; -+ struct bucket_mark m; -+ bool invalidating_cached_data; -+ size_t b; -+ int ret = 0; -+ -+ BUG_ON(!ca->alloc_heap.used || -+ !ca->alloc_heap.data[0].nr); -+ b = ca->alloc_heap.data[0].bucket; -+ -+ /* first, put on free_inc and mark as owned by allocator: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ verify_not_on_freelist(c, ca, b); -+ -+ BUG_ON(!fifo_push(&ca->free_inc, b)); -+ -+ g = bucket(ca, b); -+ m = READ_ONCE(g->mark); -+ -+ invalidating_cached_data = m.cached_sectors != 0; -+ -+ /* -+ * If we're not invalidating cached data, we only increment the bucket -+ * gen in memory here, the incremented gen will be updated in the btree -+ * by bch2_trans_mark_pointer(): -+ */ -+ -+ if (!invalidating_cached_data) -+ bch2_invalidate_bucket(c, ca, b, &m); -+ else -+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!invalidating_cached_data) -+ goto out; -+ -+ /* -+ * If the read-only path is trying to shut down, we can't be generating -+ * new btree updates: -+ */ -+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { -+ ret = 1; -+ goto out; -+ } -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, iter->pos.offset); -+ m = READ_ONCE(g->mark); -+ u = alloc_mem_to_key(g, m); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ invalidating_cached_data = u.cached_sectors != 0; -+ -+ u.gen++; -+ u.data_type = 0; -+ u.dirty_sectors = 0; -+ u.cached_sectors = 0; -+ u.read_time = c->bucket_clock[READ].hand; -+ u.write_time = c->bucket_clock[WRITE].hand; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_BUCKET_INVALIDATE); -+ -+ /* -+ * XXX: -+ * when using deferred btree updates, we have journal reclaim doing -+ * btree updates and thus requiring the allocator to make forward -+ * progress, and here the allocator is requiring space in the journal - -+ * so we need a journal pre-reservation: -+ */ -+ ret = bch2_trans_commit(trans, NULL, -+ invalidating_cached_data ? journal_seq : NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ flags); -+ if (ret == -EINTR) -+ goto retry; -+out: -+ if (!ret) { -+ /* remove from alloc_heap: */ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ top->bucket++; -+ top->nr--; -+ -+ if (!top->nr) -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ -+ /* -+ * Make sure we flush the last journal entry that updated this -+ * bucket (i.e. deleting the last reference) before writing to -+ * this bucket again: -+ */ -+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); -+ } else { -+ size_t b2; -+ -+ /* remove from free_inc: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ bch2_mark_alloc_bucket(c, ca, b, false, -+ gc_pos_alloc(c, NULL), 0); -+ -+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); -+ BUG_ON(b != b2); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* -+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: -+ */ -+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 journal_seq = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ -+ /* Only use nowait if we've already invalidated at least one bucket: */ -+ while (!ret && -+ !fifo_full(&ca->free_inc) && -+ ca->alloc_heap.used) -+ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, -+ BTREE_INSERT_GC_LOCK_HELD| -+ (!fifo_empty(&ca->free_inc) -+ ? BTREE_INSERT_NOWAIT : 0)); -+ -+ bch2_trans_exit(&trans); -+ -+ /* If we used NOWAIT, don't return the error: */ -+ if (!fifo_empty(&ca->free_inc)) -+ ret = 0; -+ if (ret) { -+ bch_err(ca, "error invalidating buckets: %i", ret); -+ return ret; -+ } -+ -+ if (journal_seq) -+ ret = bch2_journal_flush_seq(&c->journal, journal_seq); -+ if (ret) { -+ bch_err(ca, "journal error: %i", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ -+ /* -+ * Don't strand buckets on the copygc freelist until -+ * after recovery is finished: -+ */ -+ if (!test_bit(BCH_FS_STARTED, &c->flags) && -+ i == RESERVE_MOVINGGC) -+ continue; -+ -+ if (fifo_push(&ca->free[i], bucket)) { -+ fifo_pop(&ca->free_inc, bucket); -+ -+ closure_wake_up(&c->freelist_wait); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ spin_unlock(&c->freelist_lock); -+ goto out; -+ } -+ } -+ -+ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { -+ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; -+ closure_wake_up(&c->freelist_wait); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ if ((current->flags & PF_KTHREAD) && -+ kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ schedule(); -+ try_to_freeze(); -+ } -+out: -+ __set_current_state(TASK_RUNNING); -+ return ret; -+} -+ -+/* -+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to -+ * freelists, waiting until there's room if necessary: -+ */ -+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ while (!fifo_empty(&ca->free_inc)) { -+ size_t bucket = fifo_peek(&ca->free_inc); -+ -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, bucket), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ if (push_invalidated_bucket(c, ca, bucket)) -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/** -+ * bch_allocator_thread - move buckets from free_inc to reserves -+ * -+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and -+ * the reserves are depleted by bucket allocation. When we run out -+ * of free_inc, try to invalidate some buckets and write out -+ * prios and gens. -+ */ -+static int bch2_allocator_thread(void *arg) -+{ -+ struct bch_dev *ca = arg; -+ struct bch_fs *c = ca->fs; -+ size_t nr; -+ int ret; -+ -+ set_freezable(); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ while (1) { -+ cond_resched(); -+ if (kthread_should_stop()) -+ break; -+ -+ pr_debug("discarding %zu invalidated buckets", -+ fifo_used(&ca->free_inc)); -+ -+ ret = discard_invalidated_buckets(c, ca); -+ if (ret) -+ goto stop; -+ -+ down_read(&c->gc_lock); -+ -+ ret = bch2_invalidate_buckets(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ -+ if (!fifo_empty(&ca->free_inc)) { -+ up_read(&c->gc_lock); -+ continue; -+ } -+ -+ pr_debug("free_inc now empty"); -+ -+ do { -+ /* -+ * Find some buckets that we can invalidate, either -+ * they're completely unused, or only contain clean data -+ * that's been written back to the backing device or -+ * another cache tier -+ */ -+ -+ pr_debug("scanning for reclaimable buckets"); -+ -+ nr = find_reclaimable_buckets(c, ca); -+ -+ pr_debug("found %zu buckets", nr); -+ -+ trace_alloc_batch(ca, nr, ca->alloc_heap.size); -+ -+ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || -+ ca->inc_gen_really_needs_gc) && -+ c->gc_thread) { -+ atomic_inc(&c->kick_gc); -+ wake_up_process(c->gc_thread); -+ } -+ -+ /* -+ * If we found any buckets, we have to invalidate them -+ * before we scan for more - but if we didn't find very -+ * many we may want to wait on more buckets being -+ * available so we don't spin: -+ */ -+ if (!nr || -+ (nr < ALLOC_SCAN_BATCH(ca) && -+ !fifo_empty(&ca->free[RESERVE_NONE]))) { -+ ret = wait_buckets_available(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ } -+ } while (!nr); -+ -+ up_read(&c->gc_lock); -+ -+ pr_debug("%zu buckets to invalidate", nr); -+ -+ /* -+ * alloc_heap is now full of newly-invalidated buckets: next, -+ * write out the new bucket gens: -+ */ -+ } -+ -+stop: -+ pr_debug("alloc thread stopping (ret %i)", ret); -+ ca->allocator_state = ALLOCATOR_STOPPED; -+ closure_wake_up(&c->freelist_wait); -+ return 0; -+} -+ -+/* Startup/shutdown (ro/rw): */ -+ -+void bch2_recalc_capacity(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ u64 capacity = 0, reserved_sectors = 0, gc_reserve; -+ unsigned bucket_size_max = 0; -+ unsigned long ra_pages = 0; -+ unsigned i, j; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ for_each_online_member(ca, c, i) { -+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ ra_pages += bdi->ra_pages; -+ } -+ -+ bch2_set_ra_pages(c, ra_pages); -+ -+ for_each_rw_member(ca, c, i) { -+ u64 dev_reserve = 0; -+ -+ /* -+ * We need to reserve buckets (from the number -+ * of currently available buckets) against -+ * foreground writes so that mainly copygc can -+ * make forward progress. -+ * -+ * We need enough to refill the various reserves -+ * from scratch - copygc will use its entire -+ * reserve all at once, then run against when -+ * its reserve is refilled (from the formerly -+ * available buckets). -+ * -+ * This reserve is just used when considering if -+ * allocations for foreground writes must wait - -+ * not -ENOSPC calculations. -+ */ -+ for (j = 0; j < RESERVE_NONE; j++) -+ dev_reserve += ca->free[j].size; -+ -+ dev_reserve += 1; /* btree write point */ -+ dev_reserve += 1; /* copygc write point */ -+ dev_reserve += 1; /* rebalance write point */ -+ -+ dev_reserve *= ca->mi.bucket_size; -+ -+ ca->copygc_threshold = dev_reserve; -+ -+ capacity += bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket); -+ -+ reserved_sectors += dev_reserve * 2; -+ -+ bucket_size_max = max_t(unsigned, bucket_size_max, -+ ca->mi.bucket_size); -+ } -+ -+ gc_reserve = c->opts.gc_reserve_bytes -+ ? c->opts.gc_reserve_bytes >> 9 -+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); -+ -+ reserved_sectors = max(gc_reserve, reserved_sectors); -+ -+ reserved_sectors = min(reserved_sectors, capacity); -+ -+ c->capacity = capacity - reserved_sectors; -+ -+ c->bucket_size_max = bucket_size_max; -+ -+ if (c->capacity) { -+ bch2_io_timer_add(&c->io_clock[READ], -+ &c->bucket_clock[READ].rescale); -+ bch2_io_timer_add(&c->io_clock[WRITE], -+ &c->bucket_clock[WRITE].rescale); -+ } else { -+ bch2_io_timer_del(&c->io_clock[READ], -+ &c->bucket_clock[READ].rescale); -+ bch2_io_timer_del(&c->io_clock[WRITE], -+ &c->bucket_clock[WRITE].rescale); -+ } -+ -+ /* Wake up case someone was waiting for buckets */ -+ closure_wake_up(&c->freelist_wait); -+} -+ -+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct open_bucket *ob; -+ bool ret = false; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list && -+ ob->ptr.dev == ca->dev_idx) -+ ret = true; -+ spin_unlock(&ob->lock); -+ } -+ -+ return ret; -+} -+ -+/* device goes ro: */ -+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ BUG_ON(ca->alloc_thread); -+ -+ /* First, remove device from allocation groups: */ -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ clear_bit(ca->dev_idx, c->rw_devs[i].d); -+ -+ /* -+ * Capacity is calculated based off of devices in allocation groups: -+ */ -+ bch2_recalc_capacity(c); -+ -+ /* Next, close write points that point to this device... */ -+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) -+ bch2_writepoint_stop(c, ca, &c->write_points[i]); -+ -+ bch2_writepoint_stop(c, ca, &ca->copygc_write_point); -+ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); -+ bch2_writepoint_stop(c, ca, &c->btree_write_point); -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ while (c->btree_reserve_cache_nr) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ bch2_open_buckets_put(c, &a->ob); -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+ while (1) { -+ struct open_bucket *ob; -+ -+ spin_lock(&c->freelist_lock); -+ if (!ca->open_buckets_partial_nr) { -+ spin_unlock(&c->freelist_lock); -+ break; -+ } -+ ob = c->open_buckets + -+ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; -+ ob->on_partial_list = false; -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_ec_stop_dev(c, ca); -+ -+ /* -+ * Wake up threads that were blocked on allocation, so they can notice -+ * the device can no longer be removed and the capacity has changed: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ -+ /* -+ * journal_res_get() can block waiting for free space in the journal - -+ * it needs to notice there may not be devices to allocate from anymore: -+ */ -+ wake_up(&c->journal.wait); -+ -+ /* Now wait for any in flight writes: */ -+ -+ closure_wait_event(&c->open_buckets_wait, -+ !bch2_dev_has_open_write_point(c, ca)); -+} -+ -+/* device goes rw: */ -+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ if (ca->mi.data_allowed & (1 << i)) -+ set_bit(ca->dev_idx, c->rw_devs[i].d); -+} -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (ca->alloc_thread) -+ closure_wait_event(&c->freelist_wait, -+ ca->allocator_state != ALLOCATOR_RUNNING); -+} -+ -+/* stop allocator thread: */ -+void bch2_dev_allocator_stop(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ p = rcu_dereference_protected(ca->alloc_thread, 1); -+ ca->alloc_thread = NULL; -+ -+ /* -+ * We need an rcu barrier between setting ca->alloc_thread = NULL and -+ * the thread shutting down to avoid bch2_wake_allocator() racing: -+ * -+ * XXX: it would be better to have the rcu barrier be asynchronous -+ * instead of blocking us here -+ */ -+ synchronize_rcu(); -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+/* start allocator thread: */ -+int bch2_dev_allocator_start(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ /* -+ * allocator thread already started? -+ */ -+ if (ca->alloc_thread) -+ return 0; -+ -+ p = kthread_create(bch2_allocator_thread, ca, -+ "bch_alloc[%s]", ca->name); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(ca->alloc_thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_allocator_background_init(struct bch_fs *c) -+{ -+ spin_lock_init(&c->freelist_lock); -+ bch2_bucket_clock_init(c, READ); -+ bch2_bucket_clock_init(c, WRITE); -+ -+ c->pd_controllers_update_seconds = 5; -+ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); -+} -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -new file mode 100644 -index 000000000000..f6b9f27f0713 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,97 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -+#define _BCACHEFS_ALLOC_BACKGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "debug.h" -+ -+struct bkey_alloc_unpacked { -+ u8 gen; -+#define x(_name, _bits) u##_bits _name; -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+/* returns true if not equal */ -+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, -+ struct bkey_alloc_unpacked r) -+{ -+ return l.gen != r.gen -+#define x(_name, _bits) || l._name != r._name -+ BCH_ALLOC_FIELDS() -+#undef x -+ ; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -+void bch2_alloc_pack(struct bkey_i_alloc *, -+ const struct bkey_alloc_unpacked); -+ -+static inline struct bkey_alloc_unpacked -+alloc_mem_to_key(struct bucket *g, struct bucket_mark m) -+{ -+ return (struct bkey_alloc_unpacked) { -+ .gen = m.gen, -+ .oldest_gen = g->oldest_gen, -+ .data_type = m.data_type, -+ .dirty_sectors = m.dirty_sectors, -+ .cached_sectors = m.cached_sectors, -+ .read_time = g->io_time[READ], -+ .write_time = g->io_time[WRITE], -+ }; -+} -+ -+#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -+ -+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_alloc (struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+} -+ -+struct journal_keys; -+int bch2_alloc_read(struct bch_fs *, struct journal_keys *); -+ -+static inline void bch2_wake_allocator(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(ca->alloc_thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, -+ size_t bucket) -+{ -+ if (expensive_debug_checks(c)) { -+ size_t iter; -+ long i; -+ unsigned j; -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ BUG_ON(i == bucket); -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ BUG_ON(i == bucket); -+ } -+} -+ -+void bch2_recalc_capacity(struct bch_fs *); -+ -+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_stop(struct bch_dev *); -+int bch2_dev_allocator_start(struct bch_dev *); -+ -+int bch2_alloc_write(struct bch_fs *, unsigned, bool *); -+void bch2_fs_allocator_background_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -new file mode 100644 -index 000000000000..979aba30bc9d ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,1044 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Primary bucket allocation code -+ * -+ * Copyright 2012 Google, Inc. -+ * -+ * Allocation in bcache is done in terms of buckets: -+ * -+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in -+ * btree pointers - they must match for the pointer to be considered valid. -+ * -+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a -+ * bucket simply by incrementing its gen. -+ * -+ * The gens (along with the priorities; it's really the gens are important but -+ * the code is named as if it's the priorities) are written in an arbitrary list -+ * of buckets on disk, with a pointer to them in the journal header. -+ * -+ * When we invalidate a bucket, we have to write its new gen to disk and wait -+ * for that write to complete before we use it - otherwise after a crash we -+ * could have pointers that appeared to be good but pointed to data that had -+ * been overwritten. -+ * -+ * Since the gens and priorities are all stored contiguously on disk, we can -+ * batch this up: We fill up the free_inc list with freshly invalidated buckets, -+ * call prio_write(), and when prio_write() finishes we pull buckets off the -+ * free_inc list and optionally discard them. -+ * -+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while -+ * priorities and gens were being written before we could allocate. c->free is a -+ * smaller freelist, and buckets on that list are always ready to be used. -+ * -+ * If we've got discards enabled, that happens when a bucket moves from the -+ * free_inc list to the free list. -+ * -+ * It's important to ensure that gens don't wrap around - with respect to -+ * either the oldest gen in the btree or the gen on disk. This is quite -+ * difficult to do in practice, but we explicitly guard against it anyways - if -+ * a bucket is in danger of wrapping around we simply skip invalidating it that -+ * time around, and we garbage collect or rewrite the priorities sooner than we -+ * would have otherwise. -+ * -+ * bch2_bucket_alloc() allocates a single bucket from a specific device. -+ * -+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices -+ * in a given filesystem. -+ * -+ * invalidate_buckets() drives all the processes described above. It's called -+ * from bch2_bucket_alloc() and a few other places that need to make sure free -+ * buckets are ready. -+ * -+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be -+ * invalidated, and then invalidate them and stick them on the free_inc list - -+ * in either lru or fifo order. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "io.h" -+ -+#include -+#include -+#include -+#include -+ -+enum bucket_alloc_ret { -+ ALLOC_SUCCESS, -+ OPEN_BUCKETS_EMPTY, -+ FREELIST_EMPTY, /* Allocator thread not keeping up */ -+}; -+ -+/* -+ * Open buckets represent a bucket that's currently being allocated from. They -+ * serve two purposes: -+ * -+ * - They track buckets that have been partially allocated, allowing for -+ * sub-bucket sized allocations - they're used by the sector allocator below -+ * -+ * - They provide a reference to the buckets they own that mark and sweep GC -+ * can find, until the new allocation has a pointer to it inserted into the -+ * btree -+ * -+ * When allocating some space with the sector allocator, the allocation comes -+ * with a reference to an open bucket - the caller is required to put that -+ * reference _after_ doing the index update that makes its allocation reachable. -+ */ -+ -+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (ob->ec) { -+ bch2_ec_bucket_written(c, ob); -+ return; -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&ob->lock); -+ -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), -+ false, gc_pos_alloc(c, ob), 0); -+ ob->valid = false; -+ ob->type = 0; -+ -+ spin_unlock(&ob->lock); -+ percpu_up_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ c->open_buckets_nr_free++; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *c, -+ struct open_buckets *obs, -+ unsigned dev) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ptr.dev == dev && -+ ob->ec) -+ bch2_ec_bucket_cancel(c, ob); -+} -+ -+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); -+ -+ ob = c->open_buckets + c->open_buckets_freelist; -+ c->open_buckets_freelist = ob->freelist; -+ atomic_set(&ob->pin, 1); -+ ob->type = 0; -+ -+ c->open_buckets_nr_free--; -+ return ob; -+} -+ -+static void open_bucket_free_unused(struct bch_fs *c, -+ struct open_bucket *ob, -+ bool may_realloc) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ BUG_ON(ca->open_buckets_partial_nr >= -+ ARRAY_SIZE(ca->open_buckets_partial)); -+ -+ if (ca->open_buckets_partial_nr < -+ ARRAY_SIZE(ca->open_buckets_partial) && -+ may_realloc) { -+ spin_lock(&c->freelist_lock); -+ ob->on_partial_list = true; -+ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = -+ ob - c->open_buckets; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+ closure_wake_up(&c->freelist_wait); -+ } else { -+ bch2_open_bucket_put(c, ob); -+ } -+} -+ -+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ BUG_ON(ptr_stale(ca, &ob->ptr)); -+ } -+#endif -+} -+ -+/* _only_ for allocating the journal on a new device: */ -+long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ ssize_t b; -+ -+ rcu_read_lock(); -+ buckets = bucket_array(ca); -+ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) -+ if (is_available_bucket(buckets->b[b].mark)) -+ goto success; -+ b = -1; -+success: -+ rcu_read_unlock(); -+ return b; -+} -+ -+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) -+{ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ return 0; -+ case RESERVE_BTREE: -+ return OPEN_BUCKETS_COUNT / 4; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ -+/** -+ * bch_bucket_alloc - allocate a single bucket from a specific device -+ * -+ * Returns index of bucket on success, 0 on failure -+ * */ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, -+ enum alloc_reserve reserve, -+ bool may_alloc_partial, -+ struct closure *cl) -+{ -+ struct bucket_array *buckets; -+ struct open_bucket *ob; -+ long bucket = 0; -+ -+ spin_lock(&c->freelist_lock); -+ -+ if (may_alloc_partial && -+ ca->open_buckets_partial_nr) { -+ ob = c->open_buckets + -+ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; -+ ob->on_partial_list = false; -+ spin_unlock(&c->freelist_lock); -+ return ob; -+ } -+ -+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { -+ if (cl) -+ closure_wait(&c->open_buckets_wait, cl); -+ -+ if (!c->blocked_allocate_open_bucket) -+ c->blocked_allocate_open_bucket = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ trace_open_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-OPEN_BUCKETS_EMPTY); -+ } -+ -+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) -+ goto out; -+ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_BTREE: -+ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= -+ ca->free[RESERVE_BTREE].size && -+ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_MOVINGGC: -+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) -+ goto out; -+ break; -+ default: -+ break; -+ } -+ -+ if (cl) -+ closure_wait(&c->freelist_wait, cl); -+ -+ if (!c->blocked_allocate) -+ c->blocked_allocate = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ -+ trace_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-FREELIST_EMPTY); -+out: -+ verify_not_on_freelist(c, ca, bucket); -+ -+ ob = bch2_open_bucket_alloc(c); -+ -+ spin_lock(&ob->lock); -+ buckets = bucket_array(ca); -+ -+ ob->valid = true; -+ ob->sectors_free = ca->mi.bucket_size; -+ ob->ptr = (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = buckets->b[bucket].mark.gen, -+ .offset = bucket_to_sector(ca, bucket), -+ .dev = ca->dev_idx, -+ }; -+ -+ bucket_io_clock_reset(c, ca, bucket, READ); -+ bucket_io_clock_reset(c, ca, bucket, WRITE); -+ spin_unlock(&ob->lock); -+ -+ if (c->blocked_allocate_open_bucket) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate_open_bucket], -+ c->blocked_allocate_open_bucket); -+ c->blocked_allocate_open_bucket = 0; -+ } -+ -+ if (c->blocked_allocate) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate], -+ c->blocked_allocate); -+ c->blocked_allocate = 0; -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_wake_allocator(ca); -+ -+ trace_bucket_alloc(ca, reserve); -+ return ob; -+} -+ -+static int __dev_stripe_cmp(struct dev_stripe_state *stripe, -+ unsigned l, unsigned r) -+{ -+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - -+ (stripe->next_alloc[l] < stripe->next_alloc[r])); -+} -+ -+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs) -+{ -+ struct dev_alloc_list ret = { .nr = 0 }; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_member_device_rcu(ca, c, i, devs) -+ ret.devs[ret.nr++] = i; -+ -+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); -+ return ret; -+} -+ -+void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, -+ struct dev_stripe_state *stripe) -+{ -+ u64 *v = stripe->next_alloc + ca->dev_idx; -+ u64 free_space = dev_buckets_free(c, ca); -+ u64 free_space_inv = free_space -+ ? div64_u64(1ULL << 48, free_space) -+ : 1ULL << 48; -+ u64 scale = *v / 4; -+ -+ if (*v + free_space_inv >= *v) -+ *v += free_space_inv; -+ else -+ *v = U64_MAX; -+ -+ for (v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -+ *v = *v < scale ? 0 : *v - scale; -+} -+ -+#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) -+#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) -+ -+static void add_new_bucket(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ struct open_bucket *ob) -+{ -+ unsigned durability = -+ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; -+ -+ __clear_bit(ob->ptr.dev, devs_may_alloc->d); -+ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) -+ ? durability : 1; -+ *have_cache |= !durability; -+ -+ ob_push(c, ptrs, ob); -+} -+ -+static int bch2_bucket_alloc_set(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct dev_alloc_list devs_sorted = -+ bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ struct bch_dev *ca; -+ bool alloc_failure = false; -+ unsigned i; -+ -+ BUG_ON(*nr_effective >= nr_replicas); -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ struct open_bucket *ob; -+ -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ if (!ca->mi.durability && *have_cache) -+ continue; -+ -+ ob = bch2_bucket_alloc(c, ca, reserve, -+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); -+ if (IS_ERR(ob)) { -+ enum bucket_alloc_ret ret = -PTR_ERR(ob); -+ -+ WARN_ON(reserve == RESERVE_MOVINGGC && -+ ret != OPEN_BUCKETS_EMPTY); -+ -+ if (cl) -+ return -EAGAIN; -+ if (ret == OPEN_BUCKETS_EMPTY) -+ return -ENOSPC; -+ alloc_failure = true; -+ continue; -+ } -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ -+ bch2_dev_stripe_increment(c, ca, stripe); -+ -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ -+ return alloc_failure ? -ENOSPC : -EROFS; -+} -+ -+/* Allocate from stripes: */ -+ -+/* -+ * XXX: use a higher watermark for allocating open buckets here: -+ */ -+static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ unsigned i, nr_have = 0, nr_data = -+ min_t(unsigned, h->nr_active_devs, -+ EC_STRIPE_MAX) - h->redundancy; -+ bool have_cache = true; -+ int ret = 0; -+ -+ BUG_ON(h->blocks.nr > nr_data); -+ BUG_ON(h->parity.nr > h->redundancy); -+ -+ devs = h->devs; -+ -+ open_bucket_for_each(c, &h->parity, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ open_bucket_for_each(c, &h->blocks, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+ if (h->parity.nr < h->redundancy) { -+ nr_have = h->parity.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->parity, -+ &h->parity_stripe, -+ &devs, -+ h->redundancy, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+ -+ if (h->blocks.nr < nr_data) { -+ nr_have = h->blocks.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->blocks, -+ &h->block_stripe, -+ &devs, -+ nr_data, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+ -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ -+ return bch2_ec_stripe_new_alloc(c, h); -+err: -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ return -1; -+} -+ -+/* -+ * if we can't allocate a new stripe because there are already too many -+ * partially filled stripes, force allocating from an existing stripe even when -+ * it's to a device we don't want: -+ */ -+ -+static void bucket_alloc_from_stripe(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags) -+{ -+ struct dev_alloc_list devs_sorted; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ struct bch_dev *ca; -+ unsigned i, ec_idx; -+ -+ if (!erasure_code) -+ return; -+ -+ if (nr_replicas < 2) -+ return; -+ -+ if (ec_open_bucket(c, ptrs)) -+ return; -+ -+ h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); -+ if (!h) -+ return; -+ -+ if (!h->s && ec_stripe_alloc(c, h)) -+ goto out_put_head; -+ -+ rcu_read_lock(); -+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ rcu_read_unlock(); -+ -+ for (i = 0; i < devs_sorted.nr; i++) -+ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) -+ if (ob->ptr.dev == devs_sorted.devs[i] && -+ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) -+ goto got_bucket; -+ goto out_put_head; -+got_bucket: -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ ob->ec_idx = ec_idx; -+ ob->ec = h->s; -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ atomic_inc(&h->s->pin); -+out_put_head: -+ bch2_ec_stripe_head_put(h); -+} -+ -+/* Sector allocator */ -+ -+static void get_buckets_from_writepoint(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ bool need_ec) -+{ -+ struct open_buckets ptrs_skip = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (*nr_effective < nr_replicas && -+ test_bit(ob->ptr.dev, devs_may_alloc->d) && -+ (ca->mi.durability || -+ (wp->type == BCH_DATA_USER && !*have_cache)) && -+ (ob->ec || !need_ec)) { -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, -+ flags, ob); -+ } else { -+ ob_push(c, &ptrs_skip, ob); -+ } -+ } -+ wp->ptrs = ptrs_skip; -+} -+ -+static int open_bucket_add_buckets(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_list *devs_have, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *_cl) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ struct closure *cl = NULL; -+ unsigned i; -+ int ret; -+ -+ rcu_read_lock(); -+ devs = target_rw_devs(c, wp->type, target); -+ rcu_read_unlock(); -+ -+ /* Don't allocate from devices we already have pointers to: */ -+ for (i = 0; i < devs_have->nr; i++) -+ __clear_bit(devs_have->devs[i], devs.d); -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ if (erasure_code) { -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, true); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ -+ bucket_alloc_from_stripe(c, ptrs, wp, &devs, -+ target, erasure_code, -+ nr_replicas, nr_effective, -+ have_cache, flags); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, false); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+retry_blocking: -+ /* -+ * Try nonblocking first, so that if one device is full we'll try from -+ * other devices: -+ */ -+ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, -+ nr_replicas, nr_effective, have_cache, -+ reserve, flags, cl); -+ if (ret && ret != -EROFS && !cl && _cl) { -+ cl = _cl; -+ goto retry_blocking; -+ } -+ -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, -+ struct open_buckets *obs) -+{ -+ struct open_buckets ptrs = { .nr = 0 }; -+ struct open_bucket *ob, *ob2; -+ unsigned i, j; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ bool drop = !ca || ob->ptr.dev == ca->dev_idx; -+ -+ if (!drop && ob->ec) { -+ mutex_lock(&ob->ec->lock); -+ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ open_bucket_for_each(c, &ob->ec->parity, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ mutex_unlock(&ob->ec->lock); -+ } -+ -+ if (drop) -+ bch2_open_bucket_put(c, ob); -+ else -+ ob_push(c, &ptrs, ob); -+ } -+ -+ *obs = ptrs; -+} -+ -+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, -+ struct write_point *wp) -+{ -+ mutex_lock(&wp->lock); -+ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); -+ mutex_unlock(&wp->lock); -+} -+ -+static inline struct hlist_head *writepoint_hash(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ unsigned hash = -+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); -+ -+ return &c->write_points_hash[hash]; -+} -+ -+static struct write_point *__writepoint_find(struct hlist_head *head, -+ unsigned long write_point) -+{ -+ struct write_point *wp; -+ -+ hlist_for_each_entry_rcu(wp, head, node) -+ if (wp->write_point == write_point) -+ return wp; -+ -+ return NULL; -+} -+ -+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -+{ -+ u64 stranded = c->write_points_nr * c->bucket_size_max; -+ u64 free = bch2_fs_usage_read_short(c).free; -+ -+ return stranded * factor > free; -+} -+ -+static bool try_increase_writepoints(struct bch_fs *c) -+{ -+ struct write_point *wp; -+ -+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || -+ too_many_writepoints(c, 32)) -+ return false; -+ -+ wp = c->write_points + c->write_points_nr++; -+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); -+ return true; -+} -+ -+static bool try_decrease_writepoints(struct bch_fs *c, -+ unsigned old_nr) -+{ -+ struct write_point *wp; -+ -+ mutex_lock(&c->write_points_hash_lock); -+ if (c->write_points_nr < old_nr) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return true; -+ } -+ -+ if (c->write_points_nr == 1 || -+ !too_many_writepoints(c, 8)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return false; -+ } -+ -+ wp = c->write_points + --c->write_points_nr; -+ -+ hlist_del_rcu(&wp->node); -+ mutex_unlock(&c->write_points_hash_lock); -+ -+ bch2_writepoint_stop(c, NULL, wp); -+ return true; -+} -+ -+static struct write_point *writepoint_find(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ struct write_point *wp, *oldest; -+ struct hlist_head *head; -+ -+ if (!(write_point & 1UL)) { -+ wp = (struct write_point *) write_point; -+ mutex_lock(&wp->lock); -+ return wp; -+ } -+ -+ head = writepoint_hash(c, write_point); -+restart_find: -+ wp = __writepoint_find(head, write_point); -+ if (wp) { -+lock_wp: -+ mutex_lock(&wp->lock); -+ if (wp->write_point == write_point) -+ goto out; -+ mutex_unlock(&wp->lock); -+ goto restart_find; -+ } -+restart_find_oldest: -+ oldest = NULL; -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) -+ if (!oldest || time_before64(wp->last_used, oldest->last_used)) -+ oldest = wp; -+ -+ mutex_lock(&oldest->lock); -+ mutex_lock(&c->write_points_hash_lock); -+ if (oldest >= c->write_points + c->write_points_nr || -+ try_increase_writepoints(c)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto restart_find_oldest; -+ } -+ -+ wp = __writepoint_find(head, write_point); -+ if (wp && wp != oldest) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto lock_wp; -+ } -+ -+ wp = oldest; -+ hlist_del_rcu(&wp->node); -+ wp->write_point = write_point; -+ hlist_add_head_rcu(&wp->node, head); -+ mutex_unlock(&c->write_points_hash_lock); -+out: -+ wp->last_used = sched_clock(); -+ return wp; -+} -+ -+/* -+ * Get us an open_bucket we can allocate from, return with it locked: -+ */ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct write_point *wp; -+ struct open_bucket *ob; -+ struct open_buckets ptrs; -+ unsigned nr_effective, write_points_nr; -+ unsigned ob_flags = 0; -+ bool have_cache; -+ int ret, i; -+ -+ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) -+ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; -+ -+ BUG_ON(!nr_replicas || !nr_replicas_required); -+retry: -+ ptrs.nr = 0; -+ nr_effective = 0; -+ write_points_nr = c->write_points_nr; -+ have_cache = false; -+ -+ wp = writepoint_find(c, write_point.v); -+ -+ if (wp->type == BCH_DATA_USER) -+ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; -+ -+ /* metadata may not allocate on cache devices: */ -+ if (wp->type != BCH_DATA_USER) -+ have_cache = true; -+ -+ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } else { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, NULL); -+ if (!ret) -+ goto alloc_done; -+ -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ 0, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } -+alloc_done: -+ BUG_ON(!ret && nr_effective < nr_replicas); -+ -+ if (erasure_code && !ec_open_bucket(c, &ptrs)) -+ pr_debug("failed to get ec bucket: ret %u", ret); -+ -+ if (ret == -EROFS && -+ nr_effective >= nr_replicas_required) -+ ret = 0; -+ -+ if (ret) -+ goto err; -+ -+ /* Free buckets we didn't use: */ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); -+ -+ wp->ptrs = ptrs; -+ -+ wp->sectors_free = UINT_MAX; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); -+ -+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); -+ -+ verify_not_stale(c, &wp->ptrs); -+ -+ return wp; -+err: -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) -+ ob_push(c, &ptrs, ob); -+ else -+ open_bucket_free_unused(c, ob, -+ wp->type == BCH_DATA_USER); -+ wp->ptrs = ptrs; -+ -+ mutex_unlock(&wp->lock); -+ -+ if (ret == -ENOSPC && -+ try_decrease_writepoints(c, write_points_nr)) -+ goto retry; -+ -+ return ERR_PTR(ret); -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors) -+ -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(sectors > wp->sectors_free); -+ wp->sectors_free -= sectors; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ struct bch_extent_ptr tmp = ob->ptr; -+ -+ tmp.cached = !ca->mi.durability && -+ wp->type == BCH_DATA_USER; -+ -+ tmp.offset += ca->mi.bucket_size - ob->sectors_free; -+ bch2_bkey_append_ptr(k, tmp); -+ -+ BUG_ON(sectors > ob->sectors_free); -+ ob->sectors_free -= sectors; -+ } -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); -+ wp->ptrs = keep; -+ -+ mutex_unlock(&wp->lock); -+ -+ bch2_open_buckets_put(c, &ptrs); -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ struct write_point *wp; -+ -+ mutex_init(&c->write_points_hash_lock); -+ c->write_points_nr = ARRAY_SIZE(c->write_points); -+ -+ /* open bucket 0 is a sentinal NULL: */ -+ spin_lock_init(&c->open_buckets[0].lock); -+ -+ for (ob = c->open_buckets + 1; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { -+ spin_lock_init(&ob->lock); -+ c->open_buckets_nr_free++; -+ -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ } -+ -+ writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); -+ writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); -+ -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) { -+ writepoint_init(wp, BCH_DATA_USER); -+ -+ wp->last_used = sched_clock(); -+ wp->write_point = (unsigned long) wp; -+ hlist_add_head_rcu(&wp->node, -+ writepoint_hash(c, wp->write_point)); -+ } -+} -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -new file mode 100644 -index 000000000000..687f973e4b3a ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,133 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -+#define _BCACHEFS_ALLOC_FOREGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+ -+#include -+ -+struct bkey; -+struct bch_dev; -+struct bch_fs; -+struct bch_devs_List; -+ -+struct dev_alloc_list { -+ unsigned nr; -+ u8 devs[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -+ struct dev_stripe_state *, -+ struct bch_devs_mask *); -+void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, -+ struct dev_stripe_state *); -+ -+long bch2_bucket_alloc_new_fs(struct bch_dev *); -+ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, -+ enum alloc_reserve, bool, -+ struct closure *); -+ -+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, -+ struct open_bucket *ob) -+{ -+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); -+ -+ obs->v[obs->nr++] = ob - c->open_buckets; -+} -+ -+#define open_bucket_for_each(_c, _obs, _ob, _i) \ -+ for ((_i) = 0; \ -+ (_i) < (_obs)->nr && \ -+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ -+ (_i)++) -+ -+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, -+ struct open_buckets *obs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ec) -+ return ob; -+ -+ return NULL; -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *, -+ struct open_buckets *, unsigned); -+ -+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); -+ -+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ if (atomic_dec_and_test(&ob->pin)) -+ __bch2_open_bucket_put(c, ob); -+} -+ -+static inline void bch2_open_buckets_put(struct bch_fs *c, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ bch2_open_bucket_put(c, ob); -+ ptrs->nr = 0; -+} -+ -+static inline void bch2_open_bucket_get(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ ob->type = wp->type; -+ atomic_inc(&ob->pin); -+ ob_push(c, ptrs, ob); -+ } -+} -+ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum alloc_reserve, -+ unsigned, -+ struct closure *); -+ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, -+ struct bkey_i *, unsigned); -+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, -+ struct open_buckets *); -+ -+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, -+ struct write_point *); -+ -+static inline struct write_point_specifier writepoint_hashed(unsigned long v) -+{ -+ return (struct write_point_specifier) { .v = v | 1 }; -+} -+ -+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -+{ -+ return (struct write_point_specifier) { .v = (unsigned long) wp }; -+} -+ -+static inline void writepoint_init(struct write_point *wp, -+ enum bch_data_type type) -+{ -+ mutex_init(&wp->lock); -+ wp->type = type; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -new file mode 100644 -index 000000000000..4f1465077994 ---- /dev/null -+++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,112 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_TYPES_H -+#define _BCACHEFS_ALLOC_TYPES_H -+ -+#include -+#include -+ -+#include "clock_types.h" -+#include "fifo.h" -+ -+struct ec_bucket_buf; -+ -+/* There's two of these clocks, one for reads and one for writes: */ -+struct bucket_clock { -+ /* -+ * "now" in (read/write) IO time - incremented whenever we do X amount -+ * of reads or writes. -+ * -+ * Goes with the bucket read/write prios: when we read or write to a -+ * bucket we reset the bucket's prio to the current hand; thus hand - -+ * prio = time since bucket was last read/written. -+ * -+ * The units are some amount (bytes/sectors) of data read/written, and -+ * the units can change on the fly if we need to rescale to fit -+ * everything in a u16 - your only guarantee is that the units are -+ * consistent. -+ */ -+ u16 hand; -+ u16 max_last_io; -+ -+ int rw; -+ -+ struct io_timer rescale; -+ struct mutex lock; -+}; -+ -+/* There is one reserve for each type of btree, one for prios and gens -+ * and one for moving GC */ -+enum alloc_reserve { -+ RESERVE_ALLOC = -1, -+ RESERVE_BTREE = 0, -+ RESERVE_MOVINGGC = 1, -+ RESERVE_NONE = 2, -+ RESERVE_NR = 3, -+}; -+ -+typedef FIFO(long) alloc_fifo; -+ -+#define OPEN_BUCKETS_COUNT 1024 -+ -+#define WRITE_POINT_HASH_NR 32 -+#define WRITE_POINT_MAX 32 -+ -+typedef u16 open_bucket_idx_t; -+ -+struct open_bucket { -+ spinlock_t lock; -+ atomic_t pin; -+ open_bucket_idx_t freelist; -+ -+ /* -+ * When an open bucket has an ec_stripe attached, this is the index of -+ * the block in the stripe this open_bucket corresponds to: -+ */ -+ u8 ec_idx; -+ u8 type; -+ unsigned valid:1; -+ unsigned on_partial_list:1; -+ unsigned sectors_free; -+ struct bch_extent_ptr ptr; -+ struct ec_stripe_new *ec; -+}; -+ -+#define OPEN_BUCKET_LIST_MAX 15 -+ -+struct open_buckets { -+ open_bucket_idx_t nr; -+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -+}; -+ -+struct dev_stripe_state { -+ u64 next_alloc[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct write_point { -+ struct hlist_node node; -+ struct mutex lock; -+ u64 last_used; -+ unsigned long write_point; -+ enum bch_data_type type; -+ bool is_ec; -+ -+ /* calculated based on how many pointers we're actually going to use: */ -+ unsigned sectors_free; -+ -+ struct open_buckets ptrs; -+ struct dev_stripe_state stripe; -+}; -+ -+struct write_point_specifier { -+ unsigned long v; -+}; -+ -+struct alloc_heap_entry { -+ size_t bucket; -+ size_t nr; -+ unsigned long key; -+}; -+ -+typedef HEAP(struct alloc_heap_entry) alloc_heap; -+ -+#endif /* _BCACHEFS_ALLOC_TYPES_H */ -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -new file mode 100644 -index 000000000000..893c89dbee60 ---- /dev/null -+++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,878 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_H -+#define _BCACHEFS_H -+ -+/* -+ * SOME HIGH LEVEL CODE DOCUMENTATION: -+ * -+ * Bcache mostly works with cache sets, cache devices, and backing devices. -+ * -+ * Support for multiple cache devices hasn't quite been finished off yet, but -+ * it's about 95% plumbed through. A cache set and its cache devices is sort of -+ * like a md raid array and its component devices. Most of the code doesn't care -+ * about individual cache devices, the main abstraction is the cache set. -+ * -+ * Multiple cache devices is intended to give us the ability to mirror dirty -+ * cached data and metadata, without mirroring clean cached data. -+ * -+ * Backing devices are different, in that they have a lifetime independent of a -+ * cache set. When you register a newly formatted backing device it'll come up -+ * in passthrough mode, and then you can attach and detach a backing device from -+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly -+ * invalidates any cached data for that backing device. -+ * -+ * A cache set can have multiple (many) backing devices attached to it. -+ * -+ * There's also flash only volumes - this is the reason for the distinction -+ * between struct cached_dev and struct bcache_device. A flash only volume -+ * works much like a bcache device that has a backing device, except the -+ * "cached" data is always dirty. The end result is that we get thin -+ * provisioning with very little additional code. -+ * -+ * Flash only volumes work but they're not production ready because the moving -+ * garbage collector needs more work. More on that later. -+ * -+ * BUCKETS/ALLOCATION: -+ * -+ * Bcache is primarily designed for caching, which means that in normal -+ * operation all of our available space will be allocated. Thus, we need an -+ * efficient way of deleting things from the cache so we can write new things to -+ * it. -+ * -+ * To do this, we first divide the cache device up into buckets. A bucket is the -+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ -+ * works efficiently. -+ * -+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with -+ * it. The gens and priorities for all the buckets are stored contiguously and -+ * packed on disk (in a linked list of buckets - aside from the superblock, all -+ * of bcache's metadata is stored in buckets). -+ * -+ * The priority is used to implement an LRU. We reset a bucket's priority when -+ * we allocate it or on cache it, and every so often we decrement the priority -+ * of each bucket. It could be used to implement something more sophisticated, -+ * if anyone ever gets around to it. -+ * -+ * The generation is used for invalidating buckets. Each pointer also has an 8 -+ * bit generation embedded in it; for a pointer to be considered valid, its gen -+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all -+ * we have to do is increment its gen (and write its new gen to disk; we batch -+ * this up). -+ * -+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that -+ * contain metadata (including btree nodes). -+ * -+ * THE BTREE: -+ * -+ * Bcache is in large part design around the btree. -+ * -+ * At a high level, the btree is just an index of key -> ptr tuples. -+ * -+ * Keys represent extents, and thus have a size field. Keys also have a variable -+ * number of pointers attached to them (potentially zero, which is handy for -+ * invalidating the cache). -+ * -+ * The key itself is an inode:offset pair. The inode number corresponds to a -+ * backing device or a flash only volume. The offset is the ending offset of the -+ * extent within the inode - not the starting offset; this makes lookups -+ * slightly more convenient. -+ * -+ * Pointers contain the cache device id, the offset on that device, and an 8 bit -+ * generation number. More on the gen later. -+ * -+ * Index lookups are not fully abstracted - cache lookups in particular are -+ * still somewhat mixed in with the btree code, but things are headed in that -+ * direction. -+ * -+ * Updates are fairly well abstracted, though. There are two different ways of -+ * updating the btree; insert and replace. -+ * -+ * BTREE_INSERT will just take a list of keys and insert them into the btree - -+ * overwriting (possibly only partially) any extents they overlap with. This is -+ * used to update the index after a write. -+ * -+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is -+ * overwriting a key that matches another given key. This is used for inserting -+ * data into the cache after a cache miss, and for background writeback, and for -+ * the moving garbage collector. -+ * -+ * There is no "delete" operation; deleting things from the index is -+ * accomplished by either by invalidating pointers (by incrementing a bucket's -+ * gen) or by inserting a key with 0 pointers - which will overwrite anything -+ * previously present at that location in the index. -+ * -+ * This means that there are always stale/invalid keys in the btree. They're -+ * filtered out by the code that iterates through a btree node, and removed when -+ * a btree node is rewritten. -+ * -+ * BTREE NODES: -+ * -+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and -+ * free smaller than a bucket - so, that's how big our btree nodes are. -+ * -+ * (If buckets are really big we'll only use part of the bucket for a btree node -+ * - no less than 1/4th - but a bucket still contains no more than a single -+ * btree node. I'd actually like to change this, but for now we rely on the -+ * bucket's gen for deleting btree nodes when we rewrite/split a node.) -+ * -+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook -+ * btree implementation. -+ * -+ * The way this is solved is that btree nodes are internally log structured; we -+ * can append new keys to an existing btree node without rewriting it. This -+ * means each set of keys we write is sorted, but the node is not. -+ * -+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would -+ * be expensive, and we have to distinguish between the keys we have written and -+ * the keys we haven't. So to do a lookup in a btree node, we have to search -+ * each sorted set. But we do merge written sets together lazily, so the cost of -+ * these extra searches is quite low (normally most of the keys in a btree node -+ * will be in one big set, and then there'll be one or two sets that are much -+ * smaller). -+ * -+ * This log structure makes bcache's btree more of a hybrid between a -+ * conventional btree and a compacting data structure, with some of the -+ * advantages of both. -+ * -+ * GARBAGE COLLECTION: -+ * -+ * We can't just invalidate any bucket - it might contain dirty data or -+ * metadata. If it once contained dirty data, other writes might overwrite it -+ * later, leaving no valid pointers into that bucket in the index. -+ * -+ * Thus, the primary purpose of garbage collection is to find buckets to reuse. -+ * It also counts how much valid data it each bucket currently contains, so that -+ * allocation can reuse buckets sooner when they've been mostly overwritten. -+ * -+ * It also does some things that are really internal to the btree -+ * implementation. If a btree node contains pointers that are stale by more than -+ * some threshold, it rewrites the btree node to avoid the bucket's generation -+ * wrapping around. It also merges adjacent btree nodes if they're empty enough. -+ * -+ * THE JOURNAL: -+ * -+ * Bcache's journal is not necessary for consistency; we always strictly -+ * order metadata writes so that the btree and everything else is consistent on -+ * disk in the event of an unclean shutdown, and in fact bcache had writeback -+ * caching (with recovery from unclean shutdown) before journalling was -+ * implemented. -+ * -+ * Rather, the journal is purely a performance optimization; we can't complete a -+ * write until we've updated the index on disk, otherwise the cache would be -+ * inconsistent in the event of an unclean shutdown. This means that without the -+ * journal, on random write workloads we constantly have to update all the leaf -+ * nodes in the btree, and those writes will be mostly empty (appending at most -+ * a few keys each) - highly inefficient in terms of amount of metadata writes, -+ * and it puts more strain on the various btree resorting/compacting code. -+ * -+ * The journal is just a log of keys we've inserted; on startup we just reinsert -+ * all the keys in the open journal entries. That means that when we're updating -+ * a node in the btree, we can wait until a 4k block of keys fills up before -+ * writing them out. -+ * -+ * For simplicity, we only journal updates to leaf nodes; updates to parent -+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth -+ * the complexity to deal with journalling them (in particular, journal replay) -+ * - updates to non leaf nodes just happen synchronously (see btree_split()). -+ */ -+ -+#undef pr_fmt -+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "fifo.h" -+#include "opts.h" -+#include "util.h" -+ -+#include -+ -+#define bch2_fs_init_fault(name) \ -+ dynamic_fault("bcachefs:bch_fs_init:" name) -+#define bch2_meta_read_fault(name) \ -+ dynamic_fault("bcachefs:meta:read:" name) -+#define bch2_meta_write_fault(name) \ -+ dynamic_fault("bcachefs:meta:write:" name) -+ -+#ifdef __KERNEL__ -+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) -+#else -+#define bch2_fmt(_c, fmt) fmt "\n" -+#endif -+ -+#define bch_info(c, fmt, ...) \ -+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_notice(c, fmt, ...) \ -+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn(c, fmt, ...) \ -+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err(c, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+ -+#define bch_verbose(c, fmt, ...) \ -+do { \ -+ if ((c)->opts.verbose) \ -+ bch_info(c, fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+#define pr_verbose_init(opts, fmt, ...) \ -+do { \ -+ if (opt_get(opts, verbose)) \ -+ pr_info(fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+/* Parameters that are useful for debugging, but should always be compiled in: */ -+#define BCH_DEBUG_PARAMS_ALWAYS() \ -+ BCH_DEBUG_PARAM(key_merging_disabled, \ -+ "Disables merging of extents") \ -+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ -+ "Causes mark and sweep to compact and rewrite every " \ -+ "btree node it traverses") \ -+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ -+ "Disables rewriting of btree nodes during mark and sweep")\ -+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ -+ "Disables the shrinker callback for the btree node cache") -+ -+/* Parameters that should only be compiled in in debug mode: */ -+#define BCH_DEBUG_PARAMS_DEBUG() \ -+ BCH_DEBUG_PARAM(expensive_debug_checks, \ -+ "Enables various runtime debugging checks that " \ -+ "significantly affect performance") \ -+ BCH_DEBUG_PARAM(debug_check_iterators, \ -+ "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_bkeys, \ -+ "Run bkey_debugcheck (primarily checking GC/allocation "\ -+ "information) when iterating over keys") \ -+ BCH_DEBUG_PARAM(verify_btree_ondisk, \ -+ "Reread btree nodes at various points to verify the " \ -+ "mergesort in the read path against modifications " \ -+ "done in memory") \ -+ BCH_DEBUG_PARAM(journal_seq_verify, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(inject_invalid_keys, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(test_alloc_startup, \ -+ "Force allocator startup to use the slowpath where it" \ -+ "can't find enough free buckets without invalidating" \ -+ "cached data") \ -+ BCH_DEBUG_PARAM(force_reconstruct_read, \ -+ "Force reads to use the reconstruct path, when reading" \ -+ "from erasure coded extents") \ -+ BCH_DEBUG_PARAM(test_restart_gc, \ -+ "Test restarting mark and sweep gc when bucket gens change") -+ -+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -+#else -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -+#endif -+ -+#define BCH_TIME_STATS() \ -+ x(btree_node_mem_alloc) \ -+ x(btree_node_split) \ -+ x(btree_node_sort) \ -+ x(btree_node_read) \ -+ x(btree_gc) \ -+ x(btree_lock_contended_read) \ -+ x(btree_lock_contended_intent) \ -+ x(btree_lock_contended_write) \ -+ x(data_write) \ -+ x(data_read) \ -+ x(data_promote) \ -+ x(journal_write) \ -+ x(journal_delay) \ -+ x(journal_flush_seq) \ -+ x(blocked_journal) \ -+ x(blocked_allocate) \ -+ x(blocked_allocate_open_bucket) -+ -+enum bch_time_stats { -+#define x(name) BCH_TIME_##name, -+ BCH_TIME_STATS() -+#undef x -+ BCH_TIME_STAT_NR -+}; -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "clock_types.h" -+#include "ec_types.h" -+#include "journal_types.h" -+#include "keylist_types.h" -+#include "quota_types.h" -+#include "rebalance_types.h" -+#include "replicas_types.h" -+#include "super_types.h" -+ -+/* Number of nodes btree coalesce will try to coalesce at once */ -+#define GC_MERGE_NODES 4U -+ -+/* Maximum number of nodes we might need to allocate atomically: */ -+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) -+ -+/* Size of the freelist we allocate btree nodes from: */ -+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) -+ -+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) -+ -+struct btree; -+ -+enum gc_phase { -+ GC_PHASE_NOT_RUNNING, -+ GC_PHASE_START, -+ GC_PHASE_SB, -+ -+ GC_PHASE_BTREE_EC, -+ GC_PHASE_BTREE_EXTENTS, -+ GC_PHASE_BTREE_INODES, -+ GC_PHASE_BTREE_DIRENTS, -+ GC_PHASE_BTREE_XATTRS, -+ GC_PHASE_BTREE_ALLOC, -+ GC_PHASE_BTREE_QUOTAS, -+ GC_PHASE_BTREE_REFLINK, -+ -+ GC_PHASE_PENDING_DELETE, -+ GC_PHASE_ALLOC, -+}; -+ -+struct gc_pos { -+ enum gc_phase phase; -+ struct bpos pos; -+ unsigned level; -+}; -+ -+struct io_count { -+ u64 sectors[2][BCH_DATA_NR]; -+}; -+ -+struct bch_dev { -+ struct kobject kobj; -+ struct percpu_ref ref; -+ struct completion ref_completion; -+ struct percpu_ref io_ref; -+ struct completion io_ref_completion; -+ -+ struct bch_fs *fs; -+ -+ u8 dev_idx; -+ /* -+ * Cached version of this device's member info from superblock -+ * Committed by bch2_write_super() -> bch_fs_mi_update() -+ */ -+ struct bch_member_cpu mi; -+ uuid_le uuid; -+ char name[BDEVNAME_SIZE]; -+ -+ struct bch_sb_handle disk_sb; -+ struct bch_sb *sb_read_scratch; -+ int sb_write_error; -+ -+ struct bch_devs_mask self; -+ -+ /* biosets used in cloned bios for writing multiple replicas */ -+ struct bio_set replica_set; -+ -+ /* -+ * Buckets: -+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and -+ * gc_lock, for device resize - holding any is sufficient for access: -+ * Or rcu_read_lock(), but only for ptr_stale(): -+ */ -+ struct bucket_array __rcu *buckets[2]; -+ unsigned long *buckets_nouse; -+ struct rw_semaphore bucket_lock; -+ -+ struct bch_dev_usage __percpu *usage[2]; -+ -+ /* Allocator: */ -+ struct task_struct __rcu *alloc_thread; -+ -+ /* -+ * free: Buckets that are ready to be used -+ * -+ * free_inc: Incoming buckets - these are buckets that currently have -+ * cached data in them, and we can't reuse them until after we write -+ * their new gen to disk. After prio_write() finishes writing the new -+ * gens/prios, they'll be moved to the free list (and possibly discarded -+ * in the process) -+ */ -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ -+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; -+ open_bucket_idx_t open_buckets_partial_nr; -+ -+ size_t fifo_last_bucket; -+ -+ /* last calculated minimum prio */ -+ u16 max_last_bucket_io[2]; -+ -+ size_t inc_gen_needs_gc; -+ size_t inc_gen_really_needs_gc; -+ -+ /* -+ * XXX: this should be an enum for allocator state, so as to include -+ * error state -+ */ -+ enum { -+ ALLOCATOR_STOPPED, -+ ALLOCATOR_RUNNING, -+ ALLOCATOR_BLOCKED, -+ ALLOCATOR_BLOCKED_FULL, -+ } allocator_state; -+ -+ alloc_heap alloc_heap; -+ -+ /* Copying GC: */ -+ struct task_struct *copygc_thread; -+ copygc_heap copygc_heap; -+ struct bch_pd_controller copygc_pd; -+ struct write_point copygc_write_point; -+ u64 copygc_threshold; -+ -+ atomic64_t rebalance_work; -+ -+ struct journal_device journal; -+ -+ struct work_struct io_error_work; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic64_t cur_latency[2]; -+ struct time_stats io_latency[2]; -+ -+#define CONGESTED_MAX 1024 -+ atomic_t congested; -+ u64 congested_last; -+ -+ struct io_count __percpu *io_done; -+}; -+ -+enum { -+ /* startup: */ -+ BCH_FS_ALLOC_READ_DONE, -+ BCH_FS_ALLOC_CLEAN, -+ BCH_FS_ALLOCATOR_RUNNING, -+ BCH_FS_ALLOCATOR_STOPPING, -+ BCH_FS_INITIAL_GC_DONE, -+ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, -+ BCH_FS_FSCK_DONE, -+ BCH_FS_STARTED, -+ BCH_FS_RW, -+ -+ /* shutdown: */ -+ BCH_FS_STOPPING, -+ BCH_FS_EMERGENCY_RO, -+ BCH_FS_WRITE_DISABLE_COMPLETE, -+ -+ /* errors: */ -+ BCH_FS_ERROR, -+ BCH_FS_ERRORS_FIXED, -+ -+ /* misc: */ -+ BCH_FS_BDEV_MOUNTED, -+ BCH_FS_FIXED_GENS, -+ BCH_FS_ALLOC_WRITTEN, -+ BCH_FS_REBUILD_REPLICAS, -+ BCH_FS_HOLD_BTREE_WRITES, -+}; -+ -+struct btree_debug { -+ unsigned id; -+ struct dentry *btree; -+ struct dentry *btree_format; -+ struct dentry *failed; -+}; -+ -+struct bch_fs_pcpu { -+ u64 sectors_available; -+}; -+ -+struct journal_seq_blacklist_table { -+ size_t nr; -+ struct journal_seq_blacklist_table_entry { -+ u64 start; -+ u64 end; -+ bool dirty; -+ } entries[0]; -+}; -+ -+struct journal_keys { -+ struct journal_key { -+ enum btree_id btree_id:8; -+ unsigned level:8; -+ struct bkey_i *k; -+ u32 journal_seq; -+ u32 journal_offset; -+ } *d; -+ size_t nr; -+ u64 journal_seq_base; -+}; -+ -+struct bch_fs { -+ struct closure cl; -+ -+ struct list_head list; -+ struct kobject kobj; -+ struct kobject internal; -+ struct kobject opts_dir; -+ struct kobject time_stats; -+ unsigned long flags; -+ -+ int minor; -+ struct device *chardev; -+ struct super_block *vfs_sb; -+ char name[40]; -+ -+ /* ro/rw, add/remove/resize devices: */ -+ struct rw_semaphore state_lock; -+ -+ /* Counts outstanding writes, for clean transition to read-only */ -+ struct percpu_ref writes; -+ struct work_struct read_only_work; -+ -+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; -+ -+ struct bch_replicas_cpu replicas; -+ struct bch_replicas_cpu replicas_gc; -+ struct mutex replicas_gc_lock; -+ -+ struct journal_entry_res replicas_journal_res; -+ -+ struct bch_disk_groups_cpu __rcu *disk_groups; -+ -+ struct bch_opts opts; -+ -+ /* Updated by bch2_sb_update():*/ -+ struct { -+ uuid_le uuid; -+ uuid_le user_uuid; -+ -+ u16 version; -+ u16 encoded_extent_max; -+ -+ u8 nr_devices; -+ u8 clean; -+ -+ u8 encryption_type; -+ -+ u64 time_base_lo; -+ u32 time_base_hi; -+ u32 time_precision; -+ u64 features; -+ u64 compat; -+ } sb; -+ -+ struct bch_sb_handle disk_sb; -+ -+ unsigned short block_bits; /* ilog2(block_size) */ -+ -+ u16 btree_foreground_merge_threshold; -+ -+ struct closure sb_write; -+ struct mutex sb_lock; -+ -+ /* BTREE CACHE */ -+ struct bio_set btree_bio; -+ -+ struct btree_root btree_roots[BTREE_ID_NR]; -+ struct mutex btree_root_lock; -+ -+ struct btree_cache btree_cache; -+ -+ /* -+ * Cache of allocated btree nodes - if we allocate a btree node and -+ * don't use it, if we free it that space can't be reused until going -+ * _all_ the way through the allocator (which exposes us to a livelock -+ * when allocating btree reserves fail halfway through) - instead, we -+ * can stick them here: -+ */ -+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; -+ unsigned btree_reserve_cache_nr; -+ struct mutex btree_reserve_cache_lock; -+ -+ mempool_t btree_interior_update_pool; -+ struct list_head btree_interior_update_list; -+ struct list_head btree_interior_updates_unwritten; -+ struct mutex btree_interior_update_lock; -+ struct closure_waitlist btree_interior_update_wait; -+ -+ struct workqueue_struct *btree_interior_update_worker; -+ struct work_struct btree_interior_update_work; -+ -+ /* btree_iter.c: */ -+ struct mutex btree_trans_lock; -+ struct list_head btree_trans_list; -+ mempool_t btree_iters_pool; -+ -+ struct btree_key_cache btree_key_cache; -+ -+ struct workqueue_struct *wq; -+ /* copygc needs its own workqueue for index updates.. */ -+ struct workqueue_struct *copygc_wq; -+ struct workqueue_struct *journal_reclaim_wq; -+ -+ /* ALLOCATION */ -+ struct delayed_work pd_controllers_update; -+ unsigned pd_controllers_update_seconds; -+ -+ struct bch_devs_mask rw_devs[BCH_DATA_NR]; -+ -+ u64 capacity; /* sectors */ -+ -+ /* -+ * When capacity _decreases_ (due to a disk being removed), we -+ * increment capacity_gen - this invalidates outstanding reservations -+ * and forces them to be revalidated -+ */ -+ u32 capacity_gen; -+ unsigned bucket_size_max; -+ -+ atomic64_t sectors_available; -+ -+ struct bch_fs_pcpu __percpu *pcpu; -+ -+ struct percpu_rw_semaphore mark_lock; -+ -+ seqcount_t usage_lock; -+ struct bch_fs_usage *usage_base; -+ struct bch_fs_usage __percpu *usage[2]; -+ struct bch_fs_usage __percpu *usage_gc; -+ -+ /* single element mempool: */ -+ struct mutex usage_scratch_lock; -+ struct bch_fs_usage *usage_scratch; -+ -+ /* -+ * When we invalidate buckets, we use both the priority and the amount -+ * of good data to determine which buckets to reuse first - to weight -+ * those together consistently we keep track of the smallest nonzero -+ * priority of any bucket. -+ */ -+ struct bucket_clock bucket_clock[2]; -+ -+ struct io_clock io_clock[2]; -+ -+ /* JOURNAL SEQ BLACKLIST */ -+ struct journal_seq_blacklist_table * -+ journal_seq_blacklist_table; -+ struct work_struct journal_seq_blacklist_gc_work; -+ -+ /* ALLOCATOR */ -+ spinlock_t freelist_lock; -+ struct closure_waitlist freelist_wait; -+ u64 blocked_allocate; -+ u64 blocked_allocate_open_bucket; -+ open_bucket_idx_t open_buckets_freelist; -+ open_bucket_idx_t open_buckets_nr_free; -+ struct closure_waitlist open_buckets_wait; -+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; -+ -+ struct write_point btree_write_point; -+ struct write_point rebalance_write_point; -+ -+ struct write_point write_points[WRITE_POINT_MAX]; -+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; -+ struct mutex write_points_hash_lock; -+ unsigned write_points_nr; -+ -+ /* GARBAGE COLLECTION */ -+ struct task_struct *gc_thread; -+ atomic_t kick_gc; -+ unsigned long gc_count; -+ -+ /* -+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] -+ * has been marked by GC. -+ * -+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) -+ * -+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread -+ * can read without a lock. -+ */ -+ seqcount_t gc_pos_lock; -+ struct gc_pos gc_pos; -+ -+ /* -+ * The allocation code needs gc_mark in struct bucket to be correct, but -+ * it's not while a gc is in progress. -+ */ -+ struct rw_semaphore gc_lock; -+ -+ /* IO PATH */ -+ struct semaphore io_in_flight; -+ struct bio_set bio_read; -+ struct bio_set bio_read_split; -+ struct bio_set bio_write; -+ struct mutex bio_bounce_pages_lock; -+ mempool_t bio_bounce_pages; -+ struct rhashtable promote_table; -+ -+ mempool_t compression_bounce[2]; -+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; -+ mempool_t decompress_workspace; -+ ZSTD_parameters zstd_params; -+ -+ struct crypto_shash *sha256; -+ struct crypto_sync_skcipher *chacha20; -+ struct crypto_shash *poly1305; -+ -+ atomic64_t key_version; -+ -+ mempool_t large_bkey_pool; -+ -+ /* REBALANCE */ -+ struct bch_fs_rebalance rebalance; -+ -+ /* STRIPES: */ -+ GENRADIX(struct stripe) stripes[2]; -+ struct mutex ec_stripe_create_lock; -+ -+ ec_stripes_heap ec_stripes_heap; -+ spinlock_t ec_stripes_heap_lock; -+ -+ /* ERASURE CODING */ -+ struct list_head ec_new_stripe_list; -+ struct mutex ec_new_stripe_lock; -+ u64 ec_stripe_hint; -+ -+ struct bio_set ec_bioset; -+ -+ struct work_struct ec_stripe_delete_work; -+ struct llist_head ec_stripe_delete_list; -+ -+ /* REFLINK */ -+ u64 reflink_hint; -+ -+ /* VFS IO PATH - fs-io.c */ -+ struct bio_set writepage_bioset; -+ struct bio_set dio_write_bioset; -+ struct bio_set dio_read_bioset; -+ -+ struct bio_list btree_write_error_list; -+ struct work_struct btree_write_error_work; -+ spinlock_t btree_write_error_lock; -+ -+ /* ERRORS */ -+ struct list_head fsck_errors; -+ struct mutex fsck_error_lock; -+ bool fsck_alloc_err; -+ -+ /* QUOTAS */ -+ struct bch_memquota_type quotas[QTYP_NR]; -+ -+ /* DEBUG JUNK */ -+ struct dentry *debug; -+ struct btree_debug btree_debug[BTREE_ID_NR]; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree *verify_data; -+ struct btree_node *verify_ondisk; -+ struct mutex verify_lock; -+#endif -+ -+ u64 unused_inode_hint; -+ -+ /* -+ * A btree node on disk could have too many bsets for an iterator to fit -+ * on the stack - have to dynamically allocate them -+ */ -+ mempool_t fill_iter; -+ -+ mempool_t btree_bounce_pool; -+ -+ struct journal journal; -+ struct list_head journal_entries; -+ struct journal_keys journal_keys; -+ -+ u64 last_bucket_seq_cleanup; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic_long_t read_realloc_races; -+ atomic_long_t extent_migrate_done; -+ atomic_long_t extent_migrate_raced; -+ -+ unsigned btree_gc_periodic:1; -+ unsigned copy_gc_enabled:1; -+ bool promote_whole_extents; -+ -+#define BCH_DEBUG_PARAM(name, description) bool name; -+ BCH_DEBUG_PARAMS_ALL() -+#undef BCH_DEBUG_PARAM -+ -+ struct time_stats times[BCH_TIME_STAT_NR]; -+}; -+ -+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -+{ -+#ifndef NO_BCACHEFS_FS -+ if (c->vfs_sb) -+ c->vfs_sb->s_bdi->ra_pages = ra_pages; -+#endif -+} -+ -+static inline unsigned bucket_bytes(const struct bch_dev *ca) -+{ -+ return ca->mi.bucket_size << 9; -+} -+ -+static inline unsigned block_bytes(const struct bch_fs *c) -+{ -+ return c->opts.block_size << 9; -+} -+ -+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) -+{ -+ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); -+} -+ -+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) -+{ -+ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; -+ -+ if (c->sb.time_precision == 1) -+ return ns; -+ -+ return div_s64(ns, c->sb.time_precision); -+} -+ -+static inline s64 bch2_current_time(struct bch_fs *c) -+{ -+ struct timespec64 now; -+ -+ ktime_get_coarse_real_ts64(&now); -+ return timespec_to_bch2_time(c, now); -+} -+ -+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -+{ -+ return dev < c->sb.nr_devices && c->devs[dev]; -+} -+ -+#endif /* _BCACHEFS_H */ -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -new file mode 100644 -index 000000000000..f808e63a713d ---- /dev/null -+++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,1666 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FORMAT_H -+#define _BCACHEFS_FORMAT_H -+ -+/* -+ * bcachefs on disk data structures -+ * -+ * OVERVIEW: -+ * -+ * There are three main types of on disk data structures in bcachefs (this is -+ * reduced from 5 in bcache) -+ * -+ * - superblock -+ * - journal -+ * - btree -+ * -+ * The btree is the primary structure; most metadata exists as keys in the -+ * various btrees. There are only a small number of btrees, they're not -+ * sharded - we have one btree for extents, another for inodes, et cetera. -+ * -+ * SUPERBLOCK: -+ * -+ * The superblock contains the location of the journal, the list of devices in -+ * the filesystem, and in general any metadata we need in order to decide -+ * whether we can start a filesystem or prior to reading the journal/btree -+ * roots. -+ * -+ * The superblock is extensible, and most of the contents of the superblock are -+ * in variable length, type tagged fields; see struct bch_sb_field. -+ * -+ * Backup superblocks do not reside in a fixed location; also, superblocks do -+ * not have a fixed size. To locate backup superblocks we have struct -+ * bch_sb_layout; we store a copy of this inside every superblock, and also -+ * before the first superblock. -+ * -+ * JOURNAL: -+ * -+ * The journal primarily records btree updates in the order they occurred; -+ * journal replay consists of just iterating over all the keys in the open -+ * journal entries and re-inserting them into the btrees. -+ * -+ * The journal also contains entry types for the btree roots, and blacklisted -+ * journal sequence numbers (see journal_seq_blacklist.c). -+ * -+ * BTREE: -+ * -+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically -+ * 128k-256k) and log structured. We use struct btree_node for writing the first -+ * entry in a given node (offset 0), and struct btree_node_entry for all -+ * subsequent writes. -+ * -+ * After the header, btree node entries contain a list of keys in sorted order. -+ * Values are stored inline with the keys; since values are variable length (and -+ * keys effectively are variable length too, due to packing) we can't do random -+ * access without building up additional in memory tables in the btree node read -+ * path. -+ * -+ * BTREE KEYS (struct bkey): -+ * -+ * The various btrees share a common format for the key - so as to avoid -+ * switching in fastpath lookup/comparison code - but define their own -+ * structures for the key values. -+ * -+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max -+ * size is just under 2k. The common part also contains a type tag for the -+ * value, and a format field indicating whether the key is packed or not (and -+ * also meant to allow adding new key fields in the future, if desired). -+ * -+ * bkeys, when stored within a btree node, may also be packed. In that case, the -+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can -+ * be generous with field sizes in the common part of the key format (64 bit -+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define LE_BITMASK(_bits, name, type, field, offset, end) \ -+static const unsigned name##_OFFSET = offset; \ -+static const unsigned name##_BITS = (end - offset); \ -+static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ -+ \ -+static inline __u64 name(const type *k) \ -+{ \ -+ return (__le##_bits##_to_cpu(k->field) >> offset) & \ -+ ~(~0ULL << (end - offset)); \ -+} \ -+ \ -+static inline void SET_##name(type *k, __u64 v) \ -+{ \ -+ __u##_bits new = __le##_bits##_to_cpu(k->field); \ -+ \ -+ new &= ~(~(~0ULL << (end - offset)) << offset); \ -+ new |= (v & ~(~0ULL << (end - offset))) << offset; \ -+ k->field = __cpu_to_le##_bits(new); \ -+} -+ -+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) -+ -+struct bkey_format { -+ __u8 key_u64s; -+ __u8 nr_fields; -+ /* One unused slot for now: */ -+ __u8 bits_per_field[6]; -+ __le64 field_offset[6]; -+}; -+ -+/* Btree keys - all units are in sectors */ -+ -+struct bpos { -+ /* -+ * Word order matches machine byte order - btree code treats a bpos as a -+ * single large integer, for search/comparison purposes -+ * -+ * Note that wherever a bpos is embedded in another on disk data -+ * structure, it has to be byte swabbed when reading in metadata that -+ * wasn't written in native endian order: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u32 snapshot; -+ __u64 offset; -+ __u64 inode; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u64 inode; -+ __u64 offset; /* Points to end of extent - sectors */ -+ __u32 snapshot; -+#else -+#error edit for your odd byteorder. -+#endif -+} __attribute__((packed, aligned(4))); -+ -+#define KEY_INODE_MAX ((__u64)~0ULL) -+#define KEY_OFFSET_MAX ((__u64)~0ULL) -+#define KEY_SNAPSHOT_MAX ((__u32)~0U) -+#define KEY_SIZE_MAX ((__u32)~0U) -+ -+static inline struct bpos POS(__u64 inode, __u64 offset) -+{ -+ struct bpos ret; -+ -+ ret.inode = inode; -+ ret.offset = offset; -+ ret.snapshot = 0; -+ -+ return ret; -+} -+ -+#define POS_MIN POS(0, 0) -+#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) -+ -+/* Empty placeholder struct, for container_of() */ -+struct bch_val { -+ __u64 __nothing[0]; -+}; -+ -+struct bversion { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u64 lo; -+ __u32 hi; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u32 hi; -+ __u64 lo; -+#endif -+} __attribute__((packed, aligned(4))); -+ -+struct bkey { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u8 pad[1]; -+ -+ struct bversion version; -+ __u32 size; /* extent size, in sectors */ -+ struct bpos p; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ struct bpos p; -+ __u32 size; /* extent size, in sectors */ -+ struct bversion version; -+ -+ __u8 pad[1]; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bkey_packed { -+ __u64 _data[0]; -+ -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+ -+ /* -+ * XXX: next incompat on disk format change, switch format and -+ * needs_whiteout - bkey_packed() will be cheaper if format is the high -+ * bits of the bitfield -+ */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ __u8 key_start[0]; -+ -+ /* -+ * We copy bkeys with struct assignment in various places, and while -+ * that shouldn't be done with packed bkeys we can't disallow it in C, -+ * and it's legal to cast a bkey to a bkey_packed - so padding it out -+ * to the same size as struct bkey should hopefully be safest. -+ */ -+ __u8 pad[sizeof(struct bkey) - 3]; -+} __attribute__((packed, aligned(8))); -+ -+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -+#define BKEY_U64s_MAX U8_MAX -+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) -+ -+#define KEY_PACKED_BITS_START 24 -+ -+#define KEY_FORMAT_LOCAL_BTREE 0 -+#define KEY_FORMAT_CURRENT 1 -+ -+enum bch_bkey_fields { -+ BKEY_FIELD_INODE, -+ BKEY_FIELD_OFFSET, -+ BKEY_FIELD_SNAPSHOT, -+ BKEY_FIELD_SIZE, -+ BKEY_FIELD_VERSION_HI, -+ BKEY_FIELD_VERSION_LO, -+ BKEY_NR_FIELDS, -+}; -+ -+#define bkey_format_field(name, field) \ -+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) -+ -+#define BKEY_FORMAT_CURRENT \ -+((struct bkey_format) { \ -+ .key_u64s = BKEY_U64s, \ -+ .nr_fields = BKEY_NR_FIELDS, \ -+ .bits_per_field = { \ -+ bkey_format_field(INODE, p.inode), \ -+ bkey_format_field(OFFSET, p.offset), \ -+ bkey_format_field(SNAPSHOT, p.snapshot), \ -+ bkey_format_field(SIZE, size), \ -+ bkey_format_field(VERSION_HI, version.hi), \ -+ bkey_format_field(VERSION_LO, version.lo), \ -+ }, \ -+}) -+ -+/* bkey with inline value */ -+struct bkey_i { -+ __u64 _data[0]; -+ -+ union { -+ struct { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ }; -+ struct { -+ struct bkey k; -+ struct bch_val v; -+ }; -+ }; -+}; -+ -+#define KEY(_inode, _offset, _size) \ -+((struct bkey) { \ -+ .u64s = BKEY_U64s, \ -+ .format = KEY_FORMAT_CURRENT, \ -+ .p = POS(_inode, _offset), \ -+ .size = _size, \ -+}) -+ -+static inline void bkey_init(struct bkey *k) -+{ -+ *k = KEY(0, 0, 0); -+} -+ -+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) -+ -+#define __BKEY_PADDED(key, pad) \ -+ struct { struct bkey_i key; __u64 key ## _pad[pad]; } -+ -+/* -+ * - DELETED keys are used internally to mark keys that should be ignored but -+ * override keys in composition order. Their version number is ignored. -+ * -+ * - DISCARDED keys indicate that the data is all 0s because it has been -+ * discarded. DISCARDs may have a version; if the version is nonzero the key -+ * will be persistent, otherwise the key will be dropped whenever the btree -+ * node is rewritten (like DELETED keys). -+ * -+ * - ERROR: any read of the data returns a read error, as the data was lost due -+ * to a failing device. Like DISCARDED keys, they can be removed (overridden) -+ * by new writes or cluster-wide GC. Node repair can also overwrite them with -+ * the same or a more recent version number, but not with an older version -+ * number. -+ * -+ * - WHITEOUT: for hash table btrees -+*/ -+#define BCH_BKEY_TYPES() \ -+ x(deleted, 0) \ -+ x(discard, 1) \ -+ x(error, 2) \ -+ x(cookie, 3) \ -+ x(whiteout, 4) \ -+ x(btree_ptr, 5) \ -+ x(extent, 6) \ -+ x(reservation, 7) \ -+ x(inode, 8) \ -+ x(inode_generation, 9) \ -+ x(dirent, 10) \ -+ x(xattr, 11) \ -+ x(alloc, 12) \ -+ x(quota, 13) \ -+ x(stripe, 14) \ -+ x(reflink_p, 15) \ -+ x(reflink_v, 16) \ -+ x(inline_data, 17) \ -+ x(btree_ptr_v2, 18) -+ -+enum bch_bkey_type { -+#define x(name, nr) KEY_TYPE_##name = nr, -+ BCH_BKEY_TYPES() -+#undef x -+ KEY_TYPE_MAX, -+}; -+ -+struct bch_cookie { -+ struct bch_val v; -+ __le64 cookie; -+}; -+ -+/* Extents */ -+ -+/* -+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally -+ * preceded by checksum/compression information (bch_extent_crc32 or -+ * bch_extent_crc64). -+ * -+ * One major determining factor in the format of extents is how we handle and -+ * represent extents that have been partially overwritten and thus trimmed: -+ * -+ * If an extent is not checksummed or compressed, when the extent is trimmed we -+ * don't have to remember the extent we originally allocated and wrote: we can -+ * merely adjust ptr->offset to point to the start of the data that is currently -+ * live. The size field in struct bkey records the current (live) size of the -+ * extent, and is also used to mean "size of region on disk that we point to" in -+ * this case. -+ * -+ * Thus an extent that is not checksummed or compressed will consist only of a -+ * list of bch_extent_ptrs, with none of the fields in -+ * bch_extent_crc32/bch_extent_crc64. -+ * -+ * When an extent is checksummed or compressed, it's not possible to read only -+ * the data that is currently live: we have to read the entire extent that was -+ * originally written, and then return only the part of the extent that is -+ * currently live. -+ * -+ * Thus, in addition to the current size of the extent in struct bkey, we need -+ * to store the size of the originally allocated space - this is the -+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, -+ * when the extent is trimmed, instead of modifying the offset field of the -+ * pointer, we keep a second smaller offset field - "offset into the original -+ * extent of the currently live region". -+ * -+ * The other major determining factor is replication and data migration: -+ * -+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated -+ * write, we will initially write all the replicas in the same format, with the -+ * same checksum type and compression format - however, when copygc runs later (or -+ * tiering/cache promotion, anything that moves data), it is not in general -+ * going to rewrite all the pointers at once - one of the replicas may be in a -+ * bucket on one device that has very little fragmentation while another lives -+ * in a bucket that has become heavily fragmented, and thus is being rewritten -+ * sooner than the rest. -+ * -+ * Thus it will only move a subset of the pointers (or in the case of -+ * tiering/cache promotion perhaps add a single pointer without dropping any -+ * current pointers), and if the extent has been partially overwritten it must -+ * write only the currently live portion (or copygc would not be able to reduce -+ * fragmentation!) - which necessitates a different bch_extent_crc format for -+ * the new pointer. -+ * -+ * But in the interests of space efficiency, we don't want to store one -+ * bch_extent_crc for each pointer if we don't have to. -+ * -+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and -+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the -+ * type of a given entry with a scheme similar to utf8 (except we're encoding a -+ * type, not a size), encoding the type in the position of the first set bit: -+ * -+ * bch_extent_crc32 - 0b1 -+ * bch_extent_ptr - 0b10 -+ * bch_extent_crc64 - 0b100 -+ * -+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and -+ * bch_extent_crc64 is the least constrained). -+ * -+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, -+ * until the next bch_extent_crc32/64. -+ * -+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer -+ * is neither checksummed nor compressed. -+ */ -+ -+/* 128 bits, sufficient for cryptographic MACs: */ -+struct bch_csum { -+ __le64 lo; -+ __le64 hi; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_EXTENT_ENTRY_TYPES() \ -+ x(ptr, 0) \ -+ x(crc32, 1) \ -+ x(crc64, 2) \ -+ x(crc128, 3) \ -+ x(stripe_ptr, 4) -+#define BCH_EXTENT_ENTRY_MAX 5 -+ -+enum bch_extent_entry_type { -+#define x(f, n) BCH_EXTENT_ENTRY_##f = n, -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+/* Compressed/uncompressed size are stored biased by 1: */ -+struct bch_extent_crc32 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u32 type:2, -+ _compressed_size:7, -+ _uncompressed_size:7, -+ offset:7, -+ _unused:1, -+ csum_type:4, -+ compression_type:4; -+ __u32 csum; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u32 csum; -+ __u32 compression_type:4, -+ csum_type:4, -+ _unused:1, -+ offset:7, -+ _uncompressed_size:7, -+ _compressed_size:7, -+ type:2; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+#define CRC32_SIZE_MAX (1U << 7) -+#define CRC32_NONCE_MAX 0 -+ -+struct bch_extent_crc64 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:3, -+ _compressed_size:9, -+ _uncompressed_size:9, -+ offset:9, -+ nonce:10, -+ csum_type:4, -+ compression_type:4, -+ csum_hi:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 csum_hi:16, -+ compression_type:4, -+ csum_type:4, -+ nonce:10, -+ offset:9, -+ _uncompressed_size:9, -+ _compressed_size:9, -+ type:3; -+#endif -+ __u64 csum_lo; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC64_SIZE_MAX (1U << 9) -+#define CRC64_NONCE_MAX ((1U << 10) - 1) -+ -+struct bch_extent_crc128 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:4, -+ _compressed_size:13, -+ _uncompressed_size:13, -+ offset:13, -+ nonce:13, -+ csum_type:4, -+ compression_type:4; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 compression_type:4, -+ csum_type:4, -+ nonce:13, -+ offset:13, -+ _uncompressed_size:13, -+ _compressed_size:13, -+ type:4; -+#endif -+ struct bch_csum csum; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC128_SIZE_MAX (1U << 13) -+#define CRC128_NONCE_MAX ((1U << 13) - 1) -+ -+/* -+ * @reservation - pointer hasn't been written to, just reserved -+ */ -+struct bch_extent_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:1, -+ cached:1, -+ unused:1, -+ reservation:1, -+ offset:44, /* 8 petabytes */ -+ dev:8, -+ gen:8; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 gen:8, -+ dev:8, -+ offset:44, -+ reservation:1, -+ unused:1, -+ cached:1, -+ type:1; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent_stripe_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:5, -+ block:8, -+ idx:51; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 idx:51, -+ block:8, -+ type:5; -+#endif -+}; -+ -+struct bch_extent_reservation { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:22, -+ replicas:4, -+ generation:32; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 generation:32, -+ replicas:4, -+ unused:22, -+ type:6; -+#endif -+}; -+ -+union bch_extent_entry { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 -+ unsigned long type; -+#elif __BITS_PER_LONG == 32 -+ struct { -+ unsigned long pad; -+ unsigned long type; -+ }; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define x(f, n) struct bch_extent_##f f; -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+struct bch_btree_ptr { -+ struct bch_val v; -+ -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_btree_ptr_v2 { -+ struct bch_val v; -+ -+ __u64 mem_ptr; -+ __le64 seq; -+ __le16 sectors_written; -+ /* In case we ever decide to do variable size btree nodes: */ -+ __le16 sectors; -+ struct bpos min_key; -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent { -+ struct bch_val v; -+ -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_reservation { -+ struct bch_val v; -+ -+ __le32 generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+} __attribute__((packed, aligned(8))); -+ -+/* Maximum size (in u64s) a single pointer could be: */ -+#define BKEY_EXTENT_PTR_U64s_MAX\ -+ ((sizeof(struct bch_extent_crc128) + \ -+ sizeof(struct bch_extent_ptr)) / sizeof(u64)) -+ -+/* Maximum possible size of an entire extent value: */ -+#define BKEY_EXTENT_VAL_U64s_MAX \ -+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -+ -+#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* * Maximum possible size of an entire extent, key + value: */ -+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* Btree pointers don't carry around checksums: */ -+#define BKEY_BTREE_PTR_VAL_U64s_MAX \ -+ ((sizeof(struct bch_btree_ptr_v2) + \ -+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) -+#define BKEY_BTREE_PTR_U64s_MAX \ -+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -+ -+/* Inodes */ -+ -+#define BLOCKDEV_INODE_MAX 4096 -+ -+#define BCACHEFS_ROOT_INO 4096 -+ -+struct bch_inode { -+ struct bch_val v; -+ -+ __le64 bi_hash_seed; -+ __le32 bi_flags; -+ __le16 bi_mode; -+ __u8 fields[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_inode_generation { -+ struct bch_val v; -+ -+ __le32 bi_generation; -+ __le32 pad; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_INODE_FIELDS() \ -+ x(bi_atime, 64) \ -+ x(bi_ctime, 64) \ -+ x(bi_mtime, 64) \ -+ x(bi_otime, 64) \ -+ x(bi_size, 64) \ -+ x(bi_sectors, 64) \ -+ x(bi_uid, 32) \ -+ x(bi_gid, 32) \ -+ x(bi_nlink, 32) \ -+ x(bi_generation, 32) \ -+ x(bi_dev, 32) \ -+ x(bi_data_checksum, 8) \ -+ x(bi_compression, 8) \ -+ x(bi_project, 32) \ -+ x(bi_background_compression, 8) \ -+ x(bi_data_replicas, 8) \ -+ x(bi_promote_target, 16) \ -+ x(bi_foreground_target, 16) \ -+ x(bi_background_target, 16) \ -+ x(bi_erasure_code, 16) \ -+ x(bi_fields_set, 16) -+ -+/* subset of BCH_INODE_FIELDS */ -+#define BCH_INODE_OPTS() \ -+ x(data_checksum, 8) \ -+ x(compression, 8) \ -+ x(project, 32) \ -+ x(background_compression, 8) \ -+ x(data_replicas, 8) \ -+ x(promote_target, 16) \ -+ x(foreground_target, 16) \ -+ x(background_target, 16) \ -+ x(erasure_code, 16) -+ -+enum inode_opt_id { -+#define x(name, ...) \ -+ Inode_opt_##name, -+ BCH_INODE_OPTS() -+#undef x -+ Inode_opt_nr, -+}; -+ -+enum { -+ /* -+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL -+ * flags) -+ */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, -+ -+ __BCH_INODE_I_SIZE_DIRTY= 5, -+ __BCH_INODE_I_SECTORS_DIRTY= 6, -+ __BCH_INODE_UNLINKED = 7, -+ -+ /* bits 20+ reserved for packed fields below: */ -+}; -+ -+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -+ -+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); -+ -+/* Dirents */ -+ -+/* -+ * Dirents (and xattrs) have to implement string lookups; since our b-tree -+ * doesn't support arbitrary length strings for the key, we instead index by a -+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset -+ * field of the key - using linear probing to resolve hash collisions. This also -+ * provides us with the readdir cookie posix requires. -+ * -+ * Linear probing requires us to use whiteouts for deletions, in the event of a -+ * collision: -+ */ -+ -+struct bch_dirent { -+ struct bch_val v; -+ -+ /* Target inode number: */ -+ __le64 d_inum; -+ -+ /* -+ * Copy of mode bits 12-15 from the target inode - so userspace can get -+ * the filetype without having to do a stat() -+ */ -+ __u8 d_type; -+ -+ __u8 d_name[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ -+ sizeof(struct bkey) - \ -+ offsetof(struct bch_dirent, d_name)) -+ -+ -+/* Xattrs */ -+ -+#define KEY_TYPE_XATTR_INDEX_USER 0 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -+#define KEY_TYPE_XATTR_INDEX_SECURITY 4 -+ -+struct bch_xattr { -+ struct bch_val v; -+ __u8 x_type; -+ __u8 x_name_len; -+ __le16 x_val_len; -+ __u8 x_name[]; -+} __attribute__((packed, aligned(8))); -+ -+/* Bucket/allocation information: */ -+ -+struct bch_alloc { -+ struct bch_val v; -+ __u8 fields; -+ __u8 gen; -+ __u8 data[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_ALLOC_FIELDS() \ -+ x(read_time, 16) \ -+ x(write_time, 16) \ -+ x(data_type, 8) \ -+ x(dirty_sectors, 16) \ -+ x(cached_sectors, 16) \ -+ x(oldest_gen, 8) -+ -+enum { -+#define x(name, bytes) BCH_ALLOC_FIELD_##name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ BCH_ALLOC_FIELD_NR -+}; -+ -+static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+#define x(name, bits) + (bits / 8) -+static const unsigned BKEY_ALLOC_VAL_U64s_MAX = -+ DIV_ROUND_UP(offsetof(struct bch_alloc, data) -+ BCH_ALLOC_FIELDS(), sizeof(u64)); -+#undef x -+ -+#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) -+ -+/* Quotas: */ -+ -+enum quota_types { -+ QTYP_USR = 0, -+ QTYP_GRP = 1, -+ QTYP_PRJ = 2, -+ QTYP_NR = 3, -+}; -+ -+enum quota_counters { -+ Q_SPC = 0, -+ Q_INO = 1, -+ Q_COUNTERS = 2, -+}; -+ -+struct bch_quota_counter { -+ __le64 hardlimit; -+ __le64 softlimit; -+}; -+ -+struct bch_quota { -+ struct bch_val v; -+ struct bch_quota_counter c[Q_COUNTERS]; -+} __attribute__((packed, aligned(8))); -+ -+/* Erasure coding */ -+ -+struct bch_stripe { -+ struct bch_val v; -+ __le16 sectors; -+ __u8 algorithm; -+ __u8 nr_blocks; -+ __u8 nr_redundant; -+ -+ __u8 csum_granularity_bits; -+ __u8 csum_type; -+ __u8 pad; -+ -+ struct bch_extent_ptr ptrs[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* Reflink: */ -+ -+struct bch_reflink_p { -+ struct bch_val v; -+ __le64 idx; -+ -+ __le32 reservation_generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+}; -+ -+struct bch_reflink_v { -+ struct bch_val v; -+ __le64 refcount; -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+}; -+ -+/* Inline data */ -+ -+struct bch_inline_data { -+ struct bch_val v; -+ u8 data[0]; -+}; -+ -+/* Optional/variable size superblock sections: */ -+ -+struct bch_sb_field { -+ __u64 _data[0]; -+ __le32 u64s; -+ __le32 type; -+}; -+ -+#define BCH_SB_FIELDS() \ -+ x(journal, 0) \ -+ x(members, 1) \ -+ x(crypt, 2) \ -+ x(replicas_v0, 3) \ -+ x(quota, 4) \ -+ x(disk_groups, 5) \ -+ x(clean, 6) \ -+ x(replicas, 7) \ -+ x(journal_seq_blacklist, 8) -+ -+enum bch_sb_field_type { -+#define x(f, nr) BCH_SB_FIELD_##f = nr, -+ BCH_SB_FIELDS() -+#undef x -+ BCH_SB_FIELD_NR -+}; -+ -+/* BCH_SB_FIELD_journal: */ -+ -+struct bch_sb_field_journal { -+ struct bch_sb_field field; -+ __le64 buckets[0]; -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+#define BCH_MIN_NR_NBUCKETS (1 << 6) -+ -+struct bch_member { -+ uuid_le uuid; -+ __le64 nbuckets; /* device size */ -+ __le16 first_bucket; /* index of first bucket used */ -+ __le16 bucket_size; /* sectors */ -+ __le32 pad; -+ __le64 last_mount; /* time_t */ -+ -+ __le64 flags[2]; -+}; -+ -+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -+/* 4-10 unused, was TIER, HAS_(META)DATA */ -+LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) -+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) -+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) -+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) -+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -+ -+#define BCH_TIER_MAX 4U -+ -+#if 0 -+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -+#endif -+ -+enum bch_member_state { -+ BCH_MEMBER_STATE_RW = 0, -+ BCH_MEMBER_STATE_RO = 1, -+ BCH_MEMBER_STATE_FAILED = 2, -+ BCH_MEMBER_STATE_SPARE = 3, -+ BCH_MEMBER_STATE_NR = 4, -+}; -+ -+enum cache_replacement { -+ CACHE_REPLACEMENT_LRU = 0, -+ CACHE_REPLACEMENT_FIFO = 1, -+ CACHE_REPLACEMENT_RANDOM = 2, -+ CACHE_REPLACEMENT_NR = 3, -+}; -+ -+struct bch_sb_field_members { -+ struct bch_sb_field field; -+ struct bch_member members[0]; -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+struct nonce { -+ __le32 d[4]; -+}; -+ -+struct bch_key { -+ __le64 key[4]; -+}; -+ -+#define BCH_KEY_MAGIC \ -+ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ -+ ((u64) 'h' << 16)|((u64) '*' << 24)| \ -+ ((u64) '*' << 32)|((u64) 'k' << 40)| \ -+ ((u64) 'e' << 48)|((u64) 'y' << 56)) -+ -+struct bch_encrypted_key { -+ __le64 magic; -+ struct bch_key key; -+}; -+ -+/* -+ * If this field is present in the superblock, it stores an encryption key which -+ * is used encrypt all other data/metadata. The key will normally be encrypted -+ * with the key userspace provides, but if encryption has been turned off we'll -+ * just store the master key unencrypted in the superblock so we can access the -+ * previously encrypted data. -+ */ -+struct bch_sb_field_crypt { -+ struct bch_sb_field field; -+ -+ __le64 flags; -+ __le64 kdf_flags; -+ struct bch_encrypted_key key; -+}; -+ -+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); -+ -+enum bch_kdf_types { -+ BCH_KDF_SCRYPT = 0, -+ BCH_KDF_NR = 1, -+}; -+ -+/* stored as base 2 log of scrypt params: */ -+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -+ -+/* BCH_SB_FIELD_replicas: */ -+ -+enum bch_data_type { -+ BCH_DATA_NONE = 0, -+ BCH_DATA_SB = 1, -+ BCH_DATA_JOURNAL = 2, -+ BCH_DATA_BTREE = 3, -+ BCH_DATA_USER = 4, -+ BCH_DATA_CACHED = 5, -+ BCH_DATA_NR = 6, -+}; -+ -+struct bch_replicas_entry_v0 { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+struct bch_sb_field_replicas_v0 { -+ struct bch_sb_field field; -+ struct bch_replicas_entry_v0 entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_entry { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 nr_required; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+#define replicas_entry_bytes(_i) \ -+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) -+ -+struct bch_sb_field_replicas { -+ struct bch_sb_field field; -+ struct bch_replicas_entry entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_quota: */ -+ -+struct bch_sb_quota_counter { -+ __le32 timelimit; -+ __le32 warnlimit; -+}; -+ -+struct bch_sb_quota_type { -+ __le64 flags; -+ struct bch_sb_quota_counter c[Q_COUNTERS]; -+}; -+ -+struct bch_sb_field_quota { -+ struct bch_sb_field field; -+ struct bch_sb_quota_type q[QTYP_NR]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_disk_groups: */ -+ -+#define BCH_SB_LABEL_SIZE 32 -+ -+struct bch_disk_group { -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 flags[2]; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) -+ -+struct bch_sb_field_disk_groups { -+ struct bch_sb_field field; -+ struct bch_disk_group entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * On clean shutdown, store btree roots and current journal sequence number in -+ * the superblock: -+ */ -+struct jset_entry { -+ __le16 u64s; -+ __u8 btree_id; -+ __u8 level; -+ __u8 type; /* designates what this jset holds */ -+ __u8 pad[3]; -+ -+ union { -+ struct bkey_i start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct bch_sb_field_clean { -+ struct bch_sb_field field; -+ -+ __le32 flags; -+ __le16 read_clock; -+ __le16 write_clock; -+ __le64 journal_seq; -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct journal_seq_blacklist_entry { -+ __le64 start; -+ __le64 end; -+}; -+ -+struct bch_sb_field_journal_seq_blacklist { -+ struct bch_sb_field field; -+ -+ union { -+ struct journal_seq_blacklist_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+/* Superblock: */ -+ -+/* -+ * New versioning scheme: -+ * One common version number for all on disk data structures - superblock, btree -+ * nodes, journal entries -+ */ -+#define BCH_JSET_VERSION_OLD 2 -+#define BCH_BSET_VERSION_OLD 3 -+ -+enum bcachefs_metadata_version { -+ bcachefs_metadata_version_min = 9, -+ bcachefs_metadata_version_new_versioning = 10, -+ bcachefs_metadata_version_bkey_renumber = 10, -+ bcachefs_metadata_version_inode_btree_change = 11, -+ bcachefs_metadata_version_max = 12, -+}; -+ -+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) -+ -+#define BCH_SB_SECTOR 8 -+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ -+ -+struct bch_sb_layout { -+ uuid_le magic; /* bcachefs superblock UUID */ -+ __u8 layout_type; -+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ -+ __u8 nr_superblocks; -+ __u8 pad[5]; -+ __le64 sb_offset[61]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_SB_LAYOUT_SECTOR 7 -+ -+/* -+ * @offset - sector where this sb was written -+ * @version - on disk format version -+ * @version_min - Oldest metadata version this filesystem contains; so we can -+ * safely drop compatibility code and refuse to mount filesystems -+ * we'd need it for -+ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) -+ * @seq - incremented each time superblock is written -+ * @uuid - used for generating various magic numbers and identifying -+ * member devices, never changes -+ * @user_uuid - user visible UUID, may be changed -+ * @label - filesystem label -+ * @seq - identifies most recent superblock, incremented each time -+ * superblock is written -+ * @features - enabled incompatible features -+ */ -+struct bch_sb { -+ struct bch_csum csum; -+ __le16 version; -+ __le16 version_min; -+ __le16 pad[2]; -+ uuid_le magic; -+ uuid_le uuid; -+ uuid_le user_uuid; -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 offset; -+ __le64 seq; -+ -+ __le16 block_size; -+ __u8 dev_idx; -+ __u8 nr_devices; -+ __le32 u64s; -+ -+ __le64 time_base_lo; -+ __le32 time_base_hi; -+ __le32 time_precision; -+ -+ __le64 flags[8]; -+ __le64 features[2]; -+ __le64 compat[2]; -+ -+ struct bch_sb_layout layout; -+ -+ union { -+ struct bch_sb_field start[0]; -+ __le64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * Flags: -+ * BCH_SB_INITALIZED - set on first mount -+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect -+ * behaviour of mount/recovery path: -+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits -+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 -+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides -+ * DATA/META_CSUM_TYPE. Also indicates encryption -+ * algorithm in use, if/when we get more than one -+ */ -+ -+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); -+ -+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); -+ -+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); -+ -+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); -+ -+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -+ -+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); -+ -+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -+ -+LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); -+ -+/* 61-64 unused */ -+ -+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); -+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); -+ -+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -+ -+/* -+ * Max size of an extent that may require bouncing to read or write -+ * (checksummed, compressed): 64k -+ */ -+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, -+ struct bch_sb, flags[1], 14, 20); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); -+ -+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); -+ -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, -+ struct bch_sb, flags[2], 0, 4); -+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); -+ -+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -+ -+/* -+ * Features: -+ * -+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist -+ * reflink: gates KEY_TYPE_reflink -+ * inline_data: gates KEY_TYPE_inline_data -+ * new_siphash: gates BCH_STR_HASH_SIPHASH -+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE -+ */ -+#define BCH_SB_FEATURES() \ -+ x(lz4, 0) \ -+ x(gzip, 1) \ -+ x(zstd, 2) \ -+ x(atomic_nlink, 3) \ -+ x(ec, 4) \ -+ x(journal_seq_blacklist_v3, 5) \ -+ x(reflink, 6) \ -+ x(new_siphash, 7) \ -+ x(inline_data, 8) \ -+ x(new_extent_overwrite, 9) \ -+ x(incompressible, 10) \ -+ x(btree_ptr_v2, 11) \ -+ x(extents_above_btree_updates, 12) \ -+ x(btree_updates_journalled, 13) -+ -+#define BCH_SB_FEATURES_ALL \ -+ ((1ULL << BCH_FEATURE_new_siphash)| \ -+ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ -+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -+ (1ULL << BCH_FEATURE_extents_above_btree_updates)) -+ -+enum bch_sb_feature { -+#define x(f, n) BCH_FEATURE_##f, -+ BCH_SB_FEATURES() -+#undef x -+ BCH_FEATURE_NR, -+}; -+ -+enum bch_sb_compat { -+ BCH_COMPAT_FEAT_ALLOC_INFO = 0, -+ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, -+}; -+ -+/* options: */ -+ -+#define BCH_REPLICAS_MAX 4U -+ -+enum bch_error_actions { -+ BCH_ON_ERROR_CONTINUE = 0, -+ BCH_ON_ERROR_RO = 1, -+ BCH_ON_ERROR_PANIC = 2, -+ BCH_NR_ERROR_ACTIONS = 3, -+}; -+ -+enum bch_str_hash_type { -+ BCH_STR_HASH_CRC32C = 0, -+ BCH_STR_HASH_CRC64 = 1, -+ BCH_STR_HASH_SIPHASH_OLD = 2, -+ BCH_STR_HASH_SIPHASH = 3, -+ BCH_STR_HASH_NR = 4, -+}; -+ -+enum bch_str_hash_opts { -+ BCH_STR_HASH_OPT_CRC32C = 0, -+ BCH_STR_HASH_OPT_CRC64 = 1, -+ BCH_STR_HASH_OPT_SIPHASH = 2, -+ BCH_STR_HASH_OPT_NR = 3, -+}; -+ -+enum bch_csum_type { -+ BCH_CSUM_NONE = 0, -+ BCH_CSUM_CRC32C_NONZERO = 1, -+ BCH_CSUM_CRC64_NONZERO = 2, -+ BCH_CSUM_CHACHA20_POLY1305_80 = 3, -+ BCH_CSUM_CHACHA20_POLY1305_128 = 4, -+ BCH_CSUM_CRC32C = 5, -+ BCH_CSUM_CRC64 = 6, -+ BCH_CSUM_NR = 7, -+}; -+ -+static const unsigned bch_crc_bytes[] = { -+ [BCH_CSUM_NONE] = 0, -+ [BCH_CSUM_CRC32C_NONZERO] = 4, -+ [BCH_CSUM_CRC32C] = 4, -+ [BCH_CSUM_CRC64_NONZERO] = 8, -+ [BCH_CSUM_CRC64] = 8, -+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, -+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -+}; -+ -+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -+{ -+ switch (type) { -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+enum bch_csum_opts { -+ BCH_CSUM_OPT_NONE = 0, -+ BCH_CSUM_OPT_CRC32C = 1, -+ BCH_CSUM_OPT_CRC64 = 2, -+ BCH_CSUM_OPT_NR = 3, -+}; -+ -+#define BCH_COMPRESSION_TYPES() \ -+ x(none, 0) \ -+ x(lz4_old, 1) \ -+ x(gzip, 2) \ -+ x(lz4, 3) \ -+ x(zstd, 4) \ -+ x(incompressible, 5) -+ -+enum bch_compression_type { -+#define x(t, n) BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_TYPES() -+#undef x -+ BCH_COMPRESSION_TYPE_NR -+}; -+ -+#define BCH_COMPRESSION_OPTS() \ -+ x(none, 0) \ -+ x(lz4, 1) \ -+ x(gzip, 2) \ -+ x(zstd, 3) -+ -+enum bch_compression_opts { -+#define x(t, n) BCH_COMPRESSION_OPT_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ BCH_COMPRESSION_OPT_NR -+}; -+ -+/* -+ * Magic numbers -+ * -+ * The various other data structures have their own magic numbers, which are -+ * xored with the first part of the cache set's UUID -+ */ -+ -+#define BCACHE_MAGIC \ -+ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ -+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -+ -+#define BCACHEFS_STATFS_MAGIC 0xca451a4e -+ -+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) -+ -+static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -+{ -+ __le64 ret; -+ memcpy(&ret, &sb->uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 __jset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -+} -+ -+static inline __u64 __bset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -+} -+ -+/* Journal */ -+ -+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) -+ -+#define BCH_JSET_ENTRY_TYPES() \ -+ x(btree_keys, 0) \ -+ x(btree_root, 1) \ -+ x(prio_ptrs, 2) \ -+ x(blacklist, 3) \ -+ x(blacklist_v2, 4) \ -+ x(usage, 5) \ -+ x(data_usage, 6) -+ -+enum { -+#define x(f, nr) BCH_JSET_ENTRY_##f = nr, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+ BCH_JSET_ENTRY_NR -+}; -+ -+/* -+ * Journal sequence numbers can be blacklisted: bsets record the max sequence -+ * number of all the journal entries they contain updates for, so that on -+ * recovery we can ignore those bsets that contain index updates newer that what -+ * made it into the journal. -+ * -+ * This means that we can't reuse that journal_seq - we have to skip it, and -+ * then record that we skipped it so that the next time we crash and recover we -+ * don't think there was a missing journal entry. -+ */ -+struct jset_entry_blacklist { -+ struct jset_entry entry; -+ __le64 seq; -+}; -+ -+struct jset_entry_blacklist_v2 { -+ struct jset_entry entry; -+ __le64 start; -+ __le64 end; -+}; -+ -+enum { -+ FS_USAGE_RESERVED = 0, -+ FS_USAGE_INODES = 1, -+ FS_USAGE_KEY_VERSION = 2, -+ FS_USAGE_NR = 3 -+}; -+ -+struct jset_entry_usage { -+ struct jset_entry entry; -+ __le64 v; -+} __attribute__((packed)); -+ -+struct jset_entry_data_usage { -+ struct jset_entry entry; -+ __le64 v; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+/* -+ * On disk format for a journal entry: -+ * seq is monotonically increasing; every journal entry has its own unique -+ * sequence number. -+ * -+ * last_seq is the oldest journal entry that still has keys the btree hasn't -+ * flushed to disk yet. -+ * -+ * version is for on disk format changes. -+ */ -+struct jset { -+ struct bch_csum csum; -+ -+ __le64 magic; -+ __le64 seq; -+ __le32 version; -+ __le32 flags; -+ -+ __le32 u64s; /* size of d[] in u64s */ -+ -+ __u8 encrypted_start[0]; -+ -+ __le16 read_clock; -+ __le16 write_clock; -+ -+ /* Sequence number of oldest dirty journal entry */ -+ __le64 last_seq; -+ -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -+ -+#define BCH_JOURNAL_BUCKETS_MIN 8 -+ -+/* Btree: */ -+ -+#define BCH_BTREE_IDS() \ -+ x(EXTENTS, 0, "extents") \ -+ x(INODES, 1, "inodes") \ -+ x(DIRENTS, 2, "dirents") \ -+ x(XATTRS, 3, "xattrs") \ -+ x(ALLOC, 4, "alloc") \ -+ x(QUOTAS, 5, "quotas") \ -+ x(EC, 6, "stripes") \ -+ x(REFLINK, 7, "reflink") -+ -+enum btree_id { -+#define x(kwd, val, name) BTREE_ID_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BTREE_ID_NR -+}; -+ -+#define BTREE_MAX_DEPTH 4U -+ -+/* Btree nodes */ -+ -+/* -+ * Btree nodes -+ * -+ * On disk a btree node is a list/log of these; within each set the keys are -+ * sorted -+ */ -+struct bset { -+ __le64 seq; -+ -+ /* -+ * Highest journal entry this bset contains keys for. -+ * If on recovery we don't see that journal entry, this bset is ignored: -+ * this allows us to preserve the order of all index updates after a -+ * crash, since the journal records a total order of all index updates -+ * and anything that didn't make it to the journal doesn't get used. -+ */ -+ __le64 journal_seq; -+ -+ __le32 flags; -+ __le16 version; -+ __le16 u64s; /* count of d[] in u64s */ -+ -+ union { -+ struct bkey_packed start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); -+ -+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, -+ struct bset, flags, 5, 6); -+ -+struct btree_node { -+ struct bch_csum csum; -+ __le64 magic; -+ -+ /* this flags field is encrypted, unlike bset->flags: */ -+ __le64 flags; -+ -+ /* Closed interval: */ -+ struct bpos min_key; -+ struct bpos max_key; -+ struct bch_extent_ptr ptr; -+ struct bkey_format format; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); -+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, -+ struct btree_node, flags, 8, 9); -+/* 9-32 unused */ -+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); -+ -+struct btree_node_entry { -+ struct bch_csum csum; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+#endif /* _BCACHEFS_FORMAT_H */ -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -new file mode 100644 -index 000000000000..d71157a3e073 ---- /dev/null -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -0,0 +1,332 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IOCTL_H -+#define _BCACHEFS_IOCTL_H -+ -+#include -+#include -+#include "bcachefs_format.h" -+ -+/* -+ * Flags common to multiple ioctls: -+ */ -+#define BCH_FORCE_IF_DATA_LOST (1 << 0) -+#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) -+ -+#define BCH_FORCE_IF_DEGRADED \ -+ (BCH_FORCE_IF_DATA_DEGRADED| \ -+ BCH_FORCE_IF_METADATA_DEGRADED) -+ -+/* -+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname -+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the -+ * filesystem: -+ */ -+#define BCH_BY_INDEX (1 << 4) -+ -+/* -+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem -+ * wide superblock: -+ */ -+#define BCH_READ_DEV (1 << 5) -+ -+/* global control dev: */ -+ -+/* These are currently broken, and probably unnecessary: */ -+#if 0 -+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) -+ -+struct bch_ioctl_assemble { -+ __u32 flags; -+ __u32 nr_devs; -+ __u64 pad; -+ __u64 devs[]; -+}; -+ -+struct bch_ioctl_incremental { -+ __u32 flags; -+ __u64 pad; -+ __u64 dev; -+}; -+#endif -+ -+/* filesystem ioctls: */ -+ -+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) -+ -+/* These only make sense when we also have incremental assembly */ -+#if 0 -+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -+#define BCH_IOCTL_STOP _IO(0xbc, 3) -+#endif -+ -+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -+ -+/* ioctl below act on a particular file, not the filesystem as a whole: */ -+ -+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) -+ -+/* -+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID -+ * -+ * Returns user visible UUID, not internal UUID (which may not ever be changed); -+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with -+ * this UUID. -+ */ -+struct bch_ioctl_query_uuid { -+ uuid_le uuid; -+}; -+ -+#if 0 -+struct bch_ioctl_start { -+ __u32 flags; -+ __u32 pad; -+}; -+#endif -+ -+/* -+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem -+ * -+ * The specified device must not be open or in use. On success, the new device -+ * will be an online member of the filesystem just like any other member. -+ * -+ * The device must first be prepared by userspace by formatting with a bcachefs -+ * superblock, which is only used for passing in superblock options/parameters -+ * for that device (in struct bch_member). The new device's superblock should -+ * not claim to be a member of any existing filesystem - UUIDs on it will be -+ * ignored. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem -+ * -+ * Any data present on @dev will be permanently deleted, and @dev will be -+ * removed from its slot in the filesystem's list of member devices. The device -+ * may be either offline or offline. -+ * -+ * Will fail removing @dev would leave us with insufficient read write devices -+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are -+ * set. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem -+ * but is not open (e.g. because we started in degraded mode), bring it online -+ * -+ * all existing data on @dev will be available once the device is online, -+ * exactly as if @dev was present when the filesystem was first mounted -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that -+ * block device, without removing it from the filesystem (so it can be brought -+ * back online later) -+ * -+ * Data present on @dev will be unavailable while @dev is offline (unless -+ * replicated), but will still be intact and untouched if @dev is brought back -+ * online -+ * -+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would -+ * leave us with insufficient read write devices or degraded/unavailable data, -+ * unless the approprate BCH_FORCE_IF_* flags are set. -+ */ -+ -+struct bch_ioctl_disk { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem -+ * -+ * @new_state - one of the bch_member_state states (rw, ro, failed, -+ * spare) -+ * -+ * Will refuse to change member state if we would then have insufficient devices -+ * to write to, or if it would result in degraded data (when @new_state is -+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. -+ */ -+struct bch_ioctl_disk_set_state { -+ __u32 flags; -+ __u8 new_state; -+ __u8 pad[3]; -+ __u64 dev; -+}; -+ -+enum bch_data_ops { -+ BCH_DATA_OP_SCRUB = 0, -+ BCH_DATA_OP_REREPLICATE = 1, -+ BCH_DATA_OP_MIGRATE = 2, -+ BCH_DATA_OP_NR = 3, -+}; -+ -+/* -+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. -+ * scrub, rereplicate, migrate). -+ * -+ * This ioctl kicks off a job in the background, and returns a file descriptor. -+ * Reading from the file descriptor returns a struct bch_ioctl_data_event, -+ * indicating current progress, and closing the file descriptor will stop the -+ * job. The file descriptor is O_CLOEXEC. -+ */ -+struct bch_ioctl_data { -+ __u32 op; -+ __u32 flags; -+ -+ struct bpos start; -+ struct bpos end; -+ -+ union { -+ struct { -+ __u32 dev; -+ __u32 pad; -+ } migrate; -+ struct { -+ __u64 pad[8]; -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+enum bch_data_event { -+ BCH_DATA_EVENT_PROGRESS = 0, -+ /* XXX: add an event for reporting errors */ -+ BCH_DATA_EVENT_NR = 1, -+}; -+ -+struct bch_ioctl_data_progress { -+ __u8 data_type; -+ __u8 btree_id; -+ __u8 pad[2]; -+ struct bpos pos; -+ -+ __u64 sectors_done; -+ __u64 sectors_total; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_ioctl_data_event { -+ __u8 type; -+ __u8 pad[7]; -+ union { -+ struct bch_ioctl_data_progress p; -+ __u64 pad2[15]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_usage { -+ __u64 sectors; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+static inline struct bch_replicas_usage * -+replicas_usage_next(struct bch_replicas_usage *u) -+{ -+ return (void *) u + replicas_entry_bytes(&u->r) + 8; -+} -+ -+/* -+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage -+ * -+ * Returns disk space usage broken out by data type, number of replicas, and -+ * by component device -+ * -+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries -+ * -+ * On success, @replica_entries_bytes will be changed to indicate the number of -+ * bytes actually used. -+ * -+ * Returns -ERANGE if @replica_entries_bytes was too small -+ */ -+struct bch_ioctl_fs_usage { -+ __u64 capacity; -+ __u64 used; -+ __u64 online_reserved; -+ __u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ -+ __u32 replica_entries_bytes; -+ __u32 pad; -+ -+ struct bch_replicas_usage replicas[0]; -+}; -+ -+/* -+ * BCH_IOCTL_DEV_USAGE: query device disk space usage -+ * -+ * Returns disk space usage broken out by data type - both by buckets and -+ * sectors. -+ */ -+struct bch_ioctl_dev_usage { -+ __u64 dev; -+ __u32 flags; -+ __u8 state; -+ __u8 pad[7]; -+ -+ __u32 bucket_size; -+ __u64 nr_buckets; -+ __u64 available_buckets; -+ -+ __u64 buckets[BCH_DATA_NR]; -+ __u64 sectors[BCH_DATA_NR]; -+ -+ __u64 ec_buckets; -+ __u64 ec_sectors; -+}; -+ -+/* -+ * BCH_IOCTL_READ_SUPER: read filesystem superblock -+ * -+ * Equivalent to reading the superblock directly from the block device, except -+ * avoids racing with the kernel writing the superblock or having to figure out -+ * which block device to read -+ * -+ * @sb - buffer to read into -+ * @size - size of userspace allocated buffer -+ * @dev - device to read superblock for, if BCH_READ_DEV flag is -+ * specified -+ * -+ * Returns -ERANGE if buffer provided is too small -+ */ -+struct bch_ioctl_read_super { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 size; -+ __u64 sb; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to -+ * determine if disk is a (online) member - if so, returns device's index -+ * -+ * Returns -ENOENT if not found -+ */ -+struct bch_ioctl_disk_get_idx { -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device -+ * -+ * @dev - member to resize -+ * @nbuckets - new number of buckets -+ */ -+struct bch_ioctl_disk_resize { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 nbuckets; -+}; -+ -+#endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c -new file mode 100644 -index 000000000000..4d0c9129cd4a ---- /dev/null -+++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1154 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "bset.h" -+#include "util.h" -+ -+#undef EBUG_ON -+ -+#ifdef DEBUG_BKEYS -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) -+#endif -+ -+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) -+{ -+ unsigned bit = high_bit_offset, done = 0; -+ -+ while (1) { -+ while (bit < 64) { -+ if (done && !(done % 8)) -+ *out++ = ' '; -+ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; -+ bit++; -+ done++; -+ if (done == nr_bits) { -+ *out++ = '\0'; -+ return; -+ } -+ } -+ -+ p = next_word(p); -+ bit = 0; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) -+{ -+ struct bkey tmp; -+ -+ BUG_ON(bkeyp_val_u64s(format, packed) != -+ bkey_val_u64s(unpacked)); -+ -+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); -+ -+ tmp = __bch2_bkey_unpack_key(format, packed); -+ -+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { -+ char buf1[160], buf2[160]; -+ char buf3[160], buf4[160]; -+ -+ bch2_bkey_to_text(&PBUF(buf1), unpacked); -+ bch2_bkey_to_text(&PBUF(buf2), &tmp); -+ bch2_to_binary(buf3, (void *) unpacked, 80); -+ bch2_to_binary(buf4, high_word(format, packed), 80); -+ -+ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", -+ format->key_u64s, -+ format->bits_per_field[0], -+ format->bits_per_field[1], -+ format->bits_per_field[2], -+ format->bits_per_field[3], -+ format->bits_per_field[4], -+ buf1, buf2, buf3, buf4); -+ } -+} -+ -+#else -+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) {} -+#endif -+ -+struct pack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct pack_state pack_state_init(const struct bkey_format *format, -+ struct bkey_packed *k) -+{ -+ u64 *p = high_word(format, k); -+ -+ return (struct pack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = 0, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static void pack_state_finish(struct pack_state *state, -+ struct bkey_packed *k) -+{ -+ EBUG_ON(state->p < k->_data); -+ EBUG_ON(state->p >= k->_data + state->format->key_u64s); -+ -+ *state->p = state->w; -+} -+ -+struct unpack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ const u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct unpack_state unpack_state_init(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(format, k); -+ -+ return (struct unpack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = *p << high_bit_offset, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static u64 get_inc_field(struct unpack_state *state, unsigned field) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (bits >= state->bits) { -+ v = state->w >> (64 - bits); -+ bits -= state->bits; -+ -+ state->p = next_word(state->p); -+ state->w = *state->p; -+ state->bits = 64; -+ } -+ -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ v |= (state->w >> 1) >> (63 - bits); -+ state->w <<= bits; -+ state->bits -= bits; -+ -+ return v + offset; -+} -+ -+__always_inline -+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (v < offset) -+ return false; -+ -+ v -= offset; -+ -+ if (fls64(v) > bits) -+ return false; -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return true; -+} -+ -+/* -+ * Note: does NOT set out->format (we don't know what it should be here!) -+ * -+ * Also: doesn't work on extents - it doesn't preserve the invariant that -+ * if k is packed bkey_start_pos(k) will successfully pack -+ */ -+static bool bch2_bkey_transform_key(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ struct pack_state out_s = pack_state_init(out_f, out); -+ struct unpack_state in_s = unpack_state_init(in_f, in); -+ unsigned i; -+ -+ out->_data[0] = 0; -+ -+ for (i = 0; i < BKEY_NR_FIELDS; i++) -+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) -+ return false; -+ -+ /* Can't happen because the val would be too big to unpack: */ -+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); -+ -+ pack_state_finish(&out_s, out); -+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ return true; -+} -+ -+bool bch2_bkey_transform(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) -+ return false; -+ -+ memcpy_u64s((u64 *) out + out_f->key_u64s, -+ (u64 *) in + in_f->key_u64s, -+ (in->u64s - in_f->key_u64s)); -+ return true; -+} -+ -+#define bkey_fields() \ -+ x(BKEY_FIELD_INODE, p.inode) \ -+ x(BKEY_FIELD_OFFSET, p.offset) \ -+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ -+ x(BKEY_FIELD_SIZE, size) \ -+ x(BKEY_FIELD_VERSION_HI, version.hi) \ -+ x(BKEY_FIELD_VERSION_LO, version.lo) -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bkey out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); -+ -+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; -+ out.format = KEY_FORMAT_CURRENT; -+ out.needs_whiteout = in->needs_whiteout; -+ out.type = in->type; -+ out.pad[0] = 0; -+ -+#define x(id, field) out.field = get_inc_field(&state, id); -+ bkey_fields() -+#undef x -+ -+ return out; -+} -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bpos out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ -+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); -+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); -+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); -+ -+ return out; -+} -+#endif -+ -+/** -+ * bch2_bkey_pack_key -- pack just the key, not the value -+ */ -+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, -+ const struct bkey_format *format) -+{ -+ struct pack_state state = pack_state_init(format, out); -+ -+ EBUG_ON((void *) in == (void *) out); -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->format != KEY_FORMAT_CURRENT); -+ -+ out->_data[0] = 0; -+ -+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; -+ bkey_fields() -+#undef x -+ -+ /* -+ * Extents - we have to guarantee that if an extent is packed, a trimmed -+ * version will also pack: -+ */ -+ if (bkey_start_offset(in) < -+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) -+ return false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ bch2_bkey_pack_verify(out, in, format); -+ return true; -+} -+ -+/** -+ * bch2_bkey_unpack -- unpack the key and the value -+ */ -+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, -+ const struct bkey_packed *src) -+{ -+ __bkey_unpack_key(b, &dst->k, src); -+ -+ memcpy_u64s(&dst->v, -+ bkeyp_val(&b->format, src), -+ bkeyp_val_u64s(&b->format, src)); -+} -+ -+/** -+ * bch2_bkey_pack -- pack the key and the value -+ */ -+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, -+ const struct bkey_format *format) -+{ -+ struct bkey_packed tmp; -+ -+ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) -+ return false; -+ -+ memmove_u64s((u64 *) out + format->key_u64s, -+ &in->v, -+ bkey_val_u64s(&in->k)); -+ memcpy_u64s(out, &tmp, format->key_u64s); -+ -+ return true; -+} -+ -+__always_inline -+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ bool ret = true; -+ -+ EBUG_ON(v < offset); -+ v -= offset; -+ -+ if (fls64(v) > bits) { -+ v = ~(~0ULL << bits); -+ ret = false; -+ } -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static bool bkey_packed_successor(struct bkey_packed *out, -+ const struct btree *b, -+ struct bkey_packed k) -+{ -+ const struct bkey_format *f = &b->format; -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned first_bit, offset; -+ u64 *p; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ if (!nr_key_bits) -+ return false; -+ -+ *out = k; -+ -+ first_bit = high_bit_offset + nr_key_bits - 1; -+ p = nth_word(high_word(f, out), first_bit >> 6); -+ offset = 63 - (first_bit & 63); -+ -+ while (nr_key_bits) { -+ unsigned bits = min(64 - offset, nr_key_bits); -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if ((*p & mask) != mask) { -+ *p += 1ULL << offset; -+ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); -+ return true; -+ } -+ -+ *p &= ~mask; -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ offset = 0; -+ } -+ -+ return false; -+} -+#endif -+ -+/* -+ * Returns a packed key that compares <= in -+ * -+ * This is used in bset_search_tree(), where we need a packed pos in order to be -+ * able to compare against the keys in the auxiliary search tree - and it's -+ * legal to use a packed pos that isn't equivalent to the original pos, -+ * _provided_ it compares <= to the original pos. -+ */ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, -+ struct bpos in, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct pack_state state = pack_state_init(f, out); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos orig = in; -+#endif -+ bool exact = true; -+ -+ out->_data[0] = 0; -+ -+ if (unlikely(in.snapshot < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { -+ if (!in.offset-- && -+ !in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.offset < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { -+ if (!in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.inode < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) -+ return BKEY_PACK_POS_FAIL; -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) -+ exact = false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = f->key_u64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->type = KEY_TYPE_deleted; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (exact) { -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -+ } else { -+ struct bkey_packed successor; -+ -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -+ BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0); -+ } -+#endif -+ -+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -+} -+ -+void bch2_bkey_format_init(struct bkey_format_state *s) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) -+ s->field_min[i] = U64_MAX; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) -+ s->field_max[i] = 0; -+ -+ /* Make sure we can store a size of 0: */ -+ s->field_min[BKEY_FIELD_SIZE] = 0; -+} -+ -+static void __bkey_format_add(struct bkey_format_state *s, -+ unsigned field, u64 v) -+{ -+ s->field_min[field] = min(s->field_min[field], v); -+ s->field_max[field] = max(s->field_max[field], v); -+} -+ -+/* -+ * Changes @format so that @k can be successfully packed with @format -+ */ -+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -+{ -+#define x(id, field) __bkey_format_add(s, id, k->field); -+ bkey_fields() -+#undef x -+ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); -+} -+ -+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -+{ -+ unsigned field = 0; -+ -+ __bkey_format_add(s, field++, p.inode); -+ __bkey_format_add(s, field++, p.offset); -+ __bkey_format_add(s, field++, p.snapshot); -+} -+ -+/* -+ * We don't want it to be possible for the packed format to represent fields -+ * bigger than a u64... that will cause confusion and issues (like with -+ * bkey_packed_successor()) -+ */ -+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, -+ unsigned bits, u64 offset) -+{ -+ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); -+ -+ f->bits_per_field[i] = bits; -+ f->field_offset[i] = cpu_to_le64(offset); -+} -+ -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ struct bkey_format ret = { -+ .nr_fields = BKEY_NR_FIELDS, -+ }; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { -+ s->field_min[i] = min(s->field_min[i], s->field_max[i]); -+ -+ set_format_field(&ret, i, -+ fls64(s->field_max[i] - s->field_min[i]), -+ s->field_min[i]); -+ -+ bits += ret.bits_per_field[i]; -+ } -+ -+ /* allow for extent merging: */ -+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { -+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; -+ bits += 4; -+ } -+ -+ ret.key_u64s = DIV_ROUND_UP(bits, 64); -+ -+ /* if we have enough spare bits, round fields up to nearest byte */ -+ bits = ret.key_u64s * 64 - bits; -+ -+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { -+ unsigned r = round_up(ret.bits_per_field[i], 8) - -+ ret.bits_per_field[i]; -+ -+ if (r <= bits) { -+ set_format_field(&ret, i, -+ ret.bits_per_field[i] + r, -+ le64_to_cpu(ret.field_offset[i])); -+ bits -= r; -+ } -+ } -+ -+ EBUG_ON(bch2_bkey_format_validate(&ret)); -+ return ret; -+} -+ -+const char *bch2_bkey_format_validate(struct bkey_format *f) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ -+ if (f->nr_fields != BKEY_NR_FIELDS) -+ return "incorrect number of fields"; -+ -+ for (i = 0; i < f->nr_fields; i++) { -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (f->bits_per_field[i] > 64) -+ return "field too large"; -+ -+ if (field_offset && -+ (f->bits_per_field[i] == 64 || -+ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < -+ field_offset))) -+ return "offset + bits overflow"; -+ -+ bits += f->bits_per_field[i]; -+ } -+ -+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) -+ return "incorrect key_u64s"; -+ -+ return NULL; -+} -+ -+/* -+ * Most significant differing bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, -+ const struct bkey_packed *l_k, -+ const struct bkey_packed *r_k) -+{ -+ const u64 *l = high_word(&b->format, l_k); -+ const u64 *r = high_word(&b->format, r_k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned word_bits = 64 - high_bit_offset; -+ u64 l_v, r_v; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ /* for big endian, skip past header */ -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (nr_key_bits) { -+ if (nr_key_bits < word_bits) { -+ l_v >>= word_bits - nr_key_bits; -+ r_v >>= word_bits - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= word_bits; -+ } -+ -+ if (l_v != r_v) -+ return fls64(l_v ^ r_v) - 1 + nr_key_bits; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ word_bits = 64; -+ } -+ -+ return 0; -+} -+ -+/* -+ * First set bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(&b->format, k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned ret = 0, offset; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ offset = nr_key_bits; -+ while (offset > 64) { -+ p = next_word(p); -+ offset -= 64; -+ } -+ -+ offset = 64 - offset; -+ -+ while (nr_key_bits) { -+ unsigned bits = nr_key_bits + offset < 64 -+ ? nr_key_bits -+ : 64 - offset; -+ -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if (*p & mask) -+ return ret + __ffs64(*p & mask) - offset; -+ -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ ret += bits; -+ offset = 0; -+ } -+ -+ return 0; -+} -+ -+#ifdef CONFIG_X86_64 -+ -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ long d0, d1, d2, d3; -+ int cmp; -+ -+ /* we shouldn't need asm for this, but gcc is being retarded: */ -+ -+ asm(".intel_syntax noprefix;" -+ "xor eax, eax;" -+ "xor edx, edx;" -+ "1:;" -+ "mov r8, [rdi];" -+ "mov r9, [rsi];" -+ "sub ecx, 64;" -+ "jl 2f;" -+ -+ "cmp r8, r9;" -+ "jnz 3f;" -+ -+ "lea rdi, [rdi - 8];" -+ "lea rsi, [rsi - 8];" -+ "jmp 1b;" -+ -+ "2:;" -+ "not ecx;" -+ "shr r8, 1;" -+ "shr r9, 1;" -+ "shr r8, cl;" -+ "shr r9, cl;" -+ "cmp r8, r9;" -+ -+ "3:\n" -+ "seta al;" -+ "setb dl;" -+ "sub eax, edx;" -+ ".att_syntax prefix;" -+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) -+ : "0" (l), "1" (r), "3" (nr_key_bits) -+ : "r8", "r9", "cc", "memory"); -+ -+ return cmp; -+} -+ -+#define I(_x) (*(out)++ = (_x)) -+#define I1(i0) I(i0) -+#define I2(i0, i1) (I1(i0), I(i1)) -+#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) -+ -+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, -+ enum bch_bkey_fields field, -+ unsigned dst_offset, unsigned dst_size, -+ bool *eax_zeroed) -+{ -+ unsigned bits = format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(format->field_offset[field]); -+ unsigned i, byte, bit_offset, align, shl, shr; -+ -+ if (!bits && !offset) { -+ if (!*eax_zeroed) { -+ /* xor eax, eax */ -+ I2(0x31, 0xc0); -+ } -+ -+ *eax_zeroed = true; -+ goto set_field; -+ } -+ -+ if (!bits) { -+ /* just return offset: */ -+ -+ switch (dst_size) { -+ case 8: -+ if (offset > S32_MAX) { -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ -+ I3(0xc7, 0x47, dst_offset + 4); -+ memcpy(out, (void *) &offset + 4, 4); -+ out += 4; -+ } else { -+ /* mov [rdi + dst_offset], offset */ -+ /* sign extended */ -+ I4(0x48, 0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+ } -+ -+ bit_offset = format->key_u64s * 64; -+ for (i = 0; i <= field; i++) -+ bit_offset -= format->bits_per_field[i]; -+ -+ byte = bit_offset / 8; -+ bit_offset -= byte * 8; -+ -+ *eax_zeroed = false; -+ -+ if (bit_offset == 0 && bits == 8) { -+ /* movzx eax, BYTE PTR [rsi + imm8] */ -+ I4(0x0f, 0xb6, 0x46, byte); -+ } else if (bit_offset == 0 && bits == 16) { -+ /* movzx eax, WORD PTR [rsi + imm8] */ -+ I4(0x0f, 0xb7, 0x46, byte); -+ } else if (bit_offset + bits <= 32) { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 32); -+ -+ /* mov eax, [rsi + imm8] */ -+ I3(0x8b, 0x46, byte); -+ -+ if (bit_offset) { -+ /* shr eax, imm8 */ -+ I3(0xc1, 0xe8, bit_offset); -+ } -+ -+ if (bit_offset + bits < 32) { -+ unsigned mask = ~0U >> (32 - bits); -+ -+ /* and eax, imm32 */ -+ I1(0x25); -+ memcpy(out, &mask, 4); -+ out += 4; -+ } -+ } else if (bit_offset + bits <= 64) { -+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 64); -+ -+ /* mov rax, [rsi + imm8] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ shl = 64 - bit_offset - bits; -+ shr = bit_offset + shl; -+ -+ if (shl) { -+ /* shl rax, imm8 */ -+ I4(0x48, 0xc1, 0xe0, shl); -+ } -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } else { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 96); -+ -+ /* mov rax, [rsi + byte] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ /* mov edx, [rsi + byte + 8] */ -+ I3(0x8b, 0x56, byte + 8); -+ -+ /* bits from next word: */ -+ shr = bit_offset + bits - 64; -+ BUG_ON(shr > bit_offset); -+ -+ /* shr rax, bit_offset */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ -+ /* shl rdx, imm8 */ -+ I4(0x48, 0xc1, 0xe2, 64 - shr); -+ -+ /* or rax, rdx */ -+ I3(0x48, 0x09, 0xd0); -+ -+ shr = bit_offset - shr; -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } -+ -+ /* rax += offset: */ -+ if (offset > S32_MAX) { -+ /* mov rdx, imm64 */ -+ I2(0x48, 0xba); -+ memcpy(out, &offset, 8); -+ out += 8; -+ /* add %rdx, %rax */ -+ I3(0x48, 0x01, 0xd0); -+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { -+ /* add rax, imm32 */ -+ I2(0x48, 0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } else if (offset) { -+ /* add eax, imm32 */ -+ I1(0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+set_field: -+ switch (dst_size) { -+ case 8: -+ /* mov [rdi + dst_offset], rax */ -+ I4(0x48, 0x89, 0x47, dst_offset); -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], eax */ -+ I3(0x89, 0x47, dst_offset); -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+} -+ -+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -+{ -+ bool eax_zeroed = false; -+ u8 *out = _out; -+ -+ /* -+ * rdi: dst - unpacked key -+ * rsi: src - packed key -+ */ -+ -+ /* k->u64s, k->format, k->type */ -+ -+ /* mov eax, [rsi] */ -+ I2(0x8b, 0x06); -+ -+ /* add eax, BKEY_U64s - format->key_u64s */ -+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); -+ -+ /* and eax, imm32: mask out k->pad: */ -+ I5(0x25, 0xff, 0xff, 0xff, 0); -+ -+ /* mov [rdi], eax */ -+ I2(0x89, 0x07); -+ -+#define x(id, field) \ -+ out = compile_bkey_field(format, out, id, \ -+ offsetof(struct bkey, field), \ -+ sizeof(((struct bkey *) NULL)->field), \ -+ &eax_zeroed); -+ bkey_fields() -+#undef x -+ -+ /* retq */ -+ I1(0xc3); -+ -+ return (void *) out - _out; -+} -+ -+#else -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ u64 l_v, r_v; -+ -+ if (!nr_key_bits) -+ return 0; -+ -+ /* for big endian, skip past header */ -+ nr_key_bits += high_bit_offset; -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (1) { -+ if (nr_key_bits < 64) { -+ l_v >>= 64 - nr_key_bits; -+ r_v >>= 64 - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= 64; -+ } -+ -+ if (!nr_key_bits || l_v != r_v) -+ break; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ } -+ -+ return cmp_int(l_v, r_v); -+} -+#endif -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ int ret; -+ -+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ ret = __bkey_cmp_bits(high_word(f, l), -+ high_word(f, r), -+ b->nr_key_bits); -+ -+ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), -+ bkey_unpack_pos(b, r))); -+ return ret; -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_packed(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ struct bkey unpacked; -+ -+ if (likely(bkey_packed(l) && bkey_packed(r))) -+ return __bch2_bkey_cmp_packed_format_checked(l, r, b); -+ -+ if (bkey_packed(l)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, l); -+ l = (void*) &unpacked; -+ } else if (bkey_packed(r)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, r); -+ r = (void*) &unpacked; -+ } -+ -+ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ const struct bkey *l_unpacked; -+ -+ return unlikely(l_unpacked = packed_to_bkey_c(l)) -+ ? bkey_cmp(l_unpacked->p, *r) -+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+void bch2_bpos_swab(struct bpos *p) -+{ -+ u8 *l = (u8 *) p; -+ u8 *h = ((u8 *) &p[1]) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -+{ -+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; -+ u8 *l = k->key_start; -+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void) -+{ -+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); -+ struct bkey_packed p; -+ -+ struct bkey_format test_format = { -+ .key_u64s = 2, -+ .nr_fields = BKEY_NR_FIELDS, -+ .bits_per_field = { -+ 13, -+ 64, -+ }, -+ }; -+ -+ struct unpack_state in_s = -+ unpack_state_init(&bch2_bkey_format_current, (void *) &t); -+ struct pack_state out_s = pack_state_init(&test_format, &p); -+ unsigned i; -+ -+ for (i = 0; i < out_s.format->nr_fields; i++) { -+ u64 a, v = get_inc_field(&in_s, i); -+ -+ switch (i) { -+#define x(id, field) case id: a = t.field; break; -+ bkey_fields() -+#undef x -+ default: -+ BUG(); -+ } -+ -+ if (a != v) -+ panic("got %llu actual %llu i %u\n", v, a, i); -+ -+ if (!set_inc_field(&out_s, i, v)) -+ panic("failed at %u\n", i); -+ } -+ -+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -+} -+#endif -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -new file mode 100644 -index 000000000000..cbcfbd26bc58 ---- /dev/null -+++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,605 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_H -+#define _BCACHEFS_BKEY_H -+ -+#include -+#include "bcachefs_format.h" -+ -+#include "util.h" -+#include "vstructs.h" -+ -+#ifdef CONFIG_X86_64 -+#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -+#endif -+ -+void bch2_to_binary(char *, const u64 *, unsigned); -+ -+/* bkey with split value, const */ -+struct bkey_s_c { -+ const struct bkey *k; -+ const struct bch_val *v; -+}; -+ -+/* bkey with split value */ -+struct bkey_s { -+ union { -+ struct { -+ struct bkey *k; -+ struct bch_val *v; -+ }; -+ struct bkey_s_c s_c; -+ }; -+}; -+ -+#define bkey_next(_k) vstruct_next(_k) -+ -+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ k = bkey_next(k); -+ -+ while (k != end && !k->u64s) -+ k = (void *) ((u64 *) k + 1); -+ return k; -+} -+ -+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) -+ -+static inline size_t bkey_val_bytes(const struct bkey *k) -+{ -+ return bkey_val_u64s(k) * sizeof(u64); -+} -+ -+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -+{ -+ k->u64s = BKEY_U64s + val_u64s; -+} -+ -+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -+{ -+ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) -+ -+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) -+ -+#define bkey_whiteout(_k) \ -+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -+ -+#define bkey_packed_typecheck(_k) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ -+ !type_is(_k, struct bkey_packed *)); \ -+ type_is(_k, struct bkey_packed *); \ -+}) -+ -+enum bkey_lr_packed { -+ BKEY_PACKED_BOTH, -+ BKEY_PACKED_RIGHT, -+ BKEY_PACKED_LEFT, -+ BKEY_PACKED_NONE, -+}; -+ -+#define bkey_lr_packed_typecheck(_l, _r) \ -+ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) -+ -+#define bkey_lr_packed(_l, _r) \ -+ ((_l)->format + ((_r)->format << 1)) -+ -+#define bkey_copy(_dst, _src) \ -+do { \ -+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ -+ !type_is(_dst, struct bkey_packed *)); \ -+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ -+ !type_is(_src, struct bkey_packed *)); \ -+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ -+ (u64 *) (_dst) < (u64 *) (_src) + \ -+ ((struct bkey *) (_src))->u64s); \ -+ \ -+ memcpy_u64s_small((_dst), (_src), \ -+ ((struct bkey *) (_src))->u64s); \ -+} while (0) -+ -+struct btree; -+ -+struct bkey_format_state { -+ u64 field_min[BKEY_NR_FIELDS]; -+ u64 field_max[BKEY_NR_FIELDS]; -+}; -+ -+void bch2_bkey_format_init(struct bkey_format_state *); -+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); -+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -+const char *bch2_bkey_format_validate(struct bkey_format *); -+ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+__pure -+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+__pure -+int __bch2_bkey_cmp_packed(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+static inline __pure -+int bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, const struct bpos *r) -+{ -+ return __bch2_bkey_cmp_left_packed(b, l, r); -+} -+ -+/* -+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to -+ * pass it by by val... as much as I hate c++, const ref would be nice here: -+ */ -+__pure __flatten -+static inline int bkey_cmp_left_packed_byval(const struct btree *b, -+ const struct bkey_packed *l, -+ struct bpos r) -+{ -+ return bkey_cmp_left_packed(b, l, &r); -+} -+ -+/* -+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to -+ * skip dispatching on k->format: -+ */ -+#define bkey_cmp_packed(_b, _l, _r) \ -+({ \ -+ int _cmp; \ -+ \ -+ switch (bkey_lr_packed_typecheck(_l, _r)) { \ -+ case BKEY_PACKED_NONE: \ -+ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ -+ ((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_LEFT: \ -+ _cmp = bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_l), \ -+ &((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_RIGHT: \ -+ _cmp = -bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_r), \ -+ &((struct bkey *) (_l))->p); \ -+ break; \ -+ case BKEY_PACKED_BOTH: \ -+ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ -+ (void *) (_r), (_b)); \ -+ break; \ -+ } \ -+ _cmp; \ -+}) -+ -+#if 1 -+static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -+{ -+ if (l.inode != r.inode) -+ return l.inode < r.inode ? -1 : 1; -+ if (l.offset != r.offset) -+ return l.offset < r.offset ? -1 : 1; -+ if (l.snapshot != r.snapshot) -+ return l.snapshot < r.snapshot ? -1 : 1; -+ return 0; -+} -+#else -+int bkey_cmp(struct bpos l, struct bpos r); -+#endif -+ -+static inline struct bpos bpos_min(struct bpos l, struct bpos r) -+{ -+ return bkey_cmp(l, r) < 0 ? l : r; -+} -+ -+void bch2_bpos_swab(struct bpos *); -+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); -+ -+static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -+{ -+ return cmp_int(l.hi, r.hi) ?: -+ cmp_int(l.lo, r.lo); -+} -+ -+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -+ -+static __always_inline int bversion_zero(struct bversion v) -+{ -+ return !bversion_cmp(v, ZERO_VERSION); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+/* statement expressions confusing unlikely()? */ -+#define bkey_packed(_k) \ -+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ -+ (_k)->format != KEY_FORMAT_CURRENT; }) -+#else -+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -+#endif -+ -+/* -+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse -+ */ -+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -+{ -+ return (struct bkey_packed *) k; -+} -+ -+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -+{ -+ return (const struct bkey_packed *) k; -+} -+ -+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (struct bkey_i *) k; -+} -+ -+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (const struct bkey *) k; -+} -+ -+static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -+{ -+ return format->bits_per_field[BKEY_FIELD_INODE] + -+ format->bits_per_field[BKEY_FIELD_OFFSET] + -+ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -+} -+ -+static inline struct bpos bkey_successor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!++ret.offset) -+ BUG_ON(!++ret.inode); -+ -+ return ret; -+} -+ -+static inline struct bpos bkey_predecessor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!ret.offset--) -+ BUG_ON(!ret.inode--); -+ -+ return ret; -+} -+ -+static inline u64 bkey_start_offset(const struct bkey *k) -+{ -+ return k->p.offset - k->size; -+} -+ -+static inline struct bpos bkey_start_pos(const struct bkey *k) -+{ -+ return (struct bpos) { -+ .inode = k->p.inode, -+ .offset = bkey_start_offset(k), -+ .snapshot = k->p.snapshot, -+ }; -+} -+ -+/* Packed helpers */ -+ -+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; -+ -+ EBUG_ON(k->u64s < ret); -+ return ret; -+} -+ -+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_key_u64s(format, k) * sizeof(u64); -+} -+ -+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return k->u64s - bkeyp_key_u64s(format, k); -+} -+ -+static inline size_t bkeyp_val_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_val_u64s(format, k) * sizeof(u64); -+} -+ -+static inline void set_bkeyp_val_u64s(const struct bkey_format *format, -+ struct bkey_packed *k, unsigned val_u64s) -+{ -+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -+} -+ -+#define bkeyp_val(_format, _k) \ -+ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) -+ -+extern const struct bkey_format bch2_bkey_format_current; -+ -+bool bch2_bkey_transform(const struct bkey_format *, -+ struct bkey_packed *, -+ const struct bkey_format *, -+ const struct bkey_packed *); -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *, -+ const struct bkey_packed *); -+#endif -+ -+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, -+ const struct bkey_format *); -+ -+enum bkey_pack_pos_ret { -+ BKEY_PACK_POS_EXACT, -+ BKEY_PACK_POS_SMALLER, -+ BKEY_PACK_POS_FAIL, -+}; -+ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, -+ const struct btree *); -+ -+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, -+ const struct btree *b) -+{ -+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -+} -+ -+void bch2_bkey_unpack(const struct btree *, struct bkey_i *, -+ const struct bkey_packed *); -+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, -+ const struct bkey_format *); -+ -+static inline u64 bkey_field_max(const struct bkey_format *f, -+ enum bch_bkey_fields nr) -+{ -+ return f->bits_per_field[nr] < 64 -+ ? (le64_to_cpu(f->field_offset[nr]) + -+ ~(~0ULL << f->bits_per_field[nr])) -+ : U64_MAX; -+} -+ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ -+int bch2_compile_bkey_format(const struct bkey_format *, void *); -+ -+#else -+ -+static inline int bch2_compile_bkey_format(const struct bkey_format *format, -+ void *out) { return 0; } -+ -+#endif -+ -+static inline void bkey_reassemble(struct bkey_i *dst, -+ struct bkey_s_c src) -+{ -+ dst->k = *src.k; -+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -+} -+ -+#define bkey_s_null ((struct bkey_s) { .k = NULL }) -+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) -+ -+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) -+ -+static inline struct bkey_s bkey_to_s(struct bkey *k) -+{ -+ return (struct bkey_s) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -+{ -+ return (struct bkey_s_c) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -+{ -+ return (struct bkey_s) { .k = &k->k, .v = &k->v }; -+} -+ -+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -+{ -+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -+} -+ -+/* -+ * For a given type of value (e.g. struct bch_extent), generates the types for -+ * bkey + bch_extent - inline, split, split const - and also all the conversion -+ * functions, which also check that the value is of the correct type. -+ * -+ * We use anonymous unions for upcasting - e.g. converting from e.g. a -+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion -+ * functions. -+ */ -+#define BKEY_VAL_ACCESSORS(name) \ -+struct bkey_i_##name { \ -+ union { \ -+ struct bkey k; \ -+ struct bkey_i k_i; \ -+ }; \ -+ struct bch_##name v; \ -+}; \ -+ \ -+struct bkey_s_c_##name { \ -+ union { \ -+ struct { \ -+ const struct bkey *k; \ -+ const struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+struct bkey_s_##name { \ -+ union { \ -+ struct { \ -+ struct bkey *k; \ -+ struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c_##name c; \ -+ struct bkey_s s; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline const struct bkey_i_##name * \ -+bkey_i_to_##name##_c(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -+{ \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+name##_i_to_s_c(const struct bkey_i_##name *k) \ -+{ \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+bkey_i_to_s_c_##name(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -+{ \ -+ struct bkey_i_##name *k = \ -+ container_of(&_k->k, struct bkey_i_##name, k); \ -+ \ -+ bkey_init(&k->k); \ -+ memset(&k->v, 0, sizeof(k->v)); \ -+ k->k.type = KEY_TYPE_##name; \ -+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ -+ \ -+ return k; \ -+} -+ -+BKEY_VAL_ACCESSORS(cookie); -+BKEY_VAL_ACCESSORS(btree_ptr); -+BKEY_VAL_ACCESSORS(extent); -+BKEY_VAL_ACCESSORS(reservation); -+BKEY_VAL_ACCESSORS(inode); -+BKEY_VAL_ACCESSORS(inode_generation); -+BKEY_VAL_ACCESSORS(dirent); -+BKEY_VAL_ACCESSORS(xattr); -+BKEY_VAL_ACCESSORS(alloc); -+BKEY_VAL_ACCESSORS(quota); -+BKEY_VAL_ACCESSORS(stripe); -+BKEY_VAL_ACCESSORS(reflink_p); -+BKEY_VAL_ACCESSORS(reflink_v); -+BKEY_VAL_ACCESSORS(inline_data); -+BKEY_VAL_ACCESSORS(btree_ptr_v2); -+ -+/* byte order helpers */ -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return f->key_u64s - 1; -+} -+ -+#define high_bit_offset 0 -+#define nth_word(p, n) ((p) - (n)) -+ -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return 0; -+} -+ -+#define high_bit_offset KEY_PACKED_BITS_START -+#define nth_word(p, n) ((p) + (n)) -+ -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define high_word(f, k) ((k)->_data + high_word_offset(f)) -+#define next_word(p) nth_word(p, 1) -+#define prev_word(p) nth_word(p, -1) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void); -+#else -+static inline void bch2_bkey_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_BKEY_H */ -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -new file mode 100644 -index 000000000000..36e0c5152b47 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,353 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "alloc_background.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "quota.h" -+#include "reflink.h" -+#include "xattr.h" -+ -+const char * const bch2_bkey_types[] = { -+#define x(name, nr) #name, -+ BCH_BKEY_TYPES() -+#undef x -+ NULL -+}; -+ -+static const char *deleted_key_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_deleted (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+#define bch2_bkey_ops_discard (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k)) -+ return "value size should be zero"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_error (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_cookie_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_cookie (struct bkey_ops) { \ -+ .key_invalid = key_type_cookie_invalid, \ -+} -+ -+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_inline_data_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); -+} -+ -+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ -+ .key_invalid = key_type_inline_data_invalid, \ -+ .val_to_text = key_type_inline_data_to_text, \ -+} -+ -+static const struct bkey_ops bch2_bkey_ops[] = { -+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, -+ BCH_BKEY_TYPES() -+#undef x -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->type >= KEY_TYPE_MAX) -+ return "invalid type"; -+ -+ return bch2_bkey_ops[k.k->type].key_invalid(c, k); -+} -+ -+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ if (k.k->u64s < BKEY_U64s) -+ return "u64s too small"; -+ -+ if (type == BKEY_TYPE_BTREE && -+ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ if (btree_node_type_is_extents(type)) { -+ if ((k.k->size == 0) != bkey_deleted(k.k)) -+ return "bad size field"; -+ -+ if (k.k->size > k.k->p.offset) -+ return "size greater than offset"; -+ } else { -+ if (k.k->size) -+ return "nonzero size field"; -+ } -+ -+ if (k.k->p.snapshot) -+ return "nonzero snapshot"; -+ -+ if (type != BKEY_TYPE_BTREE && -+ !bkey_cmp(k.k->p, POS_MAX)) -+ return "POS_MAX key"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ return __bch2_bkey_invalid(c, k, type) ?: -+ bch2_bkey_val_invalid(c, k); -+} -+ -+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) -+{ -+ if (bkey_cmp(k.k->p, b->data->min_key) < 0) -+ return "key before start of btree node"; -+ -+ if (bkey_cmp(k.k->p, b->data->max_key) > 0) -+ return "key past end of btree node"; -+ -+ return NULL; -+} -+ -+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ const char *invalid; -+ -+ BUG_ON(!k.k->u64s); -+ -+ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, k); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); -+ return; -+ } -+ -+ if (ops->key_debugcheck) -+ ops->key_debugcheck(c, k); -+} -+ -+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -+{ -+ if (!bkey_cmp(pos, POS_MIN)) -+ pr_buf(out, "POS_MIN"); -+ else if (!bkey_cmp(pos, POS_MAX)) -+ pr_buf(out, "POS_MAX"); -+ else -+ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); -+} -+ -+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -+{ -+ if (k) { -+ pr_buf(out, "u64s %u type %s ", k->u64s, -+ bch2_bkey_types[k->type]); -+ -+ bch2_bpos_to_text(out, k->p); -+ -+ pr_buf(out, " snap %u len %u ver %llu", -+ k->p.snapshot, k->size, k->version.lo); -+ } else { -+ pr_buf(out, "(null)"); -+ } -+} -+ -+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); -+} -+ -+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_to_text(out, k.k); -+ -+ if (k.k) { -+ pr_buf(out, ": "); -+ bch2_val_to_text(out, c, k); -+ } -+} -+ -+void bch2_bkey_swab_val(struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (ops->swab) -+ ops->swab(k); -+} -+ -+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ return ops->key_normalize -+ ? ops->key_normalize(c, k) -+ : false; -+} -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *c, -+ struct bkey_s l, struct bkey_s r) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; -+ enum merge_result ret; -+ -+ if (key_merging_disabled(c) || -+ !ops->key_merge || -+ l.k->type != r.k->type || -+ bversion_cmp(l.k->version, r.k->version) || -+ bkey_cmp(l.k->p, bkey_start_pos(r.k))) -+ return BCH_MERGE_NOMERGE; -+ -+ ret = ops->key_merge(c, l, r); -+ -+ if (ret != BCH_MERGE_NOMERGE) -+ l.k->needs_whiteout |= r.k->needs_whiteout; -+ return ret; -+} -+ -+static const struct old_bkey_type { -+ u8 btree_node_type; -+ u8 old; -+ u8 new; -+} bkey_renumber_table[] = { -+ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, -+ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, -+ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, -+ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, -+ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, -+ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, -+ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, -+ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, -+}; -+ -+void bch2_bkey_renumber(enum btree_node_type btree_node_type, -+ struct bkey_packed *k, -+ int write) -+{ -+ const struct old_bkey_type *i; -+ -+ for (i = bkey_renumber_table; -+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); -+ i++) -+ if (btree_node_type == i->btree_node_type && -+ k->type == (write ? i->new : i->old)) { -+ k->type = write ? i->old : i->new; -+ break; -+ } -+} -+ -+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ const struct bkey_ops *ops; -+ struct bkey uk; -+ struct bkey_s u; -+ int i; -+ -+ /* -+ * Do these operations in reverse order in the write path: -+ */ -+ -+ for (i = 0; i < 4; i++) -+ switch (!write ? i : 3 - i) { -+ case 0: -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_key(f, k); -+ break; -+ case 1: -+ if (version < bcachefs_metadata_version_bkey_renumber) -+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); -+ break; -+ case 2: -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ if (!bkey_packed(k)) { -+ struct bkey_i *u = packed_to_bkey(k); -+ swap(u->k.p.inode, u->k.p.offset); -+ } else if (f->bits_per_field[BKEY_FIELD_INODE] && -+ f->bits_per_field[BKEY_FIELD_OFFSET]) { -+ struct bkey_format tmp = *f, *in = f, *out = &tmp; -+ -+ swap(tmp.bits_per_field[BKEY_FIELD_INODE], -+ tmp.bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(tmp.field_offset[BKEY_FIELD_INODE], -+ tmp.field_offset[BKEY_FIELD_OFFSET]); -+ -+ if (!write) -+ swap(in, out); -+ -+ uk = __bch2_bkey_unpack_key(in, k); -+ swap(uk.p.inode, uk.p.offset); -+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); -+ } -+ } -+ break; -+ case 3: -+ if (!bkey_packed(k)) { -+ u = bkey_i_to_s(packed_to_bkey(k)); -+ } else { -+ uk = __bch2_bkey_unpack_key(f, k); -+ u.k = &uk; -+ u.v = bkeyp_val(f, k); -+ } -+ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_val(u); -+ -+ ops = &bch2_bkey_ops[k->type]; -+ -+ if (ops->compat) -+ ops->compat(btree_id, version, big_endian, write, u); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h -new file mode 100644 -index 000000000000..0bca725ae3b8 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,82 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_METHODS_H -+#define _BCACHEFS_BKEY_METHODS_H -+ -+#include "bkey.h" -+ -+struct bch_fs; -+struct btree; -+struct bkey; -+enum btree_node_type; -+ -+extern const char * const bch2_bkey_types[]; -+ -+enum merge_result { -+ BCH_MERGE_NOMERGE, -+ -+ /* -+ * The keys were mergeable, but would have overflowed size - so instead -+ * l was changed to the maximum size, and both keys were modified: -+ */ -+ BCH_MERGE_PARTIAL, -+ BCH_MERGE_MERGE, -+}; -+ -+struct bkey_ops { -+ /* Returns reason for being invalid if invalid, else NULL: */ -+ const char * (*key_invalid)(const struct bch_fs *, -+ struct bkey_s_c); -+ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); -+ void (*val_to_text)(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ void (*swab)(struct bkey_s); -+ bool (*key_normalize)(struct bch_fs *, struct bkey_s); -+ enum merge_result (*key_merge)(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ void (*compat)(enum btree_id id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s); -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); -+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -+ -+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -+ -+void bch2_bpos_to_text(struct printbuf *, struct bpos); -+void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -+void bch2_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_bkey_swab_val(struct bkey_s); -+ -+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); -+ -+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, -+ int, struct bkey_format *, struct bkey_packed *); -+ -+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ if (version < bcachefs_metadata_version_current || -+ big_endian != CPU_BIG_ENDIAN) -+ __bch2_bkey_compat(level, btree_id, version, -+ big_endian, write, f, k); -+ -+} -+ -+#endif /* _BCACHEFS_BKEY_METHODS_H */ -diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h -new file mode 100644 -index 000000000000..f607a0cb37ed ---- /dev/null -+++ b/fs/bcachefs/bkey_on_stack.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_ON_STACK_H -+#define _BCACHEFS_BKEY_ON_STACK_H -+ -+#include "bcachefs.h" -+ -+struct bkey_on_stack { -+ struct bkey_i *k; -+ u64 onstack[12]; -+}; -+ -+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, -+ struct bch_fs *c, unsigned u64s) -+{ -+ if (s->k == (void *) s->onstack && -+ u64s > ARRAY_SIZE(s->onstack)) { -+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); -+ memcpy(s->k, s->onstack, sizeof(s->onstack)); -+ } -+} -+ -+static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, -+ struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bkey_on_stack_realloc(s, c, k.k->u64s); -+ bkey_reassemble(s->k, k); -+} -+ -+static inline void bkey_on_stack_init(struct bkey_on_stack *s) -+{ -+ s->k = (void *) s->onstack; -+} -+ -+static inline void bkey_on_stack_exit(struct bkey_on_stack *s, -+ struct bch_fs *c) -+{ -+ if (s->k != (void *) s->onstack) -+ mempool_free(s->k, &c->large_bkey_pool); -+ s->k = NULL; -+} -+ -+#endif /* _BCACHEFS_BKEY_ON_STACK_H */ -diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c -new file mode 100644 -index 000000000000..839e78d1dc35 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,515 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "bkey_sort.h" -+#include "bset.h" -+#include "extents.h" -+ -+typedef int (*sort_cmp_fn)(struct btree *, -+ struct bkey_packed *, -+ struct bkey_packed *); -+ -+static inline bool sort_iter_end(struct sort_iter *iter) -+{ -+ return !iter->used; -+} -+ -+static inline void __sort_iter_sift(struct sort_iter *iter, -+ unsigned from, -+ sort_cmp_fn cmp) -+{ -+ unsigned i; -+ -+ for (i = from; -+ i + 1 < iter->used && -+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; -+ i++) -+ swap(iter->data[i], iter->data[i + 1]); -+} -+ -+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ -+ __sort_iter_sift(iter, 0, cmp); -+} -+ -+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ unsigned i = iter->used; -+ -+ while (i--) -+ __sort_iter_sift(iter, i, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -+{ -+ return !sort_iter_end(iter) ? iter->data->k : NULL; -+} -+ -+static inline void __sort_iter_advance(struct sort_iter *iter, -+ unsigned idx, sort_cmp_fn cmp) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ BUG_ON(idx >= iter->used); -+ -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ -+ BUG_ON(i->k > i->end); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, cmp); -+} -+ -+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ __sort_iter_advance(iter, 0, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, -+ sort_cmp_fn cmp) -+{ -+ struct bkey_packed *ret = sort_iter_peek(iter); -+ -+ if (ret) -+ sort_iter_advance(iter, cmp); -+ -+ return ret; -+} -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ */ -+static inline int key_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ cmp_int((unsigned long) l, (unsigned long) r); -+} -+ -+static inline bool should_drop_next_key(struct sort_iter *iter) -+{ -+ /* -+ * key_sort_cmp() ensures that when keys compare equal the older key -+ * comes first; so if l->k compares equal to r->k then l->k is older -+ * and should be dropped. -+ */ -+ return iter->used >= 2 && -+ !bkey_cmp_packed(iter->b, -+ iter->data[0].k, -+ iter->data[1].k); -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct bkey_packed *out = dst->start; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); -+ -+ while ((k = sort_iter_peek(iter))) { -+ if (!bkey_whiteout(k) && -+ !should_drop_next_key(iter)) { -+ bkey_copy(out, k); -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+static void extent_sort_append(struct bch_fs *c, -+ struct bkey_format *f, -+ struct btree_nr_keys *nr, -+ struct bkey_packed **out, -+ struct bkey_s k) -+{ -+ if (!bkey_whiteout(k.k)) { -+ if (!bch2_bkey_pack_key(*out, k.k, f)) -+ memcpy_u64s_small(*out, k.k, BKEY_U64s); -+ -+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); -+ -+ btree_keys_account_key_add(nr, 0, *out); -+ *out = bkey_next(*out); -+ } -+} -+ -+/* Sort + repack in a new format: */ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *dst, struct btree *src, -+ struct btree_node_iter *src_iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_format *in_f = &src->format; -+ struct bkey_packed *in, *out = vstruct_last(dst); -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(in)) -+ continue; -+ -+ if (bch2_bkey_transform(out_f, out, bkey_packed(in) -+ ? in_f : &bch2_bkey_format_current, in)) -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ else -+ bch2_bkey_unpack(src, (void *) out, in); -+ -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *c, -+ struct bset *dst, struct btree *src, -+ struct btree_node_iter *iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *out = vstruct_last(dst), *k_packed; -+ struct bkey_on_stack k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&k); -+ -+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(k_packed)) -+ continue; -+ -+ /* -+ * NOTE: -+ * bch2_bkey_normalize may modify the key we pass it (dropping -+ * stale pointers) and we don't have a write lock on the src -+ * node; we have to make a copy of the entire key before calling -+ * normalize -+ */ -+ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); -+ bch2_bkey_unpack(src, k.k, k_packed); -+ -+ if (filter_whiteouts && -+ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) -+ continue; -+ -+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ bkey_on_stack_exit(&k, c); -+ return nr; -+} -+ -+static inline int sort_keys_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: -+ (int) l->needs_whiteout - (int) r->needs_whiteout; -+} -+ -+unsigned bch2_sort_keys(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *next, *out = dst; -+ -+ sort_iter_sort(iter, sort_keys_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_keys_cmp))) { -+ bool needs_whiteout = false; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ while ((next = sort_iter_peek(iter)) && -+ !bkey_cmp_packed(iter->b, in, next)) { -+ BUG_ON(in->needs_whiteout && -+ next->needs_whiteout); -+ needs_whiteout |= in->needs_whiteout; -+ in = sort_iter_next(iter, sort_keys_cmp); -+ } -+ -+ if (bkey_whiteout(in)) { -+ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); -+ set_bkeyp_val_u64s(f, out, 0); -+ } else { -+ bkey_copy(out, in); -+ } -+ out->needs_whiteout |= needs_whiteout; -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+/* Compat code for btree_node_old_extent_overwrite: */ -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ * -+ * Necessary for sort_fix_overlapping() - if there are multiple keys that -+ * compare equal in different sets, we have to process them newest to oldest. -+ */ -+static inline int extent_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), -+ bkey_start_pos(&ur)) ?: -+ cmp_int((unsigned long) r, (unsigned long) l); -+} -+ -+/* -+ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same -+ * bset being ordered by start offset - but 0 size whiteouts (which are always -+ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: -+ */ -+static void extent_iter_advance(struct sort_iter *iter, unsigned idx) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ do { -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ } while (i->k != i->end && bkey_deleted(i->k)); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); -+} -+ -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct btree *b = iter->b; -+ struct bkey_format *f = &b->format; -+ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; -+ struct bkey_packed *out = dst->start; -+ struct bkey l_unpacked, r_unpacked; -+ struct bkey_s l, r; -+ struct btree_nr_keys nr; -+ struct bkey_on_stack split; -+ unsigned i; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&split); -+ -+ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); -+ for (i = 0; i < iter->used;) { -+ if (bkey_deleted(iter->data[i].k)) -+ __sort_iter_advance(iter, i, -+ extent_sort_fix_overlapping_cmp); -+ else -+ i++; -+ } -+ -+ while (!sort_iter_end(iter)) { -+ l = __bkey_disassemble(b, _l->k, &l_unpacked); -+ -+ if (iter->used == 1) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ r = __bkey_disassemble(b, _r->k, &r_unpacked); -+ -+ /* If current key and next key don't overlap, just append */ -+ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ /* Skip 0 size keys */ -+ if (!r.k->size) { -+ extent_iter_advance(iter, 1); -+ continue; -+ } -+ -+ /* -+ * overlap: keep the newer key and trim the older key so they -+ * don't overlap. comparing pointers tells us which one is -+ * newer, since the bsets are appended one after the other. -+ */ -+ -+ /* can't happen because of comparison func */ -+ BUG_ON(_l->k < _r->k && -+ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); -+ -+ if (_l->k > _r->k) { -+ /* l wins, trim r */ -+ if (bkey_cmp(l.k->p, r.k->p) >= 0) { -+ extent_iter_advance(iter, 1); -+ } else { -+ bch2_cut_front_s(l.k->p, r); -+ extent_save(b, _r->k, r.k); -+ __sort_iter_sift(iter, 1, -+ extent_sort_fix_overlapping_cmp); -+ } -+ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { -+ -+ /* -+ * r wins, but it overlaps in the middle of l - split l: -+ */ -+ bkey_on_stack_reassemble(&split, c, l.s_c); -+ bch2_cut_back(bkey_start_pos(r.k), split.k); -+ -+ bch2_cut_front_s(r.k->p, l); -+ extent_save(b, _l->k, l.k); -+ -+ __sort_iter_sift(iter, 0, -+ extent_sort_fix_overlapping_cmp); -+ -+ extent_sort_append(c, f, &nr, &out, -+ bkey_i_to_s(split.k)); -+ } else { -+ bch2_cut_back_s(bkey_start_pos(r.k), l); -+ extent_save(b, _l->k, l.k); -+ } -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ -+ bkey_on_stack_exit(&split, c); -+ return nr; -+} -+ -+static inline int sort_extents_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(l) - (int) bkey_deleted(r); -+} -+ -+unsigned bch2_sort_extents(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *in, *out = dst; -+ -+ sort_iter_sort(iter, sort_extents_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extents_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ bkey_copy(out, in); -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+static inline int sort_extent_whiteouts_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -+} -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, -+ struct sort_iter *iter) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *out = dst; -+ struct bkey_i l, r; -+ bool prev = false, l_packed = false; -+ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); -+ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); -+ u64 new_size; -+ -+ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); -+ -+ sort_iter_sort(iter, sort_extent_whiteouts_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ EBUG_ON(bkeyp_val_u64s(f, in)); -+ EBUG_ON(in->type != KEY_TYPE_discard); -+ -+ r.k = bkey_unpack_key(iter->b, in); -+ -+ if (prev && -+ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ new_size = l_packed -+ ? min(max_packed_size, max_packed_offset - -+ bkey_start_offset(&l.k)) -+ : KEY_SIZE_MAX; -+ -+ new_size = min(new_size, r.k.p.offset - -+ bkey_start_offset(&l.k)); -+ -+ BUG_ON(new_size < l.k.size); -+ -+ bch2_key_resize(&l.k, new_size); -+ -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ bch2_cut_front(l.k.p, &r); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ l = r; -+ prev = true; -+ l_packed = bkey_packed(in); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h -new file mode 100644 -index 000000000000..458a051fdac5 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_SORT_H -+#define _BCACHEFS_BKEY_SORT_H -+ -+struct sort_iter { -+ struct btree *b; -+ unsigned used; -+ unsigned size; -+ -+ struct sort_iter_set { -+ struct bkey_packed *k, *end; -+ } data[MAX_BSETS + 1]; -+}; -+ -+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) -+{ -+ iter->b = b; -+ iter->used = 0; -+ iter->size = ARRAY_SIZE(iter->data); -+} -+ -+static inline void sort_iter_add(struct sort_iter *iter, -+ struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ BUG_ON(iter->used >= iter->size); -+ -+ if (k != end) -+ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *, -+ struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+ -+unsigned bch2_sort_keys(struct bkey_packed *, -+ struct sort_iter *, bool); -+unsigned bch2_sort_extents(struct bkey_packed *, -+ struct sort_iter *, bool); -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, -+ struct sort_iter *); -+ -+#endif /* _BCACHEFS_BKEY_SORT_H */ -diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c -new file mode 100644 -index 000000000000..6fc91e6a35e8 ---- /dev/null -+++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1803 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for working with individual keys, and sorted sets of keys with in a -+ * btree node -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "bset.h" -+#include "eytzinger.h" -+#include "util.h" -+ -+#include -+#include -+#include -+#include -+ -+/* hack.. */ -+#include "alloc_types.h" -+#include -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, -+ struct btree *); -+ -+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -+{ -+ unsigned n = ARRAY_SIZE(iter->data); -+ -+ while (n && __btree_node_iter_set_end(iter, n - 1)) -+ --n; -+ -+ return n; -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (offset <= t->end_offset) { -+ EBUG_ON(offset < btree_bkey_first_offset(t)); -+ return t; -+ } -+ -+ BUG(); -+} -+ -+/* -+ * There are never duplicate live keys in the btree - but including keys that -+ * have been flagged as deleted (and will be cleaned up later) we _will_ see -+ * duplicates. -+ * -+ * Thus the sort order is: usual key comparison first, but for keys that compare -+ * equal the deleted key(s) come first, and the (at most one) live version comes -+ * last. -+ * -+ * The main reason for this is insertion: to handle overwrites, we first iterate -+ * over keys that compare equal to our insert key, and then insert immediately -+ * prior to the first key greater than the key we're inserting - our insert -+ * position will be after all keys that compare equal to our insert key, which -+ * by the time we actually do the insert will all be deleted. -+ */ -+ -+void bch2_dump_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned set) -+{ -+ struct bkey_packed *_k, *_n; -+ struct bkey uk, n; -+ struct bkey_s_c k; -+ char buf[200]; -+ -+ if (!i->u64s) -+ return; -+ -+ for (_k = i->start; -+ _k < vstruct_last(i); -+ _k = _n) { -+ _n = bkey_next_skip_noops(_k, vstruct_last(i)); -+ -+ k = bkey_disassemble(b, _k, &uk); -+ if (c) -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ else -+ bch2_bkey_to_text(&PBUF(buf), k.k); -+ printk(KERN_ERR "block %u key %5zu: %s\n", set, -+ _k->_data - i->_data, buf); -+ -+ if (_n == vstruct_last(i)) -+ continue; -+ -+ n = bkey_unpack_key(b, _n); -+ -+ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { -+ printk(KERN_ERR "Key skipped backwards\n"); -+ continue; -+ } -+ -+ if (!bkey_deleted(k.k) && -+ !bkey_cmp(n.p, k.k->p)) -+ printk(KERN_ERR "Duplicate keys\n"); -+ } -+} -+ -+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ console_lock(); -+ for_each_bset(b, t) -+ bch2_dump_bset(c, b, bset(b, t), t - b->set); -+ console_unlock(); -+} -+ -+void bch2_dump_btree_node_iter(struct btree *b, -+ struct btree_node_iter *iter) -+{ -+ struct btree_node_iter_set *set; -+ -+ printk(KERN_ERR "btree node iter with %u/%u sets:\n", -+ __btree_node_iter_used(iter), b->nsets); -+ -+ btree_node_iter_for_each(iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk = bkey_unpack_key(b, k); -+ char buf[100]; -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ printk(KERN_ERR "set %zu key %u: %s\n", -+ t - b->set, set->k, buf); -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ struct bset_tree *t; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr = { 0 }; -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) -+ btree_keys_account_key_add(&nr, t - b->set, k); -+ -+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -+} -+ -+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, -+ struct btree *b) -+{ -+ struct btree_node_iter iter = *_iter; -+ const struct bkey_packed *k, *n; -+ -+ k = bch2_btree_node_iter_peek_all(&iter, b); -+ __bch2_btree_node_iter_advance(&iter, b); -+ n = bch2_btree_node_iter_peek_all(&iter, b); -+ -+ bkey_unpack_key(b, k); -+ -+ if (n && -+ bkey_iter_cmp(b, k, n) > 0) { -+ struct btree_node_iter_set *set; -+ struct bkey ku = bkey_unpack_key(b, k); -+ struct bkey nu = bkey_unpack_key(b, n); -+ char buf1[80], buf2[80]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &ku); -+ bch2_bkey_to_text(&PBUF(buf2), &nu); -+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", -+ buf1, buf2); -+ printk(KERN_ERR "iter was:"); -+ -+ btree_node_iter_for_each(_iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ printk(" [%zi %zi]", t - b->set, -+ k->_data - bset(b, t)->_data); -+ } -+ panic("\n"); -+ } -+} -+ -+void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct btree_node_iter_set *set, *s2; -+ struct bkey_packed *k, *p; -+ struct bset_tree *t; -+ -+ if (bch2_btree_node_iter_end(iter)) -+ return; -+ -+ /* Verify no duplicates: */ -+ btree_node_iter_for_each(iter, set) -+ btree_node_iter_for_each(iter, s2) -+ BUG_ON(set != s2 && set->end == s2->end); -+ -+ /* Verify that set->end is correct: */ -+ btree_node_iter_for_each(iter, set) { -+ for_each_bset(b, t) -+ if (set->end == t->end_offset) -+ goto found; -+ BUG(); -+found: -+ BUG_ON(set->k < btree_bkey_first_offset(t) || -+ set->k >= t->end_offset); -+ } -+ -+ /* Verify iterator is sorted: */ -+ btree_node_iter_for_each(iter, set) -+ BUG_ON(set != iter->data && -+ btree_node_iter_cmp(b, set[-1], set[0]) > 0); -+ -+ k = bch2_btree_node_iter_peek_all(iter, b); -+ -+ for_each_bset(b, t) { -+ if (iter->data[0].end == t->end_offset) -+ continue; -+ -+ p = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ -+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); -+ } -+} -+ -+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -+ struct bkey_packed *insert, unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); -+ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); -+#if 0 -+ BUG_ON(prev && -+ bkey_iter_cmp(b, prev, insert) > 0); -+#else -+ if (prev && -+ bkey_iter_cmp(b, prev, insert) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, prev); -+ struct bkey k2 = bkey_unpack_key(b, insert); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("prev > insert:\n" -+ "prev key %s\n" -+ "insert key %s\n", -+ buf1, buf2); -+ } -+#endif -+#if 0 -+ BUG_ON(next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0); -+#else -+ if (next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, insert); -+ struct bkey k2 = bkey_unpack_key(b, next); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("insert > next:\n" -+ "insert key %s\n" -+ "next key %s\n", -+ buf1, buf2); -+ } -+#endif -+} -+ -+#else -+ -+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, -+ struct btree *b) {} -+ -+#endif -+ -+/* Auxiliary search trees */ -+ -+#define BFLOAT_FAILED_UNPACKED U8_MAX -+#define BFLOAT_FAILED U8_MAX -+ -+struct bkey_float { -+ u8 exponent; -+ u8 key_offset; -+ u16 mantissa; -+}; -+#define BKEY_MANTISSA_BITS 16 -+ -+static unsigned bkey_float_byte_offset(unsigned idx) -+{ -+ return idx * sizeof(struct bkey_float); -+} -+ -+struct ro_aux_tree { -+ struct bkey_float f[0]; -+}; -+ -+struct rw_aux_tree { -+ u16 offset; -+ struct bpos k; -+}; -+ -+/* -+ * BSET_CACHELINE was originally intended to match the hardware cacheline size - -+ * it used to be 64, but I realized the lookup code would touch slightly less -+ * memory if it was 128. -+ * -+ * It definites the number of bytes (in struct bset) per struct bkey_float in -+ * the auxiliar search tree - when we're done searching the bset_float tree we -+ * have this many bytes left that we do a linear search over. -+ * -+ * Since (after level 5) every level of the bset_tree is on a new cacheline, -+ * we're touching one fewer cacheline in the bset tree in exchange for one more -+ * cacheline in the linear search - but the linear search might stop before it -+ * gets to the second cacheline. -+ */ -+ -+#define BSET_CACHELINE 128 -+ -+/* Space required for the btree node keys */ -+static inline size_t btree_keys_bytes(struct btree *b) -+{ -+ return PAGE_SIZE << b->page_order; -+} -+ -+static inline size_t btree_keys_cachelines(struct btree *b) -+{ -+ return btree_keys_bytes(b) / BSET_CACHELINE; -+} -+ -+static inline size_t btree_aux_data_bytes(struct btree *b) -+{ -+ return btree_keys_cachelines(b) * 8; -+} -+ -+static inline size_t btree_aux_data_u64s(struct btree *b) -+{ -+ return btree_aux_data_bytes(b) / sizeof(u64); -+} -+ -+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -+{ -+ BUG_ON(t->aux_data_offset == U16_MAX); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return t->aux_data_offset; -+ case BSET_RO_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + -+ t->size * sizeof(u8), 8); -+ case BSET_RW_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned bset_aux_tree_buf_start(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return t == b->set -+ ? DIV_ROUND_UP(b->unpack_fn_len, 8) -+ : bset_aux_tree_buf_end(t - 1); -+} -+ -+static void *__aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return b->aux_data + t->aux_data_offset * 8; -+} -+ -+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+static u8 *ro_aux_tree_prev(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -+} -+ -+static struct bkey_float *bkey_float(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned idx) -+{ -+ return ro_aux_tree_base(b, t)->f + idx; -+} -+ -+static void bset_aux_tree_verify(struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ if (t->aux_data_offset == U16_MAX) -+ continue; -+ -+ BUG_ON(t != b->set && -+ t[-1].aux_data_offset == U16_MAX); -+ -+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); -+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); -+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); -+ } -+#endif -+} -+ -+/* Memory allocation */ -+ -+void bch2_btree_keys_free(struct btree *b) -+{ -+ vfree(b->aux_data); -+ b->aux_data = NULL; -+} -+ -+#ifndef PAGE_KERNEL_EXEC -+# define PAGE_KERNEL_EXEC PAGE_KERNEL -+#endif -+ -+int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) -+{ -+ b->page_order = page_order; -+ b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, -+ PAGE_KERNEL_EXEC); -+ if (!b->aux_data) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) -+{ -+ unsigned i; -+ -+ b->nsets = 0; -+ memset(&b->nr, 0, sizeof(b->nr)); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ b->expensive_debug_checks = expensive_debug_checks; -+#endif -+ for (i = 0; i < MAX_BSETS; i++) -+ b->set[i].data_offset = U16_MAX; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+/* Binary tree stuff for auxiliary search trees */ -+ -+/* -+ * Cacheline/offset <-> bkey pointer arithmetic: -+ * -+ * t->tree is a binary search tree in an array; each node corresponds to a key -+ * in one cacheline in t->set (BSET_CACHELINE bytes). -+ * -+ * This means we don't have to store the full index of the key that a node in -+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and -+ * then bkey_float->m gives us the offset within that cacheline, in units of 8 -+ * bytes. -+ * -+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to -+ * make this work. -+ * -+ * To construct the bfloat for an arbitrary key we need to know what the key -+ * immediately preceding it is: we have to check if the two keys differ in the -+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size -+ * of the previous key so we can walk backwards to it from t->tree[j]'s key. -+ */ -+ -+static inline void *bset_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline) -+{ -+ return (void *) round_down((unsigned long) btree_bkey_first(b, t), -+ L1_CACHE_BYTES) + -+ cacheline * BSET_CACHELINE; -+} -+ -+static struct bkey_packed *cacheline_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ unsigned offset) -+{ -+ return bset_cacheline(b, t, cacheline) + offset * 8; -+} -+ -+static unsigned bkey_to_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ const struct bkey_packed *k) -+{ -+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -+} -+ -+static ssize_t __bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -+} -+ -+static unsigned bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); -+ -+ EBUG_ON(m > U8_MAX); -+ return m; -+} -+ -+static inline struct bkey_packed *tree_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ return cacheline_to_bkey(b, t, -+ __eytzinger1_to_inorder(j, t->size, t->extra), -+ bkey_float(b, t, j)->key_offset); -+} -+ -+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; -+ -+ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); -+} -+ -+static struct rw_aux_tree *rw_aux_tree(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+/* -+ * For the write set - the one we're currently inserting keys into - we don't -+ * maintain a full search tree, we just keep a simple lookup table in t->prev. -+ */ -+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, -+ struct bset_tree *t, -+ unsigned j) -+{ -+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -+} -+ -+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, -+ unsigned j, struct bkey_packed *k) -+{ -+ EBUG_ON(k >= btree_bkey_last(b, t)); -+ -+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { -+ .offset = __btree_node_key_to_offset(b, k), -+ .k = bkey_unpack_pos(b, k), -+ }; -+} -+ -+static void bch2_bset_verify_rw_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ struct bkey_packed *k = btree_bkey_first(b, t); -+ unsigned j = 0; -+ -+ if (!btree_keys_expensive_checks(b)) -+ return; -+ -+ BUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ BUG_ON(t->size < 1); -+ BUG_ON(rw_aux_to_bkey(b, t, j) != k); -+ -+ goto start; -+ while (1) { -+ if (rw_aux_to_bkey(b, t, j) == k) { -+ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, -+ bkey_unpack_pos(b, k))); -+start: -+ if (++j == t->size) -+ break; -+ -+ BUG_ON(rw_aux_tree(b, t)[j].offset <= -+ rw_aux_tree(b, t)[j - 1].offset); -+ } -+ -+ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ BUG_ON(k >= btree_bkey_last(b, t)); -+ } -+} -+ -+/* returns idx of first entry >= offset: */ -+static unsigned rw_aux_tree_bsearch(struct btree *b, -+ struct bset_tree *t, -+ unsigned offset) -+{ -+ unsigned bset_offs = offset - btree_bkey_first_offset(t); -+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); -+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ EBUG_ON(!t->size); -+ EBUG_ON(idx > t->size); -+ -+ while (idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset) -+ idx++; -+ -+ while (idx && -+ rw_aux_tree(b, t)[idx - 1].offset >= offset) -+ idx--; -+ -+ EBUG_ON(idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset); -+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); -+ EBUG_ON(idx + 1 < t->size && -+ rw_aux_tree(b, t)[idx].offset == -+ rw_aux_tree(b, t)[idx + 1].offset); -+ -+ return idx; -+} -+ -+static inline unsigned bkey_mantissa(const struct bkey_packed *k, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+ u64 v; -+ -+ EBUG_ON(!bkey_packed(k)); -+ -+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); -+ -+ /* -+ * In little endian, we're shifting off low bits (and then the bits we -+ * want are at the low end), in big endian we're shifting off high bits -+ * (and then the bits we want are at the high end, so we shift them -+ * back down): -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ v >>= f->exponent & 7; -+#else -+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -+#endif -+ return (u16) v; -+} -+ -+static void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) -+{ -+ struct bkey_float *f = bkey_float(b, t, j); -+ struct bkey_packed *m = tree_to_bkey(b, t, j); -+ struct bkey_packed *l, *r; -+ unsigned mantissa; -+ int shift, exponent, high_bit; -+ -+ if (is_power_of_2(j)) { -+ l = min_key; -+ -+ if (!l->u64s) { -+ if (!bkey_pack_pos(l, b->data->min_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = b->data->min_key; -+ bkey_copy(l, &tmp); -+ } -+ } -+ } else { -+ l = tree_to_prev_bkey(b, t, j >> ffs(j)); -+ -+ EBUG_ON(m < l); -+ } -+ -+ if (is_power_of_2(j + 1)) { -+ r = max_key; -+ -+ if (!r->u64s) { -+ if (!bkey_pack_pos(r, t->max_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = t->max_key; -+ bkey_copy(r, &tmp); -+ } -+ } -+ } else { -+ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); -+ -+ EBUG_ON(m > r); -+ } -+ -+ /* -+ * for failed bfloats, the lookup code falls back to comparing against -+ * the original key. -+ */ -+ -+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || -+ !b->nr_key_bits) { -+ f->exponent = BFLOAT_FAILED_UNPACKED; -+ return; -+ } -+ -+ /* -+ * The greatest differing bit of l and r is the first bit we must -+ * include in the bfloat mantissa we're creating in order to do -+ * comparisons - that bit always becomes the high bit of -+ * bfloat->mantissa, and thus the exponent we're calculating here is -+ * the position of what will become the low bit in bfloat->mantissa: -+ * -+ * Note that this may be negative - we may be running off the low end -+ * of the key: we handle this later: -+ */ -+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), -+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); -+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); -+ -+ /* -+ * Then we calculate the actual shift value, from the start of the key -+ * (k->_data), to get the key bits starting at exponent: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; -+ -+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -+#else -+ shift = high_bit_offset + -+ b->nr_key_bits - -+ exponent - -+ BKEY_MANTISSA_BITS; -+ -+ EBUG_ON(shift < KEY_PACKED_BITS_START); -+#endif -+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); -+ -+ f->exponent = shift; -+ mantissa = bkey_mantissa(m, f, j); -+ -+ /* -+ * If we've got garbage bits, set them to all 1s - it's legal for the -+ * bfloat to compare larger than the original key, but not smaller: -+ */ -+ if (exponent < 0) -+ mantissa |= ~(~0U << -exponent); -+ -+ f->mantissa = mantissa; -+} -+ -+/* bytes remaining - only valid for last bset: */ -+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ bset_aux_tree_verify(b); -+ -+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -+} -+ -+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / -+ (sizeof(struct bkey_float) + sizeof(u8)); -+} -+ -+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -+} -+ -+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *k; -+ -+ t->size = 1; -+ t->extra = BSET_RW_AUX_TREE_VAL; -+ rw_aux_tree(b, t)[0].offset = -+ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); -+ -+ bset_tree_for_each_key(b, t, k) { -+ if (t->size == bset_rw_tree_capacity(b, t)) -+ break; -+ -+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > -+ L1_CACHE_BYTES) -+ rw_aux_tree_set(b, t, t->size++, k); -+ } -+} -+ -+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); -+ struct bkey_packed min_key, max_key; -+ unsigned j, cacheline = 1; -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), -+ bset_ro_tree_capacity(b, t)); -+retry: -+ if (t->size < 2) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ return; -+ } -+ -+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; -+ -+ /* First we figure out where the first key in each cacheline is */ -+ eytzinger1_for_each(j, t->size) { -+ while (bkey_to_cacheline(b, t, k) < cacheline) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ if (k >= btree_bkey_last(b, t)) { -+ /* XXX: this path sucks */ -+ t->size--; -+ goto retry; -+ } -+ -+ ro_aux_tree_prev(b, t)[j] = prev->u64s; -+ bkey_float(b, t, j)->key_offset = -+ bkey_to_cacheline_offset(b, t, cacheline++, k); -+ -+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); -+ EBUG_ON(tree_to_bkey(b, t, j) != k); -+ } -+ -+ while (k != btree_bkey_last(b, t)) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ t->max_key = bkey_unpack_pos(b, prev); -+ -+ /* Then we build the tree */ -+ eytzinger1_for_each(j, t->size) -+ make_bfloat(b, t, j, &min_key, &max_key); -+} -+ -+static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bset_tree *i; -+ -+ for (i = b->set; i != t; i++) -+ BUG_ON(bset_has_rw_aux_tree(i)); -+ -+ bch2_bset_set_no_aux_tree(b, t); -+ -+ /* round up to next cacheline: */ -+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), -+ SMP_CACHE_BYTES / sizeof(u64)); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, -+ bool writeable) -+{ -+ if (writeable -+ ? bset_has_rw_aux_tree(t) -+ : bset_has_ro_aux_tree(t)) -+ return; -+ -+ bset_alloc_tree(b, t); -+ -+ if (!__bset_tree_capacity(b, t)) -+ return; -+ -+ if (writeable) -+ __build_rw_aux_tree(b, t); -+ else -+ __build_ro_aux_tree(b, t); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_init_first(struct btree *b, struct bset *i) -+{ -+ struct bset_tree *t; -+ -+ BUG_ON(b->nsets); -+ -+ memset(i, 0, sizeof(*i)); -+ get_random_bytes(&i->seq, sizeof(i->seq)); -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+void bch2_bset_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_node_entry *bne) -+{ -+ struct bset *i = &bne->keys; -+ struct bset_tree *t; -+ -+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); -+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); -+ BUG_ON(b->nsets >= MAX_BSETS); -+ -+ memset(i, 0, sizeof(*i)); -+ i->seq = btree_bset_first(b)->seq; -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+/* -+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the -+ * immediate predecessor: -+ */ -+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed *p; -+ unsigned offset; -+ int j; -+ -+ EBUG_ON(k < btree_bkey_first(b, t) || -+ k > btree_bkey_last(b, t)); -+ -+ if (k == btree_bkey_first(b, t)) -+ return NULL; -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ p = btree_bkey_first(b, t); -+ break; -+ case BSET_RO_AUX_TREE: -+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); -+ -+ do { -+ p = j ? tree_to_bkey(b, t, -+ __inorder_to_eytzinger1(j--, -+ t->size, t->extra)) -+ : btree_bkey_first(b, t); -+ } while (p >= k); -+ break; -+ case BSET_RW_AUX_TREE: -+ offset = __btree_node_key_to_offset(b, k); -+ j = rw_aux_tree_bsearch(b, t, offset); -+ p = j ? rw_aux_to_bkey(b, t, j - 1) -+ : btree_bkey_first(b, t); -+ break; -+ } -+ -+ return p; -+} -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; -+ -+ while ((p = __bkey_prev(b, t, k)) && !ret) { -+ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) -+ if (i->type >= min_key_type) -+ ret = i; -+ -+ k = p; -+ } -+ -+ if (btree_keys_expensive_checks(b)) { -+ BUG_ON(ret >= orig_k); -+ -+ for (i = ret -+ ? bkey_next_skip_noops(ret, orig_k) -+ : btree_bkey_first(b, t); -+ i != orig_k; -+ i = bkey_next_skip_noops(i, orig_k)) -+ BUG_ON(i->type >= min_key_type); -+ } -+ -+ return ret; -+} -+ -+/* Insert */ -+ -+static void rw_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ unsigned j = rw_aux_tree_bsearch(b, t, offset); -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset == offset) -+ rw_aux_tree_set(b, t, j, k); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+} -+ -+static void ro_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed min_key, max_key; -+ unsigned inorder, j; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { -+ t->max_key = bkey_unpack_pos(b, k); -+ -+ for (j = 1; j < t->size; j = j * 2 + 1) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ -+ if (inorder && -+ inorder < t->size) { -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ -+ if (k == tree_to_bkey(b, t, j)) { -+ /* Fix the node this key corresponds to */ -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the right boundary */ -+ for (j = eytzinger1_left_child(j); -+ j < t->size; -+ j = eytzinger1_right_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+ -+ if (inorder + 1 < t->size) { -+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); -+ -+ if (k == tree_to_prev_bkey(b, t, j)) { -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the left boundary */ -+ for (j = eytzinger1_right_child(j); -+ j < t->size; -+ j = eytzinger1_left_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+} -+ -+/** -+ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been -+ * modified, fix any auxiliary search tree by remaking all the nodes in the -+ * auxiliary search tree that @k corresponds to -+ */ -+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ break; -+ case BSET_RO_AUX_TREE: -+ ro_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ case BSET_RW_AUX_TREE: -+ rw_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ } -+} -+ -+static void bch2_bset_fix_lookup_table(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *_where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ int shift = new_u64s - clobber_u64s; -+ unsigned l, j, where = __btree_node_key_to_offset(b, _where); -+ -+ EBUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ /* returns first entry >= where */ -+ l = rw_aux_tree_bsearch(b, t, where); -+ -+ if (!l) /* never delete first entry */ -+ l++; -+ else if (l < t->size && -+ where < t->end_offset && -+ rw_aux_tree(b, t)[l].offset == where) -+ rw_aux_tree_set(b, t, l++, _where); -+ -+ /* l now > where */ -+ -+ for (j = l; -+ j < t->size && -+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; -+ j++) -+ ; -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset + shift == -+ rw_aux_tree(b, t)[l - 1].offset) -+ j++; -+ -+ memmove(&rw_aux_tree(b, t)[l], -+ &rw_aux_tree(b, t)[j], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[j]); -+ t->size -= j - l; -+ -+ for (j = l; j < t->size; j++) -+ rw_aux_tree(b, t)[j].offset += shift; -+ -+ EBUG_ON(l < t->size && -+ rw_aux_tree(b, t)[l].offset == -+ rw_aux_tree(b, t)[l - 1].offset); -+ -+ if (t->size < bset_rw_tree_capacity(b, t) && -+ (l < t->size -+ ? rw_aux_tree(b, t)[l].offset -+ : t->end_offset) - -+ rw_aux_tree(b, t)[l - 1].offset > -+ L1_CACHE_BYTES / sizeof(u64)) { -+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); -+ struct bkey_packed *end = l < t->size -+ ? rw_aux_to_bkey(b, t, l) -+ : btree_bkey_last(b, t); -+ struct bkey_packed *k = start; -+ -+ while (1) { -+ k = bkey_next_skip_noops(k, end); -+ if (k == end) -+ break; -+ -+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { -+ memmove(&rw_aux_tree(b, t)[l + 1], -+ &rw_aux_tree(b, t)[l], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[l]); -+ t->size++; -+ rw_aux_tree_set(b, t, l, k); -+ break; -+ } -+ } -+ } -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_insert(struct btree *b, -+ struct btree_node_iter *iter, -+ struct bkey_packed *where, -+ struct bkey_i *insert, -+ unsigned clobber_u64s) -+{ -+ struct bkey_format *f = &b->format; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bkey_packed packed, *src = bkey_to_packed(insert); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); -+ -+ if (bch2_bkey_pack_key(&packed, &insert->k, f)) -+ src = &packed; -+ -+ if (!bkey_whiteout(&insert->k)) -+ btree_keys_account_key_add(&b->nr, t - b->set, src); -+ -+ if (src->u64s != clobber_u64s) { -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data + src->u64s; -+ -+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < -+ (int) clobber_u64s - src->u64s); -+ -+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); -+ set_btree_bset_end(b, t); -+ } -+ -+ memcpy_u64s(where, src, -+ bkeyp_key_u64s(f, src)); -+ memcpy_u64s(bkeyp_val(f, where), &insert->v, -+ bkeyp_val_u64s(f, src)); -+ -+ if (src->u64s != clobber_u64s) -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_bset_delete(struct btree *b, -+ struct bkey_packed *where, -+ unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data; -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ -+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); -+ -+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); -+ set_btree_bset_end(b, t); -+ -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -+} -+ -+/* Lookup */ -+ -+__flatten -+static struct bkey_packed *bset_search_write_set(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ unsigned l = 0, r = t->size; -+ -+ while (l + 1 != r) { -+ unsigned m = (l + r) >> 1; -+ -+ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) -+ l = m; -+ else -+ r = m; -+ } -+ -+ return rw_aux_to_bkey(b, t, l); -+} -+ -+static inline void prefetch_four_cachelines(void *p) -+{ -+#ifdef CONFIG_X86_64 -+ asm(".intel_syntax noprefix;" -+ "prefetcht0 [%0 - 127 + 64 * 0];" -+ "prefetcht0 [%0 - 127 + 64 * 1];" -+ "prefetcht0 [%0 - 127 + 64 * 2];" -+ "prefetcht0 [%0 - 127 + 64 * 3];" -+ ".att_syntax prefix;" -+ : -+ : "r" (p + 127)); -+#else -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ prefetch(p + L1_CACHE_BYTES * 3); -+#endif -+} -+ -+static inline bool bkey_mantissa_bits_dropped(const struct btree *b, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; -+ -+ return f->exponent > key_bits_start; -+#else -+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; -+ -+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -+#endif -+} -+ -+__flatten -+static struct bkey_packed *bset_search_tree(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ struct ro_aux_tree *base = ro_aux_tree_base(b, t); -+ struct bkey_float *f; -+ struct bkey_packed *k; -+ unsigned inorder, n = 1, l, r; -+ int cmp; -+ -+ do { -+ if (likely(n << 4 < t->size)) -+ prefetch(&base->f[n << 4]); -+ -+ f = &base->f[n]; -+ -+ if (!unlikely(packed_search)) -+ goto slowpath; -+ if (unlikely(f->exponent >= BFLOAT_FAILED)) -+ goto slowpath; -+ -+ l = f->mantissa; -+ r = bkey_mantissa(packed_search, f, n); -+ -+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) -+ goto slowpath; -+ -+ n = n * 2 + (l < r); -+ continue; -+slowpath: -+ k = tree_to_bkey(b, t, n); -+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); -+ if (!cmp) -+ return k; -+ -+ n = n * 2 + (cmp < 0); -+ } while (n < t->size); -+ -+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); -+ -+ /* -+ * n would have been the node we recursed to - the low bit tells us if -+ * we recursed left or recursed right. -+ */ -+ if (likely(!(n & 1))) { -+ --inorder; -+ if (unlikely(!inorder)) -+ return btree_bkey_first(b, t); -+ -+ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; -+ } -+ -+ return cacheline_to_bkey(b, t, inorder, f->key_offset); -+} -+ -+static __always_inline __flatten -+struct bkey_packed *__bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ -+ /* -+ * First, we search for a cacheline, then lastly we do a linear search -+ * within that cacheline. -+ * -+ * To search for the cacheline, there's three different possibilities: -+ * * The set is too small to have a search tree, so we just do a linear -+ * search over the whole set. -+ * * The set is the one we're currently inserting into; keeping a full -+ * auxiliary search tree up to date would be too expensive, so we -+ * use a much simpler lookup table to do a binary search - -+ * bset_search_write_set(). -+ * * Or we use the auxiliary search tree we constructed earlier - -+ * bset_search_tree() -+ */ -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return btree_bkey_first(b, t); -+ case BSET_RW_AUX_TREE: -+ return bset_search_write_set(b, t, search, lossy_packed_search); -+ case BSET_RO_AUX_TREE: -+ /* -+ * Each node in the auxiliary search tree covers a certain range -+ * of bits, and keys above and below the set it covers might -+ * differ outside those bits - so we have to special case the -+ * start and end - handle that here: -+ */ -+ -+ if (bkey_cmp(*search, t->max_key) > 0) -+ return btree_bkey_last(b, t); -+ -+ return bset_search_tree(b, t, search, lossy_packed_search); -+ default: -+ unreachable(); -+ } -+} -+ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search_linear(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search, -+ struct bkey_packed *m) -+{ -+ if (lossy_packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_cmp_p_or_unp(b, m, -+ lossy_packed_search, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (!packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_pos_cmp(b, m, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); -+ -+ BUG_ON(prev && -+ bkey_iter_cmp_p_or_unp(b, prev, -+ packed_search, search) >= 0); -+ } -+ -+ return m; -+} -+ -+/* -+ * Returns the first key greater than or equal to @search -+ */ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ struct bkey_packed *m = __bch2_bset_search(b, t, search, -+ lossy_packed_search); -+ -+ return bch2_bset_search_linear(b, t, search, -+ packed_search, lossy_packed_search, m); -+} -+ -+/* Btree node iterator */ -+ -+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ if (k != end) { -+ struct btree_node_iter_set *pos; -+ -+ btree_node_iter_for_each(iter, pos) -+ ; -+ -+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); -+ *pos = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+} -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ __bch2_btree_node_iter_push(iter, b, k, end); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+noinline __flatten __attribute__((cold)) -+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bset_tree *t; -+ -+ trace_bkey_pack_pos_fail(search); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ bch2_bset_search(b, t, search, NULL, NULL), -+ btree_bkey_last(b, t)); -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+/** -+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a -+ * given position -+ * -+ * Main entry point to the lookup code for individual btree nodes: -+ * -+ * NOTE: -+ * -+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate -+ * keys. This doesn't matter for most code, but it does matter for lookups. -+ * -+ * Some adjacent keys with a string of equal keys: -+ * i j k k k k l m -+ * -+ * If you search for k, the lookup code isn't guaranteed to return you any -+ * specific k. The lookup code is conceptually doing a binary search and -+ * iterating backwards is very expensive so if the pivot happens to land at the -+ * last k that's what you'll get. -+ * -+ * This works out ok, but it's something to be aware of: -+ * -+ * - For non extents, we guarantee that the live key comes last - see -+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't -+ * see will only be deleted keys you don't care about. -+ * -+ * - For extents, deleted keys sort last (see the comment at the top of this -+ * file). But when you're searching for extents, you actually want the first -+ * key strictly greater than your search key - an extent that compares equal -+ * to the search key is going to have 0 sectors after the search key. -+ * -+ * But this does mean that we can't just search for -+ * bkey_successor(start_of_range) to get the first extent that overlaps with -+ * the range we want - if we're unlucky and there's an extent that ends -+ * exactly where we searched, then there could be a deleted key at the same -+ * position and we'd get that when we search instead of the preceding extent -+ * we needed. -+ * -+ * So we've got to search for start_of_range, then after the lookup iterate -+ * past any extents that compare equal to the position we searched for. -+ */ -+__flatten -+void bch2_btree_node_iter_init(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bkey_packed p, *packed_search = NULL; -+ struct btree_node_iter_set *pos = iter->data; -+ struct bkey_packed *k[MAX_BSETS]; -+ unsigned i; -+ -+ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); -+ bset_aux_tree_verify(b); -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { -+ case BKEY_PACK_POS_EXACT: -+ packed_search = &p; -+ break; -+ case BKEY_PACK_POS_SMALLER: -+ packed_search = NULL; -+ break; -+ case BKEY_PACK_POS_FAIL: -+ btree_node_iter_init_pack_failed(iter, b, search); -+ return; -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ k[i] = __bch2_bset_search(b, b->set + i, search, &p); -+ prefetch_four_cachelines(k[i]); -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ struct bset_tree *t = b->set + i; -+ struct bkey_packed *end = btree_bkey_last(b, t); -+ -+ k[i] = bch2_bset_search_linear(b, t, search, -+ packed_search, &p, k[i]); -+ if (k[i] != end) -+ *pos++ = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k[i]), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) -+ return __btree_node_offset_to_key(b, set->k); -+ -+ return btree_bkey_last(b, t); -+} -+ -+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned first) -+{ -+ bool ret; -+ -+ if ((ret = (btree_node_iter_cmp(b, -+ iter->data[first], -+ iter->data[first + 1]) > 0))) -+ swap(iter->data[first], iter->data[first + 1]); -+ return ret; -+} -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ /* unrolled bubble sort: */ -+ -+ if (!__btree_node_iter_set_end(iter, 2)) { -+ btree_node_iter_sort_two(iter, b, 0); -+ btree_node_iter_sort_two(iter, b, 1); -+ } -+ -+ if (!__btree_node_iter_set_end(iter, 1)) -+ btree_node_iter_sort_two(iter, b, 0); -+} -+ -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, -+ struct btree_node_iter_set *set) -+{ -+ struct btree_node_iter_set *last = -+ iter->data + ARRAY_SIZE(iter->data) - 1; -+ -+ memmove(&set[0], &set[1], (void *) last - (void *) set); -+ *last = (struct btree_node_iter_set) { 0, 0 }; -+} -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; -+ -+ EBUG_ON(iter->data->k > iter->data->end); -+ -+ while (!__btree_node_iter_set_end(iter, 0) && -+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) -+ iter->data->k++; -+ -+ if (unlikely(__btree_node_iter_set_end(iter, 0))) { -+ bch2_btree_node_iter_set_drop(iter, iter->data); -+ return; -+ } -+ -+ if (__btree_node_iter_set_end(iter, 1)) -+ return; -+ -+ if (!btree_node_iter_sort_two(iter, b, 0)) -+ return; -+ -+ if (__btree_node_iter_set_end(iter, 2)) -+ return; -+ -+ btree_node_iter_sort_two(iter, b, 1); -+} -+ -+void bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) { -+ bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_next_check(iter, b); -+ } -+ -+ __bch2_btree_node_iter_advance(iter, b); -+} -+ -+/* -+ * Expensive: -+ */ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bkey_packed *k, *prev = NULL; -+ struct btree_node_iter_set *set; -+ struct bset_tree *t; -+ unsigned end = 0; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ -+ for_each_bset(b, t) { -+ k = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ if (k && -+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { -+ prev = k; -+ end = t->end_offset; -+ } -+ } -+ -+ if (!prev) -+ return NULL; -+ -+ /* -+ * We're manually memmoving instead of just calling sort() to ensure the -+ * prev we picked ends up in slot 0 - sort won't necessarily put it -+ * there because of duplicate deleted keys: -+ */ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == end) -+ goto found; -+ -+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -+found: -+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); -+ -+ memmove(&iter->data[1], -+ &iter->data[0], -+ (void *) set - (void *) &iter->data[0]); -+ -+ iter->data[0].k = __btree_node_key_to_offset(b, prev); -+ iter->data[0].end = end; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ return prev; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *prev; -+ -+ do { -+ prev = bch2_btree_node_iter_prev_all(iter, b); -+ } while (prev && prev->type < min_key_type); -+ -+ return prev; -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bkey *u) -+{ -+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); -+ -+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -+} -+ -+/* Mergesort */ -+ -+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ enum bset_aux_tree_type type = bset_aux_tree_type(t); -+ size_t j; -+ -+ stats->sets[type].nr++; -+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * -+ sizeof(u64); -+ -+ if (bset_has_ro_aux_tree(t)) { -+ stats->floats += t->size - 1; -+ -+ for (j = 1; j < t->size; j++) -+ stats->failed += -+ bkey_float(b, t, j)->exponent == -+ BFLOAT_FAILED; -+ } -+ } -+} -+ -+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, -+ struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk; -+ unsigned j, inorder; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ if (!bset_has_ro_aux_tree(t)) -+ return; -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ if (!inorder || inorder >= t->size) -+ return; -+ -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ if (k != tree_to_bkey(b, t, j)) -+ return; -+ -+ switch (bkey_float(b, t, j)->exponent) { -+ case BFLOAT_FAILED: -+ uk = bkey_unpack_key(b, k); -+ pr_buf(out, -+ " failed unpacked at depth %u\n" -+ "\t%llu:%llu\n", -+ ilog2(j), -+ uk.p.inode, uk.p.offset); -+ break; -+ } -+} -diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h -new file mode 100644 -index 000000000000..652ffed4adfb ---- /dev/null -+++ b/fs/bcachefs/bset.h -@@ -0,0 +1,631 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BSET_H -+#define _BCACHEFS_BSET_H -+ -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "util.h" /* for time_stats */ -+#include "vstructs.h" -+ -+/* -+ * BKEYS: -+ * -+ * A bkey contains a key, a size field, a variable number of pointers, and some -+ * ancillary flag bits. -+ * -+ * We use two different functions for validating bkeys, bkey_invalid and -+ * bkey_deleted(). -+ * -+ * The one exception to the rule that ptr_invalid() filters out invalid keys is -+ * that it also filters out keys of size 0 - these are keys that have been -+ * completely overwritten. It'd be safe to delete these in memory while leaving -+ * them on disk, just unnecessary work - so we filter them out when resorting -+ * instead. -+ * -+ * We can't filter out stale keys when we're resorting, because garbage -+ * collection needs to find them to ensure bucket gens don't wrap around - -+ * unless we're rewriting the btree node those stale keys still exist on disk. -+ * -+ * We also implement functions here for removing some number of sectors from the -+ * front or the back of a bkey - this is mainly used for fixing overlapping -+ * extents, by removing the overlapping sectors from the older key. -+ * -+ * BSETS: -+ * -+ * A bset is an array of bkeys laid out contiguously in memory in sorted order, -+ * along with a header. A btree node is made up of a number of these, written at -+ * different times. -+ * -+ * There could be many of them on disk, but we never allow there to be more than -+ * 4 in memory - we lazily resort as needed. -+ * -+ * We implement code here for creating and maintaining auxiliary search trees -+ * (described below) for searching an individial bset, and on top of that we -+ * implement a btree iterator. -+ * -+ * BTREE ITERATOR: -+ * -+ * Most of the code in bcache doesn't care about an individual bset - it needs -+ * to search entire btree nodes and iterate over them in sorted order. -+ * -+ * The btree iterator code serves both functions; it iterates through the keys -+ * in a btree node in sorted order, starting from either keys after a specific -+ * point (if you pass it a search key) or the start of the btree node. -+ * -+ * AUXILIARY SEARCH TREES: -+ * -+ * Since keys are variable length, we can't use a binary search on a bset - we -+ * wouldn't be able to find the start of the next key. But binary searches are -+ * slow anyways, due to terrible cache behaviour; bcache originally used binary -+ * searches and that code topped out at under 50k lookups/second. -+ * -+ * So we need to construct some sort of lookup table. Since we only insert keys -+ * into the last (unwritten) set, most of the keys within a given btree node are -+ * usually in sets that are mostly constant. We use two different types of -+ * lookup tables to take advantage of this. -+ * -+ * Both lookup tables share in common that they don't index every key in the -+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search -+ * is used for the rest. -+ * -+ * For sets that have been written to disk and are no longer being inserted -+ * into, we construct a binary search tree in an array - traversing a binary -+ * search tree in an array gives excellent locality of reference and is very -+ * fast, since both children of any node are adjacent to each other in memory -+ * (and their grandchildren, and great grandchildren...) - this means -+ * prefetching can be used to great effect. -+ * -+ * It's quite useful performance wise to keep these nodes small - not just -+ * because they're more likely to be in L2, but also because we can prefetch -+ * more nodes on a single cacheline and thus prefetch more iterations in advance -+ * when traversing this tree. -+ * -+ * Nodes in the auxiliary search tree must contain both a key to compare against -+ * (we don't want to fetch the key from the set, that would defeat the purpose), -+ * and a pointer to the key. We use a few tricks to compress both of these. -+ * -+ * To compress the pointer, we take advantage of the fact that one node in the -+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have -+ * a function (to_inorder()) that takes the index of a node in a binary tree and -+ * returns what its index would be in an inorder traversal, so we only have to -+ * store the low bits of the offset. -+ * -+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To -+ * compress that, we take advantage of the fact that when we're traversing the -+ * search tree at every iteration we know that both our search key and the key -+ * we're looking for lie within some range - bounded by our previous -+ * comparisons. (We special case the start of a search so that this is true even -+ * at the root of the tree). -+ * -+ * So we know the key we're looking for is between a and b, and a and b don't -+ * differ higher than bit 50, we don't need to check anything higher than bit -+ * 50. -+ * -+ * We don't usually need the rest of the bits, either; we only need enough bits -+ * to partition the key range we're currently checking. Consider key n - the -+ * key our auxiliary search tree node corresponds to, and key p, the key -+ * immediately preceding n. The lowest bit we need to store in the auxiliary -+ * search tree is the highest bit that differs between n and p. -+ * -+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the -+ * comparison. But we'd really like our nodes in the auxiliary search tree to be -+ * of fixed size. -+ * -+ * The solution is to make them fixed size, and when we're constructing a node -+ * check if p and n differed in the bits we needed them to. If they don't we -+ * flag that node, and when doing lookups we fallback to comparing against the -+ * real key. As long as this doesn't happen to often (and it seems to reliably -+ * happen a bit less than 1% of the time), we win - even on failures, that key -+ * is then more likely to be in cache than if we were doing binary searches all -+ * the way, since we're touching so much less memory. -+ * -+ * The keys in the auxiliary search tree are stored in (software) floating -+ * point, with an exponent and a mantissa. The exponent needs to be big enough -+ * to address all the bits in the original key, but the number of bits in the -+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. -+ * -+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys -+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. -+ * We need one node per 128 bytes in the btree node, which means the auxiliary -+ * search trees take up 3% as much memory as the btree itself. -+ * -+ * Constructing these auxiliary search trees is moderately expensive, and we -+ * don't want to be constantly rebuilding the search tree for the last set -+ * whenever we insert another key into it. For the unwritten set, we use a much -+ * simpler lookup table - it's just a flat array, so index i in the lookup table -+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing -+ * within each byte range works the same as with the auxiliary search trees. -+ * -+ * These are much easier to keep up to date when we insert a key - we do it -+ * somewhat lazily; when we shift a key up we usually just increment the pointer -+ * to it, only when it would overflow do we go to the trouble of finding the -+ * first key in that range of bytes again. -+ */ -+ -+extern bool bch2_expensive_debug_checks; -+ -+static inline bool btree_keys_expensive_checks(const struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ return bch2_expensive_debug_checks || *b->expensive_debug_checks; -+#else -+ return false; -+#endif -+} -+ -+enum bset_aux_tree_type { -+ BSET_NO_AUX_TREE, -+ BSET_RO_AUX_TREE, -+ BSET_RW_AUX_TREE, -+}; -+ -+#define BSET_TREE_NR_TYPES 3 -+ -+#define BSET_NO_AUX_TREE_VAL (U16_MAX) -+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) -+ -+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -+{ -+ switch (t->extra) { -+ case BSET_NO_AUX_TREE_VAL: -+ EBUG_ON(t->size); -+ return BSET_NO_AUX_TREE; -+ case BSET_RW_AUX_TREE_VAL: -+ EBUG_ON(!t->size); -+ return BSET_RW_AUX_TREE; -+ default: -+ EBUG_ON(!t->size); -+ return BSET_RO_AUX_TREE; -+ } -+} -+ -+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -+ -+static inline void -+__bkey_unpack_key_format_checked(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ { -+ compiled_unpack_fn unpack_fn = b->aux_data; -+ unpack_fn(dst, src); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); -+ -+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -+ } -+ } -+#else -+ *dst = __bch2_bkey_unpack_key(&b->format, src); -+#endif -+} -+ -+static inline struct bkey -+bkey_unpack_key_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ struct bkey dst; -+ -+ __bkey_unpack_key_format_checked(b, &dst, src); -+ return dst; -+} -+ -+static inline void __bkey_unpack_key(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (likely(bkey_packed(src))) -+ __bkey_unpack_key_format_checked(b, dst, src); -+ else -+ *dst = *packed_to_bkey_c(src); -+} -+ -+/** -+ * bkey_unpack_key -- unpack just the key, not the value -+ */ -+static inline struct bkey bkey_unpack_key(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_key_format_checked(b, src) -+ : *packed_to_bkey_c(src); -+} -+ -+static inline struct bpos -+bkey_unpack_pos_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ return bkey_unpack_key_format_checked(b, src).p; -+#else -+ return __bkey_unpack_pos(&b->format, src); -+#endif -+} -+ -+static inline struct bpos bkey_unpack_pos(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_pos_format_checked(b, src) -+ : packed_to_bkey_c(src)->p; -+} -+ -+/* Disassembled bkeys */ -+ -+static inline struct bkey_s_c bkey_disassemble(struct btree *b, -+ const struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -+} -+ -+/* non const version: */ -+static inline struct bkey_s __bkey_disassemble(struct btree *b, -+ struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -+} -+ -+#define for_each_bset(_b, _t) \ -+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) -+ -+#define bset_tree_for_each_key(_b, _t, _k) \ -+ for (_k = btree_bkey_first(_b, _t); \ -+ _k != btree_bkey_last(_b, _t); \ -+ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) -+ -+static inline bool bset_has_ro_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -+} -+ -+static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -+} -+ -+static inline void bch2_bset_set_no_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ BUG_ON(t < b->set); -+ -+ for (; t < b->set + ARRAY_SIZE(b->set); t++) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ t->aux_data_offset = U16_MAX; -+ } -+} -+ -+static inline void btree_node_set_format(struct btree *b, -+ struct bkey_format f) -+{ -+ int len; -+ -+ b->format = f; -+ b->nr_key_bits = bkey_format_key_bits(&f); -+ -+ len = bch2_compile_bkey_format(&b->format, b->aux_data); -+ BUG_ON(len < 0 || len > U8_MAX); -+ -+ b->unpack_fn_len = len; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+static inline struct bset *bset_next_set(struct btree *b, -+ unsigned block_bytes) -+{ -+ struct bset *i = btree_bset_last(b); -+ -+ EBUG_ON(!is_power_of_2(block_bytes)); -+ -+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -+} -+ -+void bch2_btree_keys_free(struct btree *); -+int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); -+void bch2_btree_keys_init(struct btree *, bool *); -+ -+void bch2_bset_init_first(struct btree *, struct bset *); -+void bch2_bset_init_next(struct bch_fs *, struct btree *, -+ struct btree_node_entry *); -+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); -+ -+void bch2_bset_insert(struct btree *, struct btree_node_iter *, -+ struct bkey_packed *, struct bkey_i *, unsigned); -+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); -+ -+/* Bkey utility code */ -+ -+/* packed or unpacked */ -+static inline int bkey_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ EBUG_ON(r_packed && !bkey_packed(r_packed)); -+ -+ if (unlikely(!bkey_packed(l))) -+ return bkey_cmp(packed_to_bkey_c(l)->p, *r); -+ -+ if (likely(r_packed)) -+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); -+ -+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, -+ struct bkey_packed *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); -+} -+ -+enum bch_extent_overlap { -+ BCH_EXTENT_OVERLAP_ALL = 0, -+ BCH_EXTENT_OVERLAP_BACK = 1, -+ BCH_EXTENT_OVERLAP_FRONT = 2, -+ BCH_EXTENT_OVERLAP_MIDDLE = 3, -+}; -+ -+/* Returns how k overlaps with m */ -+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, -+ const struct bkey *m) -+{ -+ int cmp1 = bkey_cmp(k->p, m->p) < 0; -+ int cmp2 = bkey_cmp(bkey_start_pos(k), -+ bkey_start_pos(m)) > 0; -+ -+ return (cmp1 << 1) + cmp2; -+} -+ -+/* Btree key iteration */ -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, -+ struct bpos *); -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, -+ struct btree *, -+ struct bset_tree *); -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *, -+ struct btree_node_iter_set *); -+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); -+ -+#define btree_node_iter_for_each(_iter, _set) \ -+ for (_set = (_iter)->data; \ -+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ -+ (_set)->k != (_set)->end; \ -+ _set++) -+ -+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, -+ unsigned i) -+{ -+ return iter->data[i].k == iter->data[i].end; -+} -+ -+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -+{ -+ return __btree_node_iter_set_end(iter, 0); -+} -+ -+/* -+ * When keys compare equal, deleted keys compare first: -+ * -+ * XXX: only need to compare pointers for keys that are both within a -+ * btree_node_iterator - we need to break ties for prev() to work correctly -+ */ -+static inline int bkey_iter_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) -+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) -+ ?: cmp_int(l, r); -+} -+ -+static inline int btree_node_iter_cmp(const struct btree *b, -+ struct btree_node_iter_set l, -+ struct btree_node_iter_set r) -+{ -+ return bkey_iter_cmp(b, -+ __btree_node_offset_to_key(b, l.k), -+ __btree_node_offset_to_key(b, r.k)); -+} -+ -+/* These assume r (the search key) is not a deleted key: */ -+static inline int bkey_iter_pos_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp_left_packed(b, l, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ return bkey_cmp_p_or_unp(b, l, r_packed, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline struct bkey_packed * -+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return __btree_node_offset_to_key(b, iter->data->k); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ while (!bch2_btree_node_iter_end(iter)) { -+ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (k->type >= min_key_type) -+ return k; -+ -+ bch2_btree_node_iter_advance(iter, b); -+ } -+ -+ return NULL; -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -+{ -+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (ret) -+ bch2_btree_node_iter_advance(iter, b); -+ -+ return ret; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, -+ struct btree *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, -+ struct btree *, -+ struct bkey *); -+ -+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ -+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ -+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ -+ bch2_btree_node_iter_advance(iter, b)) -+ -+/* Accounting: */ -+ -+static inline void btree_keys_account_key(struct btree_nr_keys *n, -+ unsigned bset, -+ struct bkey_packed *k, -+ int sign) -+{ -+ n->live_u64s += k->u64s * sign; -+ n->bset_u64s[bset] += k->u64s * sign; -+ -+ if (bkey_packed(k)) -+ n->packed_keys += sign; -+ else -+ n->unpacked_keys += sign; -+} -+ -+static inline void btree_keys_account_val_delta(struct btree *b, -+ struct bkey_packed *k, -+ int delta) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ b->nr.live_u64s += delta; -+ b->nr.bset_u64s[t - b->set] += delta; -+} -+ -+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, 1) -+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, -1) -+ -+#define btree_account_key_add(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -+#define btree_account_key_drop(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) -+ -+struct bset_stats { -+ struct { -+ size_t nr, bytes; -+ } sets[BSET_TREE_NR_TYPES]; -+ -+ size_t floats; -+ size_t failed; -+}; -+ -+void bch2_btree_keys_stats(struct btree *, struct bset_stats *); -+void bch2_bfloat_to_text(struct printbuf *, struct btree *, -+ struct bkey_packed *); -+ -+/* Debug stuff */ -+ -+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -+void bch2_dump_btree_node(struct bch_fs *, struct btree *); -+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *); -+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, -+ struct bkey_packed *, unsigned); -+ -+#else -+ -+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} -+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) {} -+static inline void bch2_verify_insert_pos(struct btree *b, -+ struct bkey_packed *where, -+ struct bkey_packed *insert, -+ unsigned clobber_u64s) {} -+#endif -+ -+static inline void bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) -+ __bch2_verify_btree_nr_keys(b); -+} -+ -+#endif /* _BCACHEFS_BSET_H */ -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -new file mode 100644 -index 000000000000..d3addd3a8964 ---- /dev/null -+++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1054 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "debug.h" -+ -+#include -+#include -+#include -+ -+const char * const bch2_btree_ids[] = { -+#define x(kwd, val, name) name, -+ BCH_BTREE_IDS() -+#undef x -+ NULL -+}; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *c) -+{ -+ unsigned i, reserve = 16; -+ -+ if (!c->btree_roots[0].b) -+ reserve += 8; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ reserve += min_t(unsigned, 1, -+ c->btree_roots[i].b->c.level) * 8; -+ -+ c->btree_cache.reserve = reserve; -+} -+ -+static inline unsigned btree_cache_can_free(struct btree_cache *bc) -+{ -+ return max_t(int, 0, bc->used - bc->reserve); -+} -+ -+static void __btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ EBUG_ON(btree_node_write_in_flight(b)); -+ -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ bch2_btree_keys_free(b); -+} -+ -+static void btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ __btree_node_data_free(c, b); -+ bc->used--; -+ list_move(&b->list, &bc->freed); -+} -+ -+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct btree *b = obj; -+ const u64 *v = arg->key; -+ -+ return b->hash_val == *v ? 0 : 1; -+} -+ -+static const struct rhashtable_params bch_btree_cache_params = { -+ .head_offset = offsetof(struct btree, hash), -+ .key_offset = offsetof(struct btree, hash_val), -+ .key_len = sizeof(u64), -+ .obj_cmpfn = bch2_btree_cache_cmp_fn, -+}; -+ -+static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -+{ -+ BUG_ON(b->data || b->aux_data); -+ -+ b->data = kvpmalloc(btree_bytes(c), gfp); -+ if (!b->data) -+ return -ENOMEM; -+ -+ if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (!__btree_node_data_alloc(c, b, gfp)) { -+ bc->used++; -+ list_move(&b->list, &bc->freeable); -+ } else { -+ list_move(&b->list, &bc->freed); -+ } -+} -+ -+static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) -+{ -+ struct btree *b = kzalloc(sizeof(struct btree), gfp); -+ if (!b) -+ return NULL; -+ -+ bkey_btree_ptr_init(&b->key); -+ six_lock_init(&b->c.lock); -+ INIT_LIST_HEAD(&b->list); -+ INIT_LIST_HEAD(&b->write_blocked); -+ -+ btree_node_data_alloc(c, b, gfp); -+ return b->data ? b : NULL; -+} -+ -+/* Btree in memory cache - hash table */ -+ -+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -+{ -+ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); -+ -+ /* Cause future lookups for this node to fail: */ -+ b->hash_val = 0; -+} -+ -+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -+{ -+ BUG_ON(b->hash_val); -+ b->hash_val = btree_ptr_hash_val(&b->key); -+ -+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, -+ bch_btree_cache_params); -+} -+ -+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, -+ unsigned level, enum btree_id id) -+{ -+ int ret; -+ -+ b->c.level = level; -+ b->c.btree_id = id; -+ -+ mutex_lock(&bc->lock); -+ ret = __bch2_btree_node_hash_insert(bc, b); -+ if (!ret) -+ list_add(&b->list, &bc->live); -+ mutex_unlock(&bc->lock); -+ -+ return ret; -+} -+ -+__flatten -+static inline struct btree *btree_cache_find(struct btree_cache *bc, -+ const struct bkey_i *k) -+{ -+ u64 v = btree_ptr_hash_val(k); -+ -+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -+} -+ -+/* -+ * this version is for btree nodes that have already been freed (we're not -+ * reaping a real btree node) -+ */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ int ret = 0; -+ -+ lockdep_assert_held(&bc->lock); -+ -+ if (!six_trylock_intent(&b->c.lock)) -+ return -ENOMEM; -+ -+ if (!six_trylock_write(&b->c.lock)) -+ goto out_unlock_intent; -+ -+ if (btree_node_noevict(b)) -+ goto out_unlock; -+ -+ if (!btree_node_may_write(b)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) && -+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) || -+ btree_node_write_in_flight(b) || -+ btree_node_read_in_flight(b)) { -+ if (!flush) -+ goto out_unlock; -+ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ /* -+ * Using the underscore version because we don't want to compact -+ * bsets after the write, since this node is about to be evicted -+ * - unless btree verify mode is enabled, since it runs out of -+ * the post write cleanup: -+ */ -+ if (verify_btree_ondisk(c)) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ else -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ -+ /* wait for any in flight btree write */ -+ btree_node_wait_on_io(b); -+ } -+out: -+ if (b->hash_val && !ret) -+ trace_btree_node_reap(c, b); -+ return ret; -+out_unlock: -+ six_unlock_write(&b->c.lock); -+out_unlock_intent: -+ six_unlock_intent(&b->c.lock); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, false); -+} -+ -+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, true); -+} -+ -+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b, *t; -+ unsigned long nr = sc->nr_to_scan; -+ unsigned long can_free; -+ unsigned long touched = 0; -+ unsigned long freed = 0; -+ unsigned i; -+ -+ if (btree_shrinker_disabled(c)) -+ return SHRINK_STOP; -+ -+ /* Return -1 if we can't do anything right now */ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ return -1; -+ -+ /* -+ * It's _really_ critical that we don't free too many btree nodes - we -+ * have to always leave ourselves a reserve. The reserve is how we -+ * guarantee that allocating memory for a new btree node can always -+ * succeed, so that inserting keys into the btree can always succeed and -+ * IO can always make forward progress: -+ */ -+ nr /= btree_pages(c); -+ can_free = btree_cache_can_free(bc); -+ nr = min_t(unsigned long, nr, can_free); -+ -+ i = 0; -+ list_for_each_entry_safe(b, t, &bc->freeable, list) { -+ touched++; -+ -+ if (freed >= nr) -+ break; -+ -+ if (++i > 3 && -+ !btree_node_reclaim(c, b)) { -+ btree_node_data_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ freed++; -+ } -+ } -+restart: -+ list_for_each_entry_safe(b, t, &bc->live, list) { -+ touched++; -+ -+ if (freed >= nr) { -+ /* Save position */ -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ break; -+ } -+ -+ if (!btree_node_accessed(b) && -+ !btree_node_reclaim(c, b)) { -+ /* can't call bch2_btree_node_hash_remove under lock */ -+ freed++; -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ -+ btree_node_data_free(c, b); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ if (freed >= nr) -+ goto out; -+ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ goto out; -+ goto restart; -+ } else -+ clear_btree_node_accessed(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+out: -+ return (unsigned long) freed * btree_pages(c); -+} -+ -+static unsigned long bch2_btree_cache_count(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (btree_shrinker_disabled(c)) -+ return 0; -+ -+ return btree_cache_can_free(bc) * btree_pages(c); -+} -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ unsigned i; -+ -+ if (bc->shrink.list.next) -+ unregister_shrinker(&bc->shrink); -+ -+ mutex_lock(&bc->lock); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (c->verify_data) -+ list_move(&c->verify_data->list, &bc->live); -+ -+ kvpfree(c->verify_ondisk, btree_bytes(c)); -+#endif -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ list_add(&c->btree_roots[i].b->list, &bc->live); -+ -+ list_splice(&bc->freeable, &bc->live); -+ -+ while (!list_empty(&bc->live)) { -+ b = list_first_entry(&bc->live, struct btree, list); -+ -+ BUG_ON(btree_node_read_in_flight(b) || -+ btree_node_write_in_flight(b)); -+ -+ if (btree_node_dirty(b)) -+ bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty(b); -+ -+ btree_node_data_free(c, b); -+ } -+ -+ while (!list_empty(&bc->freed)) { -+ b = list_first_entry(&bc->freed, struct btree, list); -+ list_del(&b->list); -+ kfree(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+ -+ if (bc->table_init_done) -+ rhashtable_destroy(&bc->table); -+} -+ -+int bch2_fs_btree_cache_init(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ unsigned i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); -+ if (ret) -+ goto out; -+ -+ bc->table_init_done = true; -+ -+ bch2_recalc_btree_reserve(c); -+ -+ for (i = 0; i < bc->reserve; i++) -+ if (!btree_node_mem_alloc(c, GFP_KERNEL)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_splice_init(&bc->live, &bc->freeable); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_init(&c->verify_lock); -+ -+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); -+ if (!c->verify_ondisk) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); -+ if (!c->verify_data) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_del_init(&c->verify_data->list); -+#endif -+ -+ bc->shrink.count_objects = bch2_btree_cache_count; -+ bc->shrink.scan_objects = bch2_btree_cache_scan; -+ bc->shrink.seeks = 4; -+ bc->shrink.batch = btree_pages(c) * 2; -+ register_shrinker(&bc->shrink); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -+{ -+ mutex_init(&bc->lock); -+ INIT_LIST_HEAD(&bc->live); -+ INIT_LIST_HEAD(&bc->freeable); -+ INIT_LIST_HEAD(&bc->freed); -+} -+ -+/* -+ * We can only have one thread cannibalizing other cached btree nodes at a time, -+ * or we'll deadlock. We use an open coded mutex to ensure that, which a -+ * cannibalize_bucket() will take. This means every time we unlock the root of -+ * the btree, we need to release this lock if we have it held. -+ */ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (bc->alloc_lock == current) { -+ trace_btree_node_cannibalize_unlock(c); -+ bc->alloc_lock = NULL; -+ closure_wake_up(&bc->alloc_wait); -+ } -+} -+ -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct task_struct *old; -+ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) -+ goto success; -+ -+ if (!cl) { -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -ENOMEM; -+ } -+ -+ closure_wait(&bc->alloc_wait, cl); -+ -+ /* Try again, after adding ourselves to waitlist */ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) { -+ /* We raced */ -+ closure_wake_up(&bc->alloc_wait); -+ goto success; -+ } -+ -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -EAGAIN; -+ -+success: -+ trace_btree_node_cannibalize_lock(c); -+ return 0; -+} -+ -+static struct btree *btree_node_cannibalize(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_reclaim(c, b)) -+ return b; -+ -+ while (1) { -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_write_and_reclaim(c, b)) -+ return b; -+ -+ /* -+ * Rare case: all nodes were intent-locked. -+ * Just busy-wait. -+ */ -+ WARN_ONCE(1, "btree cache cannibalize failed\n"); -+ cond_resched(); -+ } -+} -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ u64 start_time = local_clock(); -+ unsigned flags; -+ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+ /* -+ * btree_free() doesn't free memory; it sticks the node on the end of -+ * the list. Check if there's any freed nodes there: -+ */ -+ list_for_each_entry(b, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ /* -+ * We never free struct btree itself, just the memory that holds the on -+ * disk node. Check the freed list before allocating a new one: -+ */ -+ list_for_each_entry(b, &bc->freed, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ b = NULL; -+got_node: -+ if (b) -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ if (!b) { -+ b = kzalloc(sizeof(struct btree), GFP_KERNEL); -+ if (!b) -+ goto err; -+ -+ bkey_btree_ptr_init(&b->key); -+ six_lock_init(&b->c.lock); -+ INIT_LIST_HEAD(&b->list); -+ INIT_LIST_HEAD(&b->write_blocked); -+ -+ BUG_ON(!six_trylock_intent(&b->c.lock)); -+ BUG_ON(!six_trylock_write(&b->c.lock)); -+ } -+ -+ if (!b->data) { -+ if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) -+ goto err; -+ -+ mutex_lock(&bc->lock); -+ bc->used++; -+ mutex_unlock(&bc->lock); -+ } -+ -+ BUG_ON(btree_node_hashed(b)); -+ BUG_ON(btree_node_write_in_flight(b)); -+out: -+ b->flags = 0; -+ b->written = 0; -+ b->nsets = 0; -+ b->sib_u64s[0] = 0; -+ b->sib_u64s[1] = 0; -+ b->whiteout_u64s = 0; -+ bch2_btree_keys_init(b, &c->expensive_debug_checks); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], -+ start_time); -+ -+ memalloc_nofs_restore(flags); -+ return b; -+err: -+ mutex_lock(&bc->lock); -+ -+ if (b) { -+ list_add(&b->list, &bc->freed); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ /* Try to cannibalize another cached btree node: */ -+ if (bc->alloc_lock == current) { -+ b = btree_node_cannibalize(c); -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ -+ trace_btree_node_cannibalize(c); -+ goto out; -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ return ERR_PTR(-ENOMEM); -+} -+ -+/* Slowpath, don't want it inlined into btree_iter_traverse() */ -+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, -+ struct btree_iter *iter, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level, -+ enum six_lock_type lock_type, -+ bool sync) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); -+ /* -+ * Parent node must be locked, else we could read in a btree node that's -+ * been freed: -+ */ -+ if (iter && !bch2_btree_node_relock(iter, level + 1)) -+ return ERR_PTR(-EINTR); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ if (IS_ERR(b)) -+ return b; -+ -+ bkey_copy(&b->key, k); -+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { -+ /* raced with another fill: */ -+ -+ /* mark as unhashed... */ -+ b->hash_val = 0; -+ -+ mutex_lock(&bc->lock); -+ list_add(&b->list, &bc->freeable); -+ mutex_unlock(&bc->lock); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ /* -+ * Unlock before doing IO: -+ * -+ * XXX: ideally should be dropping all btree node locks here -+ */ -+ if (iter && btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ bch2_btree_node_read(c, b, sync); -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (!sync) { -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ if (lock_type == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ -+ return b; -+} -+ -+static int lock_node_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ const struct bkey_i *k = p; -+ -+ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; -+} -+ -+/** -+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it -+ * in from disk if necessary. -+ * -+ * If IO is necessary and running under generic_make_request, returns -EAGAIN. -+ * -+ * The btree node will have either a read or a write lock held, depending on -+ * the @write parameter. -+ */ -+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ /* -+ * We must have the parent locked to call bch2_btree_node_fill(), -+ * else we could read in a btree node from disk that's been -+ * freed: -+ */ -+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, lock_type, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ /* -+ * There's a potential deadlock with splits and insertions into -+ * interior nodes we have to avoid: -+ * -+ * The other thread might be holding an intent lock on the node -+ * we want, and they want to update its parent node so they're -+ * going to upgrade their intent lock on the parent node to a -+ * write lock. -+ * -+ * But if we're holding a read lock on the parent, and we're -+ * trying to get the intent lock they're holding, we deadlock. -+ * -+ * So to avoid this we drop the read locks on parent nodes when -+ * we're starting to take intent locks - and handle the race. -+ * -+ * The race is that they might be about to free the node we -+ * want, and dropping our read lock on the parent node lets them -+ * update the parent marking the node we want as freed, and then -+ * free it: -+ * -+ * To guard against this, btree nodes are evicted from the cache -+ * when they're freed - and b->hash_val is zeroed out, which we -+ * check for after we lock the node. -+ * -+ * Then, bch2_btree_node_relock() on the parent will fail - because -+ * the parent was modified, when the pointer to the node we want -+ * was removed - and we'll bail out: -+ */ -+ if (btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, -+ lock_node_check_fn, (void *) k)) { -+ if (b->hash_val != btree_ptr_hash_val(k)) -+ goto retry; -+ return ERR_PTR(-EINTR); -+ } -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.level != level || -+ race_fault())) { -+ six_unlock_type(&b->c.lock, lock_type); -+ if (bch2_btree_node_relock(iter, level + 1)) -+ goto retry; -+ -+ trace_trans_restart_btree_node_reused(iter->trans->ip); -+ return ERR_PTR(-EINTR); -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != iter->btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ b = bch2_btree_node_fill(c, NULL, k, btree_id, -+ level, SIX_LOCK_read, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); -+ if (ret) -+ goto retry; -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.btree_id != btree_id || -+ b->c.level != level)) { -+ six_unlock_read(&b->c.lock); -+ goto retry; -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_read(&b->c.lock); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ struct btree *b, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *parent; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ struct btree *ret = NULL; -+ unsigned level = b->c.level; -+ -+ parent = btree_iter_node(iter, level + 1); -+ if (!parent) -+ return NULL; -+ -+ /* -+ * There's a corner case where a btree_iter might have a node locked -+ * that is just outside its current pos - when -+ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. -+ * -+ * But the lock ordering checks in __bch2_btree_node_lock() go off of -+ * iter->pos, not the node's key: so if the iterator is marked as -+ * needing to be traversed, we risk deadlock if we don't bail out here: -+ */ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return ERR_PTR(-EINTR); -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) { -+ ret = ERR_PTR(-EINTR); -+ goto out; -+ } -+ -+ node_iter = iter->l[parent->c.level].iter; -+ -+ k = bch2_btree_node_iter_peek_all(&node_iter, parent); -+ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); -+ -+ k = sib == btree_prev_sib -+ ? bch2_btree_node_iter_prev(&node_iter, parent) -+ : (bch2_btree_node_iter_advance(&node_iter, parent), -+ bch2_btree_node_iter_peek(&node_iter, parent)); -+ if (!k) -+ goto out; -+ -+ bch2_bkey_unpack(parent, &tmp.k, k); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { -+ struct btree_iter *linked; -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) -+ goto out; -+ -+ /* -+ * We might have got -EINTR because trylock failed, and we're -+ * holding other locks that would cause us to deadlock: -+ */ -+ trans_for_each_iter(trans, linked) -+ if (btree_iter_cmp(iter, linked) < 0) -+ __bch2_btree_iter_unlock(linked); -+ -+ if (sib == btree_prev_sib) -+ btree_node_unlock(iter, level); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ /* -+ * before btree_iter_relock() calls btree_iter_verify_locks(): -+ */ -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!bch2_btree_node_relock(iter, level)) { -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ if (!IS_ERR(ret)) { -+ six_unlock_intent(&ret->c.lock); -+ ret = ERR_PTR(-EINTR); -+ } -+ } -+ -+ bch2_trans_relock(trans); -+ } -+out: -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR) -+ bch2_btree_iter_upgrade(iter, level + 2); -+ -+ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); -+ -+ if (!IS_ERR_OR_NULL(ret)) { -+ struct btree *n1 = ret, *n2 = b; -+ -+ if (sib != btree_prev_sib) -+ swap(n1, n2); -+ -+ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), -+ n2->data->min_key)); -+ } -+ -+ bch2_btree_trans_verify_locks(trans); -+ -+ return ret; -+} -+ -+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(!btree_node_locked(iter, level + 1)); -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_cache_find(bc, k); -+ if (b) -+ return; -+ -+ bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, SIX_LOCK_read, false); -+} -+ -+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_stats stats; -+ -+ memset(&stats, 0, sizeof(stats)); -+ -+ bch2_btree_keys_stats(b, &stats); -+ -+ pr_buf(out, -+ "l %u %llu:%llu - %llu:%llu:\n" -+ " ptrs: ", -+ b->c.level, -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ b->data->max_key.inode, -+ b->data->max_key.offset); -+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ pr_buf(out, "\n" -+ " format: u64s %u fields %u %u %u %u %u\n" -+ " unpack fn len: %u\n" -+ " bytes used %zu/%zu (%zu%% full)\n" -+ " sib u64s: %u, %u (merge threshold %zu)\n" -+ " nr packed keys %u\n" -+ " nr unpacked keys %u\n" -+ " floats %zu\n" -+ " failed unpacked %zu\n", -+ f->key_u64s, -+ f->bits_per_field[0], -+ f->bits_per_field[1], -+ f->bits_per_field[2], -+ f->bits_per_field[3], -+ f->bits_per_field[4], -+ b->unpack_fn_len, -+ b->nr.live_u64s * sizeof(u64), -+ btree_bytes(c) - sizeof(struct btree_node), -+ b->nr.live_u64s * 100 / btree_max_u64s(c), -+ b->sib_u64s[0], -+ b->sib_u64s[1], -+ BTREE_FOREGROUND_MERGE_THRESHOLD(c), -+ b->nr.packed_keys, -+ b->nr.unpacked_keys, -+ stats.floats, -+ stats.failed); -+} -diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h -new file mode 100644 -index 000000000000..2160012c734f ---- /dev/null -+++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,109 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_CACHE_H -+#define _BCACHEFS_BTREE_CACHE_H -+ -+#include "bcachefs.h" -+#include "btree_types.h" -+ -+struct btree_iter; -+ -+extern const char * const bch2_btree_ids[]; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *); -+ -+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, -+ unsigned, enum btree_id); -+ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -+ -+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned, -+ enum six_lock_type); -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, -+ enum btree_id, unsigned); -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, -+ struct btree *, enum btree_node_sibling); -+ -+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned); -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *); -+int bch2_fs_btree_cache_init(struct bch_fs *); -+void bch2_fs_btree_cache_init_early(struct btree_cache *); -+ -+static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -+{ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); -+ case KEY_TYPE_btree_ptr_v2: -+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; -+ default: -+ return 0; -+ } -+} -+ -+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -+{ -+ return k->k.type == KEY_TYPE_btree_ptr_v2 -+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr -+ : NULL; -+} -+ -+/* is btree node in hash table? */ -+static inline bool btree_node_hashed(struct btree *b) -+{ -+ return b->hash_val != 0; -+} -+ -+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ -+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ -+ &(_c)->btree_cache.table), \ -+ _iter = 0; _iter < (_tbl)->size; _iter++) \ -+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -+ -+static inline size_t btree_bytes(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size << 9; -+} -+ -+static inline size_t btree_max_u64s(struct bch_fs *c) -+{ -+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); -+} -+ -+static inline size_t btree_page_order(struct bch_fs *c) -+{ -+ return get_order(btree_bytes(c)); -+} -+ -+static inline size_t btree_pages(struct bch_fs *c) -+{ -+ return 1 << btree_page_order(c); -+} -+ -+static inline unsigned btree_blocks(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size >> c->block_bits; -+} -+ -+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) -+ -+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) -+ -+#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) -+ -+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, -+ struct btree *); -+ -+#endif /* _BCACHEFS_BTREE_CACHE_H */ -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -new file mode 100644 -index 000000000000..8771ef1f07cc ---- /dev/null -+++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,1388 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * Copyright (C) 2014 Datera Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_locking.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ write_seqcount_begin(&c->gc_pos_lock); -+ c->gc_pos = new_pos; -+ write_seqcount_end(&c->gc_pos_lock); -+} -+ -+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); -+ __gc_pos_set(c, new_pos); -+} -+ -+static int bch2_gc_check_topology(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bpos *expected_start, -+ struct bpos expected_end, -+ bool is_last) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, -+ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", -+ bp.v->min_key.inode, -+ bp.v->min_key.offset, -+ expected_start->inode, -+ expected_start->offset)) { -+ BUG(); -+ } -+ } -+ -+ *expected_start = bkey_cmp(k.k->p, POS_MAX) -+ ? bkey_successor(k.k->p) -+ : k.k->p; -+ -+ if (fsck_err_on(is_last && -+ bkey_cmp(k.k->p, expected_end), c, -+ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", -+ k.k->p.inode, -+ k.k->p.offset, -+ expected_end.inode, -+ expected_end.offset)) { -+ BUG(); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* marking of btree keys/nodes: */ -+ -+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, -+ u8 *max_stale, bool initial) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ unsigned flags = -+ BTREE_TRIGGER_GC| -+ (initial ? BTREE_TRIGGER_NOATOMIC : 0); -+ int ret = 0; -+ -+ if (initial) { -+ BUG_ON(journal_seq_verify(c) && -+ k.k->version.lo > journal_cur_seq(&c->journal)); -+ -+ /* XXX change to fsck check */ -+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, -+ "key version number higher than recorded: %llu > %llu", -+ k.k->version.lo, -+ atomic64_read(&c->key_version))) -+ atomic64_set(&c->key_version, k.k->version.lo); -+ -+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, -+ "superblock not marked as containing replicas (type %u)", -+ k.k->type)) { -+ ret = bch2_mark_bkey_replicas(c, k); -+ if (ret) -+ return ret; -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); -+ -+ if (mustfix_fsck_err_on(!g->gen_valid, c, -+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ } -+ -+ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, -+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen, g->mark.gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ g2->_mark.data_type = 0; -+ g2->_mark.dirty_sectors = 0; -+ g2->_mark.cached_sectors = 0; -+ set_bit(BCH_FS_FIXED_GENS, &c->flags); -+ } -+ } -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ -+ if (gen_after(g->oldest_gen, ptr->gen)) -+ g->oldest_gen = ptr->gen; -+ -+ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); -+ } -+ -+ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); -+fsck_err: -+ return ret; -+} -+ -+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, -+ bool initial) -+{ -+ struct bpos next_node_start = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ *max_stale = 0; -+ -+ if (!btree_node_type_needs_gc(btree_node_type(b))) -+ return 0; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ ret = bch2_gc_mark_key(c, k, max_stale, initial); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (b->c.level) { -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ bch2_btree_node_iter_end(&iter)); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, -+ bool initial, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ 0, depth, BTREE_ITER_PREFETCH, b) { -+ bch2_verify_btree_nr_keys(b); -+ -+ gc_pos_set(c, gc_pos_btree_node(b)); -+ -+ ret = btree_gc_mark_node(c, b, &max_stale, initial); -+ if (ret) -+ break; -+ -+ if (!initial) { -+ if (max_stale > 64) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ else if (!btree_gc_rewrite_disabled(c) && -+ (btree_gc_always_rewrite(c) || max_stale > 16)) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->btree_root_lock); -+ b = c->btree_roots[btree_id].b; -+ if (!btree_node_fake(b)) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, initial); -+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); -+ mutex_unlock(&c->btree_root_lock); -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ unsigned target_depth) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bpos next_node_start = b->data->min_key; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); -+ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); -+ -+ ret = bch2_gc_mark_key(c, k, &max_stale, true); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ !bch2_btree_and_journal_iter_peek(&iter).k); -+ if (ret) -+ break; -+ -+ if (b->c.level > target_depth) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_btree_init_recurse(c, child, -+ journal_keys, target_depth); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init(struct bch_fs *c, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ bool metadata_only) -+{ -+ struct btree *b; -+ unsigned target_depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ b = c->btree_roots[btree_id].b; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset)) { -+ BUG(); -+ } -+ -+ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->max_key.inode, -+ b->data->max_key.offset)) { -+ BUG(); -+ } -+ -+ if (b->c.level >= target_depth) -+ ret = bch2_gc_btree_init_recurse(c, b, -+ journal_keys, target_depth); -+ -+ if (!ret) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, true); -+fsck_err: -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -+{ -+ return (int) btree_id_to_gc_phase(l) - -+ (int) btree_id_to_gc_phase(r); -+} -+ -+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ enum btree_id ids[BTREE_ID_NR]; -+ unsigned i; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ ids[i] = i; -+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ enum btree_id id = ids[i]; -+ int ret = initial -+ ? bch2_gc_btree_init(c, journal_keys, -+ id, metadata_only) -+ : bch2_gc_btree(c, id, initial, metadata_only); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, -+ u64 start, u64 end, -+ enum bch_data_type type, -+ unsigned flags) -+{ -+ u64 b = sector_to_bucket(ca, start); -+ -+ do { -+ unsigned sectors = -+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; -+ -+ bch2_mark_metadata_bucket(c, ca, b, type, sectors, -+ gc_phase(GC_PHASE_SB), flags); -+ b++; -+ start += sectors; -+ } while (start < end); -+} -+ -+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, -+ unsigned flags) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ unsigned i; -+ u64 b; -+ -+ /* -+ * This conditional is kind of gross, but we may be called from the -+ * device add path, before the new device has actually been added to the -+ * running filesystem: -+ */ -+ if (c) { -+ lockdep_assert_held(&c->sb_lock); -+ percpu_down_read(&c->mark_lock); -+ } -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset == BCH_SB_SECTOR) -+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, -+ BCH_DATA_SB, flags); -+ -+ mark_metadata_sectors(c, ca, offset, -+ offset + (1 << layout->sb_max_size_bits), -+ BCH_DATA_SB, flags); -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) { -+ b = ca->journal.buckets[i]; -+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), flags); -+ } -+ -+ if (c) -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_mark_superblocks(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&c->sb_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_SB)); -+ -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); -+ mutex_unlock(&c->sb_lock); -+} -+ -+#if 0 -+/* Also see bch2_pending_btree_node_free_insert_done() */ -+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -+{ -+ struct btree_update *as; -+ struct pending_btree_node_free *d; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); -+ -+ for_each_pending_btree_node_free(c, as, d) -+ if (d->index_update_done) -+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), -+ 0, 0, NULL, 0, -+ BTREE_TRIGGER_GC); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+#endif -+ -+static void bch2_mark_allocator_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct open_bucket *ob; -+ size_t i, j, iter; -+ unsigned ci; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ gc_pos_set(c, gc_pos_alloc(c, NULL)); -+ -+ for_each_member_device(ca, c, ci) { -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ -+ -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid) { -+ gc_pos_set(c, gc_pos_alloc(c, ob)); -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, -+ gc_pos_alloc(c, ob), -+ BTREE_TRIGGER_GC); -+ } -+ spin_unlock(&ob->lock); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_gc_free(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ genradix_free(&c->stripes[1]); -+ -+ for_each_member_device(ca, c, i) { -+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ ca->buckets[1] = NULL; -+ -+ free_percpu(ca->usage[1]); -+ ca->usage[1] = NULL; -+ } -+ -+ free_percpu(c->usage_gc); -+ c->usage_gc = NULL; -+} -+ -+static int bch2_gc_done(struct bch_fs *c, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ bool verify = !metadata_only && -+ (!initial || -+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); -+ unsigned i; -+ int ret = 0; -+ -+#define copy_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ -+ , ##__VA_ARGS__, dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ } -+#define copy_stripe_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, "stripe %zu has wrong "_msg \ -+ ": got %u, should be %u", \ -+ dst_iter.pos, ##__VA_ARGS__, \ -+ dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ dst->dirty = true; \ -+ } -+#define copy_bucket_field(_f) \ -+ if (dst->b[b].mark._f != src->b[b].mark._f) { \ -+ if (verify) \ -+ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ -+ ": got %u, should be %u", i, b, \ -+ dst->b[b].mark.gen, \ -+ bch2_data_types[dst->b[b].mark.data_type],\ -+ dst->b[b].mark._f, src->b[b].mark._f); \ -+ dst->b[b]._mark._f = src->b[b].mark._f; \ -+ } -+#define copy_dev_field(_f, _msg, ...) \ -+ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) -+#define copy_fs_field(_f, _msg, ...) \ -+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) -+ -+ if (!metadata_only) { -+ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); -+ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); -+ struct stripe *dst, *src; -+ unsigned i; -+ -+ c->ec_stripes_heap.used = 0; -+ -+ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && -+ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { -+ BUG_ON(src_iter.pos != dst_iter.pos); -+ -+ copy_stripe_field(alive, "alive"); -+ copy_stripe_field(sectors, "sectors"); -+ copy_stripe_field(algorithm, "algorithm"); -+ copy_stripe_field(nr_blocks, "nr_blocks"); -+ copy_stripe_field(nr_redundant, "nr_redundant"); -+ copy_stripe_field(blocks_nonempty, -+ "blocks_nonempty"); -+ -+ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) -+ copy_stripe_field(block_sectors[i], -+ "block_sectors[%u]", i); -+ -+ if (dst->alive) -+ bch2_stripes_heap_insert(c, dst, dst_iter.pos); -+ -+ genradix_iter_advance(&dst_iter, &c->stripes[0]); -+ genradix_iter_advance(&src_iter, &c->stripes[1]); -+ } -+ } -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 0); -+ struct bucket_array *src = __bucket_array(ca, 1); -+ size_t b; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ copy_bucket_field(gen); -+ copy_bucket_field(data_type); -+ copy_bucket_field(owned_by_allocator); -+ copy_bucket_field(stripe); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); -+ -+ dst->b[b].oldest_gen = src->b[b].oldest_gen; -+ } -+ }; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ bch2_dev_usage_from_buckets(c); -+ -+ { -+ unsigned nr = fs_usage_u64s(c); -+ struct bch_fs_usage *dst = c->usage_base; -+ struct bch_fs_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); -+ -+ copy_fs_field(hidden, "hidden"); -+ copy_fs_field(btree, "btree"); -+ -+ if (!metadata_only) { -+ copy_fs_field(data, "data"); -+ copy_fs_field(cached, "cached"); -+ copy_fs_field(reserved, "reserved"); -+ copy_fs_field(nr_inodes,"nr_inodes"); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ copy_fs_field(persistent_reserved[i], -+ "persistent_reserved[%i]", i); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ char buf[80]; -+ -+ if (metadata_only && -+ (e->data_type == BCH_DATA_USER || -+ e->data_type == BCH_DATA_CACHED)) -+ continue; -+ -+ bch2_replicas_entry_to_text(&PBUF(buf), e); -+ -+ copy_fs_field(replicas[i], "%s", buf); -+ } -+ } -+ -+#undef copy_fs_field -+#undef copy_dev_field -+#undef copy_bucket_field -+#undef copy_stripe_field -+#undef copy_field -+fsck_err: -+ return ret; -+} -+ -+static int bch2_gc_start(struct bch_fs *c, -+ bool metadata_only) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(c->usage_gc); -+ -+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), -+ sizeof(u64), GFP_KERNEL); -+ if (!c->usage_gc) { -+ bch_err(c, "error allocating c->usage_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_member_device(ca, c, i) { -+ BUG_ON(ca->buckets[1]); -+ BUG_ON(ca->usage[1]); -+ -+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!ca->buckets[1]) { -+ percpu_ref_put(&ca->ref); -+ bch_err(c, "error allocating ca->buckets[gc]"); -+ return -ENOMEM; -+ } -+ -+ ca->usage[1] = alloc_percpu(struct bch_dev_usage); -+ if (!ca->usage[1]) { -+ bch_err(c, "error allocating ca->usage[gc]"); -+ percpu_ref_put(&ca->ref); -+ return -ENOMEM; -+ } -+ } -+ -+ ret = bch2_ec_mem_alloc(c, true); -+ if (ret) { -+ bch_err(c, "error allocating ec gc mem"); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * indicate to stripe code that we need to allocate for the gc stripes -+ * radix tree, too -+ */ -+ gc_pos_set(c, gc_phase(GC_PHASE_START)); -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 1); -+ struct bucket_array *src = __bucket_array(ca, 0); -+ size_t b; -+ -+ dst->first_bucket = src->first_bucket; -+ dst->nbuckets = src->nbuckets; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ struct bucket *d = &dst->b[b]; -+ struct bucket *s = &src->b[b]; -+ -+ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; -+ d->gen_valid = s->gen_valid; -+ -+ if (metadata_only && -+ (s->mark.data_type == BCH_DATA_USER || -+ s->mark.data_type == BCH_DATA_CACHED)) { -+ d->_mark = s->mark; -+ d->_mark.owned_by_allocator = 0; -+ } -+ } -+ }; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return 0; -+} -+ -+/** -+ * bch2_gc - walk _all_ references to buckets, and recompute them: -+ * -+ * Order matters here: -+ * - Concurrent GC relies on the fact that we have a total ordering for -+ * everything that GC walks - see gc_will_visit_node(), -+ * gc_will_visit_root() -+ * -+ * - also, references move around in the course of index updates and -+ * various other crap: everything needs to agree on the ordering -+ * references are allowed to move around in - e.g., we're allowed to -+ * start with a reference owned by an open_bucket (the allocator) and -+ * move it to the btree, but not the reverse. -+ * -+ * This is necessary to ensure that gc doesn't miss references that -+ * move around - if references move backwards in the ordering GC -+ * uses, GC could skip past them -+ */ -+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ u64 start_time = local_clock(); -+ unsigned i, iter = 0; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ trace_gc_start(c); -+ -+ down_write(&c->gc_lock); -+ -+ /* flush interior btree updates: */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+again: -+ ret = bch2_gc_start(c, metadata_only); -+ if (ret) -+ goto out; -+ -+ bch2_mark_superblocks(c); -+ -+ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); -+ if (ret) -+ goto out; -+ -+#if 0 -+ bch2_mark_pending_btree_node_frees(c); -+#endif -+ bch2_mark_allocator_buckets(c); -+ -+ c->gc_count++; -+out: -+ if (!ret && -+ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || -+ (!iter && test_restart_gc(c)))) { -+ /* -+ * XXX: make sure gens we fixed got saved -+ */ -+ if (iter++ <= 2) { -+ bch_info(c, "Fixed gens, restarting mark and sweep:"); -+ clear_bit(BCH_FS_FIXED_GENS, &c->flags); -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ /* flush fsck errors, reset counters */ -+ bch2_flush_fsck_errs(c); -+ -+ goto again; -+ } -+ -+ bch_info(c, "Unable to fix bucket gens, looping"); -+ ret = -EINVAL; -+ } -+ -+ if (!ret) { -+ bch2_journal_block(&c->journal); -+ -+ percpu_down_write(&c->mark_lock); -+ ret = bch2_gc_done(c, initial, metadata_only); -+ -+ bch2_journal_unblock(&c->journal); -+ } else { -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ /* Indicates that gc is no longer in progress: */ -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ -+ up_write(&c->gc_lock); -+ -+ trace_gc_end(c); -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); -+ -+ /* -+ * Wake up allocator in case it was waiting for buckets -+ * because of not being able to inc gens -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_wake_allocator(ca); -+ -+ /* -+ * At startup, allocations can happen directly instead of via the -+ * allocator thread - issue wakeup in case they blocked on gc_lock: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ return ret; -+} -+ -+/* -+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree -+ * node pointers currently never have cached pointers that can become stale: -+ */ -+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ percpu_down_read(&c->mark_lock); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); -+ -+ if (gen_after(g->gc_gen, ptr->gen)) -+ g->gc_gen = ptr->gen; -+ -+ if (gen_after(g->mark.gen, ptr->gen) > 32) { -+ /* rewrite btree node */ -+ -+ } -+ } -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_gc_gens(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int ret; -+ -+ /* -+ * Ideally we would be using state_lock and not gc_lock here, but that -+ * introduces a deadlock in the RO path - we currently take the state -+ * lock at the start of going RO, thus the gc thread may get stuck: -+ */ -+ down_read(&c->gc_lock); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->gc_gen = g->mark.gen; -+ up_read(&ca->bucket_lock); -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (btree_node_type_needs_gc(i)) { -+ ret = bch2_gc_btree_gens(c, i); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->oldest_gen = g->gc_gen; -+ up_read(&ca->bucket_lock); -+ } -+err: -+ up_read(&c->gc_lock); -+ return ret; -+} -+ -+/* Btree coalescing */ -+ -+static void recalc_packed_keys(struct btree *b) -+{ -+ struct bset *i = btree_bset_first(b); -+ struct bkey_packed *k; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ -+ BUG_ON(b->nsets != 1); -+ -+ vstruct_for_each(i, k) -+ btree_keys_account_key_add(&b->nr, 0, k); -+} -+ -+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *old_nodes[GC_MERGE_NODES]) -+{ -+ struct btree *parent = btree_node_parent(iter, old_nodes[0]); -+ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; -+ unsigned blocks = btree_blocks(c) * 2 / 3; -+ struct btree *new_nodes[GC_MERGE_NODES]; -+ struct btree_update *as; -+ struct keylist keylist; -+ struct bkey_format_state format_state; -+ struct bkey_format new_format; -+ -+ memset(new_nodes, 0, sizeof(new_nodes)); -+ bch2_keylist_init(&keylist, NULL); -+ -+ /* Count keys that are not deleted */ -+ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) -+ u64s += old_nodes[i]->nr.live_u64s; -+ -+ nr_old_nodes = nr_new_nodes = i; -+ -+ /* Check if all keys in @old_nodes could fit in one fewer node */ -+ if (nr_old_nodes <= 1 || -+ __vstruct_blocks(struct btree_node, c->block_bits, -+ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) -+ return; -+ -+ /* Find a format that all keys in @old_nodes can pack into */ -+ bch2_bkey_format_init(&format_state); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ __bch2_btree_calc_format(&format_state, old_nodes[i]); -+ -+ new_format = bch2_bkey_format_done(&format_state); -+ -+ /* Check if repacking would make any nodes too big to fit */ -+ for (i = 0; i < nr_old_nodes; i++) -+ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); -+ return; -+ } -+ -+ if (bch2_keylist_realloc(&keylist, NULL, 0, -+ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); -+ return; -+ } -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + nr_old_nodes, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ NULL); -+ if (IS_ERR(as)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET); -+ bch2_keylist_free(&keylist, NULL); -+ return; -+ } -+ -+ trace_btree_gc_coalesce(c, old_nodes[0]); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); -+ -+ /* Repack everything with @new_format and sort down to one bset */ -+ for (i = 0; i < nr_old_nodes; i++) -+ new_nodes[i] = -+ __bch2_btree_node_alloc_replacement(as, old_nodes[i], -+ new_format); -+ -+ /* -+ * Conceptually we concatenate the nodes together and slice them -+ * up at different boundaries. -+ */ -+ for (i = nr_new_nodes - 1; i > 0; --i) { -+ struct btree *n1 = new_nodes[i]; -+ struct btree *n2 = new_nodes[i - 1]; -+ -+ struct bset *s1 = btree_bset_first(n1); -+ struct bset *s2 = btree_bset_first(n2); -+ struct bkey_packed *k, *last = NULL; -+ -+ /* Calculate how many keys from @n2 we could fit inside @n1 */ -+ u64s = 0; -+ -+ for (k = s2->start; -+ k < vstruct_last(s2) && -+ vstruct_blocks_plus(n1->data, c->block_bits, -+ u64s + k->u64s) <= blocks; -+ k = bkey_next_skip_noops(k, vstruct_last(s2))) { -+ last = k; -+ u64s += k->u64s; -+ } -+ -+ if (u64s == le16_to_cpu(s2->u64s)) { -+ /* n2 fits entirely in n1 */ -+ n1->key.k.p = n1->data->max_key = n2->data->max_key; -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, -+ le16_to_cpu(s2->u64s)); -+ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ -+ six_unlock_write(&n2->c.lock); -+ bch2_btree_node_free_never_inserted(c, n2); -+ six_unlock_intent(&n2->c.lock); -+ -+ memmove(new_nodes + i - 1, -+ new_nodes + i, -+ sizeof(new_nodes[0]) * (nr_new_nodes - i)); -+ new_nodes[--nr_new_nodes] = NULL; -+ } else if (u64s) { -+ /* move part of n2 into n1 */ -+ n1->key.k.p = n1->data->max_key = -+ bkey_unpack_pos(n1, last); -+ -+ n2->data->min_key = bkey_successor(n1->data->max_key); -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, u64s); -+ le16_add_cpu(&s1->u64s, u64s); -+ -+ memmove(s2->start, -+ vstruct_idx(s2, u64s), -+ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); -+ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) { -+ struct btree *n = new_nodes[i]; -+ -+ recalc_packed_keys(n); -+ btree_node_reset_sib_u64s(n); -+ -+ bch2_btree_build_aux_trees(n); -+ -+ bch2_btree_update_add_new_node(as, n); -+ six_unlock_write(&n->c.lock); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ } -+ -+ /* -+ * The keys for the old nodes get deleted. We don't want to insert keys -+ * that compare equal to the keys for the new nodes we'll also be -+ * inserting - we can't because keys on a keylist must be strictly -+ * greater than the previous keys, and we also don't need to since the -+ * key for the new node will serve the same purpose (overwriting the key -+ * for the old node). -+ */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ struct bkey_i delete; -+ unsigned j; -+ -+ for (j = 0; j < nr_new_nodes; j++) -+ if (!bkey_cmp(old_nodes[i]->key.k.p, -+ new_nodes[j]->key.k.p)) -+ goto next; -+ -+ bkey_init(&delete.k); -+ delete.k.p = old_nodes[i]->key.k.p; -+ bch2_keylist_add_in_order(&keylist, &delete); -+next: -+ i = i; -+ } -+ -+ /* -+ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only -+ * does the lookup once and thus expects the keys to be in sorted order -+ * so we have to make sure the new keys are correctly ordered with -+ * respect to the deleted keys added in the previous loop -+ */ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); -+ -+ /* Insert the newly coalesced nodes */ -+ bch2_btree_insert_node(as, parent, iter, &keylist, 0); -+ -+ BUG_ON(!bch2_keylist_empty(&keylist)); -+ -+ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); -+ -+ bch2_btree_iter_node_replace(iter, new_nodes[0]); -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_btree_update_get_open_buckets(as, new_nodes[i]); -+ -+ /* Free the old nodes and update our sliding window */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ bch2_btree_node_free_inmem(c, old_nodes[i], iter); -+ -+ /* -+ * the index update might have triggered a split, in which case -+ * the nodes we coalesced - the new nodes we just created - -+ * might not be sibling nodes anymore - don't add them to the -+ * sliding window (except the first): -+ */ -+ if (!i) { -+ old_nodes[i] = new_nodes[i]; -+ } else { -+ old_nodes[i] = NULL; -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ six_unlock_intent(&new_nodes[i]->c.lock); -+ -+ bch2_btree_update_done(as); -+ bch2_keylist_free(&keylist, NULL); -+} -+ -+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ unsigned i; -+ -+ /* Sliding window of adjacent btree nodes */ -+ struct btree *merge[GC_MERGE_NODES]; -+ u32 lock_seq[GC_MERGE_NODES]; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * XXX: We don't have a good way of positively matching on sibling nodes -+ * that have the same parent - this code works by handling the cases -+ * where they might not have the same parent, and is thus fragile. Ugh. -+ * -+ * Perhaps redo this to use multiple linked iterators? -+ */ -+ memset(merge, 0, sizeof(merge)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ BTREE_MAX_DEPTH, 0, -+ BTREE_ITER_PREFETCH, b) { -+ memmove(merge + 1, merge, -+ sizeof(merge) - sizeof(merge[0])); -+ memmove(lock_seq + 1, lock_seq, -+ sizeof(lock_seq) - sizeof(lock_seq[0])); -+ -+ merge[0] = b; -+ -+ for (i = 1; i < GC_MERGE_NODES; i++) { -+ if (!merge[i] || -+ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) -+ break; -+ -+ if (merge[i]->c.level != merge[0]->c.level) { -+ six_unlock_intent(&merge[i]->c.lock); -+ break; -+ } -+ } -+ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); -+ -+ bch2_coalesce_nodes(c, iter, merge); -+ -+ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { -+ lock_seq[i] = merge[i]->c.lock.state.seq; -+ six_unlock_intent(&merge[i]->c.lock); -+ } -+ -+ lock_seq[0] = merge[0]->c.lock.state.seq; -+ -+ if (kthread && kthread_should_stop()) { -+ bch2_trans_exit(&trans); -+ return -ESHUTDOWN; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ -+ /* -+ * If the parent node wasn't relocked, it might have been split -+ * and the nodes in our sliding window might not have the same -+ * parent anymore - blow away the sliding window: -+ */ -+ if (btree_iter_node(iter, iter->level + 1) && -+ !btree_node_intent_locked(iter, iter->level + 1)) -+ memset(merge + 1, 0, -+ (GC_MERGE_NODES - 1) * sizeof(merge[0])); -+ } -+ return bch2_trans_exit(&trans); -+} -+ -+/** -+ * bch_coalesce - coalesce adjacent nodes with low occupancy -+ */ -+void bch2_coalesce(struct bch_fs *c) -+{ -+ enum btree_id id; -+ -+ down_read(&c->gc_lock); -+ trace_gc_coalesce_start(c); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ int ret = c->btree_roots[id].b -+ ? bch2_coalesce_btree(c, id) -+ : 0; -+ -+ if (ret) { -+ if (ret != -ESHUTDOWN) -+ bch_err(c, "btree coalescing failed: %d", ret); -+ return; -+ } -+ } -+ -+ trace_gc_coalesce_end(c); -+ up_read(&c->gc_lock); -+} -+ -+static int bch2_gc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last = atomic_long_read(&clock->now); -+ unsigned last_kick = atomic_read(&c->kick_gc); -+ int ret; -+ -+ set_freezable(); -+ -+ while (1) { -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ if (kthread_should_stop()) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (atomic_read(&c->kick_gc) != last_kick) -+ break; -+ -+ if (c->btree_gc_periodic) { -+ unsigned long next = last + c->capacity / 16; -+ -+ if (atomic_long_read(&clock->now) >= next) -+ break; -+ -+ bch2_io_clock_schedule_timeout(clock, next); -+ } else { -+ schedule(); -+ } -+ -+ try_to_freeze(); -+ } -+ __set_current_state(TASK_RUNNING); -+ -+ last = atomic_long_read(&clock->now); -+ last_kick = atomic_read(&c->kick_gc); -+ -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ ret = bch2_gc(c, NULL, false, false); -+#else -+ ret = bch2_gc_gens(c); -+#endif -+ if (ret) -+ bch_err(c, "btree gc failed: %i", ret); -+ -+ debug_check_no_locks_held(); -+ } -+ -+ return 0; -+} -+ -+void bch2_gc_thread_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ p = c->gc_thread; -+ c->gc_thread = NULL; -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_gc_thread_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ BUG_ON(c->gc_thread); -+ -+ p = kthread_create(bch2_gc_thread, c, "bch_gc"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ c->gc_thread = p; -+ wake_up_process(p); -+ return 0; -+} -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -new file mode 100644 -index 000000000000..3694a3df62a8 ---- /dev/null -+++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,121 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_GC_H -+#define _BCACHEFS_BTREE_GC_H -+ -+#include "btree_types.h" -+ -+void bch2_coalesce(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); -+int bch2_gc_gens(struct bch_fs *); -+void bch2_gc_thread_stop(struct bch_fs *); -+int bch2_gc_thread_start(struct bch_fs *); -+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); -+ -+/* -+ * For concurrent mark and sweep (with other index updates), we define a total -+ * ordering of _all_ references GC walks: -+ * -+ * Note that some references will have the same GC position as others - e.g. -+ * everything within the same btree node; in those cases we're relying on -+ * whatever locking exists for where those references live, i.e. the write lock -+ * on a btree node. -+ * -+ * That locking is also required to ensure GC doesn't pass the updater in -+ * between the updater adding/removing the reference and updating the GC marks; -+ * without that, we would at best double count sometimes. -+ * -+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ -+ * be held that prevents GC from passing the position the updater is at. -+ * -+ * (What about the start of gc, when we're clearing all the marks? GC clears the -+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc -+ * position inside its cmpxchg loop, so crap magically works). -+ */ -+ -+/* Position of (the start of) a gc phase: */ -+static inline struct gc_pos gc_phase(enum gc_phase phase) -+{ -+ return (struct gc_pos) { -+ .phase = phase, -+ .pos = POS_MIN, -+ .level = 0, -+ }; -+} -+ -+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -+{ -+ if (l.phase != r.phase) -+ return l.phase < r.phase ? -1 : 1; -+ if (bkey_cmp(l.pos, r.pos)) -+ return bkey_cmp(l.pos, r.pos); -+ if (l.level != r.level) -+ return l.level < r.level ? -1 : 1; -+ return 0; -+} -+ -+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -+{ -+ switch (id) { -+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; -+ BCH_BTREE_IDS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct gc_pos gc_pos_btree(enum btree_id id, -+ struct bpos pos, unsigned level) -+{ -+ return (struct gc_pos) { -+ .phase = btree_id_to_gc_phase(id), -+ .pos = pos, -+ .level = level, -+ }; -+} -+ -+/* -+ * GC position of the pointers within a btree node: note, _not_ for &b->key -+ * itself, that lives in the parent node: -+ */ -+static inline struct gc_pos gc_pos_btree_node(struct btree *b) -+{ -+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -+} -+ -+/* -+ * GC position of the pointer to a btree root: we don't use -+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with -+ * btree_split() increasing the tree depth - the new root will have level > the -+ * old root and thus have a greater gc position than the old root, but that -+ * would be incorrect since once gc has marked the root it's not coming back. -+ */ -+static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -+{ -+ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); -+} -+ -+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) -+{ -+ return (struct gc_pos) { -+ .phase = GC_PHASE_ALLOC, -+ .pos = POS(ob ? ob - c->open_buckets : 0, 0), -+ }; -+} -+ -+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -+{ -+ unsigned seq; -+ bool ret; -+ -+ do { -+ seq = read_seqcount_begin(&c->gc_pos_lock); -+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; -+ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); -+ -+ return ret; -+} -+ -+#endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -new file mode 100644 -index 000000000000..d2c28eb75bde ---- /dev/null -+++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,1868 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static void verify_no_dups(struct btree *b, -+ struct bkey_packed *start, -+ struct bkey_packed *end, -+ bool extents) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bkey_packed *k, *p; -+ -+ if (start == end) -+ return; -+ -+ for (p = start, k = bkey_next_skip_noops(start, end); -+ k != end; -+ p = k, k = bkey_next_skip_noops(k, end)) { -+ struct bkey l = bkey_unpack_key(b, p); -+ struct bkey r = bkey_unpack_key(b, k); -+ -+ BUG_ON(extents -+ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 -+ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); -+ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); -+ } -+#endif -+} -+ -+static void set_needs_whiteout(struct bset *i, int v) -+{ -+ struct bkey_packed *k; -+ -+ for (k = i->start; -+ k != vstruct_last(i); -+ k = bkey_next_skip_noops(k, vstruct_last(i))) -+ k->needs_whiteout = v; -+} -+ -+static void btree_bounce_free(struct bch_fs *c, unsigned order, -+ bool used_mempool, void *p) -+{ -+ if (used_mempool) -+ mempool_free(p, &c->btree_bounce_pool); -+ else -+ vpfree(p, PAGE_SIZE << order); -+} -+ -+static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, -+ bool *used_mempool) -+{ -+ unsigned flags = memalloc_nofs_save(); -+ void *p; -+ -+ BUG_ON(order > btree_page_order(c)); -+ -+ *used_mempool = false; -+ p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); -+ if (!p) { -+ *used_mempool = true; -+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); -+ } -+ memalloc_nofs_restore(flags); -+ return p; -+} -+ -+static void sort_bkey_ptrs(const struct btree *bt, -+ struct bkey_packed **ptrs, unsigned nr) -+{ -+ unsigned n = nr, a = nr / 2, b, c, d; -+ -+ if (!a) -+ return; -+ -+ /* Heap sort: see lib/sort.c: */ -+ while (1) { -+ if (a) -+ a--; -+ else if (--n) -+ swap(ptrs[0], ptrs[n]); -+ else -+ break; -+ -+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) -+ b = bkey_cmp_packed(bt, -+ ptrs[c], -+ ptrs[d]) >= 0 ? c : d; -+ if (d == n) -+ b = c; -+ -+ while (b != a && -+ bkey_cmp_packed(bt, -+ ptrs[a], -+ ptrs[b]) >= 0) -+ b = (b - 1) / 2; -+ c = b; -+ while (b != a) { -+ b = (b - 1) / 2; -+ swap(ptrs[b], ptrs[c]); -+ } -+ } -+} -+ -+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; -+ bool used_mempool = false; -+ unsigned order; -+ -+ if (!b->whiteout_u64s) -+ return; -+ -+ order = get_order(b->whiteout_u64s * sizeof(u64)); -+ -+ new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); -+ -+ ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); -+ -+ for (k = unwritten_whiteouts_start(c, b); -+ k != unwritten_whiteouts_end(c, b); -+ k = bkey_next(k)) -+ *--ptrs = k; -+ -+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); -+ -+ k = new_whiteouts; -+ -+ while (ptrs != ptrs_end) { -+ bkey_copy(k, *ptrs); -+ k = bkey_next(k); -+ ptrs++; -+ } -+ -+ verify_no_dups(b, new_whiteouts, -+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), -+ btree_node_old_extent_overwrite(b)); -+ -+ memcpy_u64s(unwritten_whiteouts_start(c, b), -+ new_whiteouts, b->whiteout_u64s); -+ -+ btree_bounce_free(c, order, used_mempool, new_whiteouts); -+} -+ -+static bool should_compact_bset(struct btree *b, struct bset_tree *t, -+ bool compacting, enum compact_mode mode) -+{ -+ if (!bset_dead_u64s(b, t)) -+ return false; -+ -+ switch (mode) { -+ case COMPACT_LAZY: -+ return should_compact_bset_lazy(b, t) || -+ (compacting && !bset_written(b, bset(b, t))); -+ case COMPACT_ALL: -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_compact_extent_whiteouts(struct bch_fs *c, -+ struct btree *b, -+ enum compact_mode mode) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_tree *t; -+ struct bkey_packed *whiteouts = NULL; -+ struct bkey_packed *u_start, *u_pos; -+ struct sort_iter sort_iter; -+ unsigned order, whiteout_u64s = 0, u64s; -+ bool used_mempool, compacting = false; -+ -+ BUG_ON(!btree_node_is_extents(b)); -+ -+ for_each_bset(b, t) -+ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) -+ whiteout_u64s += bset_dead_u64s(b, t); -+ -+ if (!whiteout_u64s) -+ return false; -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ whiteout_u64s += b->whiteout_u64s; -+ order = get_order(whiteout_u64s * sizeof(u64)); -+ -+ whiteouts = btree_bounce_alloc(c, order, &used_mempool); -+ u_start = u_pos = whiteouts; -+ -+ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ compacting = true; -+ -+ if (!should_compact_bset(b, t, compacting, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ compacting = true; -+ u_start = u_pos; -+ start = i->start; -+ end = vstruct_last(i); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (bkey_deleted(k)) -+ continue; -+ -+ BUG_ON(bkey_whiteout(k) && -+ k->needs_whiteout && -+ bkey_written(b, k)); -+ -+ if (bkey_whiteout(k) && !k->needs_whiteout) -+ continue; -+ -+ if (bkey_whiteout(k)) { -+ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); -+ set_bkeyp_val_u64s(f, u_pos, 0); -+ u_pos = bkey_next(u_pos); -+ } else { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } -+ } -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ } -+ -+ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; -+ -+ BUG_ON((void *) unwritten_whiteouts_start(c, b) < -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ -+ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), -+ &sort_iter); -+ -+ BUG_ON(u64s > b->whiteout_u64s); -+ BUG_ON(u_pos != whiteouts && !u64s); -+ -+ if (u64s != b->whiteout_u64s) { -+ void *src = unwritten_whiteouts_start(c, b); -+ -+ b->whiteout_u64s = u64s; -+ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); -+ } -+ -+ verify_no_dups(b, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b), -+ true); -+ -+ btree_bounce_free(c, order, used_mempool, whiteouts); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ bch_btree_keys_u64s_remaining(c, b); -+ bch2_verify_btree_nr_keys(b); -+ -+ return true; -+} -+ -+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -+{ -+ struct bset_tree *t; -+ bool ret = false; -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ ret = true; -+ -+ if (!should_compact_bset(b, t, ret, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ start = btree_bkey_first(b, t); -+ end = btree_bkey_last(b, t); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (!bkey_whiteout(k)) { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } else { -+ BUG_ON(k->needs_whiteout); -+ } -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ ret = true; -+ } -+ -+ bch2_verify_btree_nr_keys(b); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return ret; -+} -+ -+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, -+ enum compact_mode mode) -+{ -+ return !btree_node_old_extent_overwrite(b) -+ ? bch2_drop_whiteouts(b, mode) -+ : bch2_compact_extent_whiteouts(c, b, mode); -+} -+ -+static void btree_node_sort(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter, -+ unsigned start_idx, -+ unsigned end_idx, -+ bool filter_whiteouts) -+{ -+ struct btree_node *out; -+ struct sort_iter sort_iter; -+ struct bset_tree *t; -+ struct bset *start_bset = bset(b, &b->set[start_idx]); -+ bool used_mempool = false; -+ u64 start_time, seq = 0; -+ unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; -+ bool sorting_entire_node = start_idx == 0 && -+ end_idx == b->nsets; -+ -+ sort_iter_init(&sort_iter, b); -+ -+ for (t = b->set + start_idx; -+ t < b->set + end_idx; -+ t++) { -+ u64s += le16_to_cpu(bset(b, t)->u64s); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ } -+ -+ order = sorting_entire_node -+ ? btree_page_order(c) -+ : get_order(__vstruct_bytes(struct btree_node, u64s)); -+ -+ out = btree_bounce_alloc(c, order, &used_mempool); -+ -+ start_time = local_clock(); -+ -+ if (btree_node_old_extent_overwrite(b)) -+ filter_whiteouts = bset_written(b, start_bset); -+ -+ u64s = (btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents -+ : bch2_sort_keys)(out->keys.start, -+ &sort_iter, -+ filter_whiteouts); -+ -+ out->keys.u64s = cpu_to_le16(u64s); -+ -+ BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); -+ -+ if (sorting_entire_node) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ /* Make sure we preserve bset journal_seq: */ -+ for (t = b->set + start_idx; t < b->set + end_idx; t++) -+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); -+ start_bset->journal_seq = cpu_to_le64(seq); -+ -+ if (sorting_entire_node) { -+ unsigned u64s = le16_to_cpu(out->keys.u64s); -+ -+ BUG_ON(order != btree_page_order(c)); -+ -+ /* -+ * Our temporary buffer is the same size as the btree node's -+ * buffer, we can just swap buffers instead of doing a big -+ * memcpy() -+ */ -+ *out = *b->data; -+ out->keys.u64s = cpu_to_le16(u64s); -+ swap(out, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ } else { -+ start_bset->u64s = out->keys.u64s; -+ memcpy_u64s(start_bset->start, -+ out->keys.start, -+ le16_to_cpu(out->keys.u64s)); -+ } -+ -+ for (i = start_idx + 1; i < end_idx; i++) -+ b->nr.bset_u64s[start_idx] += -+ b->nr.bset_u64s[i]; -+ -+ b->nsets -= shift; -+ -+ for (i = start_idx + 1; i < b->nsets; i++) { -+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; -+ b->set[i] = b->set[i + shift]; -+ } -+ -+ for (i = b->nsets; i < MAX_BSETS; i++) -+ b->nr.bset_u64s[i] = 0; -+ -+ set_btree_bset_end(b, &b->set[start_idx]); -+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); -+ -+ btree_bounce_free(c, order, used_mempool, out); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *c, -+ struct btree *dst, -+ struct btree *src) -+{ -+ struct btree_nr_keys nr; -+ struct btree_node_iter src_iter; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(dst->nsets != 1); -+ -+ bch2_bset_set_no_aux_tree(dst, dst->set); -+ -+ bch2_btree_node_iter_init_from_start(&src_iter, src); -+ -+ if (btree_node_is_extents(src)) -+ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ else -+ nr = bch2_sort_repack(btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ set_btree_bset_end(dst, dst->set); -+ -+ dst->nr.live_u64s += nr.live_u64s; -+ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; -+ dst->nr.packed_keys += nr.packed_keys; -+ dst->nr.unpacked_keys += nr.unpacked_keys; -+ -+ bch2_verify_btree_nr_keys(dst); -+} -+ -+#define SORT_CRIT (4096 / sizeof(u64)) -+ -+/* -+ * We're about to add another bset to the btree node, so if there's currently -+ * too many bsets - sort some of them together: -+ */ -+static bool btree_node_compact(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ unsigned unwritten_idx; -+ bool ret = false; -+ -+ for (unwritten_idx = 0; -+ unwritten_idx < b->nsets; -+ unwritten_idx++) -+ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) -+ break; -+ -+ if (b->nsets - unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, unwritten_idx, -+ b->nsets, false); -+ ret = true; -+ } -+ -+ if (unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, 0, unwritten_idx, false); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_build_aux_trees(struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ bch2_bset_build_aux_tree(b, t, -+ !bset_written(b, bset(b, t)) && -+ t == bset_tree_last(b)); -+} -+ -+/* -+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be -+ * inserted into -+ * -+ * Safe to call if there already is an unwritten bset - will only add a new bset -+ * if @b doesn't already have one. -+ * -+ * Returns true if we sorted (i.e. invalidated iterators -+ */ -+void bch2_btree_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_node_entry *bne; -+ bool did_sort; -+ -+ EBUG_ON(!(b->c.lock.state.seq & 1)); -+ EBUG_ON(iter && iter->l[b->c.level].b != b); -+ -+ did_sort = btree_node_compact(c, b, iter); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ if (iter && did_sort) -+ bch2_btree_iter_reinit_node(iter, b); -+} -+ -+static struct nonce btree_nonce(struct bset *i, unsigned offset) -+{ -+ return (struct nonce) {{ -+ [0] = cpu_to_le32(offset), -+ [1] = ((__le32 *) &i->seq)[0], -+ [2] = ((__le32 *) &i->seq)[1], -+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, -+ }}; -+} -+ -+static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -+{ -+ struct nonce nonce = btree_nonce(i, offset); -+ -+ if (!offset) { -+ struct btree_node *bn = container_of(i, struct btree_node, keys); -+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, -+ bytes); -+ -+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); -+ } -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, -+ vstruct_end(i) - (void *) i->_data); -+} -+ -+static void btree_err_msg(struct printbuf *out, struct bch_fs *c, -+ struct btree *b, struct bset *i, -+ unsigned offset, int write) -+{ -+ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" -+ "pos ", -+ write ? "before write " : "", -+ b->c.btree_id, b->c.level, -+ c->btree_roots[b->c.btree_id].level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ -+ pr_buf(out, " node offset %u", b->written); -+ if (i) -+ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); -+} -+ -+enum btree_err_type { -+ BTREE_ERR_FIXABLE, -+ BTREE_ERR_WANT_RETRY, -+ BTREE_ERR_MUST_RETRY, -+ BTREE_ERR_FATAL, -+}; -+ -+enum btree_validate_ret { -+ BTREE_RETRY_READ = 64, -+}; -+ -+#define btree_err(type, c, b, i, msg, ...) \ -+({ \ -+ __label__ out; \ -+ char _buf[300]; \ -+ struct printbuf out = PBUF(_buf); \ -+ \ -+ btree_err_msg(&out, c, b, i, b->written, write); \ -+ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ -+ \ -+ if (type == BTREE_ERR_FIXABLE && \ -+ write == READ && \ -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ -+ mustfix_fsck_err(c, "%s", _buf); \ -+ goto out; \ -+ } \ -+ \ -+ switch (write) { \ -+ case READ: \ -+ bch_err(c, "%s", _buf); \ -+ \ -+ switch (type) { \ -+ case BTREE_ERR_FIXABLE: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ case BTREE_ERR_WANT_RETRY: \ -+ if (have_retry) { \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case BTREE_ERR_MUST_RETRY: \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ case BTREE_ERR_FATAL: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write: %s", _buf); \ -+ \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+out: \ -+ true; \ -+}) -+ -+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -+ -+static int validate_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ const char *err; -+ int ret = 0; -+ -+ btree_err_on((version != BCH_BSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max, -+ BTREE_ERR_FATAL, c, b, i, -+ "unsupported bset version"); -+ -+ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "bset past end of btree node")) { -+ i->u64s = 0; -+ return 0; -+ } -+ -+ btree_err_on(b->written && !i->u64s, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "empty bset"); -+ -+ if (!b->written) { -+ struct btree_node *bn = -+ container_of(i, struct btree_node, keys); -+ /* These indicate that we read the wrong btree node: */ -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ /* XXX endianness */ -+ btree_err_on(bp->seq != bn->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect sequence number (wrong btree node)"); -+ } -+ -+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect btree id"); -+ -+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect level"); -+ -+ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { -+ u64 *p = (u64 *) &bn->ptr; -+ -+ *p = swab64(*p); -+ } -+ -+ if (!write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect min_key: got %llu:%llu should be %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ bp->min_key.inode, -+ bp->min_key.offset); -+ } -+ -+ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect max key"); -+ -+ if (write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ /* XXX: ideally we would be validating min_key too */ -+#if 0 -+ /* -+ * not correct anymore, due to btree node write error -+ * handling -+ * -+ * need to add bn->seq to btree keys and verify -+ * against that -+ */ -+ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), -+ bn->ptr), -+ BTREE_ERR_FATAL, c, b, i, -+ "incorrect backpointer"); -+#endif -+ err = bch2_bkey_format_validate(&bn->format); -+ btree_err_on(err, -+ BTREE_ERR_FATAL, c, b, i, -+ "invalid bkey format: %s", err); -+ -+ compat_bformat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &bn->format); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int validate_bset_keys(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned *whiteout_u64s, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ struct bkey_packed *k, *prev = NULL; -+ bool seen_non_whiteout = false; -+ int ret = 0; -+ -+ if (!BSET_SEPARATE_WHITEOUTS(i)) { -+ seen_non_whiteout = true; -+ *whiteout_u64s = 0; -+ } -+ -+ for (k = i->start; -+ k != vstruct_last(i);) { -+ struct bkey_s u; -+ struct bkey tmp; -+ const char *invalid; -+ -+ if (btree_err_on(bkey_next(k) > vstruct_last(i), -+ BTREE_ERR_FIXABLE, c, b, i, -+ "key extends past end of bset")) { -+ i->u64s = cpu_to_le16((u64 *) k - i->_data); -+ break; -+ } -+ -+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey format %u", k->format)) { -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ /* XXX: validate k->u64s */ -+ if (!write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ u = __bkey_disassemble(b, k, &tmp); -+ -+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, u.s_c) ?: -+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey:\n%s\n%s", invalid, buf); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ if (write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ /* -+ * with the separate whiteouts thing (used for extents), the -+ * second set of keys actually can have whiteouts too, so we -+ * can't solely go off bkey_whiteout()... -+ */ -+ -+ if (!seen_non_whiteout && -+ (!bkey_whiteout(k) || -+ (prev && bkey_iter_cmp(b, prev, k) > 0))) { -+ *whiteout_u64s = k->_data - i->_data; -+ seen_non_whiteout = true; -+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ char buf1[80]; -+ char buf2[80]; -+ struct bkey up = bkey_unpack_key(b, prev); -+ -+ bch2_bkey_to_text(&PBUF(buf1), &up); -+ bch2_bkey_to_text(&PBUF(buf2), u.k); -+ -+ bch2_dump_bset(c, b, i, 0); -+ btree_err(BTREE_ERR_FATAL, c, b, i, -+ "keys out of order: %s > %s", -+ buf1, buf2); -+ /* XXX: repair this */ -+ } -+ -+ prev = k; -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+fsck_err: -+ return ret; -+} -+ -+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) -+{ -+ struct btree_node_entry *bne; -+ struct sort_iter *iter; -+ struct btree_node *sorted; -+ struct bkey_packed *k; -+ struct bch_extent_ptr *ptr; -+ struct bset *i; -+ bool used_mempool, blacklisted; -+ unsigned u64s; -+ int ret, retry_read = 0, write = READ; -+ -+ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); -+ sort_iter_init(iter, b); -+ iter->size = (btree_blocks(c) + 1) * 2; -+ -+ if (bch2_meta_read_fault("btree")) -+ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "dynamic fault"); -+ -+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad magic"); -+ -+ btree_err_on(!b->data->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad btree header"); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(b->data->keys.seq != bp->seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "got wrong btree node (seq %llx want %llx)", -+ b->data->keys.seq, bp->seq); -+ } -+ -+ while (b->written < c->opts.btree_node_size) { -+ unsigned sectors, whiteout_u64s = 0; -+ struct nonce nonce; -+ struct bch_csum csum; -+ bool first = !b->written; -+ -+ if (!b->written) { -+ i = &b->data->keys; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -+ -+ btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ sectors = vstruct_sectors(b->data, c->block_bits); -+ } else { -+ bne = write_block(b); -+ i = &bne->keys; -+ -+ if (i->seq != b->data->keys.seq) -+ break; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ ret = validate_bset(c, b, i, sectors, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ if (!b->written) -+ btree_node_set_format(b, b->data->format); -+ -+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ b->written += sectors; -+ -+ blacklisted = bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(i->journal_seq), -+ true); -+ -+ btree_err_on(blacklisted && first, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "first btree node bset has blacklisted journal seq"); -+ if (blacklisted && !first) -+ continue; -+ -+ sort_iter_add(iter, i->start, -+ vstruct_idx(i, whiteout_u64s)); -+ -+ sort_iter_add(iter, -+ vstruct_idx(i, whiteout_u64s), -+ vstruct_last(i)); -+ } -+ -+ for (bne = write_block(b); -+ bset_byte_offset(b, bne) < btree_bytes(c); -+ bne = (void *) bne + block_bytes(c)) -+ btree_err_on(bne->keys.seq == b->data->keys.seq, -+ BTREE_ERR_WANT_RETRY, c, b, NULL, -+ "found bset signature after last bset"); -+ -+ sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); -+ sorted->keys.u64s = 0; -+ -+ set_btree_bset(b, b->set, &b->data->keys); -+ -+ b->nr = (btree_node_old_extent_overwrite(b) -+ ? bch2_extent_sort_fix_overlapping -+ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); -+ -+ u64s = le16_to_cpu(sorted->keys.u64s); -+ *sorted = *b->data; -+ sorted->keys.u64s = cpu_to_le16(u64s); -+ swap(sorted, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ b->nsets = 1; -+ -+ BUG_ON(b->nr.live_u64s != u64s); -+ -+ btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); -+ -+ i = &b->data->keys; -+ for (k = i->start; k != vstruct_last(i);) { -+ struct bkey tmp; -+ struct bkey_s u = __bkey_disassemble(b, k, &tmp); -+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); -+ -+ if (invalid || -+ (inject_invalid_keys(c) && -+ !bversion_cmp(u.k->version, MAX_VERSION))) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey %s: %s", buf, invalid); -+ -+ btree_keys_account_key_drop(&b->nr, 0, k); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ set_btree_bset_end(b, b->set); -+ continue; -+ } -+ -+ if (u.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); -+ -+ bp.v->mem_ptr = 0; -+ } -+ -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+ -+ bch2_bset_build_aux_tree(b, b->set, false); -+ -+ set_needs_whiteout(btree_bset_first(b), true); -+ -+ btree_node_reset_sib_u64s(b); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ set_btree_node_need_rewrite(b); -+ } -+out: -+ mempool_free(iter, &c->fill_iter); -+ return retry_read; -+fsck_err: -+ if (ret == BTREE_RETRY_READ) { -+ retry_read = 1; -+ } else { -+ bch2_inconsistent_error(c); -+ set_btree_node_read_error(b); -+ } -+ goto out; -+} -+ -+static void btree_node_read_work(struct work_struct *work) -+{ -+ struct btree_read_bio *rb = -+ container_of(work, struct btree_read_bio, work); -+ struct bch_fs *c = rb->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ struct btree *b = rb->bio.bi_private; -+ struct bio *bio = &rb->bio; -+ struct bch_io_failures failed = { .nr = 0 }; -+ bool can_retry; -+ -+ goto start; -+ while (1) { -+ bch_info(c, "retrying read"); -+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ bio_reset(bio); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = rb->pick.ptr.offset; -+ bio->bi_iter.bi_size = btree_bytes(c); -+ -+ if (rb->have_ioref) { -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ submit_bio_wait(bio); -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ } -+start: -+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", -+ blk_status_to_str(bio->bi_status)); -+ if (rb->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ rb->have_ioref = false; -+ -+ bch2_mark_io_failure(&failed, &rb->pick); -+ -+ can_retry = bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ &failed, &rb->pick) > 0; -+ -+ if (!bio->bi_status && -+ !bch2_btree_node_read_done(c, b, can_retry)) -+ break; -+ -+ if (!can_retry) { -+ set_btree_node_read_error(b); -+ break; -+ } -+ } -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -+ rb->start_time); -+ bio_put(&rb->bio); -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+} -+ -+static void btree_node_read_endio(struct bio *bio) -+{ -+ struct btree_read_bio *rb = -+ container_of(bio, struct btree_read_bio, bio); -+ struct bch_fs *c = rb->c; -+ -+ if (rb->have_ioref) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ bch2_latency_acct(ca, rb->start_time, READ); -+ } -+ -+ queue_work(system_unbound_wq, &rb->work); -+} -+ -+void bch2_btree_node_read(struct bch_fs *c, struct btree *b, -+ bool sync) -+{ -+ struct extent_ptr_decoded pick; -+ struct btree_read_bio *rb; -+ struct bch_dev *ca; -+ struct bio *bio; -+ int ret; -+ -+ trace_btree_read(c, b); -+ -+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick); -+ if (bch2_fs_fatal_err_on(ret <= 0, c, -+ "btree node read error: no device to read from")) { -+ set_btree_node_read_error(b); -+ return; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, -+ btree_bytes(c)), -+ &c->btree_bio); -+ rb = container_of(bio, struct btree_read_bio, bio); -+ rb->c = c; -+ rb->start_time = local_clock(); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ rb->pick = pick; -+ INIT_WORK(&rb->work, btree_node_read_work); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bio->bi_end_io = btree_node_read_endio; -+ bio->bi_private = b; -+ bch2_bio_map(bio, b->data, btree_bytes(c)); -+ -+ set_btree_node_read_in_flight(b); -+ -+ if (rb->have_ioref) { -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], -+ bio_sectors(bio)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ -+ if (sync) { -+ submit_bio_wait(bio); -+ -+ bio->bi_private = b; -+ btree_node_read_work(&rb->work); -+ } else { -+ submit_bio(bio); -+ } -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ -+ if (sync) -+ btree_node_read_work(&rb->work); -+ else -+ queue_work(system_unbound_wq, &rb->work); -+ -+ } -+} -+ -+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ BUG_ON(IS_ERR(b)); -+ -+ bkey_copy(&b->key, k); -+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); -+ -+ bch2_btree_node_read(c, b, true); -+ -+ if (btree_node_read_error(b)) { -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_btree_set_root_for_read(c, b); -+err: -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ return ret; -+} -+ -+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, -+ struct btree_write *w) -+{ -+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); -+ -+ do { -+ old = new = v; -+ if (!(old & 1)) -+ break; -+ -+ new &= ~1UL; -+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); -+ -+ if (old & 1) -+ closure_put(&((struct btree_update *) new)->cl); -+ -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+} -+ -+static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_write *w = btree_prev_write(b); -+ -+ bch2_btree_complete_write(c, b, w); -+ btree_node_io_unlock(b); -+} -+ -+static void bch2_btree_node_write_error(struct bch_fs *c, -+ struct btree_write_bio *wbio) -+{ -+ struct btree *b = wbio->wbio.bio.bi_private; -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+ struct bch_extent_ptr *ptr; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, -+ BTREE_MAX_DEPTH, b->c.level, 0); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ /* has node been freed? */ -+ if (iter->l[b->c.level].b != b) { -+ /* node has been freed: */ -+ BUG_ON(!btree_node_dying(b)); -+ goto out; -+ } -+ -+ BUG_ON(!btree_node_hashed(b)); -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, -+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) -+ goto err; -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_trans_exit(&trans); -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+ return; -+err: -+ set_btree_node_noevict(b); -+ bch2_fs_fatal_error(c, "fatal error writing btree node"); -+ goto out; -+} -+ -+void bch2_btree_write_error_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ btree_write_error_work); -+ struct bio *bio; -+ -+ while (1) { -+ spin_lock_irq(&c->btree_write_error_lock); -+ bio = bio_list_pop(&c->btree_write_error_list); -+ spin_unlock_irq(&c->btree_write_error_lock); -+ -+ if (!bio) -+ break; -+ -+ bch2_btree_node_write_error(c, -+ container_of(bio, struct btree_write_bio, wbio.bio)); -+ } -+} -+ -+static void btree_node_write_work(struct work_struct *work) -+{ -+ struct btree_write_bio *wbio = -+ container_of(work, struct btree_write_bio, work); -+ struct bch_fs *c = wbio->wbio.c; -+ struct btree *b = wbio->wbio.bio.bi_private; -+ -+ btree_bounce_free(c, -+ wbio->wbio.order, -+ wbio->wbio.used_mempool, -+ wbio->data); -+ -+ if (wbio->wbio.failed.nr) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ -+ queue_work(c->wq, &c->btree_write_error_work); -+ return; -+ } -+ -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+} -+ -+static void btree_node_write_endio(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_write_bio *orig = parent ?: wbio; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ unsigned long flags; -+ -+ if (wbio->have_ioref) -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", -+ blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("btree")) { -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bch2_dev_list_add_dev(&orig->failed, wbio->dev); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ } -+ -+ if (wbio->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ -+ if (parent) { -+ bio_put(bio); -+ bio_endio(&parent->bio); -+ } else { -+ struct btree_write_bio *wb = -+ container_of(orig, struct btree_write_bio, wbio); -+ -+ INIT_WORK(&wb->work, btree_node_write_work); -+ queue_work(system_unbound_wq, &wb->work); -+ } -+} -+ -+static int validate_bset_for_write(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors) -+{ -+ unsigned whiteout_u64s = 0; -+ int ret; -+ -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) -+ return -1; -+ -+ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: -+ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); -+ if (ret) -+ bch2_inconsistent_error(c); -+ -+ return ret; -+} -+ -+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ struct btree_write_bio *wbio; -+ struct bset_tree *t; -+ struct bset *i; -+ struct btree_node *bn = NULL; -+ struct btree_node_entry *bne = NULL; -+ BKEY_PADDED(key) k; -+ struct bch_extent_ptr *ptr; -+ struct sort_iter sort_iter; -+ struct nonce nonce; -+ unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; -+ u64 seq = 0; -+ bool used_mempool; -+ unsigned long old, new; -+ bool validate_before_checksum = false; -+ void *data; -+ -+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ return; -+ -+ /* -+ * We may only have a read lock on the btree node - the dirty bit is our -+ * "lock" against racing with other threads that may be trying to start -+ * a write, we do a write iff we clear the dirty bit. Since setting the -+ * dirty bit requires a write lock, we can't race with other threads -+ * redirtying it: -+ */ -+ do { -+ old = new = READ_ONCE(b->flags); -+ -+ if (!(old & (1 << BTREE_NODE_dirty))) -+ return; -+ -+ if (!btree_node_may_write(b)) -+ return; -+ -+ if (old & (1 << BTREE_NODE_write_in_flight)) { -+ btree_node_wait_on_io(b); -+ continue; -+ } -+ -+ new &= ~(1 << BTREE_NODE_dirty); -+ new &= ~(1 << BTREE_NODE_need_write); -+ new |= (1 << BTREE_NODE_write_in_flight); -+ new |= (1 << BTREE_NODE_just_written); -+ new ^= (1 << BTREE_NODE_write_idx); -+ } while (cmpxchg_acquire(&b->flags, old, new) != old); -+ -+ BUG_ON(btree_node_fake(b)); -+ BUG_ON((b->will_make_reachable != 0) != !b->written); -+ -+ BUG_ON(b->written >= c->opts.btree_node_size); -+ BUG_ON(b->written & (c->opts.block_size - 1)); -+ BUG_ON(bset_written(b, btree_bset_last(b))); -+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); -+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ bytes = !b->written -+ ? sizeof(struct btree_node) -+ : sizeof(struct btree_node_entry); -+ -+ bytes += b->whiteout_u64s * sizeof(u64); -+ -+ for_each_bset(b, t) { -+ i = bset(b, t); -+ -+ if (bset_written(b, i)) -+ continue; -+ -+ bytes += le16_to_cpu(i->u64s) * sizeof(u64); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ seq = max(seq, le64_to_cpu(i->journal_seq)); -+ } -+ -+ order = get_order(bytes); -+ data = btree_bounce_alloc(c, order, &used_mempool); -+ -+ if (!b->written) { -+ bn = data; -+ *bn = *b->data; -+ i = &bn->keys; -+ } else { -+ bne = data; -+ bne->keys = b->data->keys; -+ i = &bne->keys; -+ } -+ -+ i->journal_seq = cpu_to_le64(seq); -+ i->u64s = 0; -+ -+ if (!btree_node_old_extent_overwrite(b)) { -+ sort_iter_add(&sort_iter, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b)); -+ SET_BSET_SEPARATE_WHITEOUTS(i, false); -+ } else { -+ memcpy_u64s(i->start, -+ unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ i->u64s = cpu_to_le16(b->whiteout_u64s); -+ SET_BSET_SEPARATE_WHITEOUTS(i, true); -+ } -+ -+ b->whiteout_u64s = 0; -+ -+ u64s = btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) -+ : bch2_sort_keys(i->start, &sort_iter, false); -+ le16_add_cpu(&i->u64s, u64s); -+ -+ set_needs_whiteout(i, false); -+ -+ /* do we have data to write? */ -+ if (b->written && !i->u64s) -+ goto nowrite; -+ -+ bytes_to_write = vstruct_end(i) - data; -+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; -+ -+ memset(data + bytes_to_write, 0, -+ (sectors_to_write << 9) - bytes_to_write); -+ -+ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); -+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); -+ BUG_ON(i->seq != b->data->keys.seq); -+ -+ i->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le16(BCH_BSET_VERSION_OLD) -+ : cpu_to_le16(c->sb.version); -+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) -+ validate_before_checksum = true; -+ -+ /* validate_bset will be modifying: */ -+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ /* if we're going to be encrypting, check metadata validity first: */ -+ if (validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ -+ if (bn) -+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); -+ else -+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ /* if we're not encrypting, check metadata after checksumming: */ -+ if (!validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ /* -+ * We handle btree write errors by immediately halting the journal - -+ * after we've done that, we can't issue any subsequent btree writes -+ * because they might have pointers to new nodes that failed to write. -+ * -+ * Furthermore, there's no point in doing any more btree writes because -+ * with the journal stopped, we're never going to update the journal to -+ * reflect that those writes were done and the data flushed from the -+ * journal: -+ * -+ * Also on journal error, the pending write may have updates that were -+ * never journalled (interior nodes, see btree_update_nodes_written()) - -+ * it's critical that we don't do the write in that case otherwise we -+ * will have updates visible that weren't in the journal: -+ * -+ * Make sure to update b->written so bch2_btree_init_next() doesn't -+ * break: -+ */ -+ if (bch2_journal_error(&c->journal) || -+ c->opts.nochanges) -+ goto err; -+ -+ trace_btree_write(b, bytes_to_write, sectors_to_write); -+ -+ wbio = container_of(bio_alloc_bioset(GFP_NOIO, -+ buf_pages(data, sectors_to_write << 9), -+ &c->btree_bio), -+ struct btree_write_bio, wbio.bio); -+ wbio_init(&wbio->wbio.bio); -+ wbio->data = data; -+ wbio->wbio.order = order; -+ wbio->wbio.used_mempool = used_mempool; -+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; -+ wbio->wbio.bio.bi_end_io = btree_node_write_endio; -+ wbio->wbio.bio.bi_private = b; -+ -+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); -+ -+ /* -+ * If we're appending to a leaf node, we don't technically need FUA - -+ * this write just needs to be persisted before the next journal write, -+ * which will be marked FLUSH|FUA. -+ * -+ * Similarly if we're writing a new btree root - the pointer is going to -+ * be in the next journal entry. -+ * -+ * But if we're writing a new btree node (that isn't a root) or -+ * appending to a non leaf btree node, we need either FUA or a flush -+ * when we write the parent with the new pointer. FUA is cheaper than a -+ * flush, and writes appending to leaf nodes aren't blocking anything so -+ * just make all btree node writes FUA to keep things sane. -+ */ -+ -+ bkey_copy(&k.key, &b->key); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) -+ ptr->offset += b->written; -+ -+ b->written += sectors_to_write; -+ -+ /* XXX: submitting IO with btree locks held: */ -+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); -+ return; -+err: -+ set_btree_node_noevict(b); -+ b->written += sectors_to_write; -+nowrite: -+ btree_bounce_free(c, order, used_mempool, data); -+ btree_node_write_done(c, b); -+} -+ -+/* -+ * Work that must be done with write lock held: -+ */ -+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -+{ -+ bool invalidated_iter = false; -+ struct btree_node_entry *bne; -+ struct bset_tree *t; -+ -+ if (!btree_node_just_written(b)) -+ return false; -+ -+ BUG_ON(b->whiteout_u64s); -+ -+ clear_btree_node_just_written(b); -+ -+ /* -+ * Note: immediately after write, bset_written() doesn't work - the -+ * amount of data we had to write after compaction might have been -+ * smaller than the offset of the last bset. -+ * -+ * However, we know that all bsets have been written here, as long as -+ * we're still holding the write lock: -+ */ -+ -+ /* -+ * XXX: decide if we really want to unconditionally sort down to a -+ * single bset: -+ */ -+ if (b->nsets > 1) { -+ btree_node_sort(c, b, NULL, 0, b->nsets, true); -+ invalidated_iter = true; -+ } else { -+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); -+ } -+ -+ for_each_bset(b, t) -+ set_needs_whiteout(bset(b, t), true); -+ -+ bch2_btree_verify(c, b); -+ -+ /* -+ * If later we don't unconditionally sort down to a single bset, we have -+ * to ensure this is still true: -+ */ -+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return invalidated_iter; -+} -+ -+/* -+ * Use this one if the node is intent locked: -+ */ -+void bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ BUG_ON(lock_type_held == SIX_LOCK_write); -+ -+ if (lock_type_held == SIX_LOCK_intent || -+ six_lock_tryupgrade(&b->c.lock)) { -+ __bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ -+ /* don't cycle lock unnecessarily: */ -+ if (btree_node_just_written(b) && -+ six_trylock_write(&b->c.lock)) { -+ bch2_btree_post_write_cleanup(c, b); -+ six_unlock_write(&b->c.lock); -+ } -+ -+ if (lock_type_held == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ } else { -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ } -+} -+ -+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+restart: -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) -+ if (test_bit(flag, &b->flags)) { -+ rcu_read_unlock(); -+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); -+ goto restart; -+ -+ } -+ rcu_read_unlock(); -+} -+ -+void bch2_btree_flush_all_reads(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -+} -+ -+void bch2_btree_flush_all_writes(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -+} -+ -+void bch2_btree_verify_flushed(struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || -+ (flags & (1 << BTREE_NODE_write_in_flight))); -+ } -+ rcu_read_unlock(); -+} -+ -+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ if (!(flags & (1 << BTREE_NODE_dirty))) -+ continue; -+ -+ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", -+ b, -+ (flags & (1 << BTREE_NODE_dirty)) != 0, -+ (flags & (1 << BTREE_NODE_need_write)) != 0, -+ b->c.level, -+ b->written, -+ !list_empty_careful(&b->write_blocked), -+ b->will_make_reachable != 0, -+ b->will_make_reachable & 1); -+ } -+ rcu_read_unlock(); -+ -+ return out.pos - buf; -+} -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -new file mode 100644 -index 000000000000..f3d7ec749b61 ---- /dev/null -+++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,190 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_IO_H -+#define _BCACHEFS_BTREE_IO_H -+ -+#include "bkey_methods.h" -+#include "bset.h" -+#include "btree_locking.h" -+#include "extents.h" -+#include "io_types.h" -+ -+struct bch_fs; -+struct btree_write; -+struct btree; -+struct btree_iter; -+ -+struct btree_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ unsigned have_ioref:1; -+ struct extent_ptr_decoded pick; -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+struct btree_write_bio { -+ void *data; -+ struct work_struct work; -+ struct bch_write_bio wbio; -+}; -+ -+static inline void btree_node_io_unlock(struct btree *b) -+{ -+ EBUG_ON(!btree_node_write_in_flight(b)); -+ clear_btree_node_write_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+} -+ -+static inline void btree_node_io_lock(struct btree *b) -+{ -+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline void btree_node_wait_on_io(struct btree *b) -+{ -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline bool btree_node_may_write(struct btree *b) -+{ -+ return list_empty_careful(&b->write_blocked) && -+ (!b->written || !b->will_make_reachable); -+} -+ -+enum compact_mode { -+ COMPACT_LAZY, -+ COMPACT_ALL, -+}; -+ -+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, -+ enum compact_mode); -+ -+static inline bool should_compact_bset_lazy(struct btree *b, -+ struct bset_tree *t) -+{ -+ unsigned total_u64s = bset_u64s(t); -+ unsigned dead_u64s = bset_dead_u64s(b, t); -+ -+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -+} -+ -+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (should_compact_bset_lazy(b, t)) -+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); -+ -+ return false; -+} -+ -+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); -+ -+void bch2_btree_build_aux_trees(struct btree *); -+void bch2_btree_init_next(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+ -+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); -+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); -+int bch2_btree_root_read(struct bch_fs *, enum btree_id, -+ const struct bkey_i *, unsigned); -+ -+void bch2_btree_complete_write(struct bch_fs *, struct btree *, -+ struct btree_write *); -+void bch2_btree_write_error_work(struct work_struct *); -+ -+void __bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -+ -+void bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+ -+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_held) -+{ -+ while (b->written && -+ btree_node_need_write(b) && -+ btree_node_may_write(b)) { -+ if (!btree_node_write_in_flight(b)) { -+ bch2_btree_node_write(c, b, lock_held); -+ break; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_held); -+ btree_node_wait_on_io(b); -+ btree_node_lock_type(c, b, lock_held); -+ } -+} -+ -+#define bch2_btree_node_write_cond(_c, _b, cond) \ -+do { \ -+ unsigned long old, new, v = READ_ONCE((_b)->flags); \ -+ \ -+ do { \ -+ old = new = v; \ -+ \ -+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ -+ break; \ -+ \ -+ new |= (1 << BTREE_NODE_need_write); \ -+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ -+ \ -+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -+} while (0) -+ -+void bch2_btree_flush_all_reads(struct bch_fs *); -+void bch2_btree_flush_all_writes(struct bch_fs *); -+void bch2_btree_verify_flushed(struct bch_fs *); -+ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); -+ -+static inline void compat_bformat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bkey_format *f) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ swap(f->bits_per_field[BKEY_FIELD_INODE], -+ f->bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(f->field_offset[BKEY_FIELD_INODE], -+ f->field_offset[BKEY_FIELD_OFFSET]); -+ } -+} -+ -+static inline void compat_bpos(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bpos *p) -+{ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bpos_swab(p); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) -+ swap(p->inode, p->offset); -+} -+ -+static inline void compat_btree_node(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct btree_node *bn) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ write) -+ bn->min_key = bkey_predecessor(bn->min_key); -+ -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ !write) -+ bn->min_key = bkey_successor(bn->min_key); -+} -+ -+#endif /* _BCACHEFS_BTREE_IO_H */ -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -new file mode 100644 -index 000000000000..6fab76c3220c ---- /dev/null -+++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,2445 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "debug.h" -+#include "extents.h" -+#include "journal.h" -+ -+#include -+#include -+ -+static inline bool is_btree_node(struct btree_iter *iter, unsigned l) -+{ -+ return l < BTREE_MAX_DEPTH && -+ (unsigned long) iter->l[l].b >= 128; -+} -+ -+static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ -+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ bkey_cmp(pos, POS_MAX)) -+ pos = bkey_successor(pos); -+ return pos; -+} -+ -+static inline bool btree_iter_pos_before_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; -+} -+ -+static inline bool btree_iter_pos_after_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; -+} -+ -+static inline bool btree_iter_pos_in_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return iter->btree_id == b->c.btree_id && -+ !btree_iter_pos_before_node(iter, b) && -+ !btree_iter_pos_after_node(iter, b); -+} -+ -+/* Btree node locking: */ -+ -+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) -+{ -+ bch2_btree_node_unlock_write_inlined(b, iter); -+} -+ -+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ unsigned readers = 0; -+ -+ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[b->c.level].b == b && -+ btree_node_read_locked(linked, b->c.level)) -+ readers++; -+ -+ /* -+ * Must drop our read locks before calling six_lock_write() - -+ * six_unlock() won't do wakeups until the reader count -+ * goes to 0, and it's safe because we have the node intent -+ * locked: -+ */ -+ atomic64_sub(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); -+ atomic64_add(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = btree_iter_node(iter, level); -+ int want = __btree_lock_want(iter, level); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (race_fault()) -+ return false; -+ -+ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || -+ (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, want))) { -+ mark_btree_node_locked(iter, level, want); -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = iter->l[level].b; -+ -+ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (btree_node_intent_locked(iter, level)) -+ return true; -+ -+ if (race_fault()) -+ return false; -+ -+ if (btree_node_locked(iter, level) -+ ? six_lock_tryupgrade(&b->c.lock) -+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) -+ goto success; -+ -+ if (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { -+ btree_node_unlock(iter, level); -+ goto success; -+ } -+ -+ return false; -+success: -+ mark_btree_node_intent_locked(iter, level); -+ return true; -+} -+ -+static inline bool btree_iter_get_locks(struct btree_iter *iter, -+ bool upgrade, bool trace) -+{ -+ unsigned l = iter->level; -+ int fail_idx = -1; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!(upgrade -+ ? bch2_btree_node_upgrade(iter, l) -+ : bch2_btree_node_relock(iter, l))) { -+ if (trace) -+ (upgrade -+ ? trace_node_upgrade_fail -+ : trace_node_relock_fail)(l, iter->l[l].lock_seq, -+ is_btree_node(iter, l) -+ ? 0 -+ : (unsigned long) iter->l[l].b, -+ is_btree_node(iter, l) -+ ? iter->l[l].b->c.lock.state.seq -+ : 0); -+ -+ fail_idx = l; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ /* -+ * When we fail to get a lock, we have to ensure that any child nodes -+ * can't be relocked so bch2_btree_iter_traverse has to walk back up to -+ * the node that we failed to relock: -+ */ -+ while (fail_idx >= 0) { -+ btree_node_unlock(iter, fail_idx); -+ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; -+ --fail_idx; -+ } -+ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ return iter->uptodate < BTREE_ITER_NEED_RELOCK; -+} -+ -+static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ return type != BTREE_ITER_CACHED -+ ? container_of(_b, struct btree, c)->key.k.p -+ : container_of(_b, struct bkey_cached, c)->key.pos; -+} -+ -+/* Slowpath: */ -+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, -+ unsigned level, struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, -+ void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_iter *linked; -+ u64 start_time = local_clock(); -+ bool ret = true; -+ -+ /* Check if it's safe to block: */ -+ trans_for_each_iter(trans, linked) { -+ if (!linked->nodes_locked) -+ continue; -+ -+ /* -+ * Can't block taking an intent lock if we have _any_ nodes read -+ * locked: -+ * -+ * - Our read lock blocks another thread with an intent lock on -+ * the same node from getting a write lock, and thus from -+ * dropping its intent lock -+ * -+ * - And the other thread may have multiple nodes intent locked: -+ * both the node we want to intent lock, and the node we -+ * already have read locked - deadlock: -+ */ -+ if (type == SIX_LOCK_intent && -+ linked->nodes_locked != linked->nodes_intent_locked) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = max_t(unsigned, -+ linked->locks_want, -+ __fls(linked->nodes_locked) + 1); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* -+ * Interior nodes must be locked before their descendants: if -+ * another iterator has possible descendants locked of the node -+ * we're about to lock, it must have the ancestors locked too: -+ */ -+ if (linked->btree_id == iter->btree_id && -+ level > __fls(linked->nodes_locked)) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = -+ max(level + 1, max_t(unsigned, -+ linked->locks_want, -+ iter->locks_want)); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* Must lock btree nodes in key order: */ -+ if ((cmp_int(iter->btree_id, linked->btree_id) ?: -+ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) -+ ret = false; -+ -+ if (iter->btree_id == linked->btree_id && -+ btree_node_locked(linked, level) && -+ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, -+ btree_iter_type(linked))) <= 0) -+ ret = false; -+ -+ /* -+ * Recheck if this is a node we already have locked - since one -+ * of the get_locks() calls might've successfully -+ * upgraded/relocked it: -+ */ -+ if (linked->l[level].b == b && -+ btree_node_locked_type(linked, level) >= type) { -+ six_lock_increment(&b->c.lock, type); -+ return true; -+ } -+ } -+ -+ if (unlikely(!ret)) { -+ trace_trans_restart_would_deadlock(iter->trans->ip); -+ return false; -+ } -+ -+ if (six_trylock_type(&b->c.lock, type)) -+ return true; -+ -+ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) -+ return false; -+ -+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], -+ start_time); -+ return true; -+} -+ -+/* Btree iterator locking: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static void bch2_btree_iter_verify_locks(struct btree_iter *iter) -+{ -+ unsigned l; -+ -+ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { -+ BUG_ON(iter->nodes_locked); -+ return; -+ } -+ -+ for (l = 0; is_btree_node(iter, l); l++) { -+ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && -+ !btree_node_locked(iter, l)) -+ continue; -+ -+ BUG_ON(btree_lock_want(iter, l) != -+ btree_node_locked_type(iter, l)); -+ } -+} -+ -+void bch2_btree_trans_verify_locks(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter_all(trans, iter) -+ bch2_btree_iter_verify_locks(iter); -+} -+#else -+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} -+#endif -+ -+__flatten -+bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) -+{ -+ return btree_iter_get_locks(iter, false, trace); -+} -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ if (btree_iter_get_locks(iter, true, true)) -+ return true; -+ -+ /* -+ * Ancestor nodes must be locked before child nodes, so set locks_want -+ * on iterators that might lock ancestors before us to avoid getting -+ * -EINTR later: -+ */ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked != iter && -+ linked->btree_id == iter->btree_id && -+ linked->locks_want < new_locks_want) { -+ linked->locks_want = new_locks_want; -+ btree_iter_get_locks(linked, true, false); -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ unsigned l = iter->level; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!bch2_btree_node_upgrade(iter, l)) { -+ iter->locks_want = l; -+ return false; -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ return true; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *iter, -+ unsigned downgrade_to) -+{ -+ unsigned l, new_locks_want = downgrade_to ?: -+ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); -+ -+ if (iter->locks_want < downgrade_to) { -+ iter->locks_want = new_locks_want; -+ -+ while (iter->nodes_locked && -+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { -+ if (l > iter->level) { -+ btree_node_unlock(iter, l); -+ } else { -+ if (btree_node_intent_locked(iter, l)) { -+ six_lock_downgrade(&iter->l[l].b->c.lock); -+ iter->nodes_intent_locked ^= 1 << l; -+ } -+ break; -+ } -+ } -+ } -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ bch2_btree_iter_downgrade(iter); -+} -+ -+/* Btree transaction locking: */ -+ -+bool bch2_trans_relock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ bool ret = true; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ ret &= bch2_btree_iter_relock(iter, true); -+ -+ return ret; -+} -+ -+void bch2_trans_unlock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ __bch2_btree_iter_unlock(iter); -+} -+ -+/* Btree iterator: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_btree_iter_verify_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ bool locked = btree_node_locked(iter, 0); -+ -+ if (!bch2_btree_node_relock(iter, 0)) -+ return; -+ -+ ck = (void *) iter->l[0].b; -+ BUG_ON(ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)); -+ -+ if (!locked) -+ btree_node_unlock(iter, 0); -+} -+ -+static void bch2_btree_iter_verify_level(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ struct btree_node_iter tmp = l->iter; -+ bool locked = btree_node_locked(iter, level); -+ struct bkey_packed *p, *k; -+ char buf1[100], buf2[100]; -+ const char *msg; -+ -+ if (!debug_check_iterators(iter->trans->c)) -+ return; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ if (!level) -+ bch2_btree_iter_verify_cached(iter); -+ return; -+ } -+ -+ BUG_ON(iter->level < iter->min_depth); -+ -+ if (!btree_iter_node(iter, level)) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ /* -+ * Ideally this invariant would always be true, and hopefully in the -+ * future it will be, but for now set_pos_same_leaf() breaks it: -+ */ -+ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && -+ !btree_iter_pos_in_node(iter, l->b)); -+ -+ /* -+ * node iterators don't use leaf node iterator: -+ */ -+ if (btree_iter_type(iter) == BTREE_ITER_NODES && -+ level <= iter->min_depth) -+ goto unlock; -+ -+ bch2_btree_node_iter_verify(&l->iter, l->b); -+ -+ /* -+ * For interior nodes, the iterator will have skipped past -+ * deleted keys: -+ * -+ * For extents, the iterator may have skipped past deleted keys (but not -+ * whiteouts) -+ */ -+ p = level || btree_node_type_is_extents(iter->btree_id) -+ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) -+ : bch2_btree_node_iter_prev_all(&tmp, l->b); -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { -+ msg = "before"; -+ goto err; -+ } -+ -+ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ msg = "after"; -+ goto err; -+ } -+unlock: -+ if (!locked) -+ btree_node_unlock(iter, level); -+ return; -+err: -+ strcpy(buf1, "(none)"); -+ strcpy(buf2, "(none)"); -+ -+ if (p) { -+ struct bkey uk = bkey_unpack_key(l->b, p); -+ bch2_bkey_to_text(&PBUF(buf1), &uk); -+ } -+ -+ if (k) { -+ struct bkey uk = bkey_unpack_key(l->b, k); -+ bch2_bkey_to_text(&PBUF(buf2), &uk); -+ } -+ -+ panic("iterator should be %s key at level %u:\n" -+ "iter pos %s %llu:%llu\n" -+ "prev key %s\n" -+ "cur key %s\n", -+ msg, level, -+ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", -+ iter->pos.inode, iter->pos.offset, -+ buf1, buf2); -+} -+ -+static void bch2_btree_iter_verify(struct btree_iter *iter) -+{ -+ unsigned i; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ bch2_btree_iter_verify_level(iter, i); -+} -+ -+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) -+{ -+ struct btree_iter *iter; -+ -+ if (!debug_check_iterators(trans->c)) -+ return; -+ -+ trans_for_each_iter_with_node(trans, b, iter) -+ bch2_btree_iter_verify_level(iter, b->c.level); -+} -+ -+#else -+ -+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} -+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -+ -+#endif -+ -+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) { -+ set->k = __btree_node_key_to_offset(b, k); -+ bch2_btree_node_iter_sort(iter, b); -+ return; -+ } -+ -+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -+} -+ -+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter_level *l = &iter->l[b->c.level]; -+ struct bpos pos = btree_iter_search_key(iter); -+ -+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) -+ return; -+ -+ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_iter_fix_key_modified(linked, b, where); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static void __bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bset_tree *t, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ const struct bkey_packed *end = btree_bkey_last(b, t); -+ struct btree_node_iter_set *set; -+ unsigned offset = __btree_node_key_to_offset(b, where); -+ int shift = new_u64s - clobber_u64s; -+ unsigned old_end = t->end_offset - shift; -+ unsigned orig_iter_pos = node_iter->data[0].k; -+ bool iter_current_key_modified = -+ orig_iter_pos >= offset && -+ orig_iter_pos <= offset + clobber_u64s; -+ struct bpos iter_pos = btree_iter_search_key(iter); -+ -+ btree_node_iter_for_each(node_iter, set) -+ if (set->end == old_end) -+ goto found; -+ -+ /* didn't find the bset in the iterator - might have to readd it: */ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ bch2_btree_node_iter_push(node_iter, b, where, end); -+ goto fixup_done; -+ } else { -+ /* Iterator is after key that changed */ -+ return; -+ } -+found: -+ set->end = t->end_offset; -+ -+ /* Iterator hasn't gotten to the key that changed yet: */ -+ if (set->k < offset) -+ return; -+ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ set->k = offset; -+ } else if (set->k < offset + clobber_u64s) { -+ set->k = offset + new_u64s; -+ if (set->k == set->end) -+ bch2_btree_node_iter_set_drop(node_iter, set); -+ } else { -+ /* Iterator is after key that changed */ -+ set->k = (int) set->k + shift; -+ return; -+ } -+ -+ bch2_btree_node_iter_sort(node_iter, b); -+fixup_done: -+ if (node_iter->data[0].k != orig_iter_pos) -+ iter_current_key_modified = true; -+ -+ /* -+ * When a new key is added, and the node iterator now points to that -+ * key, the iterator might have skipped past deleted keys that should -+ * come after the key the iterator now points to. We have to rewind to -+ * before those deleted keys - otherwise -+ * bch2_btree_node_iter_prev_all() breaks: -+ */ -+ if (!bch2_btree_node_iter_end(node_iter) && -+ iter_current_key_modified && -+ (b->c.level || -+ btree_node_type_is_extents(iter->btree_id))) { -+ struct bset_tree *t; -+ struct bkey_packed *k, *k2, *p; -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ -+ for_each_bset(b, t) { -+ bool set_pos = false; -+ -+ if (node_iter->data[0].end == t->end_offset) -+ continue; -+ -+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); -+ -+ while ((p = bch2_bkey_prev_all(b, t, k2)) && -+ bkey_iter_cmp(b, k, p) < 0) { -+ k2 = p; -+ set_pos = true; -+ } -+ -+ if (set_pos) -+ btree_node_iter_set_set_pos(node_iter, -+ b, t, k2); -+ } -+ } -+ -+ if (!b->c.level && -+ node_iter == &iter->l[0].iter && -+ iter_current_key_modified) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct btree_iter *linked; -+ -+ if (node_iter != &iter->l[b->c.level].iter) { -+ __bch2_btree_node_iter_fix(iter, b, node_iter, t, -+ where, clobber_u64s, new_u64s); -+ -+ if (debug_check_iterators(iter->trans->c)) -+ bch2_btree_node_iter_verify(node_iter, b); -+ } -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_node_iter_fix(linked, b, -+ &linked->l[b->c.level].iter, t, -+ where, clobber_u64s, new_u64s); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u, -+ struct bkey_packed *k) -+{ -+ struct bkey_s_c ret; -+ -+ if (unlikely(!k)) { -+ /* -+ * signal to bch2_btree_iter_peek_slot() that we're currently at -+ * a hole -+ */ -+ u->type = KEY_TYPE_deleted; -+ return bkey_s_c_null; -+ } -+ -+ ret = bkey_disassemble(l->b, k, u); -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ -+ return ret; -+} -+ -+/* peek_all() doesn't skip deleted keys */ -+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u) -+{ -+ return __btree_iter_unpack(iter, l, u, -+ bch2_btree_node_iter_peek_all(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_prev(&l->iter, l->b)); -+} -+ -+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ int max_advance) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct bkey_packed *k; -+ int nr_advanced = 0; -+ -+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && -+ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ if (max_advance > 0 && nr_advanced >= max_advance) -+ return false; -+ -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ nr_advanced++; -+ } -+ -+ return true; -+} -+ -+/* -+ * Verify that iterator for parent node points to child node: -+ */ -+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter_level *l; -+ unsigned plevel; -+ bool parent_locked; -+ struct bkey_packed *k; -+ -+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ return; -+ -+ plevel = b->c.level + 1; -+ if (!btree_iter_node(iter, plevel)) -+ return; -+ -+ parent_locked = btree_node_locked(iter, plevel); -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ l = &iter->l[plevel]; -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ if (!k || -+ bkey_deleted(k) || -+ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { -+ char buf[100]; -+ struct bkey uk = bkey_unpack_key(b, k); -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", -+ buf, b->key.k.p.inode, b->key.k.p.offset); -+ } -+ -+ if (!parent_locked) -+ btree_node_unlock(iter, b->c.level + 1); -+} -+ -+static inline void __btree_iter_init(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ -+ bch2_btree_node_iter_init(&l->iter, l->b, &pos); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+static inline void btree_iter_node_set(struct btree_iter *iter, -+ struct btree *b) -+{ -+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); -+ -+ btree_iter_verify_new_node(iter, b); -+ -+ EBUG_ON(!btree_iter_pos_in_node(iter, b)); -+ EBUG_ON(b->c.lock.state.seq & 1); -+ -+ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; -+ iter->l[b->c.level].b = b; -+ __btree_iter_init(iter, b->c.level); -+} -+ -+/* -+ * A btree node is being replaced - update the iterator to point to the new -+ * node: -+ */ -+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) -+{ -+ enum btree_node_locked_type t; -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (btree_iter_type(linked) != BTREE_ITER_CACHED && -+ btree_iter_pos_in_node(linked, b)) { -+ /* -+ * bch2_btree_iter_node_drop() has already been called - -+ * the old node we're replacing has already been -+ * unlocked and the pointer invalidated -+ */ -+ BUG_ON(btree_node_locked(linked, b->c.level)); -+ -+ t = btree_lock_want(linked, b->c.level); -+ if (t != BTREE_NODE_UNLOCKED) { -+ six_lock_increment(&b->c.lock, t); -+ mark_btree_node_locked(linked, b->c.level, t); -+ } -+ -+ btree_iter_node_set(linked, b); -+ } -+} -+ -+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ unsigned level = b->c.level; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[level].b == b) { -+ __btree_node_unlock(linked, level); -+ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; -+ } -+} -+ -+/* -+ * A btree node has been modified in such a way as to invalidate iterators - fix -+ * them: -+ */ -+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ __btree_iter_init(linked, b->c.level); -+} -+ -+static int lock_root_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ struct btree **rootp = p; -+ -+ return b == *rootp ? 0 : -1; -+} -+ -+static inline int btree_iter_lock_root(struct btree_iter *iter, -+ unsigned depth_want) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; -+ enum six_lock_type lock_type; -+ unsigned i; -+ -+ EBUG_ON(iter->nodes_locked); -+ -+ while (1) { -+ b = READ_ONCE(*rootp); -+ iter->level = READ_ONCE(b->c.level); -+ -+ if (unlikely(iter->level < depth_want)) { -+ /* -+ * the root is at a lower depth than the depth we want: -+ * got to the end of the btree, or we're walking nodes -+ * greater than some depth and there are no nodes >= -+ * that depth -+ */ -+ iter->level = depth_want; -+ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ return 1; -+ } -+ -+ lock_type = __btree_lock_want(iter, iter->level); -+ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, -+ iter, lock_type, -+ lock_root_check_fn, rootp))) -+ return -EINTR; -+ -+ if (likely(b == READ_ONCE(*rootp) && -+ b->c.level == iter->level && -+ !race_fault())) { -+ for (i = 0; i < iter->level; i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; -+ iter->l[iter->level].b = b; -+ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ -+ mark_btree_node_locked(iter, iter->level, lock_type); -+ btree_iter_node_set(iter, b); -+ return 0; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ } -+} -+ -+noinline -+static void btree_iter_prefetch(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) -+ ? (iter->level > 1 ? 0 : 2) -+ : (iter->level > 1 ? 1 : 16); -+ bool was_locked = btree_node_locked(iter, iter->level); -+ -+ while (nr) { -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ return; -+ -+ bch2_btree_node_iter_advance(&node_iter, l->b); -+ k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!k) -+ break; -+ -+ bch2_bkey_unpack(l->b, &tmp.k, k); -+ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); -+ } -+ -+ if (!was_locked) -+ btree_node_unlock(iter, iter->level); -+} -+ -+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, -+ unsigned plevel, struct btree *b) -+{ -+ struct btree_iter_level *l = &iter->l[plevel]; -+ bool locked = btree_node_locked(iter, plevel); -+ struct bkey_packed *k; -+ struct bch_btree_ptr_v2 *bp; -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); -+ -+ bp = (void *) bkeyp_val(&l->b->format, k); -+ bp->mem_ptr = (unsigned long)b; -+ -+ if (!locked) -+ btree_node_unlock(iter, plevel); -+} -+ -+static __always_inline int btree_iter_down(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree *b; -+ unsigned level = iter->level - 1; -+ enum six_lock_type lock_type = __btree_lock_want(iter, level); -+ BKEY_PADDED(k) tmp; -+ -+ EBUG_ON(!btree_node_locked(iter, iter->level)); -+ -+ bch2_bkey_unpack(l->b, &tmp.k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+ -+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); -+ if (unlikely(IS_ERR(b))) -+ return PTR_ERR(b); -+ -+ mark_btree_node_locked(iter, level, lock_type); -+ btree_iter_node_set(iter, b); -+ -+ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && -+ unlikely(b != btree_node_mem_ptr(&tmp.k))) -+ btree_node_mem_ptr_set(iter, level + 1, b); -+ -+ if (iter->flags & BTREE_ITER_PREFETCH) -+ btree_iter_prefetch(iter); -+ -+ iter->level = level; -+ -+ return 0; -+} -+ -+static void btree_iter_up(struct btree_iter *iter) -+{ -+ btree_node_unlock(iter, iter->level++); -+} -+ -+static int btree_iter_traverse_one(struct btree_iter *); -+ -+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ u8 sorted[BTREE_ITER_MAX]; -+ unsigned i, nr_sorted = 0; -+ -+ if (trans->in_traverse_all) -+ return -EINTR; -+ -+ trans->in_traverse_all = true; -+retry_all: -+ nr_sorted = 0; -+ -+ trans_for_each_iter(trans, iter) -+ sorted[nr_sorted++] = iter->idx; -+ -+#define btree_iter_cmp_by_idx(_l, _r) \ -+ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) -+ -+ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); -+#undef btree_iter_cmp_by_idx -+ bch2_trans_unlock(trans); -+ -+ if (unlikely(ret == -ENOMEM)) { -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ } -+ -+ if (unlikely(ret == -EIO)) { -+ trans->error = true; -+ goto out; -+ } -+ -+ BUG_ON(ret && ret != -EINTR); -+ -+ /* Now, redo traversals in correct order: */ -+ for (i = 0; i < nr_sorted; i++) { -+ unsigned idx = sorted[i]; -+ -+ /* -+ * sucessfully traversing one iterator can cause another to be -+ * unlinked, in btree_key_cache_fill() -+ */ -+ if (!(trans->iters_linked & (1ULL << idx))) -+ continue; -+ -+ ret = btree_iter_traverse_one(&trans->iters[idx]); -+ if (ret) -+ goto retry_all; -+ } -+ -+ if (hweight64(trans->iters_live) > 1) -+ ret = -EINTR; -+ else -+ trans_for_each_iter(trans, iter) -+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { -+ ret = -EINTR; -+ break; -+ } -+out: -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ trans->in_traverse_all = false; -+ return ret; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *trans) -+{ -+ return __btree_iter_traverse_all(trans, 0); -+} -+ -+static inline bool btree_iter_good_node(struct btree_iter *iter, -+ unsigned l, int check_pos) -+{ -+ if (!is_btree_node(iter, l) || -+ !bch2_btree_node_relock(iter, l)) -+ return false; -+ -+ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) -+ return false; -+ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) -+ return false; -+ return true; -+} -+ -+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, -+ int check_pos) -+{ -+ unsigned l = iter->level; -+ -+ while (btree_iter_node(iter, l) && -+ !btree_iter_good_node(iter, l, check_pos)) { -+ btree_node_unlock(iter, l); -+ iter->l[l].b = BTREE_ITER_NO_NODE_UP; -+ l++; -+ } -+ -+ return l; -+} -+ -+/* -+ * This is the main state machine for walking down the btree - walks down to a -+ * specified depth -+ * -+ * Returns 0 on success, -EIO on error (error reading in a btree node). -+ * -+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is -+ * stashed in the iterator and returned from bch2_trans_exit(). -+ */ -+static int btree_iter_traverse_one(struct btree_iter *iter) -+{ -+ unsigned depth_want = iter->level; -+ -+ /* -+ * if we need interior nodes locked, call btree_iter_relock() to make -+ * sure we walk back up enough that we lock them: -+ */ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || -+ iter->locks_want > 1) -+ bch2_btree_iter_relock(iter, false); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_traverse_cached(iter); -+ -+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) -+ return 0; -+ -+ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) -+ return 0; -+ -+ /* -+ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos -+ * here unnecessary -+ */ -+ iter->level = btree_iter_up_until_good_node(iter, 0); -+ -+ /* -+ * If we've got a btree node locked (i.e. we aren't about to relock the -+ * root) - advance its node iterator if necessary: -+ * -+ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary -+ */ -+ if (is_btree_node(iter, iter->level)) { -+ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); -+ -+ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); -+ } -+ -+ /* -+ * Note: iter->nodes[iter->level] may be temporarily NULL here - that -+ * would indicate to other code that we got to the end of the btree, -+ * here it indicates that relocking the root failed - it's critical that -+ * btree_iter_lock_root() comes next and that it can't fail -+ */ -+ while (iter->level > depth_want) { -+ int ret = btree_iter_node(iter, iter->level) -+ ? btree_iter_down(iter) -+ : btree_iter_lock_root(iter, depth_want); -+ if (unlikely(ret)) { -+ if (ret == 1) -+ return 0; -+ -+ iter->level = depth_want; -+ -+ if (ret == -EIO) { -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_ERROR; -+ } else { -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_DOWN; -+ } -+ return ret; -+ } -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_iter_verify(iter); -+ return 0; -+} -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ int ret; -+ -+ ret = bch2_trans_cond_resched(trans) ?: -+ btree_iter_traverse_one(iter); -+ if (unlikely(ret)) -+ ret = __btree_iter_traverse_all(trans, ret); -+ -+ return ret; -+} -+ -+static inline void bch2_btree_iter_checks(struct btree_iter *iter) -+{ -+ enum btree_iter_type type = btree_iter_type(iter); -+ -+ EBUG_ON(iter->btree_id >= BTREE_ID_NR); -+ -+ BUG_ON((type == BTREE_ITER_KEYS || -+ type == BTREE_ITER_CACHED) && -+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || -+ bkey_cmp(iter->pos, iter->k.p) > 0)); -+ -+ bch2_btree_iter_verify_locks(iter); -+ bch2_btree_iter_verify_level(iter, iter->level); -+} -+ -+/* Iterate across nodes (leaf and interior nodes) */ -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return iter->l[iter->level].b; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ /* already got to end? */ -+ if (!btree_iter_node(iter, iter->level)) -+ return NULL; -+ -+ bch2_trans_cond_resched(iter->trans); -+ -+ btree_iter_up(iter); -+ -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ /* got to end? */ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { -+ /* -+ * Haven't gotten to the end of the parent node: go back down to -+ * the next child node -+ */ -+ -+ /* -+ * We don't really want to be unlocking here except we can't -+ * directly tell btree_iter_traverse() "traverse to this level" -+ * except by setting iter->level, so we have to unlock so we -+ * don't screw up our lock invariants: -+ */ -+ if (btree_node_read_locked(iter, iter->level)) -+ btree_node_unlock(iter, iter->level); -+ -+ iter->pos = bkey_successor(iter->pos); -+ iter->level = iter->min_depth; -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = iter->l[iter->level].b; -+ } -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+/* Iterate across keys (in leaf nodes only) */ -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ -+ EBUG_ON(iter->level != 0); -+ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); -+ EBUG_ON(!btree_node_locked(iter, 0)); -+ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+ -+ btree_iter_advance_to_pos(iter, l, -1); -+ -+ /* -+ * XXX: -+ * keeping a node locked that's outside (even just outside) iter->pos -+ * breaks __bch2_btree_node_lock(). This seems to only affect -+ * bch2_btree_node_get_sibling so for now it's fixed there, but we -+ * should try to get rid of this corner case. -+ * -+ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) -+ */ -+ -+ if (bch2_btree_node_iter_end(&l->iter) && -+ btree_iter_pos_after_node(iter, l->b)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+} -+ -+static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) -+{ -+ unsigned l = iter->level; -+ -+ if (!cmp) -+ goto out; -+ -+ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { -+ btree_node_unlock(iter, 0); -+ iter->l[0].b = BTREE_ITER_NO_NODE_UP; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ return; -+ } -+ -+ l = btree_iter_up_until_good_node(iter, cmp); -+ -+ if (btree_iter_node(iter, l)) { -+ /* -+ * We might have to skip over many keys, or just a few: try -+ * advancing the node iterator, and if we have to skip over too -+ * many keys just reinit it (or if we're rewinding, since that -+ * is expensive). -+ */ -+ if (cmp < 0 || -+ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) -+ __btree_iter_init(iter, l); -+ -+ /* Don't leave it locked if we're not supposed to: */ -+ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, l); -+ } -+out: -+ if (l != iter->level) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ else -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, -+ bool strictly_greater) -+{ -+ struct bpos old = btree_iter_search_key(iter); -+ int cmp; -+ -+ iter->flags &= ~BTREE_ITER_IS_EXTENTS; -+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ cmp = bkey_cmp(btree_iter_search_key(iter), old); -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+{ -+ int cmp = bkey_cmp(new_pos, iter->pos); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->key.k.p; -+ -+ ret = bkey_cmp(iter->pos, POS_MAX) != 0; -+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ iter->k.p = iter->pos = bkey_successor(iter->pos); -+ -+ btree_iter_pos_changed(iter, 1); -+ return ret; -+} -+ -+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->data->min_key; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ -+ ret = bkey_cmp(iter->pos, POS_MIN) != 0; -+ if (ret) { -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ } -+ -+ btree_iter_pos_changed(iter, -1); -+ return ret; -+} -+ -+/** -+ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key -+ * it currently points to -+ */ -+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c ret = { .k = &iter->k }; -+ -+ if (!bkey_deleted(&iter->k)) { -+ struct bkey_packed *_k = -+ __bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ ret.v = bkeyp_val(&l->b->format, _k); -+ -+ if (debug_check_iterators(iter->trans->c)) { -+ struct bkey k = bkey_unpack_key(l->b, _k); -+ -+ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); -+ } -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's -+ * current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_next: returns first key greater than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek(iter); -+} -+ -+static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_trans *trans = iter->trans; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update2(trans, i) -+ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: -+ bkey_cmp(pos, i->k->k.p)) <= 0) -+ break; -+ -+ return i < trans->updates2 + trans->nr_updates2 && -+ iter->btree_id == i->iter->btree_id -+ ? bkey_i_to_s_c(i->k) -+ : bkey_s_c_null; -+} -+ -+static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k = __btree_iter_peek(iter, l); -+ struct bkey_s_c u = __btree_trans_updates_peek(iter); -+ -+ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) -+ return k; -+ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { -+ iter->k = *u.k; -+ return u; -+ } -+ return bkey_s_c_null; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __bch2_btree_iter_peek_with_updates(iter); -+ -+ if (k.k && bkey_deleted(k.k)) { -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ continue; -+ } -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_with_updates(iter); -+} -+ -+/** -+ * bch2_btree_iter_peek_prev: returns first key less than or equal to -+ * iterator's current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) -+ k = __btree_iter_prev(iter, l); -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_prev_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); -+ iter->pos = bkey_start_pos(k.k); -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_prev: returns first key less than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = bkey_start_pos(&iter->k); -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (unlikely(!bkey_cmp(pos, POS_MIN))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); -+ -+ return bch2_btree_iter_peek_prev(iter); -+} -+ -+static inline struct bkey_s_c -+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter; -+ struct bkey_s_c k; -+ struct bkey n; -+ int ret; -+ -+ /* keys & holes can't span inode numbers: */ -+ if (iter->pos.offset == KEY_OFFSET_MAX) { -+ if (iter->pos.inode == KEY_INODE_MAX) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ } -+ -+ /* -+ * iterator is now at the correct position for inserting at iter->pos, -+ * but we need to keep iterating until we find the first non whiteout so -+ * we know how big a hole we have, if any: -+ */ -+ -+ node_iter = l->iter; -+ k = __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&node_iter, l->b)); -+ -+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { -+ /* -+ * We're not setting iter->uptodate because the node iterator -+ * doesn't necessarily point at the key we're returning: -+ */ -+ -+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+ } -+ -+ /* hole */ -+ -+ if (!k.k) -+ k.k = &l->b->key.k; -+ -+ bkey_init(&n); -+ n.p = iter->pos; -+ bch2_key_resize(&n, -+ min_t(u64, KEY_SIZE_MAX, -+ (k.k->p.inode == n.p.inode -+ ? bkey_start_offset(k.k) -+ : KEY_OFFSET_MAX) - -+ n.p.offset)); -+ -+ EBUG_ON(!n.size); -+ -+ iter->k = n; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return (struct bkey_s_c) { &iter->k, NULL }; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return btree_iter_peek_uptodate(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return __bch2_btree_iter_peek_slot_extents(iter); -+ -+ k = __btree_iter_peek_all(iter, l, &iter->k); -+ -+ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); -+ -+ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { -+ /* hole */ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos; -+ k = (struct bkey_s_c) { &iter->k, NULL }; -+ } -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); -+ bch2_btree_iter_checks(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ ck = (void *) iter->l[0].b; -+ -+ EBUG_ON(iter->btree_id != ck->key.btree_id || -+ bkey_cmp(iter->pos, ck->key.pos)); -+ BUG_ON(!ck->valid); -+ -+ return bkey_i_to_s_c(ck->k); -+} -+ -+static inline void bch2_btree_iter_init(struct btree_trans *trans, -+ struct btree_iter *iter, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i; -+ -+ if (btree_node_type_is_extents(btree_id) && -+ !(flags & BTREE_ITER_NODES)) -+ flags |= BTREE_ITER_IS_EXTENTS; -+ -+ iter->trans = trans; -+ iter->pos = pos; -+ bkey_init(&iter->k); -+ iter->k.p = pos; -+ iter->flags = flags; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ iter->btree_id = btree_id; -+ iter->level = 0; -+ iter->min_depth = 0; -+ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; -+ iter->nodes_locked = 0; -+ iter->nodes_intent_locked = 0; -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; -+ -+ prefetch(c->btree_roots[btree_id].b); -+} -+ -+/* new transactional stuff: */ -+ -+static inline void __bch2_trans_iter_free(struct btree_trans *trans, -+ unsigned idx) -+{ -+ __bch2_btree_iter_unlock(&trans->iters[idx]); -+ trans->iters_linked &= ~(1ULL << idx); -+ trans->iters_live &= ~(1ULL << idx); -+ trans->iters_touched &= ~(1ULL << idx); -+} -+ -+int bch2_trans_iter_put(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ int ret; -+ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ BUG_ON(trans->iters + iter->idx != iter); -+ -+ ret = btree_iter_err(iter); -+ -+ if (!(trans->iters_touched & (1ULL << iter->idx)) && -+ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) -+ __bch2_trans_iter_free(trans, iter->idx); -+ -+ trans->iters_live &= ~(1ULL << iter->idx); -+ return ret; -+} -+ -+int bch2_trans_iter_free(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return bch2_trans_iter_put(trans, iter); -+} -+ -+static int bch2_trans_realloc_iters(struct btree_trans *trans, -+ unsigned new_size) -+{ -+ void *p, *new_iters, *new_updates, *new_updates2; -+ size_t iters_bytes; -+ size_t updates_bytes; -+ -+ new_size = roundup_pow_of_two(new_size); -+ -+ BUG_ON(new_size > BTREE_ITER_MAX); -+ -+ if (new_size <= trans->size) -+ return 0; -+ -+ BUG_ON(trans->used_mempool); -+ -+ bch2_trans_unlock(trans); -+ -+ iters_bytes = sizeof(struct btree_iter) * new_size; -+ updates_bytes = sizeof(struct btree_insert_entry) * new_size; -+ -+ p = kmalloc(iters_bytes + -+ updates_bytes + -+ updates_bytes, GFP_NOFS); -+ if (p) -+ goto success; -+ -+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); -+ new_size = BTREE_ITER_MAX; -+ -+ trans->used_mempool = true; -+success: -+ new_iters = p; p += iters_bytes; -+ new_updates = p; p += updates_bytes; -+ new_updates2 = p; p += updates_bytes; -+ -+ memcpy(new_iters, trans->iters, -+ sizeof(struct btree_iter) * trans->nr_iters); -+ memcpy(new_updates, trans->updates, -+ sizeof(struct btree_insert_entry) * trans->nr_updates); -+ memcpy(new_updates2, trans->updates2, -+ sizeof(struct btree_insert_entry) * trans->nr_updates2); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ memset(trans->iters, POISON_FREE, -+ sizeof(struct btree_iter) * trans->nr_iters + -+ sizeof(struct btree_insert_entry) * trans->nr_iters); -+ -+ if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ -+ trans->iters = new_iters; -+ trans->updates = new_updates; -+ trans->updates2 = new_updates2; -+ trans->size = new_size; -+ -+ if (trans->iters_live) { -+ trace_trans_restart_iters_realloced(trans->ip, trans->size); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) -+{ -+ unsigned idx = __ffs64(~trans->iters_linked); -+ -+ if (idx < trans->nr_iters) -+ goto got_slot; -+ -+ if (trans->nr_iters == trans->size) { -+ int ret; -+ -+ if (trans->nr_iters >= BTREE_ITER_MAX) { -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) { -+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", -+ bch2_btree_ids[iter->btree_id], -+ iter->pos.inode, -+ iter->pos.offset, -+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", -+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", -+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", -+ (void *) iter->ip_allocated); -+ } -+ -+ panic("trans iter oveflow\n"); -+ } -+ -+ ret = bch2_trans_realloc_iters(trans, trans->size * 2); -+ if (ret) -+ return ERR_PTR(ret); -+ } -+ -+ idx = trans->nr_iters++; -+ BUG_ON(trans->nr_iters > trans->size); -+ -+ trans->iters[idx].idx = idx; -+got_slot: -+ BUG_ON(trans->iters_linked & (1ULL << idx)); -+ trans->iters_linked |= 1ULL << idx; -+ trans->iters[idx].flags = 0; -+ return &trans->iters[idx]; -+} -+ -+static inline void btree_iter_copy(struct btree_iter *dst, -+ struct btree_iter *src) -+{ -+ unsigned i, idx = dst->idx; -+ -+ *dst = *src; -+ dst->idx = idx; -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ if (btree_node_locked(dst, i)) -+ six_lock_increment(&dst->l[i].b->c.lock, -+ __btree_lock_want(dst, i)); -+ -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; -+} -+ -+static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -+{ -+ if (bkey_cmp(l, r) > 0) -+ swap(l, r); -+ -+ return POS(r.inode - l.inode, r.offset - l.offset); -+} -+ -+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ struct btree_iter *iter, *best = NULL; -+ -+ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); -+ -+ trans_for_each_iter(trans, iter) { -+ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) -+ continue; -+ -+ if (iter->btree_id != btree_id) -+ continue; -+ -+ if (best && -+ bkey_cmp(bpos_diff(best->pos, pos), -+ bpos_diff(iter->pos, pos)) < 0) -+ continue; -+ -+ best = iter; -+ } -+ -+ if (!best) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); -+ } else if ((trans->iters_live & (1ULL << best->idx)) || -+ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, best); -+ } else { -+ iter = best; -+ } -+ -+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ iter->flags &= ~BTREE_ITER_USER_FLAGS; -+ iter->flags |= flags & BTREE_ITER_USER_FLAGS; -+ -+ if (iter->flags & BTREE_ITER_INTENT) -+ bch2_btree_iter_upgrade(iter, 1); -+ else -+ bch2_btree_iter_downgrade(iter); -+ -+ BUG_ON(iter->btree_id != btree_id); -+ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); -+ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); -+ BUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ trans->iters_touched |= 1ULL << iter->idx; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ __bch2_btree_iter_set_pos(iter, pos, -+ btree_node_type_is_extents(btree_id)); -+ return iter; -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, -+ unsigned locks_want, -+ unsigned depth, -+ unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_NODES); -+ unsigned i; -+ -+ BUG_ON(IS_ERR(iter)); -+ BUG_ON(bkey_cmp(iter->pos, pos)); -+ -+ iter->locks_want = locks_want; -+ iter->level = depth; -+ iter->min_depth = depth; -+ -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = NULL; -+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, -+ struct btree_iter *src) -+{ -+ struct btree_iter *iter; -+ -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, src); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ /* -+ * We don't need to preserve this iter since it's cheap to copy it -+ * again - this will cause trans_iter_put() to free it right away: -+ */ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return iter; -+} -+ -+static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) -+{ -+ if (size > trans->mem_bytes) { -+ size_t old_bytes = trans->mem_bytes; -+ size_t new_bytes = roundup_pow_of_two(size); -+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); -+ -+ if (!new_mem) -+ return -ENOMEM; -+ -+ trans->mem = new_mem; -+ trans->mem_bytes = new_bytes; -+ -+ if (old_bytes) { -+ trace_trans_restart_mem_realloced(trans->ip, new_bytes); -+ return -EINTR; -+ } -+ } -+ -+ return 0; -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ void *p; -+ int ret; -+ -+ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ p = trans->mem + trans->mem_top; -+ trans->mem_top += size; -+ return p; -+} -+ -+inline void bch2_trans_unlink_iters(struct btree_trans *trans) -+{ -+ u64 iters = trans->iters_linked & -+ ~trans->iters_touched & -+ ~trans->iters_live; -+ -+ while (iters) { -+ unsigned idx = __ffs64(iters); -+ -+ iters &= ~(1ULL << idx); -+ __bch2_trans_iter_free(trans, idx); -+ } -+} -+ -+void bch2_trans_reset(struct btree_trans *trans, unsigned flags) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| -+ BTREE_ITER_SET_POS_AFTER_COMMIT); -+ -+ bch2_trans_unlink_iters(trans); -+ -+ trans->iters_touched &= trans->iters_live; -+ -+ trans->need_reset = 0; -+ trans->nr_updates = 0; -+ trans->nr_updates2 = 0; -+ trans->mem_top = 0; -+ -+ trans->extra_journal_entries = NULL; -+ trans->extra_journal_entry_u64s = 0; -+ -+ if (trans->fs_usage_deltas) { -+ trans->fs_usage_deltas->used = 0; -+ memset(&trans->fs_usage_deltas->memset_start, 0, -+ (void *) &trans->fs_usage_deltas->memset_end - -+ (void *) &trans->fs_usage_deltas->memset_start); -+ } -+ -+ if (!(flags & TRANS_RESET_NOTRAVERSE)) -+ bch2_btree_iter_traverse_all(trans); -+} -+ -+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, -+ unsigned expected_nr_iters, -+ size_t expected_mem_bytes) -+{ -+ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); -+ -+ /* -+ * reallocating iterators currently completely breaks -+ * bch2_trans_iter_put(): -+ */ -+ expected_nr_iters = BTREE_ITER_MAX; -+ -+ trans->c = c; -+ trans->ip = _RET_IP_; -+ trans->size = ARRAY_SIZE(trans->iters_onstack); -+ trans->iters = trans->iters_onstack; -+ trans->updates = trans->updates_onstack; -+ trans->updates2 = trans->updates2_onstack; -+ trans->fs_usage_deltas = NULL; -+ -+ if (expected_nr_iters > trans->size) -+ bch2_trans_realloc_iters(trans, expected_nr_iters); -+ -+ if (expected_mem_bytes) -+ bch2_trans_preload_mem(trans, expected_mem_bytes); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->pid = current->pid; -+ mutex_lock(&c->btree_trans_lock); -+ list_add(&trans->list, &c->btree_trans_list); -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+int bch2_trans_exit(struct btree_trans *trans) -+{ -+ bch2_trans_unlock(trans); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_lock(&trans->c->btree_trans_lock); -+ list_del(&trans->list); -+ mutex_unlock(&trans->c->btree_trans_lock); -+#endif -+ -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ kfree(trans->fs_usage_deltas); -+ kfree(trans->mem); -+ if (trans->used_mempool) -+ mempool_free(trans->iters, &trans->c->btree_iters_pool); -+ else if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ trans->mem = (void *) 0x1; -+ trans->iters = (void *) 0x1; -+ -+ return trans->error ? -EIO : 0; -+} -+ -+static void bch2_btree_iter_node_to_text(struct printbuf *out, -+ struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ pr_buf(out, " %px l=%u %s:", -+ _b, _b->level, bch2_btree_ids[_b->btree_id]); -+ bch2_bpos_to_text(out, btree_node_pos(_b, type)); -+} -+ -+void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree_trans *trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned l; -+ -+ mutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); -+ -+ trans_for_each_iter(trans, iter) { -+ if (!iter->nodes_locked) -+ continue; -+ -+ pr_buf(out, " iter %u %s:", -+ iter->idx, -+ bch2_btree_ids[iter->btree_id]); -+ bch2_bpos_to_text(out, iter->pos); -+ pr_buf(out, "\n"); -+ -+ for (l = 0; l < BTREE_MAX_DEPTH; l++) { -+ if (btree_node_locked(iter, l)) { -+ pr_buf(out, " %s l=%u ", -+ btree_node_intent_locked(iter, l) ? "i" : "r", l); -+ bch2_btree_iter_node_to_text(out, -+ (void *) iter->l[l].b, -+ btree_iter_type(iter)); -+ pr_buf(out, "\n"); -+ } -+ } -+ } -+ -+ b = READ_ONCE(trans->locking); -+ if (b) { -+ pr_buf(out, " locking iter %u l=%u %s:", -+ trans->locking_iter_idx, -+ trans->locking_level, -+ bch2_btree_ids[trans->locking_btree_id]); -+ bch2_bpos_to_text(out, trans->locking_pos); -+ -+ -+ pr_buf(out, " node "); -+ bch2_btree_iter_node_to_text(out, -+ (void *) b, -+ btree_iter_type(&trans->iters[trans->locking_iter_idx])); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *c) -+{ -+ mempool_exit(&c->btree_iters_pool); -+} -+ -+int bch2_fs_btree_iter_init(struct bch_fs *c) -+{ -+ unsigned nr = BTREE_ITER_MAX; -+ -+ INIT_LIST_HEAD(&c->btree_trans_list); -+ mutex_init(&c->btree_trans_lock); -+ -+ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, -+ sizeof(struct btree_iter) * nr + -+ sizeof(struct btree_insert_entry) * nr + -+ sizeof(struct btree_insert_entry) * nr); -+} -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -new file mode 100644 -index 000000000000..bd9ec3ec9a92 ---- /dev/null -+++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,314 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_ITER_H -+#define _BCACHEFS_BTREE_ITER_H -+ -+#include "bset.h" -+#include "btree_types.h" -+ -+static inline void btree_iter_set_dirty(struct btree_iter *iter, -+ enum btree_iter_uptodate u) -+{ -+ iter->uptodate = max_t(unsigned, iter->uptodate, u); -+} -+ -+static inline struct btree *btree_iter_node(struct btree_iter *iter, -+ unsigned level) -+{ -+ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; -+} -+ -+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, -+ const struct btree *b, unsigned level) -+{ -+ /* -+ * We don't compare the low bits of the lock sequence numbers because -+ * @iter might have taken a write lock on @b, and we don't want to skip -+ * the linked iterator if the sequence numbers were equal before taking -+ * that write lock. The lock sequence number is incremented by taking -+ * and releasing write locks and is even when unlocked: -+ */ -+ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; -+} -+ -+static inline struct btree *btree_node_parent(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return btree_iter_node(iter, b->c.level + 1); -+} -+ -+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) -+{ -+ return hweight64(trans->iters_linked) > 1; -+} -+ -+static inline int btree_iter_err(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -+} -+ -+/* Iterate over iters within a transaction: */ -+ -+#define trans_for_each_iter_all(_trans, _iter) \ -+ for (_iter = (_trans)->iters; \ -+ _iter < (_trans)->iters + (_trans)->nr_iters; \ -+ _iter++) -+ -+static inline struct btree_iter * -+__trans_next_iter(struct btree_trans *trans, unsigned idx) -+{ -+ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); -+ -+ for (; idx < trans->nr_iters; idx++) -+ if (trans->iters_linked & (1ULL << idx)) -+ return &trans->iters[idx]; -+ -+ return NULL; -+} -+ -+#define trans_for_each_iter(_trans, _iter) \ -+ for (_iter = __trans_next_iter((_trans), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) -+ -+static inline bool __iter_has_node(const struct btree_iter *iter, -+ const struct btree *b) -+{ -+ return iter->l[b->c.level].b == b && -+ btree_node_lock_seq_matches(iter, b, b->c.level); -+} -+ -+static inline struct btree_iter * -+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, -+ unsigned idx) -+{ -+ struct btree_iter *iter = __trans_next_iter(trans, idx); -+ -+ while (iter && !__iter_has_node(iter, b)) -+ iter = __trans_next_iter(trans, iter->idx + 1); -+ -+ return iter; -+} -+ -+#define trans_for_each_iter_with_node(_trans, _b, _iter) \ -+ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter_with_node((_trans), (_b), \ -+ (_iter)->idx + 1)) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); -+void bch2_btree_trans_verify_locks(struct btree_trans *); -+#else -+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, -+ struct btree *b) {} -+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} -+#endif -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, -+ struct bkey_packed *); -+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_packed *, -+ unsigned, unsigned); -+ -+bool bch2_btree_iter_relock(struct btree_iter *, bool); -+bool bch2_trans_relock(struct btree_trans *); -+void bch2_trans_unlock(struct btree_trans *); -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); -+ -+ return iter->locks_want < new_locks_want -+ ? (!iter->trans->nounlock -+ ? __bch2_btree_iter_upgrade(iter, new_locks_want) -+ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) -+ : iter->uptodate <= BTREE_ITER_NEED_PEEK; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); -+ -+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) -+{ -+ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) -+ __bch2_btree_iter_downgrade(iter, 0); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *); -+ -+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); -+ -+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *); -+ -+static inline int __must_check -+bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK -+ ? __bch2_btree_iter_traverse(iter) -+ : 0; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *); -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -+struct btree *bch2_btree_iter_next_node(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); -+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); -+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -+ -+static inline int btree_iter_cmp(const struct btree_iter *l, -+ const struct btree_iter *r) -+{ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: -+ bkey_cmp(l->pos, r->pos); -+} -+ -+/* -+ * Unlocks before scheduling -+ * Note: does not revalidate iterator -+ */ -+static inline int bch2_trans_cond_resched(struct btree_trans *trans) -+{ -+ if (need_resched() || race_fault()) { -+ bch2_trans_unlock(trans); -+ schedule(); -+ return bch2_trans_relock(trans) ? 0 : -EINTR; -+ } else { -+ return 0; -+ } -+} -+ -+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _locks_want, _depth, _flags, _b) \ -+ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ -+ _start, _locks_want, _depth, _flags), \ -+ _b = bch2_btree_iter_peek_node(_iter); \ -+ (_b); \ -+ (_b) = bch2_btree_iter_next_node(_iter)) -+ -+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _flags, _b) \ -+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ 0, 0, _flags, _b) -+ -+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, -+ unsigned flags) -+{ -+ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_peek_cached(iter); -+ else -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_peek_slot(iter) -+ : bch2_btree_iter_peek(iter); -+} -+ -+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, -+ unsigned flags) -+{ -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_next_slot(iter) -+ : bch2_btree_iter_next(iter); -+} -+ -+static inline int bkey_err(struct bkey_s_c k) -+{ -+ return PTR_ERR_OR_ZERO(k.k); -+} -+ -+#define for_each_btree_key(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ -+ bch2_trans_get_iter((_trans), (_btree_id), \ -+ (_start), (_flags))) ?: \ -+ PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_peek(_iter, _flags)).k); \ -+ !_ret && (_k).k; \ -+ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_next(_iter, _flags)).k)) -+ -+#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ -+ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ (_k) = __bch2_btree_iter_next(_iter, _flags)) -+ -+/* new multiple iterator interface: */ -+ -+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); -+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); -+ -+void bch2_trans_unlink_iters(struct btree_trans *); -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, -+ struct bpos, unsigned); -+ -+static inline struct btree_iter * -+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, -+ struct btree_iter *); -+static inline struct btree_iter * -+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_copy_iter(trans, src); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+ -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, -+ enum btree_id, struct bpos, -+ unsigned, unsigned, unsigned); -+ -+#define TRANS_RESET_NOTRAVERSE (1 << 0) -+ -+void bch2_trans_reset(struct btree_trans *, unsigned); -+ -+static inline void bch2_trans_begin(struct btree_trans *trans) -+{ -+ return bch2_trans_reset(trans, 0); -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *, size_t); -+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); -+int bch2_trans_exit(struct btree_trans *); -+ -+void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *); -+int bch2_fs_btree_iter_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_ITER_H */ -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -new file mode 100644 -index 000000000000..d73cc8ddadac ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,519 @@ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+ -+#include -+ -+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct bkey_cached *ck = obj; -+ const struct bkey_cached_key *key = arg->key; -+ -+ return cmp_int(ck->key.btree_id, key->btree_id) ?: -+ bkey_cmp(ck->key.pos, key->pos); -+} -+ -+static const struct rhashtable_params bch2_btree_key_cache_params = { -+ .head_offset = offsetof(struct bkey_cached, hash), -+ .key_offset = offsetof(struct bkey_cached, key), -+ .key_len = sizeof(struct bkey_cached_key), -+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, -+}; -+ -+__flatten -+static inline struct bkey_cached * -+btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -+{ -+ struct bkey_cached_key key = { -+ .btree_id = btree_id, -+ .pos = pos, -+ }; -+ -+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, -+ bch2_btree_key_cache_params); -+} -+ -+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -+{ -+ if (!six_trylock_intent(&ck->c.lock)) -+ return false; -+ -+ if (!six_trylock_write(&ck->c.lock)) { -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void bkey_cached_evict(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, -+ bch2_btree_key_cache_params)); -+ memset(&ck->key, ~0, sizeof(ck->key)); -+} -+ -+static void bkey_cached_free(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ list_move(&ck->list, &c->freed); -+ -+ kfree(ck->k); -+ ck->k = NULL; -+ ck->u64s = 0; -+ -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+} -+ -+static struct bkey_cached * -+bkey_cached_alloc(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck; -+ -+ list_for_each_entry(ck, &c->freed, list) -+ if (bkey_cached_lock_for_evict(ck)) -+ return ck; -+ -+ list_for_each_entry(ck, &c->clean, list) -+ if (bkey_cached_lock_for_evict(ck)) { -+ bkey_cached_evict(c, ck); -+ return ck; -+ } -+ -+ ck = kzalloc(sizeof(*ck), GFP_NOFS); -+ if (!ck) -+ return NULL; -+ -+ INIT_LIST_HEAD(&ck->list); -+ six_lock_init(&ck->c.lock); -+ BUG_ON(!six_trylock_intent(&ck->c.lock)); -+ BUG_ON(!six_trylock_write(&ck->c.lock)); -+ -+ return ck; -+} -+ -+static struct bkey_cached * -+btree_key_cache_create(struct btree_key_cache *c, -+ enum btree_id btree_id, -+ struct bpos pos) -+{ -+ struct bkey_cached *ck; -+ -+ ck = bkey_cached_alloc(c); -+ if (!ck) -+ return ERR_PTR(-ENOMEM); -+ -+ ck->c.level = 0; -+ ck->c.btree_id = btree_id; -+ ck->key.btree_id = btree_id; -+ ck->key.pos = pos; -+ ck->valid = false; -+ -+ BUG_ON(ck->flags); -+ -+ if (rhashtable_lookup_insert_fast(&c->table, -+ &ck->hash, -+ bch2_btree_key_cache_params)) { -+ /* We raced with another fill: */ -+ bkey_cached_free(c, ck); -+ return NULL; -+ } -+ -+ list_move(&ck->list, &c->clean); -+ six_unlock_write(&ck->c.lock); -+ -+ return ck; -+} -+ -+static int btree_key_cache_fill(struct btree_trans *trans, -+ struct btree_iter *ck_iter, -+ struct bkey_cached *ck) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned new_u64s = 0; -+ struct bkey_i *new_k = NULL; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, ck->key.btree_id, -+ ck->key.pos, BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+ } -+ -+ if (!bch2_btree_node_relock(ck_iter, 0)) { -+ bch2_trans_iter_put(trans, iter); -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ if (k.k->u64s > ck->u64s) { -+ new_u64s = roundup_pow_of_two(k.k->u64s); -+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) { -+ bch2_trans_iter_put(trans, iter); -+ return -ENOMEM; -+ } -+ } -+ -+ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); -+ if (new_k) { -+ kfree(ck->k); -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ } -+ -+ bkey_reassemble(ck->k, k); -+ ck->valid = true; -+ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); -+ -+ /* We're not likely to need this iterator again: */ -+ bch2_trans_iter_free(trans, iter); -+ -+ return 0; -+} -+ -+static int bkey_cached_check_fn(struct six_lock *lock, void *p) -+{ -+ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); -+ const struct btree_iter *iter = p; -+ -+ return ck->key.btree_id == iter->btree_id && -+ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; -+} -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck; -+ int ret = 0; -+ -+ BUG_ON(iter->level); -+ -+ if (btree_node_locked(iter, 0)) { -+ ck = (void *) iter->l[0].b; -+ goto fill; -+ } -+retry: -+ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); -+ if (!ck) { -+ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { -+ iter->l[0].b = NULL; -+ return 0; -+ } -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ ck = btree_key_cache_create(&c->btree_key_cache, -+ iter->btree_id, iter->pos); -+ mutex_unlock(&c->btree_key_cache.lock); -+ -+ ret = PTR_ERR_OR_ZERO(ck); -+ if (ret) -+ goto err; -+ if (!ck) -+ goto retry; -+ -+ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); -+ iter->locks_want = 1; -+ } else { -+ enum six_lock_type lock_want = __btree_lock_want(iter, 0); -+ -+ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, -+ bkey_cached_check_fn, iter)) { -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ goto retry; -+ } -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ six_unlock_type(&ck->c.lock, lock_want); -+ goto retry; -+ } -+ -+ mark_btree_node_locked(iter, 0, lock_want); -+ } -+ -+ iter->l[0].lock_seq = ck->c.lock.state.seq; -+ iter->l[0].b = (void *) ck; -+fill: -+ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { -+ if (!btree_node_intent_locked(iter, 0)) -+ bch2_btree_iter_upgrade(iter, 1); -+ if (!btree_node_intent_locked(iter, 0)) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ ret = btree_key_cache_fill(trans, iter, ck); -+ if (ret) -+ goto err; -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ bch2_btree_iter_downgrade(iter); -+ return ret; -+err: -+ if (ret != -EINTR) { -+ btree_node_unlock(iter, 0); -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; -+ } -+ return ret; -+} -+ -+static int btree_key_cache_flush_pos(struct btree_trans *trans, -+ struct bkey_cached_key key, -+ u64 journal_seq, -+ bool evict) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct btree_iter *c_iter = NULL, *b_iter = NULL; -+ struct bkey_cached *ck; -+ int ret; -+ -+ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(b_iter); -+ if (ret) -+ goto out; -+ -+ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_CACHED_NOCREATE| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(c_iter); -+ if (ret) -+ goto out; -+retry: -+ ret = bch2_btree_iter_traverse(c_iter); -+ if (ret) -+ goto err; -+ -+ ck = (void *) c_iter->l[0].b; -+ if (!ck || -+ (journal_seq && ck->journal.seq != journal_seq)) -+ goto out; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ if (!evict) -+ goto out; -+ goto evict; -+ } -+ -+ ret = bch2_btree_iter_traverse(b_iter) ?: -+ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_JOURNAL_RESERVED| -+ BTREE_INSERT_JOURNAL_RECLAIM); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ BUG_ON(ret && !bch2_journal_error(j)); -+ -+ if (ret) -+ goto out; -+ -+ bch2_journal_pin_drop(j, &ck->journal); -+ bch2_journal_preres_put(j, &ck->res); -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ -+ if (!evict) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_move_tail(&ck->list, &c->btree_key_cache.clean); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } else { -+evict: -+ BUG_ON(!btree_node_intent_locked(c_iter, 0)); -+ -+ mark_btree_node_unlocked(c_iter, 0); -+ c_iter->l[0].b = NULL; -+ -+ six_lock_write(&ck->c.lock, NULL, NULL); -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ bkey_cached_evict(&c->btree_key_cache, ck); -+ bkey_cached_free(&c->btree_key_cache, ck); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+out: -+ bch2_trans_iter_put(trans, b_iter); -+ bch2_trans_iter_put(trans, c_iter); -+ return ret; -+} -+ -+static void btree_key_cache_journal_flush(struct journal *j, -+ struct journal_entry_pin *pin, -+ u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bkey_cached *ck = -+ container_of(pin, struct bkey_cached, journal); -+ struct bkey_cached_key key; -+ struct btree_trans trans; -+ -+ six_lock_read(&ck->c.lock, NULL, NULL); -+ key = READ_ONCE(ck->key); -+ -+ if (ck->journal.seq != seq || -+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_read(&ck->c.lock); -+ return; -+ } -+ six_unlock_read(&ck->c.lock); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ btree_key_cache_flush_pos(&trans, key, seq, false); -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * Flush and evict a key from the key cache: -+ */ -+int bch2_btree_key_cache_flush(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached_key key = { id, pos }; -+ -+ /* Fastpath - assume it won't be found: */ -+ if (!btree_key_cache_find(c, id, pos)) -+ return 0; -+ -+ return btree_key_cache_flush_pos(trans, key, 0, true); -+} -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ BUG_ON(insert->u64s > ck->u64s); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ int difference; -+ -+ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); -+ -+ difference = jset_u64s(insert->u64s) - ck->res.u64s; -+ if (difference > 0) { -+ trans->journal_preres.u64s -= difference; -+ ck->res.u64s += difference; -+ } -+ } -+ -+ bkey_copy(ck->k, insert); -+ ck->valid = true; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_del_init(&ck->list); -+ -+ set_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+ -+ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, -+ &ck->journal, btree_key_cache_journal_flush); -+ return true; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ BUG_ON(btree_key_cache_find(trans->c, id, pos)); -+} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck, *n; -+ -+ mutex_lock(&c->lock); -+ list_for_each_entry_safe(ck, n, &c->clean, list) { -+ kfree(ck->k); -+ kfree(ck); -+ } -+ list_for_each_entry_safe(ck, n, &c->freed, list) -+ kfree(ck); -+ mutex_unlock(&c->lock); -+ -+ rhashtable_destroy(&c->table); -+} -+ -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -+{ -+ mutex_init(&c->lock); -+ INIT_LIST_HEAD(&c->freed); -+ INIT_LIST_HEAD(&c->clean); -+} -+ -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) -+{ -+ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); -+} -+ -+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) -+{ -+ struct bucket_table *tbl; -+ struct bkey_cached *ck; -+ struct rhash_head *pos; -+ size_t i; -+ -+ mutex_lock(&c->lock); -+ tbl = rht_dereference_rcu(c->table.tbl, &c->table); -+ -+ for (i = 0; i < tbl->size; i++) { -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ pr_buf(out, "%s:", -+ bch2_btree_ids[ck->key.btree_id]); -+ bch2_bpos_to_text(out, ck->key.pos); -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) -+ pr_buf(out, " journal seq %llu", ck->journal.seq); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->lock); -+} -diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h -new file mode 100644 -index 000000000000..b1756c6c622c ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,25 @@ -+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -+#define _BCACHEFS_BTREE_KEY_CACHE_H -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *); -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *, -+ struct btree_iter *, struct bkey_i *); -+int bch2_btree_key_cache_flush(struct btree_trans *, -+ enum btree_id, struct bpos); -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *, -+ enum btree_id, struct bpos); -+#else -+static inline void -+bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) {} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *); -+ -+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); -+ -+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -new file mode 100644 -index 000000000000..81fbf3e18647 ---- /dev/null -+++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_LOCKING_H -+#define _BCACHEFS_BTREE_LOCKING_H -+ -+/* -+ * Only for internal btree use: -+ * -+ * The btree iterator tracks what locks it wants to take, and what locks it -+ * currently has - here we have wrappers for locking/unlocking btree nodes and -+ * updating the iterator state -+ */ -+ -+#include -+ -+#include "btree_iter.h" -+ -+/* matches six lock types */ -+enum btree_node_locked_type { -+ BTREE_NODE_UNLOCKED = -1, -+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, -+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, -+}; -+ -+static inline int btree_node_locked_type(struct btree_iter *iter, -+ unsigned level) -+{ -+ /* -+ * We're relying on the fact that if nodes_intent_locked is set -+ * nodes_locked must be set as well, so that we can compute without -+ * branches: -+ */ -+ return BTREE_NODE_UNLOCKED + -+ ((iter->nodes_locked >> level) & 1) + -+ ((iter->nodes_intent_locked >> level) & 1); -+} -+ -+static inline bool btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; -+} -+ -+static inline bool btree_node_read_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; -+} -+ -+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) -+{ -+ return iter->nodes_locked & (1 << level); -+} -+ -+static inline void mark_btree_node_unlocked(struct btree_iter *iter, -+ unsigned level) -+{ -+ iter->nodes_locked &= ~(1 << level); -+ iter->nodes_intent_locked &= ~(1 << level); -+} -+ -+static inline void mark_btree_node_locked(struct btree_iter *iter, -+ unsigned level, -+ enum six_lock_type type) -+{ -+ /* relying on this to avoid a branch */ -+ BUILD_BUG_ON(SIX_LOCK_read != 0); -+ BUILD_BUG_ON(SIX_LOCK_intent != 1); -+ -+ iter->nodes_locked |= 1 << level; -+ iter->nodes_intent_locked |= type << level; -+} -+ -+static inline void mark_btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ mark_btree_node_locked(iter, level, SIX_LOCK_intent); -+} -+ -+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) -+{ -+ return level < iter->locks_want -+ ? SIX_LOCK_intent -+ : SIX_LOCK_read; -+} -+ -+static inline enum btree_node_locked_type -+btree_lock_want(struct btree_iter *iter, int level) -+{ -+ if (level < iter->level) -+ return BTREE_NODE_UNLOCKED; -+ if (level < iter->locks_want) -+ return BTREE_NODE_INTENT_LOCKED; -+ if (level == iter->level) -+ return BTREE_NODE_READ_LOCKED; -+ return BTREE_NODE_UNLOCKED; -+} -+ -+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ int lock_type = btree_node_locked_type(iter, level); -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ if (lock_type != BTREE_NODE_UNLOCKED) -+ six_unlock_type(&iter->l[level].b->c.lock, lock_type); -+ mark_btree_node_unlocked(iter, level); -+} -+ -+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ EBUG_ON(!level && iter->trans->nounlock); -+ -+ __btree_node_unlock(iter, level); -+} -+ -+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) -+{ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ while (iter->nodes_locked) -+ btree_node_unlock(iter, __ffs(iter->nodes_locked)); -+} -+ -+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) -+{ -+ switch (type) { -+ case SIX_LOCK_read: -+ return BCH_TIME_btree_lock_contended_read; -+ case SIX_LOCK_intent: -+ return BCH_TIME_btree_lock_contended_intent; -+ case SIX_LOCK_write: -+ return BCH_TIME_btree_lock_contended_write; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * wrapper around six locks that just traces lock contended time -+ */ -+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ u64 start_time = local_clock(); -+ -+ six_lock_type(&b->c.lock, type, NULL, NULL); -+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -+} -+ -+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ if (!six_trylock_type(&b->c.lock, type)) -+ __btree_node_lock_type(c, b, type); -+} -+ -+/* -+ * Lock a btree node if we already have it locked on one of our linked -+ * iterators: -+ */ -+static inline bool btree_node_lock_increment(struct btree_trans *trans, -+ struct btree *b, unsigned level, -+ enum btree_node_locked_type want) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->l[level].b == b && -+ btree_node_locked_type(iter, level) >= want) { -+ six_lock_increment(&b->c.lock, want); -+ return true; -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, -+ struct btree_iter *, enum six_lock_type, -+ six_lock_should_sleep_fn, void *); -+ -+static inline bool btree_node_lock(struct btree *b, -+ struct bpos pos, unsigned level, -+ struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ bool ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = b; -+ trans->locking_iter_idx = iter->idx; -+ trans->locking_pos = pos; -+ trans->locking_btree_id = iter->btree_id; -+ trans->locking_level = level; -+#endif -+ ret = likely(six_trylock_type(&b->c.lock, type)) || -+ btree_node_lock_increment(trans, b, level, type) || -+ __bch2_btree_node_lock(b, pos, level, iter, type, -+ should_sleep_fn, p); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = NULL; -+#endif -+ return ret; -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_node_relock(struct btree_iter *iter, -+ unsigned level) -+{ -+ EBUG_ON(btree_node_locked(iter, level) && -+ btree_node_locked_type(iter, level) != -+ __btree_lock_want(iter, level)); -+ -+ return likely(btree_node_locked(iter, level)) || -+ __bch2_btree_node_relock(iter, level); -+} -+ -+/* -+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will -+ * succeed: -+ */ -+static inline void -+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ linked->l[b->c.level].lock_seq += 2; -+ -+ six_unlock_write(&b->c.lock); -+} -+ -+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); -+ -+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); -+ -+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); -+ -+ if (unlikely(!six_trylock_write(&b->c.lock))) -+ __bch2_btree_node_lock_write(b, iter); -+} -+ -+#endif /* _BCACHEFS_BTREE_LOCKING_H */ -+ -+ -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -new file mode 100644 -index 000000000000..98611b1da1ed ---- /dev/null -+++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,666 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_TYPES_H -+#define _BCACHEFS_BTREE_TYPES_H -+ -+#include -+#include -+#include -+ -+#include "bkey_methods.h" -+#include "buckets_types.h" -+#include "journal_types.h" -+ -+struct open_bucket; -+struct btree_update; -+struct btree_trans; -+ -+#define MAX_BSETS 3U -+ -+struct btree_nr_keys { -+ -+ /* -+ * Amount of live metadata (i.e. size of node after a compaction) in -+ * units of u64s -+ */ -+ u16 live_u64s; -+ u16 bset_u64s[MAX_BSETS]; -+ -+ /* live keys only: */ -+ u16 packed_keys; -+ u16 unpacked_keys; -+}; -+ -+struct bset_tree { -+ /* -+ * We construct a binary tree in an array as if the array -+ * started at 1, so that things line up on the same cachelines -+ * better: see comments in bset.c at cacheline_to_bkey() for -+ * details -+ */ -+ -+ /* size of the binary tree and prev array */ -+ u16 size; -+ -+ /* function of size - precalculated for to_inorder() */ -+ u16 extra; -+ -+ u16 data_offset; -+ u16 aux_data_offset; -+ u16 end_offset; -+ -+ struct bpos max_key; -+}; -+ -+struct btree_write { -+ struct journal_entry_pin journal; -+}; -+ -+struct btree_alloc { -+ struct open_buckets ob; -+ BKEY_PADDED(k); -+}; -+ -+struct btree_bkey_cached_common { -+ struct six_lock lock; -+ u8 level; -+ u8 btree_id; -+}; -+ -+struct btree { -+ struct btree_bkey_cached_common c; -+ -+ struct rhash_head hash; -+ u64 hash_val; -+ -+ unsigned long flags; -+ u16 written; -+ u8 nsets; -+ u8 nr_key_bits; -+ -+ struct bkey_format format; -+ -+ struct btree_node *data; -+ void *aux_data; -+ -+ /* -+ * Sets of sorted keys - the real btree node - plus a binary search tree -+ * -+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point -+ * to the memory we have allocated for this btree node. Additionally, -+ * set[0]->data points to the entire btree node as it exists on disk. -+ */ -+ struct bset_tree set[MAX_BSETS]; -+ -+ struct btree_nr_keys nr; -+ u16 sib_u64s[2]; -+ u16 whiteout_u64s; -+ u8 page_order; -+ u8 unpack_fn_len; -+ -+ /* -+ * XXX: add a delete sequence number, so when bch2_btree_node_relock() -+ * fails because the lock sequence number has changed - i.e. the -+ * contents were modified - we can still relock the node if it's still -+ * the one we want, without redoing the traversal -+ */ -+ -+ /* -+ * For asynchronous splits/interior node updates: -+ * When we do a split, we allocate new child nodes and update the parent -+ * node to point to them: we update the parent in memory immediately, -+ * but then we must wait until the children have been written out before -+ * the update to the parent can be written - this is a list of the -+ * btree_updates that are blocking this node from being -+ * written: -+ */ -+ struct list_head write_blocked; -+ -+ /* -+ * Also for asynchronous splits/interior node updates: -+ * If a btree node isn't reachable yet, we don't want to kick off -+ * another write - because that write also won't yet be reachable and -+ * marking it as completed before it's reachable would be incorrect: -+ */ -+ unsigned long will_make_reachable; -+ -+ struct open_buckets ob; -+ -+ /* lru list */ -+ struct list_head list; -+ -+ struct btree_write writes[2]; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ bool *expensive_debug_checks; -+#endif -+ -+ /* Key/pointer for this btree node */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+}; -+ -+struct btree_cache { -+ struct rhashtable table; -+ bool table_init_done; -+ /* -+ * We never free a struct btree, except on shutdown - we just put it on -+ * the btree_cache_freed list and reuse it later. This simplifies the -+ * code, and it doesn't cost us much memory as the memory usage is -+ * dominated by buffers that hold the actual btree node data and those -+ * can be freed - and the number of struct btrees allocated is -+ * effectively bounded. -+ * -+ * btree_cache_freeable effectively is a small cache - we use it because -+ * high order page allocations can be rather expensive, and it's quite -+ * common to delete and allocate btree nodes in quick succession. It -+ * should never grow past ~2-3 nodes in practice. -+ */ -+ struct mutex lock; -+ struct list_head live; -+ struct list_head freeable; -+ struct list_head freed; -+ -+ /* Number of elements in live + freeable lists */ -+ unsigned used; -+ unsigned reserve; -+ struct shrinker shrink; -+ -+ /* -+ * If we need to allocate memory for a new btree node and that -+ * allocation fails, we can cannibalize another node in the btree cache -+ * to satisfy the allocation - lock to guarantee only one thread does -+ * this at a time: -+ */ -+ struct task_struct *alloc_lock; -+ struct closure_waitlist alloc_wait; -+}; -+ -+struct btree_node_iter { -+ struct btree_node_iter_set { -+ u16 k, end; -+ } data[MAX_BSETS]; -+}; -+ -+enum btree_iter_type { -+ BTREE_ITER_KEYS, -+ BTREE_ITER_NODES, -+ BTREE_ITER_CACHED, -+}; -+ -+#define BTREE_ITER_TYPE ((1 << 2) - 1) -+ -+/* -+ * Iterate over all possible positions, synthesizing deleted keys for holes: -+ */ -+#define BTREE_ITER_SLOTS (1 << 2) -+/* -+ * Indicates that intent locks should be taken on leaf nodes, because we expect -+ * to be doing updates: -+ */ -+#define BTREE_ITER_INTENT (1 << 3) -+/* -+ * Causes the btree iterator code to prefetch additional btree nodes from disk: -+ */ -+#define BTREE_ITER_PREFETCH (1 << 4) -+/* -+ * Indicates that this iterator should not be reused until transaction commit, -+ * either because a pending update references it or because the update depends -+ * on that particular key being locked (e.g. by the str_hash code, for hash -+ * table consistency) -+ */ -+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) -+/* -+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for -+ * @pos or the first key strictly greater than @pos -+ */ -+#define BTREE_ITER_IS_EXTENTS (1 << 6) -+#define BTREE_ITER_ERROR (1 << 7) -+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -+#define BTREE_ITER_CACHED_NOFILL (1 << 9) -+#define BTREE_ITER_CACHED_NOCREATE (1 << 10) -+ -+#define BTREE_ITER_USER_FLAGS \ -+ (BTREE_ITER_SLOTS \ -+ |BTREE_ITER_INTENT \ -+ |BTREE_ITER_PREFETCH \ -+ |BTREE_ITER_CACHED_NOFILL \ -+ |BTREE_ITER_CACHED_NOCREATE) -+ -+enum btree_iter_uptodate { -+ BTREE_ITER_UPTODATE = 0, -+ BTREE_ITER_NEED_PEEK = 1, -+ BTREE_ITER_NEED_RELOCK = 2, -+ BTREE_ITER_NEED_TRAVERSE = 3, -+}; -+ -+#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) -+#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) -+#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) -+#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) -+#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) -+#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) -+#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) -+ -+/* -+ * @pos - iterator's current position -+ * @level - current btree depth -+ * @locks_want - btree level below which we start taking intent locks -+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked -+ * @nodes_intent_locked - bitmask indicating which locks are intent locks -+ */ -+struct btree_iter { -+ struct btree_trans *trans; -+ struct bpos pos; -+ struct bpos pos_after_commit; -+ -+ u16 flags; -+ u8 idx; -+ -+ enum btree_id btree_id:4; -+ enum btree_iter_uptodate uptodate:4; -+ unsigned level:4, -+ min_depth:4, -+ locks_want:4, -+ nodes_locked:4, -+ nodes_intent_locked:4; -+ -+ struct btree_iter_level { -+ struct btree *b; -+ struct btree_node_iter iter; -+ u32 lock_seq; -+ } l[BTREE_MAX_DEPTH]; -+ -+ /* -+ * Current unpacked key - so that bch2_btree_iter_next()/ -+ * bch2_btree_iter_next_slot() can correctly advance pos. -+ */ -+ struct bkey k; -+ unsigned long ip_allocated; -+}; -+ -+static inline enum btree_iter_type -+btree_iter_type(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_TYPE; -+} -+ -+static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -+{ -+ return iter->l + iter->level; -+} -+ -+struct btree_key_cache { -+ struct mutex lock; -+ struct rhashtable table; -+ struct list_head freed; -+ struct list_head clean; -+}; -+ -+struct bkey_cached_key { -+ u32 btree_id; -+ struct bpos pos; -+} __attribute__((packed, aligned(4))); -+ -+#define BKEY_CACHED_DIRTY 0 -+ -+struct bkey_cached { -+ struct btree_bkey_cached_common c; -+ -+ unsigned long flags; -+ u8 u64s; -+ bool valid; -+ struct bkey_cached_key key; -+ -+ struct rhash_head hash; -+ struct list_head list; -+ -+ struct journal_preres res; -+ struct journal_entry_pin journal; -+ -+ struct bkey_i *k; -+}; -+ -+struct btree_insert_entry { -+ unsigned trigger_flags; -+ unsigned trans_triggers_run:1; -+ struct bkey_i *k; -+ struct btree_iter *iter; -+}; -+ -+#ifndef CONFIG_LOCKDEP -+#define BTREE_ITER_MAX 64 -+#else -+#define BTREE_ITER_MAX 32 -+#endif -+ -+struct btree_trans { -+ struct bch_fs *c; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct list_head list; -+ struct btree *locking; -+ unsigned locking_iter_idx; -+ struct bpos locking_pos; -+ u8 locking_btree_id; -+ u8 locking_level; -+ pid_t pid; -+#endif -+ unsigned long ip; -+ -+ u64 iters_linked; -+ u64 iters_live; -+ u64 iters_touched; -+ -+ u8 nr_iters; -+ u8 nr_updates; -+ u8 nr_updates2; -+ u8 size; -+ unsigned used_mempool:1; -+ unsigned error:1; -+ unsigned nounlock:1; -+ unsigned need_reset:1; -+ unsigned in_traverse_all:1; -+ -+ unsigned mem_top; -+ unsigned mem_bytes; -+ void *mem; -+ -+ struct btree_iter *iters; -+ struct btree_insert_entry *updates; -+ struct btree_insert_entry *updates2; -+ -+ /* update path: */ -+ struct jset_entry *extra_journal_entries; -+ unsigned extra_journal_entry_u64s; -+ struct journal_entry_pin *journal_pin; -+ -+ struct journal_res journal_res; -+ struct journal_preres journal_preres; -+ u64 *journal_seq; -+ struct disk_reservation *disk_res; -+ unsigned flags; -+ unsigned journal_u64s; -+ unsigned journal_preres_u64s; -+ struct replicas_delta_list *fs_usage_deltas; -+ -+ struct btree_iter iters_onstack[2]; -+ struct btree_insert_entry updates_onstack[2]; -+ struct btree_insert_entry updates2_onstack[2]; -+}; -+ -+#define BTREE_FLAG(flag) \ -+static inline bool btree_node_ ## flag(struct btree *b) \ -+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void set_btree_node_ ## flag(struct btree *b) \ -+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void clear_btree_node_ ## flag(struct btree *b) \ -+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } -+ -+enum btree_flags { -+ BTREE_NODE_read_in_flight, -+ BTREE_NODE_read_error, -+ BTREE_NODE_dirty, -+ BTREE_NODE_need_write, -+ BTREE_NODE_noevict, -+ BTREE_NODE_write_idx, -+ BTREE_NODE_accessed, -+ BTREE_NODE_write_in_flight, -+ BTREE_NODE_just_written, -+ BTREE_NODE_dying, -+ BTREE_NODE_fake, -+ BTREE_NODE_old_extent_overwrite, -+ BTREE_NODE_need_rewrite, -+}; -+ -+BTREE_FLAG(read_in_flight); -+BTREE_FLAG(read_error); -+BTREE_FLAG(dirty); -+BTREE_FLAG(need_write); -+BTREE_FLAG(noevict); -+BTREE_FLAG(write_idx); -+BTREE_FLAG(accessed); -+BTREE_FLAG(write_in_flight); -+BTREE_FLAG(just_written); -+BTREE_FLAG(dying); -+BTREE_FLAG(fake); -+BTREE_FLAG(old_extent_overwrite); -+BTREE_FLAG(need_rewrite); -+ -+static inline struct btree_write *btree_current_write(struct btree *b) -+{ -+ return b->writes + btree_node_write_idx(b); -+} -+ -+static inline struct btree_write *btree_prev_write(struct btree *b) -+{ -+ return b->writes + (btree_node_write_idx(b) ^ 1); -+} -+ -+static inline struct bset_tree *bset_tree_last(struct btree *b) -+{ -+ EBUG_ON(!b->nsets); -+ return b->set + b->nsets - 1; -+} -+ -+static inline void * -+__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -+{ -+ return (void *) ((u64 *) b->data + 1 + offset); -+} -+ -+static inline u16 -+__btree_node_ptr_to_offset(const struct btree *b, const void *p) -+{ -+ u16 ret = (u64 *) p - 1 - (u64 *) b->data; -+ -+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); -+ return ret; -+} -+ -+static inline struct bset *bset(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return __btree_node_offset_to_ptr(b, t->data_offset); -+} -+ -+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -+{ -+ t->end_offset = -+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -+} -+ -+static inline void set_btree_bset(struct btree *b, struct bset_tree *t, -+ const struct bset *i) -+{ -+ t->data_offset = __btree_node_ptr_to_offset(b, i); -+ set_btree_bset_end(b, t); -+} -+ -+static inline struct bset *btree_bset_first(struct btree *b) -+{ -+ return bset(b, b->set); -+} -+ -+static inline struct bset *btree_bset_last(struct btree *b) -+{ -+ return bset(b, bset_tree_last(b)); -+} -+ -+static inline u16 -+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -+{ -+ return __btree_node_ptr_to_offset(b, k); -+} -+ -+static inline struct bkey_packed * -+__btree_node_offset_to_key(const struct btree *b, u16 k) -+{ -+ return __btree_node_offset_to_ptr(b, k); -+} -+ -+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -+{ -+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -+} -+ -+#define btree_bkey_first(_b, _t) \ -+({ \ -+ EBUG_ON(bset(_b, _t)->start != \ -+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ -+ \ -+ bset(_b, _t)->start; \ -+}) -+ -+#define btree_bkey_last(_b, _t) \ -+({ \ -+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ -+ vstruct_last(bset(_b, _t))); \ -+ \ -+ __btree_node_offset_to_key(_b, (_t)->end_offset); \ -+}) -+ -+static inline unsigned bset_u64s(struct bset_tree *t) -+{ -+ return t->end_offset - t->data_offset - -+ sizeof(struct bset) / sizeof(u64); -+} -+ -+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -+{ -+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -+} -+ -+static inline unsigned bset_byte_offset(struct btree *b, void *i) -+{ -+ return i - (void *) b->data; -+} -+ -+enum btree_node_type { -+#define x(kwd, val, name) BKEY_TYPE_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BKEY_TYPE_BTREE, -+}; -+ -+/* Type of a key in btree @id at level @level: */ -+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -+{ -+ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; -+} -+ -+/* Type of keys @b contains: */ -+static inline enum btree_node_type btree_node_type(struct btree *b) -+{ -+ return __btree_node_type(b->c.level, b->c.btree_id); -+} -+ -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ switch (type) { -+ case BKEY_TYPE_EXTENTS: -+ case BKEY_TYPE_REFLINK: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool btree_node_is_extents(struct btree *b) -+{ -+ return btree_node_type_is_extents(btree_node_type(b)); -+} -+ -+static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) -+{ -+ return __btree_node_type(iter->level, iter->btree_id); -+} -+ -+static inline bool btree_iter_is_extents(struct btree_iter *iter) -+{ -+ return btree_node_type_is_extents(btree_iter_key_type(iter)); -+} -+ -+#define BTREE_NODE_TYPE_HAS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_ALLOC)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_REFLINK)| \ -+ (1U << BKEY_TYPE_EC)| \ -+ (1U << BKEY_TYPE_BTREE)) -+ -+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_REFLINK)) -+ -+enum btree_trigger_flags { -+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ -+ __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ -+ -+ __BTREE_TRIGGER_INSERT, -+ __BTREE_TRIGGER_OVERWRITE, -+ __BTREE_TRIGGER_OVERWRITE_SPLIT, -+ -+ __BTREE_TRIGGER_GC, -+ __BTREE_TRIGGER_BUCKET_INVALIDATE, -+ __BTREE_TRIGGER_ALLOC_READ, -+ __BTREE_TRIGGER_NOATOMIC, -+}; -+ -+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -+#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) -+ -+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) -+ -+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -+#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) -+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -+ -+static inline bool btree_node_type_needs_gc(enum btree_node_type type) -+{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); -+} -+ -+struct btree_root { -+ struct btree *b; -+ -+ /* On disk root - see async splits: */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ u8 level; -+ u8 alive; -+ s8 error; -+}; -+ -+/* -+ * Optional hook that will be called just prior to a btree node update, when -+ * we're holding the write lock and we know what key is about to be overwritten: -+ */ -+ -+enum btree_insert_ret { -+ BTREE_INSERT_OK, -+ /* leaf node needs to be split */ -+ BTREE_INSERT_BTREE_NODE_FULL, -+ BTREE_INSERT_ENOSPC, -+ BTREE_INSERT_NEED_MARK_REPLICAS, -+ BTREE_INSERT_NEED_JOURNAL_RES, -+}; -+ -+enum btree_gc_coalesce_fail_reason { -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -+}; -+ -+enum btree_node_sibling { -+ btree_prev_sib, -+ btree_next_sib, -+}; -+ -+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, -+ struct btree *, -+ struct btree_node_iter *); -+ -+#endif /* _BCACHEFS_BTREE_TYPES_H */ -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -new file mode 100644 -index 000000000000..e0b1bde37484 ---- /dev/null -+++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,144 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_H -+#define _BCACHEFS_BTREE_UPDATE_H -+ -+#include "btree_iter.h" -+#include "journal.h" -+ -+struct bch_fs; -+struct btree; -+ -+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_i *); -+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); -+ -+enum btree_insert_flags { -+ __BTREE_INSERT_NOUNLOCK, -+ __BTREE_INSERT_NOFAIL, -+ __BTREE_INSERT_NOCHECK_RW, -+ __BTREE_INSERT_LAZY_RW, -+ __BTREE_INSERT_USE_RESERVE, -+ __BTREE_INSERT_USE_ALLOC_RESERVE, -+ __BTREE_INSERT_JOURNAL_REPLAY, -+ __BTREE_INSERT_JOURNAL_RESERVED, -+ __BTREE_INSERT_JOURNAL_RECLAIM, -+ __BTREE_INSERT_NOWAIT, -+ __BTREE_INSERT_GC_LOCK_HELD, -+ __BCH_HASH_SET_MUST_CREATE, -+ __BCH_HASH_SET_MUST_REPLACE, -+}; -+ -+/* -+ * Don't drop locks _after_ successfully updating btree: -+ */ -+#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) -+ -+/* Don't check for -ENOSPC: */ -+#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) -+ -+#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) -+#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) -+ -+/* for copygc, or when merging btree nodes */ -+#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) -+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) -+ -+/* Insert is for journal replay - don't get journal reservations: */ -+#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -+ -+/* Indicates that we have pre-reserved space in the journal: */ -+#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -+ -+/* Insert is being called from journal reclaim path: */ -+#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) -+ -+/* Don't block on allocation failure (for new btree nodes: */ -+#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) -+#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) -+ -+#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) -+#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) -+ -+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -+ -+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); -+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, -+ struct disk_reservation *, u64 *, int flags); -+ -+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *); -+int bch2_btree_delete_range(struct bch_fs *, enum btree_id, -+ struct bpos, struct bpos, u64 *); -+ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, -+ __le64, unsigned); -+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, -+ struct btree *, struct bkey_i *); -+ -+int bch2_trans_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_trigger_flags); -+int __bch2_trans_commit(struct btree_trans *); -+ -+/** -+ * bch2_trans_commit - insert keys at given iterator positions -+ * -+ * This is main entry point for btree updates. -+ * -+ * Return values: -+ * -EINTR: locking changed, this function should be called again. -+ * -EROFS: filesystem read only -+ * -EIO: journal or btree node IO error -+ */ -+static inline int bch2_trans_commit(struct btree_trans *trans, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ unsigned flags) -+{ -+ trans->disk_res = disk_res; -+ trans->journal_seq = journal_seq; -+ trans->flags = flags; -+ -+ return __bch2_trans_commit(trans); -+} -+ -+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ int _ret; \ -+ \ -+ while (1) { \ -+ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ -+ (_journal_seq), (_flags)); \ -+ if (_ret != -EINTR) \ -+ break; \ -+ bch2_trans_reset(_trans, 0); \ -+ } \ -+ \ -+ _ret; \ -+}) -+ -+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ struct btree_trans trans; \ -+ int _ret, _ret2; \ -+ \ -+ bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ -+ _do); \ -+ _ret2 = bch2_trans_exit(&trans); \ -+ \ -+ _ret ?: _ret2; \ -+}) -+ -+#define trans_for_each_update(_trans, _i) \ -+ for ((_i) = (_trans)->updates; \ -+ (_i) < (_trans)->updates + (_trans)->nr_updates; \ -+ (_i)++) -+ -+#define trans_for_each_update2(_trans, _i) \ -+ for ((_i) = (_trans)->updates2; \ -+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ -+ (_i)++) -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_H */ -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -new file mode 100644 -index 000000000000..b41916f93c9b ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2076 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "extents.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+/* Debug code: */ -+ -+/* -+ * Verify that child nodes correctly span parent node's range: -+ */ -+static void btree_node_interior_verify(struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos next_node = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_btree_ptr_v2 bp; -+ struct bkey unpacked; -+ -+ BUG_ON(!b->c.level); -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while (1) { -+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ break; -+ bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (bch2_btree_node_iter_end(&iter)) { -+ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); -+ break; -+ } -+ -+ next_node = bkey_successor(k.k->p); -+ } -+#endif -+} -+ -+/* Calculate ideal packed bkey format for new btree nodes: */ -+ -+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -+{ -+ struct bkey_packed *k; -+ struct bset_tree *t; -+ struct bkey uk; -+ -+ bch2_bkey_format_add_pos(s, b->data->min_key); -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) { -+ uk = bkey_unpack_key(b, k); -+ bch2_bkey_format_add_key(s, &uk); -+ } -+} -+ -+static struct bkey_format bch2_btree_calc_format(struct btree *b) -+{ -+ struct bkey_format_state s; -+ -+ bch2_bkey_format_init(&s); -+ __bch2_btree_calc_format(&s, b); -+ -+ return bch2_bkey_format_done(&s); -+} -+ -+static size_t btree_node_u64s_with_format(struct btree *b, -+ struct bkey_format *new_f) -+{ -+ struct bkey_format *old_f = &b->format; -+ -+ /* stupid integer promotion rules */ -+ ssize_t delta = -+ (((int) new_f->key_u64s - old_f->key_u64s) * -+ (int) b->nr.packed_keys) + -+ (((int) new_f->key_u64s - BKEY_U64s) * -+ (int) b->nr.unpacked_keys); -+ -+ BUG_ON(delta + b->nr.live_u64s < 0); -+ -+ return b->nr.live_u64s + delta; -+} -+ -+/** -+ * btree_node_format_fits - check if we could rewrite node with a new format -+ * -+ * This assumes all keys can pack with the new format -- it just checks if -+ * the re-packed keys would fit inside the node itself. -+ */ -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, -+ struct bkey_format *new_f) -+{ -+ size_t u64s = btree_node_u64s_with_format(b, new_f); -+ -+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); -+} -+ -+/* Btree node freeing/allocation: */ -+ -+static void __btree_node_free(struct bch_fs *c, struct btree *b) -+{ -+ trace_btree_node_free(c, b); -+ -+ BUG_ON(btree_node_dirty(b)); -+ BUG_ON(btree_node_need_write(b)); -+ BUG_ON(b == btree_node_root(c, b)); -+ BUG_ON(b->ob.nr); -+ BUG_ON(!list_empty(&b->write_blocked)); -+ BUG_ON(b->will_make_reachable); -+ -+ clear_btree_node_noevict(b); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ six_lock_wakeup_all(&b->c.lock); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+} -+ -+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) -+{ -+ struct open_buckets ob = b->ob; -+ -+ b->ob.nr = 0; -+ -+ clear_btree_node_dirty(b); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ bch2_open_buckets_put(c, &ob); -+} -+ -+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ BUG_ON(linked->l[b->c.level].b == b); -+ -+ six_lock_write(&b->c.lock, NULL, NULL); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, -+ struct disk_reservation *res, -+ struct closure *cl, -+ unsigned flags) -+{ -+ struct write_point *wp; -+ struct btree *b; -+ BKEY_PADDED(k) tmp; -+ struct open_buckets ob = { .nr = 0 }; -+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; -+ unsigned nr_reserve; -+ enum alloc_reserve alloc_reserve; -+ -+ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { -+ nr_reserve = 0; -+ alloc_reserve = RESERVE_ALLOC; -+ } else if (flags & BTREE_INSERT_USE_RESERVE) { -+ nr_reserve = BTREE_NODE_RESERVE / 2; -+ alloc_reserve = RESERVE_BTREE; -+ } else { -+ nr_reserve = BTREE_NODE_RESERVE; -+ alloc_reserve = RESERVE_NONE; -+ } -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ if (c->btree_reserve_cache_nr > nr_reserve) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ ob = a->ob; -+ bkey_copy(&tmp.k, &a->k); -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ goto mem_alloc; -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+retry: -+ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, -+ writepoint_ptr(&c->btree_write_point), -+ &devs_have, -+ res->nr_replicas, -+ c->opts.metadata_replicas_required, -+ alloc_reserve, 0, cl); -+ if (IS_ERR(wp)) -+ return ERR_CAST(wp); -+ -+ if (wp->sectors_free < c->opts.btree_node_size) { -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ob->sectors_free < c->opts.btree_node_size) -+ ob->sectors_free = 0; -+ -+ bch2_alloc_sectors_done(c, wp); -+ goto retry; -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) -+ bkey_btree_ptr_v2_init(&tmp.k); -+ else -+ bkey_btree_ptr_init(&tmp.k); -+ -+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); -+ -+ bch2_open_bucket_get(c, wp, &ob); -+ bch2_alloc_sectors_done(c, wp); -+mem_alloc: -+ b = bch2_btree_node_mem_alloc(c); -+ -+ /* we hold cannibalize_lock: */ -+ BUG_ON(IS_ERR(b)); -+ BUG_ON(b->ob.nr); -+ -+ bkey_copy(&b->key, &tmp.k); -+ b->ob = ob; -+ -+ return b; -+} -+ -+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ BUG_ON(!as->nr_prealloc_nodes); -+ -+ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ set_btree_node_accessed(b); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ b->c.level = level; -+ b->c.btree_id = as->btree_id; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ b->data->magic = cpu_to_le64(bset_magic(c)); -+ b->data->flags = 0; -+ SET_BTREE_NODE_ID(b->data, as->btree_id); -+ SET_BTREE_NODE_LEVEL(b->data, level); -+ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); -+ -+ bp->v.mem_ptr = 0; -+ bp->v.seq = b->data->keys.seq; -+ bp->v.sectors_written = 0; -+ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) -+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ bch2_btree_build_aux_trees(b); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); -+ BUG_ON(ret); -+ -+ trace_btree_node_alloc(c, b); -+ return b; -+} -+ -+static void btree_set_min(struct btree *b, struct bpos pos) -+{ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; -+ b->data->min_key = pos; -+} -+ -+static void btree_set_max(struct btree *b, struct bpos pos) -+{ -+ b->key.k.p = pos; -+ b->data->max_key = pos; -+} -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b, -+ struct bkey_format format) -+{ -+ struct btree *n; -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ -+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); -+ -+ btree_set_min(n, b->data->min_key); -+ btree_set_max(n, b->data->max_key); -+ -+ n->data->format = format; -+ btree_node_set_format(n, format); -+ -+ bch2_btree_sort_into(as->c, n, b); -+ -+ btree_node_reset_sib_u64s(n); -+ -+ n->key.k.p = b->key.k.p; -+ return n; -+} -+ -+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bkey_format new_f = bch2_btree_calc_format(b); -+ -+ /* -+ * The keys might expand with the new format - if they wouldn't fit in -+ * the btree node anymore, use the old format for now: -+ */ -+ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) -+ new_f = b->format; -+ -+ return __bch2_btree_node_alloc_replacement(as, b, new_f); -+} -+ -+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) -+{ -+ struct btree *b = bch2_btree_node_alloc(as, level); -+ -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ -+ btree_node_set_format(b, b->data->format); -+ bch2_btree_build_aux_trees(b); -+ -+ bch2_btree_update_add_new_node(as, b); -+ six_unlock_write(&b->c.lock); -+ -+ return b; -+} -+ -+static void bch2_btree_reserve_put(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ -+ while (as->nr_prealloc_nodes) { -+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (c->btree_reserve_cache_nr < -+ ARRAY_SIZE(c->btree_reserve_cache)) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; -+ -+ a->ob = b->ob; -+ b->ob.nr = 0; -+ bkey_copy(&a->k, &b->key); -+ } else { -+ bch2_open_buckets_put(c, &b->ob); -+ } -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ mutex_unlock(&c->btree_reserve_cache_lock); -+} -+ -+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, -+ unsigned flags, struct closure *cl) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); -+ -+ /* -+ * Protects reaping from the btree node cache and using the btree node -+ * open bucket reserve: -+ */ -+ ret = bch2_btree_cache_cannibalize_lock(c, cl); -+ if (ret) -+ return ret; -+ -+ while (as->nr_prealloc_nodes < nr_nodes) { -+ b = __bch2_btree_node_alloc(c, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT -+ ? NULL : cl, flags); -+ if (IS_ERR(b)) { -+ ret = PTR_ERR(b); -+ goto err_free; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); -+ if (ret) -+ goto err_free; -+ -+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; -+ } -+ -+ bch2_btree_cache_cannibalize_unlock(c); -+ return 0; -+err_free: -+ bch2_btree_cache_cannibalize_unlock(c); -+ trace_btree_reserve_get_fail(c, nr_nodes, cl); -+ return ret; -+} -+ -+/* Asynchronous interior node update machinery */ -+ -+static void bch2_btree_update_free(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ bch2_journal_pin_flush(&c->journal, &as->journal); -+ bch2_disk_reservation_put(c, &as->disk_res); -+ bch2_btree_reserve_put(as); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_del(&as->unwritten_list); -+ list_del(&as->list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ closure_debug_destroy(&as->cl); -+ mempool_free(as, &c->btree_interior_update_pool); -+ -+ closure_wake_up(&c->btree_interior_update_wait); -+} -+ -+static void btree_update_will_delete_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_old_keys)); -+ bch2_keylist_add(&as->old_keys, k); -+} -+ -+static void btree_update_will_add_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_new_keys)); -+ bch2_keylist_add(&as->new_keys, k); -+} -+ -+/* -+ * The transactional part of an interior btree node update, where we journal the -+ * update we did to the interior node and update alloc info: -+ */ -+static int btree_update_nodes_written_trans(struct btree_trans *trans, -+ struct btree_update *as) -+{ -+ struct bkey_i *k; -+ int ret; -+ -+ trans->extra_journal_entries = (void *) &as->journal_entries[0]; -+ trans->extra_journal_entry_u64s = as->journal_u64s; -+ trans->journal_pin = &as->journal; -+ -+ for_each_keylist_key(&as->new_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_keylist_key(&as->old_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void btree_update_nodes_written(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b = as->b; -+ u64 journal_seq = 0; -+ unsigned i; -+ int ret; -+ -+ /* -+ * We did an update to a parent node where the pointers we added pointed -+ * to child nodes that weren't written yet: now, the child nodes have -+ * been written so we can write out the update to the interior node. -+ */ -+ -+ /* -+ * We can't call into journal reclaim here: we'd block on the journal -+ * reclaim lock, but we may need to release the open buckets we have -+ * pinned in order for other btree updates to make forward progress, and -+ * journal reclaim does btree updates when flushing bkey_cached entries, -+ * which may require allocations as well. -+ */ -+ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED, -+ btree_update_nodes_written_trans(&trans, as)); -+ BUG_ON(ret && !bch2_journal_error(&c->journal)); -+ -+ if (b) { -+ /* -+ * @b is the node we did the final insert into: -+ * -+ * On failure to get a journal reservation, we still have to -+ * unblock the write and allow most of the write path to happen -+ * so that shutdown works, but the i->journal_seq mechanism -+ * won't work to prevent the btree write from being visible (we -+ * didn't get a journal sequence number) - instead -+ * __bch2_btree_node_write() doesn't do the actual write if -+ * we're in journal error state: -+ */ -+ -+ btree_node_lock_type(c, b, SIX_LOCK_intent); -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ list_del(&as->write_blocked_list); -+ -+ if (!ret && as->b == b) { -+ struct bset *i = btree_bset_last(b); -+ -+ BUG_ON(!b->c.level); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ i->journal_seq = cpu_to_le64( -+ max(journal_seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ } -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ six_unlock_write(&b->c.lock); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_intent); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ BUG_ON(b->will_make_reachable != (unsigned long) as); -+ b->will_make_reachable = 0; -+ } -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ } -+ -+ for (i = 0; i < as->nr_open_buckets; i++) -+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); -+ -+ bch2_btree_update_free(as); -+} -+ -+static void btree_interior_update_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, btree_interior_update_work); -+ struct btree_update *as; -+ -+ while (1) { -+ mutex_lock(&c->btree_interior_update_lock); -+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, -+ struct btree_update, unwritten_list); -+ if (as && !as->nodes_written) -+ as = NULL; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (!as) -+ break; -+ -+ btree_update_nodes_written(as); -+ } -+} -+ -+static void btree_update_set_nodes_written(struct closure *cl) -+{ -+ struct btree_update *as = container_of(cl, struct btree_update, cl); -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ as->nodes_written = true; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -+} -+ -+/* -+ * We're updating @b with pointers to nodes that haven't finished writing yet: -+ * block @b from being written until @as completes -+ */ -+static void btree_update_updated_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_NODE; -+ as->b = b; -+ list_add(&as->write_blocked_list, &b->write_blocked); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static void btree_update_reparent(struct btree_update *as, -+ struct btree_update *child) -+{ -+ struct bch_fs *c = as->c; -+ -+ lockdep_assert_held(&c->btree_interior_update_lock); -+ -+ child->b = NULL; -+ child->mode = BTREE_INTERIOR_UPDATING_AS; -+ -+ /* -+ * When we write a new btree root, we have to drop our journal pin -+ * _before_ the new nodes are technically reachable; see -+ * btree_update_nodes_written(). -+ * -+ * This goes for journal pins that are recursively blocked on us - so, -+ * just transfer the journal pin to the new interior update so -+ * btree_update_nodes_written() can drop it. -+ */ -+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &child->journal); -+} -+ -+static void btree_update_updated_root(struct btree_update *as, struct btree *b) -+{ -+ struct bkey_i *insert = &b->key; -+ struct bch_fs *c = as->c; -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_ROOT; -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+/* -+ * bch2_btree_update_add_new_node: -+ * -+ * This causes @as to wait on @b to be written, before it gets to -+ * bch2_btree_update_nodes_written -+ * -+ * Additionally, it sets b->will_make_reachable to prevent any additional writes -+ * to @b from happening besides the first until @b is reachable on disk -+ * -+ * And it adds @b to the list of @as's new nodes, so that we can update sector -+ * counts in bch2_btree_update_nodes_written: -+ */ -+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ closure_get(&as->cl); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); -+ BUG_ON(b->will_make_reachable); -+ -+ as->new_nodes[as->nr_new_nodes++] = b; -+ b->will_make_reachable = 1UL|(unsigned long) as; -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ btree_update_will_add_key(as, &b->key); -+} -+ -+/* -+ * returns true if @b was a new node -+ */ -+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_update *as; -+ unsigned long v; -+ unsigned i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ /* -+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's -+ * dropped when it gets written by bch2_btree_complete_write - the -+ * xchg() is for synchronization with bch2_btree_complete_write: -+ */ -+ v = xchg(&b->will_make_reachable, 0); -+ as = (struct btree_update *) (v & ~1UL); -+ -+ if (!as) { -+ mutex_unlock(&c->btree_interior_update_lock); -+ return; -+ } -+ -+ for (i = 0; i < as->nr_new_nodes; i++) -+ if (as->new_nodes[i] == b) -+ goto found; -+ -+ BUG(); -+found: -+ array_remove_item(as->new_nodes, as->nr_new_nodes, i); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (v & 1) -+ closure_put(&as->cl); -+} -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -+{ -+ while (b->ob.nr) -+ as->open_buckets[as->nr_open_buckets++] = -+ b->ob.v[--b->ob.nr]; -+} -+ -+/* -+ * @b is being split/rewritten: it may have pointers to not-yet-written btree -+ * nodes and thus outstanding btree_updates - redirect @b's -+ * btree_updates to point to this btree_update: -+ */ -+void bch2_btree_interior_update_will_free_node(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct btree_update *p, *n; -+ struct btree_write *w; -+ -+ set_btree_node_dying(b); -+ -+ if (btree_node_fake(b)) -+ return; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ /* -+ * Does this node have any btree_update operations preventing -+ * it from being written? -+ * -+ * If so, redirect them to point to this btree_update: we can -+ * write out our new nodes, but we won't make them visible until those -+ * operations complete -+ */ -+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { -+ list_del_init(&p->write_blocked_list); -+ btree_update_reparent(as, p); -+ -+ /* -+ * for flush_held_btree_writes() waiting on updates to flush or -+ * nodes to be writeable: -+ */ -+ closure_wake_up(&c->btree_interior_update_wait); -+ } -+ -+ clear_btree_node_dirty(b); -+ clear_btree_node_need_write(b); -+ -+ /* -+ * Does this node have unwritten data that has a pin on the journal? -+ * -+ * If so, transfer that pin to the btree_update operation - -+ * note that if we're freeing multiple nodes, we only need to keep the -+ * oldest pin of any of the nodes we're freeing. We'll release the pin -+ * when the new nodes are persistent and reachable on disk: -+ */ -+ w = btree_current_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ w = btree_prev_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ /* -+ * Is this a node that isn't reachable on disk yet? -+ * -+ * Nodes that aren't reachable yet have writes blocked until they're -+ * reachable - now that we've cancelled any pending writes and moved -+ * things waiting on that write to wait on this update, we can drop this -+ * node from the list of nodes that the other update is making -+ * reachable, prior to freeing it: -+ */ -+ btree_update_drop_new_node(c, b); -+ -+ btree_update_will_delete_key(as, &b->key); -+} -+ -+void bch2_btree_update_done(struct btree_update *as) -+{ -+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); -+ -+ bch2_btree_reserve_put(as); -+ -+ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); -+} -+ -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, -+ unsigned nr_nodes, unsigned flags, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_update *as; -+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) -+ ? JOURNAL_RES_GET_RECLAIM : 0; -+ int ret = 0; -+ -+ /* -+ * This check isn't necessary for correctness - it's just to potentially -+ * prevent us from doing a lot of work that'll end up being wasted: -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); -+ memset(as, 0, sizeof(*as)); -+ closure_init(&as->cl, NULL); -+ as->c = c; -+ as->mode = BTREE_INTERIOR_NO_UPDATE; -+ as->btree_id = id; -+ INIT_LIST_HEAD(&as->list); -+ INIT_LIST_HEAD(&as->unwritten_list); -+ INIT_LIST_HEAD(&as->write_blocked_list); -+ bch2_keylist_init(&as->old_keys, as->_old_keys); -+ bch2_keylist_init(&as->new_keys, as->_new_keys); -+ bch2_keylist_init(&as->parent_keys, as->inline_keys); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags|JOURNAL_RES_GET_NONBLOCK); -+ if (ret == -EAGAIN) { -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ return ERR_PTR(-EINTR); -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ ret = bch2_disk_reservation_get(c, &as->disk_res, -+ nr_nodes * c->opts.btree_node_size, -+ c->opts.metadata_replicas, -+ disk_res_flags); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->list, &c->btree_interior_update_list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return as; -+err: -+ bch2_btree_update_free(as); -+ return ERR_PTR(ret); -+} -+ -+/* Btree root updates: */ -+ -+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -+{ -+ /* Root nodes cannot be reaped */ -+ mutex_lock(&c->btree_cache.lock); -+ list_del_init(&b->list); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ mutex_lock(&c->btree_root_lock); -+ BUG_ON(btree_node_root(c, b) && -+ (b->c.level < btree_node_root(c, b)->c.level || -+ !btree_node_dying(btree_node_root(c, b)))); -+ -+ btree_node_root(c, b) = b; -+ mutex_unlock(&c->btree_root_lock); -+ -+ bch2_recalc_btree_reserve(c); -+} -+ -+/** -+ * bch_btree_set_root - update the root in memory and on disk -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. However, you must hold an intent lock on the -+ * old root. -+ * -+ * Note: This allocates a journal entry but doesn't add any keys to -+ * it. All the btree roots are part of every journal write, so there -+ * is nothing new to be done. This just guarantees that there is a -+ * journal write. -+ */ -+static void bch2_btree_set_root(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *old; -+ -+ trace_btree_set_root(c, b); -+ BUG_ON(!b->written && -+ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); -+ -+ old = btree_node_root(c, b); -+ -+ /* -+ * Ensure no one is using the old root while we switch to the -+ * new root: -+ */ -+ bch2_btree_node_lock_write(old, iter); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ btree_update_updated_root(as, b); -+ -+ /* -+ * Unlock old root after new root is visible: -+ * -+ * The new root isn't persistent, but that's ok: we still have -+ * an intent lock on the new root, and any updates that would -+ * depend on the new root would have to update the new root. -+ */ -+ bch2_btree_node_unlock_write(old, iter); -+} -+ -+/* Interior node updates: */ -+ -+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct btree_node_iter *node_iter) -+{ -+ struct bkey_packed *k; -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_keys, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && -+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) -+ bch2_btree_node_iter_advance(node_iter, b); -+ -+ bch2_btree_bset_insert_key(iter, b, node_iter, insert); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+} -+ -+/* -+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher -+ * node) -+ */ -+static struct btree *__btree_split_node(struct btree_update *as, -+ struct btree *n1, -+ struct btree_iter *iter) -+{ -+ size_t nr_packed = 0, nr_unpacked = 0; -+ struct btree *n2; -+ struct bset *set1, *set2; -+ struct bkey_packed *k, *prev = NULL; -+ -+ n2 = bch2_btree_node_alloc(as, n1->c.level); -+ bch2_btree_update_add_new_node(as, n2); -+ -+ n2->data->max_key = n1->data->max_key; -+ n2->data->format = n1->format; -+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); -+ n2->key.k.p = n1->key.k.p; -+ -+ btree_node_set_format(n2, n2->data->format); -+ -+ set1 = btree_bset_first(n1); -+ set2 = btree_bset_first(n2); -+ -+ /* -+ * Has to be a linear search because we don't have an auxiliary -+ * search tree yet -+ */ -+ k = set1->start; -+ while (1) { -+ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); -+ -+ if (n == vstruct_last(set1)) -+ break; -+ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) -+ break; -+ -+ if (bkey_packed(k)) -+ nr_packed++; -+ else -+ nr_unpacked++; -+ -+ prev = k; -+ k = n; -+ } -+ -+ BUG_ON(!prev); -+ -+ btree_set_max(n1, bkey_unpack_pos(n1, prev)); -+ btree_set_min(n2, bkey_successor(n1->key.k.p)); -+ -+ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); -+ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ -+ n2->nr.live_u64s = le16_to_cpu(set2->u64s); -+ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); -+ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; -+ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; -+ -+ n1->nr.live_u64s = le16_to_cpu(set1->u64s); -+ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); -+ n1->nr.packed_keys = nr_packed; -+ n1->nr.unpacked_keys = nr_unpacked; -+ -+ BUG_ON(!set1->u64s); -+ BUG_ON(!set2->u64s); -+ -+ memcpy_u64s(set2->start, -+ vstruct_end(set1), -+ le16_to_cpu(set2->u64s)); -+ -+ btree_node_reset_sib_u64s(n1); -+ btree_node_reset_sib_u64s(n2); -+ -+ bch2_verify_btree_nr_keys(n1); -+ bch2_verify_btree_nr_keys(n2); -+ -+ if (n1->c.level) { -+ btree_node_interior_verify(n1); -+ btree_node_interior_verify(n2); -+ } -+ -+ return n2; -+} -+ -+/* -+ * For updates to interior nodes, we've got to do the insert before we split -+ * because the stuff we're inserting has to be inserted atomically. Post split, -+ * the keys might have to go in different nodes and the split would no longer be -+ * atomic. -+ * -+ * Worse, if the insert is from btree node coalescing, if we do the insert after -+ * we do the split (and pick the pivot) - the pivot we pick might be between -+ * nodes that were coalesced, and thus in the middle of a child node post -+ * coalescing: -+ */ -+static void btree_split_insert_keys(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct keylist *keys) -+{ -+ struct btree_node_iter node_iter; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct bkey_packed *src, *dst, *n; -+ struct bset *i; -+ -+ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); -+ -+ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); -+ -+ while (!bch2_keylist_empty(keys)) { -+ k = bch2_keylist_front(keys); -+ -+ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); -+ bch2_keylist_pop_front(keys); -+ } -+ -+ /* -+ * We can't tolerate whiteouts here - with whiteouts there can be -+ * duplicate keys, and it would be rather bad if we picked a duplicate -+ * for the pivot: -+ */ -+ i = btree_bset_first(b); -+ src = dst = i->start; -+ while (src != vstruct_last(i)) { -+ n = bkey_next_skip_noops(src, vstruct_last(i)); -+ if (!bkey_deleted(src)) { -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ src = n; -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) dst - i->_data); -+ set_btree_bset_end(b, b->set); -+ -+ BUG_ON(b->nsets != 1 || -+ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); -+ -+ btree_node_interior_verify(b); -+} -+ -+static void btree_split(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree *n1, *n2 = NULL, *n3 = NULL; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(!parent && (b != btree_node_root(c, b))); -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n1 = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n1); -+ -+ if (keys) -+ btree_split_insert_keys(as, n1, iter, keys); -+ -+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { -+ trace_btree_split(c, b); -+ -+ n2 = __btree_split_node(as, n1, iter); -+ -+ bch2_btree_build_aux_trees(n2); -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n2->c.lock); -+ six_unlock_write(&n1->c.lock); -+ -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent); -+ -+ /* -+ * Note that on recursive parent_keys == keys, so we -+ * can't start adding new keys to parent_keys before emptying it -+ * out (which we did with btree_split_insert_keys() above) -+ */ -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ bch2_keylist_add(&as->parent_keys, &n2->key); -+ -+ if (!parent) { -+ /* Depth increases, make a new root */ -+ n3 = __btree_root_alloc(as, b->c.level + 1); -+ -+ n3->sib_u64s[0] = U16_MAX; -+ n3->sib_u64s[1] = U16_MAX; -+ -+ btree_split_insert_keys(as, n3, iter, &as->parent_keys); -+ -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent); -+ } -+ } else { -+ trace_btree_compact(c, b); -+ -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n1->c.lock); -+ -+ if (parent) -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ } -+ -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent); -+ -+ /* New nodes all written, now make them visible: */ -+ -+ if (parent) { -+ /* Split a non root node */ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else if (n3) { -+ bch2_btree_set_root(as, n3, iter); -+ } else { -+ /* Root filled up but didn't need to be split */ -+ bch2_btree_set_root(as, n1, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n1); -+ if (n2) -+ bch2_btree_update_get_open_buckets(as, n2); -+ if (n3) -+ bch2_btree_update_get_open_buckets(as, n3); -+ -+ /* Successful split, update the iterator to point to the new nodes: */ -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ if (n3) -+ bch2_btree_iter_node_replace(iter, n3); -+ if (n2) -+ bch2_btree_iter_node_replace(iter, n2); -+ bch2_btree_iter_node_replace(iter, n1); -+ -+ /* -+ * The old node must be freed (in memory) _before_ unlocking the new -+ * nodes - else another thread could re-acquire a read lock on the old -+ * node after another thread has locked and updated the new node, thus -+ * seeing stale data: -+ */ -+ bch2_btree_node_free_inmem(c, b, iter); -+ -+ if (n3) -+ six_unlock_intent(&n3->c.lock); -+ if (n2) -+ six_unlock_intent(&n2->c.lock); -+ six_unlock_intent(&n1->c.lock); -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], -+ start_time); -+} -+ -+static void -+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys) -+{ -+ struct btree_iter *linked; -+ struct btree_node_iter node_iter; -+ struct bkey_i *insert = bch2_keylist_front(keys); -+ struct bkey_packed *k; -+ -+ /* Don't screw up @iter's position: */ -+ node_iter = iter->l[b->c.level].iter; -+ -+ /* -+ * btree_split(), btree_gc_coalesce() will insert keys before -+ * the iterator's current position - they know the keys go in -+ * the node the iterator points to: -+ */ -+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && -+ (bkey_cmp_packed(b, k, &insert->k) >= 0)) -+ ; -+ -+ for_each_keylist_key(keys, insert) -+ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); -+ -+ btree_update_updated_node(as, b); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); -+ -+ bch2_btree_trans_verify_iters(iter->trans, b); -+} -+ -+/** -+ * bch_btree_insert_node - insert bkeys into a given btree node -+ * -+ * @iter: btree iterator -+ * @keys: list of keys to insert -+ * @hook: insert callback -+ * @persistent: if not null, @persistent will wait on journal write -+ * -+ * Inserts as many keys as it can into a given btree node, splitting it if full. -+ * If a split occurred, this function will return early. This can only happen -+ * for leaf nodes -- inserts into interior nodes have to be atomic. -+ */ -+void bch2_btree_insert_node(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ BUG_ON(!b->c.level); -+ BUG_ON(!as || as->b); -+ bch2_verify_keylist_sorted(keys); -+ -+ if (as->must_rewrite) -+ goto split; -+ -+ bch2_btree_node_lock_for_insert(c, b, iter); -+ -+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { -+ bch2_btree_node_unlock_write(b, iter); -+ goto split; -+ } -+ -+ bch2_btree_insert_keys_interior(as, b, iter, keys); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ bch2_btree_node_unlock_write(b, iter); -+ -+ btree_node_interior_verify(b); -+ -+ /* -+ * when called from the btree_split path the new nodes aren't added to -+ * the btree iterator yet, so the merge path's unlock/wait/relock dance -+ * won't work: -+ */ -+ bch2_foreground_maybe_merge(c, iter, b->c.level, -+ flags|BTREE_INSERT_NOUNLOCK); -+ return; -+split: -+ btree_split(as, b, iter, keys, flags); -+} -+ -+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_update *as; -+ struct closure cl; -+ int ret = 0; -+ struct btree_insert_entry *i; -+ -+ /* -+ * We already have a disk reservation and open buckets pinned; this -+ * allocation must not block: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ flags |= BTREE_INSERT_USE_RESERVE; -+ -+ closure_init_stack(&cl); -+ -+ /* Hack, because gc and splitting nodes doesn't mix yet: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) { -+ if (flags & BTREE_INSERT_NOUNLOCK) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(trans)) -+ ret = -EINTR; -+ } -+ -+ /* -+ * XXX: figure out how far we might need to split, -+ * instead of locking/reserving all the way to the root: -+ */ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ trace_trans_restart_iter_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, b), flags, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) { -+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); -+ bch2_trans_unlock(trans); -+ ret = -EINTR; -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ } -+ goto out; -+ } -+ -+ btree_split(as, b, iter, NULL, flags); -+ bch2_btree_update_done(as); -+ -+ /* -+ * We haven't successfully inserted yet, so don't downgrade all the way -+ * back to read locks; -+ */ -+ __bch2_btree_iter_downgrade(iter, 1); -+out: -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+} -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_update *as; -+ struct bkey_format_state new_s; -+ struct bkey_format new_f; -+ struct bkey_i delete; -+ struct btree *b, *m, *n, *prev, *next, *parent; -+ struct closure cl; -+ size_t sib_u64s; -+ int ret = 0; -+ -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ closure_init_stack(&cl); -+retry: -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ b = iter->l[level].b; -+ -+ parent = btree_node_parent(iter, b); -+ if (!parent) -+ goto out; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) -+ goto out; -+ -+ /* XXX: can't be holding read locks */ -+ m = bch2_btree_node_get_sibling(c, iter, b, sib); -+ if (IS_ERR(m)) { -+ ret = PTR_ERR(m); -+ goto err; -+ } -+ -+ /* NULL means no sibling: */ -+ if (!m) { -+ b->sib_u64s[sib] = U16_MAX; -+ goto out; -+ } -+ -+ if (sib == btree_prev_sib) { -+ prev = m; -+ next = b; -+ } else { -+ prev = b; -+ next = m; -+ } -+ -+ bch2_bkey_format_init(&new_s); -+ __bch2_btree_calc_format(&new_s, b); -+ __bch2_btree_calc_format(&new_s, m); -+ new_f = bch2_bkey_format_done(&new_s); -+ -+ sib_u64s = btree_node_u64s_with_format(b, &new_f) + -+ btree_node_u64s_with_format(m, &new_f); -+ -+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { -+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ sib_u64s /= 2; -+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ } -+ -+ sib_u64s = min(sib_u64s, btree_max_u64s(c)); -+ b->sib_u64s[sib] = sib_u64s; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { -+ six_unlock_intent(&m->c.lock); -+ goto out; -+ } -+ -+ /* We're changing btree topology, doesn't mix with gc: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) -+ goto err_cycle_gc_lock; -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ ret = -EINTR; -+ goto err_unlock; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + 1, -+ flags| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ goto err_unlock; -+ } -+ -+ trace_btree_merge(c, b); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ bch2_btree_interior_update_will_free_node(as, m); -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ bch2_btree_update_add_new_node(as, n); -+ -+ btree_set_min(n, prev->data->min_key); -+ btree_set_max(n, next->data->max_key); -+ n->data->format = new_f; -+ -+ btree_node_set_format(n, new_f); -+ -+ bch2_btree_sort_into(c, n, prev); -+ bch2_btree_sort_into(c, n, next); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ bkey_init(&delete.k); -+ delete.k.p = prev->key.k.p; -+ bch2_keylist_add(&as->parent_keys, &delete); -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_drop(iter, m); -+ -+ bch2_btree_iter_node_replace(iter, n); -+ -+ bch2_btree_trans_verify_iters(trans, n); -+ -+ bch2_btree_node_free_inmem(c, b, iter); -+ bch2_btree_node_free_inmem(c, m, iter); -+ -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+out: -+ bch2_btree_trans_verify_locks(trans); -+ -+ /* -+ * Don't downgrade locks here: we're called after successful insert, -+ * and the caller will downgrade locks after a successful insert -+ * anyways (in case e.g. a split was required first) -+ * -+ * And we're also called when inserting into interior nodes in the -+ * split path, and downgrading to read locks in there is potentially -+ * confusing: -+ */ -+ closure_sync(&cl); -+ return; -+ -+err_cycle_gc_lock: -+ six_unlock_intent(&m->c.lock); -+ -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ goto out; -+ -+ bch2_trans_unlock(trans); -+ -+ down_read(&c->gc_lock); -+ up_read(&c->gc_lock); -+ ret = -EINTR; -+ goto err; -+ -+err_unlock: -+ six_unlock_intent(&m->c.lock); -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+err: -+ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); -+ -+ if ((ret == -EAGAIN || ret == -EINTR) && -+ !(flags & BTREE_INSERT_NOUNLOCK)) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto out; -+ -+ goto retry; -+ } -+ -+ goto out; -+} -+ -+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, unsigned flags, -+ struct closure *cl) -+{ -+ struct btree *n, *parent = btree_node_parent(iter, b); -+ struct btree_update *as; -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ (parent -+ ? btree_update_reserve_required(c, parent) -+ : 0) + 1, -+ flags, cl); -+ if (IS_ERR(as)) { -+ trace_btree_gc_rewrite_node_fail(c, b); -+ return PTR_ERR(as); -+ } -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ trace_btree_gc_rewrite_node(c, b); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ if (parent) { -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else { -+ bch2_btree_set_root(as, n, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_replace(iter, n); -+ bch2_btree_node_free_inmem(c, b, iter); -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ return 0; -+} -+ -+/** -+ * bch_btree_node_rewrite - Rewrite/move a btree node -+ * -+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. -+ * btree_check_reserve() has to wait) -+ */ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ __le64 seq, unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ flags |= BTREE_INSERT_NOFAIL; -+ -+ closure_init_stack(&cl); -+ -+ bch2_btree_iter_upgrade(iter, U8_MAX); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ } -+ } -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ break; -+ -+ b = bch2_btree_iter_peek_node(iter); -+ if (!b || b->data->keys.seq != seq) -+ break; -+ -+ ret = __btree_node_rewrite(c, iter, b, flags, &cl); -+ if (ret != -EAGAIN && -+ ret != -EINTR) -+ break; -+ -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+ -+ bch2_btree_iter_downgrade(iter); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ -+ closure_sync(&cl); -+ return ret; -+} -+ -+static void __bch2_btree_node_update_key(struct bch_fs *c, -+ struct btree_update *as, -+ struct btree_iter *iter, -+ struct btree *b, struct btree *new_hash, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent; -+ int ret; -+ -+ btree_update_will_delete_key(as, &b->key); -+ btree_update_will_add_key(as, new_key); -+ -+ parent = btree_node_parent(iter, b); -+ if (parent) { -+ if (new_hash) { -+ bkey_copy(&new_hash->key, new_key); -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, -+ new_hash, b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ } -+ -+ bch2_keylist_add(&as->parent_keys, new_key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); -+ -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, new_key); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } else { -+ bkey_copy(&b->key, new_key); -+ } -+ } else { -+ BUG_ON(btree_node_root(c, b) != b); -+ -+ bch2_btree_node_lock_write(b, iter); -+ bkey_copy(&b->key, new_key); -+ -+ if (btree_ptr_hash_val(&b->key) != b->hash_val) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } -+ -+ btree_update_updated_root(as, b); -+ bch2_btree_node_unlock_write(b, iter); -+ } -+ -+ bch2_btree_update_done(as); -+} -+ -+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree_update *as = NULL; -+ struct btree *new_hash = NULL; -+ struct closure cl; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) -+ return -EINTR; -+ -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(iter->trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ /* -+ * check btree_ptr_hash_val() after @b is locked by -+ * btree_iter_traverse(): -+ */ -+ if (btree_ptr_hash_val(new_key) != b->hash_val) { -+ /* bch2_btree_reserve_get will unlock */ -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ if (ret) { -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ new_hash = bch2_btree_node_mem_alloc(c); -+ } -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ parent ? btree_update_reserve_required(c, parent) : 0, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE, -+ &cl); -+ -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) -+ ret = -EINTR; -+ -+ if (ret != -EINTR) -+ goto err; -+ -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) -+ goto err; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); -+ if (ret) -+ goto err_free_update; -+ -+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); -+ -+ bch2_btree_iter_downgrade(iter); -+err: -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&new_hash->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ six_unlock_write(&new_hash->c.lock); -+ six_unlock_intent(&new_hash->c.lock); -+ } -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+err_free_update: -+ bch2_btree_update_free(as); -+ goto err; -+} -+ -+/* Init code: */ -+ -+/* -+ * Only for filesystem bringup, when first reading the btree roots or allocating -+ * btree roots when initializing a new filesystem: -+ */ -+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -+{ -+ BUG_ON(btree_node_root(c, b)); -+ -+ bch2_btree_set_root_inmem(c, b); -+} -+ -+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ set_btree_node_fake(b); -+ set_btree_node_need_rewrite(b); -+ b->c.level = 0; -+ b->c.btree_id = id; -+ -+ bkey_btree_ptr_init(&b->key); -+ b->key.k.p = POS_MAX; -+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ bch2_btree_build_aux_trees(b); -+ -+ b->data->flags = 0; -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ btree_node_set_format(b, b->data->format); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, -+ b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct btree_update *as; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each_entry(as, &c->btree_interior_update_list, list) -+ pr_buf(&out, "%p m %u w %u r %u j %llu\n", -+ as, -+ as->mode, -+ as->nodes_written, -+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, -+ as->journal.seq); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return out.pos - buf; -+} -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct list_head *i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each(i, &c->btree_interior_update_list) -+ ret++; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return ret; -+} -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) -+{ -+ struct btree_root *r; -+ struct jset_entry *entry; -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ vstruct_for_each(jset, entry) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) { -+ r = &c->btree_roots[entry->btree_id]; -+ r->level = entry->level; -+ r->alive = true; -+ bkey_copy(&r->key, &entry->start[0]); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+} -+ -+struct jset_entry * -+bch2_btree_roots_to_journal_entries(struct bch_fs *c, -+ struct jset_entry *start, -+ struct jset_entry *end) -+{ -+ struct jset_entry *entry; -+ unsigned long have = 0; -+ unsigned i; -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) -+ __set_bit(entry->btree_id, &have); -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].alive && !test_bit(i, &have)) { -+ journal_entry_set(end, -+ BCH_JSET_ENTRY_btree_root, -+ i, c->btree_roots[i].level, -+ &c->btree_roots[i].key, -+ c->btree_roots[i].key.u64s); -+ end = vstruct_next(end); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+ -+ return end; -+} -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -+{ -+ if (c->btree_interior_update_worker) -+ destroy_workqueue(c->btree_interior_update_worker); -+ mempool_exit(&c->btree_interior_update_pool); -+} -+ -+int bch2_fs_btree_interior_update_init(struct bch_fs *c) -+{ -+ mutex_init(&c->btree_reserve_cache_lock); -+ INIT_LIST_HEAD(&c->btree_interior_update_list); -+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); -+ mutex_init(&c->btree_interior_update_lock); -+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); -+ -+ c->btree_interior_update_worker = -+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); -+ if (!c->btree_interior_update_worker) -+ return -ENOMEM; -+ -+ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, -+ sizeof(struct btree_update)); -+} -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -new file mode 100644 -index 000000000000..4a5b9dcfbdd0 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,331 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+ -+#include "btree_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+ -+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, -+ struct bkey_format *); -+ -+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) -+ -+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) -+ -+/* -+ * Tracks an in progress split/rewrite of a btree node and the update to the -+ * parent node: -+ * -+ * When we split/rewrite a node, we do all the updates in memory without -+ * waiting for any writes to complete - we allocate the new node(s) and update -+ * the parent node, possibly recursively up to the root. -+ * -+ * The end result is that we have one or more new nodes being written - -+ * possibly several, if there were multiple splits - and then a write (updating -+ * an interior node) which will make all these new nodes visible. -+ * -+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old -+ * nodes can't be freed (their space on disk can't be reclaimed) until the -+ * update to the interior node that makes the new node visible completes - -+ * until then, the old nodes are still reachable on disk. -+ * -+ */ -+struct btree_update { -+ struct closure cl; -+ struct bch_fs *c; -+ -+ struct list_head list; -+ struct list_head unwritten_list; -+ -+ /* What kind of update are we doing? */ -+ enum { -+ BTREE_INTERIOR_NO_UPDATE, -+ BTREE_INTERIOR_UPDATING_NODE, -+ BTREE_INTERIOR_UPDATING_ROOT, -+ BTREE_INTERIOR_UPDATING_AS, -+ } mode; -+ -+ unsigned must_rewrite:1; -+ unsigned nodes_written:1; -+ -+ enum btree_id btree_id; -+ -+ struct disk_reservation disk_res; -+ struct journal_preres journal_preres; -+ -+ /* -+ * BTREE_INTERIOR_UPDATING_NODE: -+ * The update that made the new nodes visible was a regular update to an -+ * existing interior node - @b. We can't write out the update to @b -+ * until the new nodes we created are finished writing, so we block @b -+ * from writing by putting this btree_interior update on the -+ * @b->write_blocked list with @write_blocked_list: -+ */ -+ struct btree *b; -+ struct list_head write_blocked_list; -+ -+ /* -+ * We may be freeing nodes that were dirty, and thus had journal entries -+ * pinned: we need to transfer the oldest of those pins to the -+ * btree_update operation, and release it when the new node(s) -+ * are all persistent and reachable: -+ */ -+ struct journal_entry_pin journal; -+ -+ /* Preallocated nodes we reserve when we start the update: */ -+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_prealloc_nodes; -+ -+ /* Nodes being freed: */ -+ struct keylist old_keys; -+ u64 _old_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* Nodes being added: */ -+ struct keylist new_keys; -+ u64 _new_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* New nodes, that will be made reachable by this update: */ -+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_new_nodes; -+ -+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * -+ BCH_REPLICAS_MAX]; -+ open_bucket_idx_t nr_open_buckets; -+ -+ unsigned journal_u64s; -+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; -+ -+ /* Only here to reduce stack usage on recursive splits: */ -+ struct keylist parent_keys; -+ /* -+ * Enough room for btree_split's keys without realloc - btree node -+ * pointers never have crc/compression info, so we only need to acount -+ * for the pointers for three keys -+ */ -+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -+}; -+ -+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, -+ struct btree *, -+ struct bkey_format); -+ -+void bch2_btree_update_done(struct btree_update *); -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, -+ unsigned, struct closure *); -+ -+void bch2_btree_interior_update_will_free_node(struct btree_update *, -+ struct btree *); -+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -+ -+void bch2_btree_insert_node(struct btree_update *, struct btree *, -+ struct btree_iter *, struct keylist *, -+ unsigned); -+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, -+ unsigned, unsigned, enum btree_node_sibling); -+ -+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree *b; -+ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ b = iter->l[level].b; -+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) -+ return; -+ -+ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); -+} -+ -+static inline void bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags) -+{ -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_prev_sib); -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_next_sib); -+} -+ -+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); -+ -+static inline unsigned btree_update_reserve_required(struct bch_fs *c, -+ struct btree *b) -+{ -+ unsigned depth = btree_node_root(c, b)->c.level + 1; -+ -+ /* -+ * Number of nodes we might have to allocate in a worst case btree -+ * split operation - we split all the way up to the root, then allocate -+ * a new root, unless we're already at max depth: -+ */ -+ if (depth < BTREE_MAX_DEPTH) -+ return (depth - b->c.level) * 2 + 1; -+ else -+ return (depth - b->c.level) * 2 - 1; -+} -+ -+static inline void btree_node_reset_sib_u64s(struct btree *b) -+{ -+ b->sib_u64s[0] = b->nr.live_u64s; -+ b->sib_u64s[1] = b->nr.live_u64s; -+} -+ -+static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -+{ -+ return (void *) b->data + btree_bytes(c); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, -+ struct btree *b) -+{ -+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, -+ struct btree *b) -+{ -+ return btree_data_end(c, b); -+} -+ -+static inline void *write_block(struct btree *b) -+{ -+ return (void *) b->data + (b->written << 9); -+} -+ -+static inline bool __btree_addr_written(struct btree *b, void *p) -+{ -+ return p < write_block(b); -+} -+ -+static inline bool bset_written(struct btree *b, struct bset *i) -+{ -+ return __btree_addr_written(b, i); -+} -+ -+static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -+{ -+ return __btree_addr_written(b, k); -+} -+ -+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, -+ struct btree *b, -+ void *end) -+{ -+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + -+ b->whiteout_u64s; -+ ssize_t total = c->opts.btree_node_size << 6; -+ -+ return total - used; -+} -+ -+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, -+ struct btree *b) -+{ -+ ssize_t remaining = __bch_btree_u64s_remaining(c, b, -+ btree_bkey_last(b, bset_tree_last(b))); -+ -+ BUG_ON(remaining < 0); -+ -+ if (bset_written(b, btree_bset_last(b))) -+ return 0; -+ -+ return remaining; -+} -+ -+static inline unsigned btree_write_set_buffer(struct btree *b) -+{ -+ /* -+ * Could buffer up larger amounts of keys for btrees with larger keys, -+ * pending benchmarking: -+ */ -+ return 4 << 10; -+} -+ -+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, -+ struct btree *b) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ struct btree_node_entry *bne = max(write_block(b), -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ ssize_t remaining_space = -+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); -+ -+ if (unlikely(bset_written(b, bset(b, t)))) { -+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) -+ return bne; -+ } else { -+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) -+ return bne; -+ } -+ -+ return NULL; -+} -+ -+static inline void push_whiteout(struct bch_fs *c, struct btree *b, -+ struct bpos pos) -+{ -+ struct bkey_packed k; -+ -+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); -+ -+ if (!bkey_pack_pos(&k, pos, b)) { -+ struct bkey *u = (void *) &k; -+ -+ bkey_init(u); -+ u->p = pos; -+ } -+ -+ k.needs_whiteout = true; -+ -+ b->whiteout_u64s += k.u64s; -+ bkey_copy(unwritten_whiteouts_start(c, b), &k); -+} -+ -+/* -+ * write lock must be held on @b (else the dirty bset that we were going to -+ * insert into could be written out from under us) -+ */ -+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, -+ struct btree *b, unsigned u64s) -+{ -+ if (unlikely(btree_node_fake(b))) -+ return false; -+ -+ return u64s <= bch_btree_keys_u64s_remaining(c, b); -+} -+ -+ssize_t bch2_btree_updates_print(struct bch_fs *, char *); -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); -+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, -+ struct jset_entry *, struct jset_entry *); -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *); -+int bch2_fs_btree_interior_update_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ -diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c -new file mode 100644 -index 000000000000..cf4105e83eda ---- /dev/null -+++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1174 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extent_update.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+ -+#include -+#include -+#include -+ -+static inline bool same_leaf_as_prev(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i != trans->updates2 && -+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; -+} -+ -+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ bch2_btree_node_lock_write(b, iter); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return; -+ -+ if (unlikely(btree_node_just_written(b)) && -+ bch2_btree_post_write_cleanup(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ /* -+ * If the last bset has been written, or if it's gotten too big - start -+ * a new bset to insert into: -+ */ -+ if (want_new_bset(c, b)) -+ bch2_btree_init_next(c, b, iter); -+} -+ -+/* Inserting into a given leaf node (last stage of insert): */ -+ -+/* Handle overwrites and do insert, for non extents: */ -+bool bch2_btree_bset_insert_key(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bkey_packed *k; -+ unsigned clobber_u64s = 0, new_u64s = 0; -+ -+ EBUG_ON(btree_node_just_written(b)); -+ EBUG_ON(bset_written(b, btree_bset_last(b))); -+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); -+ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); -+ EBUG_ON(insert->k.u64s > -+ bch_btree_keys_u64s_remaining(iter->trans->c, b)); -+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_packed(b, k, &insert->k)) -+ k = NULL; -+ -+ /* @k is the key being overwritten/deleted, if any: */ -+ EBUG_ON(k && bkey_whiteout(k)); -+ -+ /* Deleting, but not found? nothing to do: */ -+ if (bkey_whiteout(&insert->k) && !k) -+ return false; -+ -+ if (bkey_whiteout(&insert->k)) { -+ /* Deleting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ if (k->needs_whiteout) -+ push_whiteout(iter->trans->c, b, insert->k.p); -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ bch2_bset_delete(b, k, clobber_u64s); -+ goto fix_iter; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ -+ return true; -+ } -+ -+ if (k) { -+ /* Overwriting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ insert->k.needs_whiteout = k->needs_whiteout; -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ goto overwrite; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ } -+ -+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -+overwrite: -+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); -+ new_u64s = k->u64s; -+fix_iter: -+ if (clobber_u64s != new_u64s) -+ bch2_btree_node_iter_fix(iter, b, node_iter, k, -+ clobber_u64s, new_u64s); -+ return true; -+} -+ -+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, -+ unsigned i, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write *w = container_of(pin, struct btree_write, journal); -+ struct btree *b = container_of(w, struct btree, writes[i]); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ bch2_btree_node_write_cond(c, b, -+ (btree_current_write(b) == w && w->journal.seq == seq)); -+ six_unlock_read(&b->c.lock); -+} -+ -+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 0, seq); -+} -+ -+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 1, seq); -+} -+ -+inline void bch2_btree_add_journal_pin(struct bch_fs *c, -+ struct btree *b, u64 seq) -+{ -+ struct btree_write *w = btree_current_write(b); -+ -+ bch2_journal_pin_add(&c->journal, seq, &w->journal, -+ btree_node_write_idx(b) == 0 -+ ? btree_node_flush0 -+ : btree_node_flush1); -+} -+ -+/** -+ * btree_insert_key - insert a key one key into a leaf node -+ */ -+static bool btree_insert_key_leaf(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bset *i = bset(b, t); -+ int old_u64s = bset_u64s(t); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ EBUG_ON(!iter->level && -+ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); -+ -+ if (unlikely(!bch2_btree_bset_insert_key(iter, b, -+ &iter_l(iter)->iter, insert))) -+ return false; -+ -+ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); -+ -+ if (unlikely(!btree_node_dirty(b))) -+ set_btree_node_dirty(b); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) bset_u64s(t) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ trace_btree_insert_key(c, b, insert); -+ return true; -+} -+ -+/* Cached btree updates: */ -+ -+/* Normal update interface: */ -+ -+static inline void btree_insert_entry_checks(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ -+ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); -+ BUG_ON(debug_check_bkeys(c) && -+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), -+ __btree_node_type(iter->level, iter->btree_id))); -+} -+ -+static noinline int -+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, -+ &trans->journal_preres, u64s, 0); -+ if (ret) -+ return ret; -+ -+ if (!bch2_trans_relock(trans)) { -+ trace_trans_restart_journal_preres_get(trans->ip); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static inline int bch2_trans_journal_res_get(struct btree_trans *trans, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) -+ flags |= JOURNAL_RES_GET_RESERVED; -+ -+ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); -+ -+ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ -+ if (unlikely(btree_node_need_rewrite(b)) || -+ unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) -+ return BTREE_INSERT_BTREE_NODE_FULL; -+ -+ return BTREE_INSERT_OK; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ unsigned u64s) -+{ -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ unsigned new_u64s; -+ struct bkey_i *new_k; -+ -+ BUG_ON(iter->level); -+ -+ if (u64s <= ck->u64s) -+ return BTREE_INSERT_OK; -+ -+ new_u64s = roundup_pow_of_two(u64s); -+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) -+ return -ENOMEM; -+ -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ return BTREE_INSERT_OK; -+} -+ -+static inline void do_btree_insert_one(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ bool did_work; -+ -+ EBUG_ON(trans->journal_res.ref != -+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); -+ -+ insert->k.needs_whiteout = false; -+ -+ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) -+ ? btree_insert_key_leaf(trans, iter, insert) -+ : bch2_btree_insert_key_cached(trans, iter, insert); -+ if (!did_work) -+ return; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ bch2_journal_add_keys(j, &trans->journal_res, -+ iter->btree_id, insert); -+ -+ bch2_journal_set_has_inode(j, &trans->journal_res, -+ insert->k.p.inode); -+ -+ if (trans->journal_seq) -+ *trans->journal_seq = trans->journal_res.seq; -+ } -+} -+ -+static inline bool iter_has_trans_triggers(struct btree_iter *iter) -+{ -+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); -+} -+ -+static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) -+{ -+ return (BTREE_NODE_TYPE_HAS_TRIGGERS & -+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & -+ (1U << iter->btree_id); -+} -+ -+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) -+{ -+ __bch2_btree_iter_unlock(iter); -+} -+ -+static noinline void bch2_trans_mark_gc(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) { -+ /* -+ * XXX: synchronization of cached update triggers with gc -+ */ -+ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); -+ -+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) -+ bch2_mark_update(trans, i->iter, i->k, NULL, -+ i->trigger_flags|BTREE_TRIGGER_GC); -+ } -+} -+ -+static inline int -+bch2_trans_commit_write_locked(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_fs_usage *fs_usage = NULL; -+ struct btree_insert_entry *i; -+ unsigned u64s = 0; -+ bool marking = false; -+ int ret; -+ -+ if (race_fault()) { -+ trace_trans_restart_fault_inject(trans->ip); -+ return -EINTR; -+ } -+ -+ /* -+ * Check if the insert will fit in the leaf node with the write lock -+ * held, otherwise another thread could write the node changing the -+ * amount of space available: -+ */ -+ -+ prefetch(&trans->c->journal.flags); -+ -+ trans_for_each_update2(trans, i) { -+ /* Multiple inserts might go to same leaf: */ -+ if (!same_leaf_as_prev(trans, i)) -+ u64s = 0; -+ -+ u64s += i->k->k.u64s; -+ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED -+ ? btree_key_can_insert(trans, i->iter, i->k, u64s) -+ : btree_key_can_insert_cached(trans, i->iter, i->k, u64s); -+ if (ret) { -+ *stopped_at = i; -+ return ret; -+ } -+ -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ marking = true; -+ } -+ -+ if (marking) { -+ percpu_down_read(&c->mark_lock); -+ fs_usage = bch2_fs_usage_scratch_get(c); -+ } -+ -+ /* -+ * Don't get journal reservation until after we know insert will -+ * succeed: -+ */ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ ret = bch2_trans_journal_res_get(trans, -+ JOURNAL_RES_GET_NONBLOCK); -+ if (ret) -+ goto err; -+ } else { -+ trans->journal_res.seq = c->journal.replay_journal_seq; -+ } -+ -+ if (unlikely(trans->extra_journal_entry_u64s)) { -+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries, -+ trans->extra_journal_entry_u64s); -+ -+ trans->journal_res.offset += trans->extra_journal_entry_u64s; -+ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; -+ } -+ -+ /* -+ * Not allowed to fail after we've gotten our journal reservation - we -+ * have to use it: -+ */ -+ -+ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (journal_seq_verify(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version.lo = trans->journal_res.seq; -+ else if (inject_invalid_keys(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version = MAX_VERSION; -+ } -+ -+ /* Must be called under mark_lock: */ -+ if (marking && trans->fs_usage_deltas && -+ bch2_replicas_delta_list_apply(c, fs_usage, -+ trans->fs_usage_deltas)) { -+ ret = BTREE_INSERT_NEED_MARK_REPLICAS; -+ goto err; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (iter_has_nontrans_triggers(i->iter)) -+ bch2_mark_update(trans, i->iter, i->k, -+ fs_usage, i->trigger_flags); -+ -+ if (marking) -+ bch2_trans_fs_usage_apply(trans, fs_usage); -+ -+ if (unlikely(c->gc_pos.phase)) -+ bch2_trans_mark_gc(trans); -+ -+ trans_for_each_update2(trans, i) -+ do_btree_insert_one(trans, i->iter, i->k); -+err: -+ if (marking) { -+ bch2_fs_usage_scratch_put(c, fs_usage); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Get journal reservation, take write locks, and attempt to do btree update(s): -+ */ -+static inline int do_bch2_trans_commit(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct btree_insert_entry *i; -+ struct btree_iter *iter; -+ int ret; -+ -+ trans_for_each_update2(trans, i) -+ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); -+ -+ ret = bch2_journal_preres_get(&trans->c->journal, -+ &trans->journal_preres, trans->journal_preres_u64s, -+ JOURNAL_RES_GET_NONBLOCK| -+ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) -+ ? JOURNAL_RES_GET_RECLAIM : 0)); -+ if (unlikely(ret == -EAGAIN)) -+ ret = bch2_trans_journal_preres_get_cold(trans, -+ trans->journal_preres_u64s); -+ if (unlikely(ret)) -+ return ret; -+ -+ /* -+ * Can't be holding any read locks when we go to take write locks: -+ * -+ * note - this must be done after bch2_trans_journal_preres_get_cold() -+ * or anything else that might call bch2_trans_relock(), since that -+ * would just retake the read locks: -+ */ -+ trans_for_each_iter(trans, iter) { -+ if (iter->nodes_locked != iter->nodes_intent_locked) { -+ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ bch2_btree_iter_unlock_noinline(iter); -+ } -+ } -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ trans_for_each_update2(trans, i) -+ btree_insert_entry_checks(trans, i->iter, i->k); -+ bch2_btree_trans_verify_locks(trans); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_lock_for_insert(trans->c, -+ iter_l(i->iter)->b, i->iter); -+ -+ ret = bch2_trans_commit_write_locked(trans, stopped_at); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, -+ i->iter); -+ -+ if (!ret && trans->journal_pin) -+ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, -+ trans->journal_pin, NULL); -+ -+ /* -+ * Drop journal reservation after dropping write locks, since dropping -+ * the journal reservation may kick off a journal write: -+ */ -+ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); -+ -+ if (unlikely(ret)) -+ return ret; -+ -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ trans->nounlock = true; -+ -+ trans_for_each_update2(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !same_leaf_as_prev(trans, i)) -+ bch2_foreground_maybe_merge(trans->c, i->iter, -+ 0, trans->flags); -+ -+ trans->nounlock = false; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; -+} -+ -+static noinline -+int bch2_trans_commit_error(struct btree_trans *trans, -+ struct btree_insert_entry *i, -+ int ret) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned flags = trans->flags; -+ -+ /* -+ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree -+ * update; if we haven't done anything yet it doesn't apply -+ */ -+ flags &= ~BTREE_INSERT_NOUNLOCK; -+ -+ switch (ret) { -+ case BTREE_INSERT_BTREE_NODE_FULL: -+ ret = bch2_btree_split_leaf(c, i->iter, flags); -+ -+ /* -+ * if the split succeeded without dropping locks the insert will -+ * still be atomic (what the caller peeked() and is overwriting -+ * won't have changed) -+ */ -+#if 0 -+ /* -+ * XXX: -+ * split -> btree node merging (of parent node) might still drop -+ * locks when we're not passing it BTREE_INSERT_NOUNLOCK -+ * -+ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that -+ * will inhibit merging - but we don't have a reliable way yet -+ * (do we?) of checking if we dropped locks in this path -+ */ -+ if (!ret) -+ goto retry; -+#endif -+ -+ /* -+ * don't care if we got ENOSPC because we told split it -+ * couldn't block: -+ */ -+ if (!ret || -+ ret == -EINTR || -+ (flags & BTREE_INSERT_NOUNLOCK)) { -+ trace_trans_restart_btree_node_split(trans->ip); -+ ret = -EINTR; -+ } -+ break; -+ case BTREE_INSERT_ENOSPC: -+ ret = -ENOSPC; -+ break; -+ case BTREE_INSERT_NEED_MARK_REPLICAS: -+ bch2_trans_unlock(trans); -+ -+ trans_for_each_update(trans, i) { -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); -+ if (ret) -+ return ret; -+ } -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_mark_replicas(trans->ip); -+ ret = -EINTR; -+ break; -+ case BTREE_INSERT_NEED_JOURNAL_RES: -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); -+ if (ret) -+ return ret; -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_journal_res_get(trans->ip); -+ ret = -EINTR; -+ break; -+ default: -+ BUG_ON(ret >= 0); -+ break; -+ } -+ -+ if (ret == -EINTR) { -+ int ret2 = bch2_btree_iter_traverse_all(trans); -+ -+ if (ret2) { -+ trace_trans_restart_traverse(trans->ip); -+ return ret2; -+ } -+ -+ trace_trans_restart_atomic(trans->ip); -+ } -+ -+ return ret; -+} -+ -+static noinline int -+bch2_trans_commit_get_rw_cold(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) -+ return -EROFS; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_fs_read_write_early(c); -+ if (ret) -+ return ret; -+ -+ percpu_ref_get(&c->writes); -+ return 0; -+} -+ -+static void bch2_trans_update2(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .iter = iter, .k = insert -+ }; -+ -+ btree_insert_entry_checks(trans, n.iter, n.k); -+ -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ trans_for_each_update2(trans, i) { -+ if (btree_iter_cmp(n.iter, i->iter) == 0) { -+ *i = n; -+ return; -+ } -+ -+ if (btree_iter_cmp(n.iter, i->iter) <= 0) -+ break; -+ } -+ -+ array_insert_item(trans->updates2, trans->nr_updates2, -+ i - trans->updates2, n); -+} -+ -+static int extent_update_to_keys(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ ret = bch2_extent_can_insert(trans, orig_iter, insert); -+ if (ret) -+ return ret; -+ -+ if (bkey_deleted(&insert->k)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, orig_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ iter->flags |= BTREE_ITER_INTENT; -+ __bch2_btree_iter_set_pos(iter, insert->k.p, false); -+ bch2_trans_update2(trans, iter, insert); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+static int extent_handle_overwrites(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos start, struct bpos end) -+{ -+ struct btree_iter *iter = NULL, *update_iter; -+ struct bkey_i *update; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_with_updates(iter); -+ -+ while (k.k && !(ret = bkey_err(k))) { -+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_back(start, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ if (bkey_cmp(k.k->p, end) > 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_front(end, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } else { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ update->k = *k.k; -+ set_bkey_val_u64s(&update->k, 0); -+ update->k.type = KEY_TYPE_deleted; -+ update->k.size = 0; -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ k = bch2_btree_iter_next_with_updates(iter); -+ } -+err: -+ if (!IS_ERR_OR_NULL(iter)) -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_trans_commit(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i = NULL; -+ struct btree_iter *iter; -+ bool trans_trigger_run; -+ unsigned u64s; -+ int ret = 0; -+ -+ BUG_ON(trans->need_reset); -+ -+ if (!trans->nr_updates) -+ goto out_noupdates; -+ -+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&trans->c->gc_lock); -+ -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); -+ -+ trans->journal_u64s = trans->extra_journal_entry_u64s; -+ trans->journal_preres_u64s = 0; -+ -+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!percpu_ref_tryget(&trans->c->writes))) { -+ ret = bch2_trans_commit_get_rw_cold(trans); -+ if (ret) -+ return ret; -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) -+ bch2_btree_key_cache_verify_clean(trans, -+ i->iter->btree_id, i->iter->pos); -+#endif -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ trans_for_each_update(trans, i) { -+ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && -+ (ret = bch2_btree_iter_traverse(i->iter)))) { -+ trace_trans_restart_traverse(trans->ip); -+ goto out; -+ } -+ -+ /* -+ * We're not using bch2_btree_iter_upgrade here because -+ * we know trans->nounlock can't be set: -+ */ -+ if (unlikely(i->iter->locks_want < 1 && -+ !__bch2_btree_iter_upgrade(i->iter, 1))) { -+ trace_trans_restart_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ if (iter_has_trans_triggers(i->iter) && -+ !i->trans_triggers_run) { -+ i->trans_triggers_run = true; -+ trans_trigger_run = true; -+ -+ ret = bch2_trans_mark_update(trans, i->iter, i->k, -+ i->trigger_flags); -+ if (unlikely(ret)) { -+ if (ret == -EINTR) -+ trace_trans_restart_mark(trans->ip); -+ goto out; -+ } -+ } -+ } -+ } while (trans_trigger_run); -+ -+ /* Turn extents updates into keys: */ -+ trans_for_each_update(trans, i) -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ struct bpos start = bkey_start_pos(&i->k->k); -+ -+ while (i + 1 < trans->updates + trans->nr_updates && -+ i[0].iter->btree_id == i[1].iter->btree_id && -+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) -+ i++; -+ -+ ret = extent_handle_overwrites(trans, i->iter->btree_id, -+ start, i->k->k.p); -+ if (ret) -+ goto out; -+ } -+ -+ trans_for_each_update(trans, i) { -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ ret = extent_update_to_keys(trans, i->iter, i->k); -+ if (ret) -+ goto out; -+ } else { -+ bch2_trans_update2(trans, i->iter, i->k); -+ } -+ } -+ -+ trans_for_each_update2(trans, i) { -+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); -+ BUG_ON(i->iter->locks_want < 1); -+ -+ u64s = jset_u64s(i->k->k.u64s); -+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && -+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) -+ trans->journal_preres_u64s += u64s; -+ trans->journal_u64s += u64s; -+ } -+retry: -+ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); -+ -+ ret = do_bch2_trans_commit(trans, &i); -+ -+ /* make sure we didn't drop or screw up locks: */ -+ bch2_btree_trans_verify_locks(trans); -+ -+ if (ret) -+ goto err; -+ -+ trans_for_each_iter(trans, iter) -+ if ((trans->iters_live & (1ULL << iter->idx)) && -+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); -+ else -+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); -+ } -+out: -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) -+ percpu_ref_put(&trans->c->writes); -+out_noupdates: -+ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); -+ -+ return ret; -+err: -+ ret = bch2_trans_commit_error(trans, i, ret); -+ if (ret) -+ goto out; -+ -+ goto retry; -+} -+ -+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_trigger_flags flags) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .trigger_flags = flags, .iter = iter, .k = k -+ }; -+ -+ EBUG_ON(bkey_cmp(iter->pos, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? bkey_start_pos(&k->k) -+ : k->k.p)); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ iter->pos_after_commit = k->k.p; -+ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; -+ } -+ -+ /* -+ * Pending updates are kept sorted: first, find position of new update: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_iter_cmp(iter, i->iter) <= 0) -+ break; -+ -+ /* -+ * Now delete/trim any updates the new update overwrites: -+ */ -+ if (i > trans->updates && -+ i[-1].iter->btree_id == iter->btree_id && -+ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) -+ bch2_cut_back(n.iter->pos, i[-1].k); -+ -+ while (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) -+ array_remove_item(trans->updates, trans->nr_updates, -+ i - trans->updates); -+ -+ if (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { -+ /* -+ * When we have an extent that overwrites the start of another -+ * update, trimming that extent will mean the iterator's -+ * position has to change since the iterator position has to -+ * match the extent's start pos - but we don't want to change -+ * the iterator pos if some other code is using it, so we may -+ * need to clone it: -+ */ -+ if (trans->iters_live & (1ULL << i->iter->idx)) { -+ i->iter = bch2_trans_copy_iter(trans, i->iter); -+ if (IS_ERR(i->iter)) { -+ trans->need_reset = true; -+ return PTR_ERR(i->iter); -+ } -+ -+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, i->iter); -+ } -+ -+ bch2_cut_front(n.k->k.p, i->k); -+ bch2_btree_iter_set_pos(i->iter, n.k->k.p); -+ } -+ -+ EBUG_ON(trans->nr_updates >= trans->nr_iters); -+ -+ array_insert_item(trans->updates, trans->nr_updates, -+ i - trans->updates, n); -+ return 0; -+} -+ -+int __bch2_btree_insert(struct btree_trans *trans, -+ enum btree_id id, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, 0); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+/** -+ * bch2_btree_insert - insert keys into the extent btree -+ * @c: pointer to struct bch_fs -+ * @id: btree to insert into -+ * @insert_keys: list of keys to insert -+ * @hook: insert callback -+ */ -+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, int flags) -+{ -+ return bch2_trans_do(c, disk_res, journal_seq, flags, -+ __bch2_btree_insert(&trans, id, k)); -+} -+ -+int bch2_btree_delete_at_range(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ u64 *journal_seq) -+{ -+ struct bkey_s_c k; -+ int ret = 0; -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ bkey_init(&delete.k); -+ -+ /* -+ * For extents, iter.pos won't necessarily be the same as -+ * bkey_start_pos(k.k) (for non extents they always will be the -+ * same). It's important that we delete starting from iter.pos -+ * because the range we want to delete could start in the middle -+ * of k. -+ * -+ * (bch2_btree_iter_peek() does guarantee that iter.pos >= -+ * bkey_start_pos(k.k)). -+ */ -+ delete.k.p = iter->pos; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ unsigned max_sectors = -+ KEY_SIZE_MAX & (~0 << trans->c->block_bits); -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_trim_atomic(&delete, iter); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+ ret = bch2_trans_commit(trans, NULL, journal_seq, -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ break; -+ -+ bch2_trans_cond_resched(trans); -+ } -+ -+ if (ret == -EINTR) { -+ ret = 0; -+ goto retry; -+ } -+ -+ return ret; -+ -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned flags) -+{ -+ struct bkey_i k; -+ -+ bkey_init(&k.k); -+ k.k.p = iter->pos; -+ -+ bch2_trans_update(trans, iter, &k, 0); -+ return bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE|flags); -+} -+ -+/* -+ * bch_btree_delete_range - delete everything within a given range -+ * -+ * Range is a half open interval - [start, end) -+ */ -+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, -+ struct bpos start, struct bpos end, -+ u64 *journal_seq) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ /* -+ * XXX: whether we need mem/more iters depends on whether this btree id -+ * has triggers -+ */ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); -+ -+ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -new file mode 100644 -index 000000000000..0ec194b93c71 ---- /dev/null -+++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2126 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ * -+ * Bucket states: -+ * - free bucket: mark == 0 -+ * The bucket contains no data and will not be read -+ * -+ * - allocator bucket: owned_by_allocator == 1 -+ * The bucket is on a free list, or it is an open bucket -+ * -+ * - cached bucket: owned_by_allocator == 0 && -+ * dirty_sectors == 0 && -+ * cached_sectors > 0 -+ * The bucket contains data but may be safely discarded as there are -+ * enough replicas of the data on other cache devices, or it has been -+ * written back to the backing device -+ * -+ * - dirty bucket: owned_by_allocator == 0 && -+ * dirty_sectors > 0 -+ * The bucket contains data that we must not discard (either only copy, -+ * or one of the 'main copies' for data requiring multiple replicas) -+ * -+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 -+ * This is a btree node, journal or gen/prio bucket -+ * -+ * Lifecycle: -+ * -+ * bucket invalidated => bucket on freelist => open bucket => -+ * [dirty bucket =>] cached bucket => bucket invalidated => ... -+ * -+ * Note that cache promotion can skip the dirty bucket step, as data -+ * is copied from a deeper tier to a shallower tier, onto a cached -+ * bucket. -+ * Note also that a cached bucket can spontaneously become dirty -- -+ * see below. -+ * -+ * Only a traversal of the key space can determine whether a bucket is -+ * truly dirty or cached. -+ * -+ * Transitions: -+ * -+ * - free => allocator: bucket was invalidated -+ * - cached => allocator: bucket was invalidated -+ * -+ * - allocator => dirty: open bucket was filled up -+ * - allocator => cached: open bucket was filled up -+ * - allocator => metadata: metadata was allocated -+ * -+ * - dirty => cached: dirty sectors were copied to a deeper tier -+ * - dirty => free: dirty sectors were overwritten or moved (copy gc) -+ * - cached => free: cached sectors were overwritten -+ * -+ * - metadata => free: metadata was freed -+ * -+ * Oddities: -+ * - cached => dirty: a device was removed so formerly replicated data -+ * is no longer sufficiently replicated -+ * - free => cached: cannot happen -+ * - free => dirty: cannot happen -+ * - free => metadata: cannot happen -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "ec.h" -+#include "error.h" -+#include "movinggc.h" -+#include "replicas.h" -+ -+#include -+#include -+ -+/* -+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent -+ * wraparound: -+ */ -+void bch2_bucket_seq_cleanup(struct bch_fs *c) -+{ -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u16 last_seq_ondisk = c->journal.last_seq_ondisk; -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ struct bucket_mark m; -+ unsigned i; -+ -+ if (journal_seq - c->last_bucket_seq_cleanup < -+ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) -+ return; -+ -+ c->last_bucket_seq_cleanup = journal_seq; -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) { -+ bucket_cmpxchg(g, m, ({ -+ if (!m.journal_seq_valid || -+ bucket_needs_journal_commit(m, last_seq_ondisk)) -+ break; -+ -+ m.journal_seq_valid = 0; -+ })); -+ } -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+void bch2_fs_usage_initialize(struct bch_fs *c) -+{ -+ struct bch_fs_usage *usage; -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ usage = c->usage_base; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ usage->reserved += usage->persistent_reserved[i]; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ switch (e->data_type) { -+ case BCH_DATA_BTREE: -+ usage->btree += usage->replicas[i]; -+ break; -+ case BCH_DATA_USER: -+ usage->data += usage->replicas[i]; -+ break; -+ case BCH_DATA_CACHED: -+ usage->cached += usage->replicas[i]; -+ break; -+ } -+ } -+ -+ percpu_up_write(&c->mark_lock); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ if (fs_usage == c->usage_scratch) -+ mutex_unlock(&c->usage_scratch_lock); -+ else -+ kfree(fs_usage); -+} -+ -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); -+ -+ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); -+ if (ret) -+ return ret; -+ -+ if (mutex_trylock(&c->usage_scratch_lock)) -+ goto out_pool; -+ -+ ret = kzalloc(bytes, GFP_NOFS); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->usage_scratch_lock); -+out_pool: -+ ret = c->usage_scratch; -+ memset(ret, 0, bytes); -+ return ret; -+} -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_dev_usage ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ acc_u64s_percpu((u64 *) &ret, -+ (u64 __percpu *) ca->usage[0], -+ sizeof(ret) / sizeof(u64)); -+ -+ return ret; -+} -+ -+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, -+ unsigned journal_seq, -+ bool gc) -+{ -+ return this_cpu_ptr(gc -+ ? c->usage_gc -+ : c->usage[journal_seq & 1]); -+} -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -+{ -+ ssize_t offset = v - (u64 *) c->usage_base; -+ unsigned seq; -+ u64 ret; -+ -+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ ret = *v + -+ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + -+ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned seq, v, u64s = fs_usage_u64s(c); -+retry: -+ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); -+ if (unlikely(!ret)) -+ return NULL; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ v = fs_usage_u64s(c); -+ if (unlikely(u64s != v)) { -+ u64s = v; -+ percpu_up_read(&c->mark_lock); -+ kfree(ret); -+ goto retry; -+ } -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ memcpy(ret, c->usage_base, u64s * sizeof(u64)); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -+{ -+ unsigned u64s = fs_usage_u64s(c); -+ -+ BUG_ON(idx >= 2); -+ -+ write_seqcount_begin(&c->usage_lock); -+ -+ acc_u64s_percpu((u64 *) c->usage_base, -+ (u64 __percpu *) c->usage[idx], u64s); -+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); -+ -+ write_seqcount_end(&c->usage_lock); -+} -+ -+void bch2_fs_usage_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_fs_usage *fs_usage) -+{ -+ unsigned i; -+ -+ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); -+ -+ pr_buf(out, "hidden:\t\t\t\t%llu\n", -+ fs_usage->hidden); -+ pr_buf(out, "data:\t\t\t\t%llu\n", -+ fs_usage->data); -+ pr_buf(out, "cached:\t\t\t\t%llu\n", -+ fs_usage->cached); -+ pr_buf(out, "reserved:\t\t\t%llu\n", -+ fs_usage->reserved); -+ pr_buf(out, "nr_inodes:\t\t\t%llu\n", -+ fs_usage->nr_inodes); -+ pr_buf(out, "online reserved:\t\t%llu\n", -+ fs_usage->online_reserved); -+ -+ for (i = 0; -+ i < ARRAY_SIZE(fs_usage->persistent_reserved); -+ i++) { -+ pr_buf(out, "%u replicas:\n", i + 1); -+ pr_buf(out, "\treserved:\t\t%llu\n", -+ fs_usage->persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ pr_buf(out, "\t"); -+ bch2_replicas_entry_to_text(out, e); -+ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); -+ } -+} -+ -+#define RESERVE_FACTOR 6 -+ -+static u64 reserve_factor(u64 r) -+{ -+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -+} -+ -+static u64 avail_factor(u64 r) -+{ -+ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); -+} -+ -+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ return min(fs_usage->hidden + -+ fs_usage->btree + -+ fs_usage->data + -+ reserve_factor(fs_usage->reserved + -+ fs_usage->online_reserved), -+ c->capacity); -+} -+ -+static struct bch_fs_usage_short -+__bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ u64 data, reserved; -+ -+ ret.capacity = c->capacity - -+ bch2_fs_usage_read_one(c, &c->usage_base->hidden); -+ -+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + -+ bch2_fs_usage_read_one(c, &c->usage_base->btree); -+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + -+ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); -+ -+ ret.used = min(ret.capacity, data + reserve_factor(reserved)); -+ ret.free = ret.capacity - ret.used; -+ -+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); -+ -+ return ret; -+} -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = __bch2_fs_usage_read_short(c); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+static inline int is_unavailable_bucket(struct bucket_mark m) -+{ -+ return !is_available_bucket(m); -+} -+ -+static inline int is_fragmented_bucket(struct bucket_mark m, -+ struct bch_dev *ca) -+{ -+ if (!m.owned_by_allocator && -+ m.data_type == BCH_DATA_USER && -+ bucket_sectors_used(m)) -+ return max_t(int, 0, (int) ca->mi.bucket_size - -+ bucket_sectors_used(m)); -+ return 0; -+} -+ -+static inline int bucket_stripe_sectors(struct bucket_mark m) -+{ -+ return m.stripe ? m.dirty_sectors : 0; -+} -+ -+static inline enum bch_data_type bucket_type(struct bucket_mark m) -+{ -+ return m.cached_sectors && !m.dirty_sectors -+ ? BCH_DATA_CACHED -+ : m.data_type; -+} -+ -+static bool bucket_became_unavailable(struct bucket_mark old, -+ struct bucket_mark new) -+{ -+ return is_available_bucket(old) && -+ !is_available_bucket(new); -+} -+ -+int bch2_fs_usage_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct disk_reservation *disk_res, -+ unsigned journal_seq) -+{ -+ s64 added = fs_usage->data + fs_usage->reserved; -+ s64 should_not_have_added; -+ int ret = 0; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ /* -+ * Not allowed to reduce sectors_available except by getting a -+ * reservation: -+ */ -+ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); -+ if (WARN_ONCE(should_not_have_added > 0, -+ "disk usage increased by %lli without a reservation", -+ should_not_have_added)) { -+ atomic64_sub(should_not_have_added, &c->sectors_available); -+ added -= should_not_have_added; -+ ret = -1; -+ } -+ -+ if (added > 0) { -+ disk_res->sectors -= added; -+ fs_usage->online_reserved -= added; -+ } -+ -+ preempt_disable(); -+ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), -+ (u64 *) fs_usage, fs_usage_u64s(c)); -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static inline void account_bucket(struct bch_fs_usage *fs_usage, -+ struct bch_dev_usage *dev_usage, -+ enum bch_data_type type, -+ int nr, s64 size) -+{ -+ if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) -+ fs_usage->hidden += size; -+ -+ dev_usage->buckets[type] += nr; -+} -+ -+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, -+ struct bch_fs_usage *fs_usage, -+ struct bucket_mark old, struct bucket_mark new, -+ bool gc) -+{ -+ struct bch_dev_usage *u; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ preempt_disable(); -+ u = this_cpu_ptr(ca->usage[gc]); -+ -+ if (bucket_type(old)) -+ account_bucket(fs_usage, u, bucket_type(old), -+ -1, -ca->mi.bucket_size); -+ -+ if (bucket_type(new)) -+ account_bucket(fs_usage, u, bucket_type(new), -+ 1, ca->mi.bucket_size); -+ -+ u->buckets_alloc += -+ (int) new.owned_by_allocator - (int) old.owned_by_allocator; -+ u->buckets_unavailable += -+ is_unavailable_bucket(new) - is_unavailable_bucket(old); -+ -+ u->buckets_ec += (int) new.stripe - (int) old.stripe; -+ u->sectors_ec += bucket_stripe_sectors(new) - -+ bucket_stripe_sectors(old); -+ -+ u->sectors[old.data_type] -= old.dirty_sectors; -+ u->sectors[new.data_type] += new.dirty_sectors; -+ u->sectors[BCH_DATA_CACHED] += -+ (int) new.cached_sectors - (int) old.cached_sectors; -+ u->sectors_fragmented += -+ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); -+ preempt_enable(); -+ -+ if (!is_available_bucket(old) && is_available_bucket(new)) -+ bch2_wake_allocator(ca); -+} -+ -+void bch2_dev_usage_from_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_mark old = { .v.counter = 0 }; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int cpu; -+ -+ c->usage_base->hidden = 0; -+ -+ for_each_member_device(ca, c, i) { -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(ca->usage[0], cpu), 0, -+ sizeof(*ca->usage[0])); -+ -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ bch2_dev_usage_update(c, ca, c->usage_base, -+ old, g->mark, false); -+ } -+} -+ -+static inline int update_replicas(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ int idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) -+ return -1; -+ -+ if (!fs_usage) -+ return 0; -+ -+ switch (r->data_type) { -+ case BCH_DATA_BTREE: -+ fs_usage->btree += sectors; -+ break; -+ case BCH_DATA_USER: -+ fs_usage->data += sectors; -+ break; -+ case BCH_DATA_CACHED: -+ fs_usage->cached += sectors; -+ break; -+ } -+ fs_usage->replicas[idx] += sectors; -+ return 0; -+} -+ -+static inline void update_cached_sectors(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas(c, fs_usage, &r.e, sectors); -+} -+ -+static struct replicas_delta_list * -+replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -+{ -+ struct replicas_delta_list *d = trans->fs_usage_deltas; -+ unsigned new_size = d ? (d->size + more) * 2 : 128; -+ -+ if (!d || d->used + more > d->size) { -+ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); -+ BUG_ON(!d); -+ -+ d->size = new_size; -+ trans->fs_usage_deltas = d; -+ } -+ return d; -+} -+ -+static inline void update_replicas_list(struct btree_trans *trans, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ struct replicas_delta_list *d; -+ struct replicas_delta *n; -+ unsigned b; -+ -+ if (!sectors) -+ return; -+ -+ b = replicas_entry_bytes(r) + 8; -+ d = replicas_deltas_realloc(trans, b); -+ -+ n = (void *) d->d + d->used; -+ n->delta = sectors; -+ memcpy(&n->r, r, replicas_entry_bytes(r)); -+ d->used += b; -+} -+ -+static inline void update_cached_sectors_list(struct btree_trans *trans, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas_list(trans, &r.e, sectors); -+} -+ -+static inline struct replicas_delta * -+replicas_delta_next(struct replicas_delta *d) -+{ -+ return (void *) d + replicas_entry_bytes(&d->r) + 8; -+} -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct replicas_delta_list *r) -+{ -+ struct replicas_delta *d = r->d; -+ struct replicas_delta *top = (void *) r->d + r->used; -+ unsigned i; -+ -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ if (update_replicas(c, fs_usage, &d->r, d->delta)) { -+ top = d; -+ goto unwind; -+ } -+ -+ if (!fs_usage) -+ return 0; -+ -+ fs_usage->nr_inodes += r->nr_inodes; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ fs_usage->reserved += r->persistent_reserved[i]; -+ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; -+ } -+ -+ return 0; -+unwind: -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ update_replicas(c, fs_usage, &d->r, -d->delta); -+ return -1; -+} -+ -+#define do_mark_fn(fn, c, pos, flags, ...) \ -+({ \ -+ int gc, ret = 0; \ -+ \ -+ percpu_rwsem_assert_held(&c->mark_lock); \ -+ \ -+ for (gc = 0; gc < 2 && !ret; gc++) \ -+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ -+ (gc && gc_visited(c, pos))) \ -+ ret = fn(c, __VA_ARGS__, gc); \ -+ ret; \ -+}) -+ -+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *ret, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ BUG_ON(!is_available_bucket(new)); -+ -+ new.owned_by_allocator = true; -+ new.data_type = 0; -+ new.cached_sectors = 0; -+ new.dirty_sectors = 0; -+ new.gen++; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ if (old.cached_sectors) -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -((s64) old.cached_sectors)); -+ -+ if (!gc) -+ *ret = old; -+ return 0; -+} -+ -+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *old) -+{ -+ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, -+ ca, b, old); -+ -+ if (!old->owned_by_allocator && old->cached_sectors) -+ trace_invalidate(ca, bucket_to_sector(ca, b), -+ old->cached_sectors); -+} -+ -+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.owned_by_allocator = owned_by_allocator; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && -+ !owned_by_allocator && !old.owned_by_allocator); -+ -+ return 0; -+} -+ -+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ struct gc_pos pos, unsigned flags) -+{ -+ preempt_disable(); -+ -+ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, -+ ca, b, owned_by_allocator); -+ -+ preempt_enable(); -+} -+ -+static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bkey_alloc_unpacked u; -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bucket_mark old, m; -+ -+ /* -+ * alloc btree is read in by bch2_alloc_read, not gc: -+ */ -+ if ((flags & BTREE_TRIGGER_GC) && -+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ -+ if (k.k->p.offset >= ca->mi.nbuckets) -+ return 0; -+ -+ g = __bucket(ca, k.k->p.offset, gc); -+ u = bch2_alloc_unpack(k); -+ -+ old = bucket_cmpxchg(g, m, ({ -+ m.gen = u.gen; -+ m.data_type = u.data_type; -+ m.dirty_sectors = u.dirty_sectors; -+ m.cached_sectors = u.cached_sectors; -+ -+ if (journal_seq) { -+ m.journal_seq_valid = 1; -+ m.journal_seq = journal_seq; -+ } -+ })); -+ -+ if (!(flags & BTREE_TRIGGER_ALLOC_READ)) -+ bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); -+ -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; -+ -+ /* -+ * need to know if we're getting called from the invalidate path or -+ * not: -+ */ -+ -+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && -+ old.cached_sectors) { -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -old.cached_sectors); -+ trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), -+ old.cached_sectors); -+ } -+ -+ return 0; -+} -+ -+#define checked_add(a, b) \ -+({ \ -+ unsigned _res = (unsigned) (a) + (b); \ -+ bool overflow = _res > U16_MAX; \ -+ if (overflow) \ -+ _res = U16_MAX; \ -+ (a) = _res; \ -+ overflow; \ -+}) -+ -+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type data_type, -+ unsigned sectors, bool gc) -+{ -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ bool overflow; -+ -+ BUG_ON(data_type != BCH_DATA_SB && -+ data_type != BCH_DATA_JOURNAL); -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.data_type = data_type; -+ overflow = checked_add(new.dirty_sectors, sectors); -+ })); -+ -+ bch2_fs_inconsistent_on(old.data_type && -+ old.data_type != data_type, c, -+ "different types of data in same bucket: %s, %s", -+ bch2_data_types[old.data_type], -+ bch2_data_types[data_type]); -+ -+ bch2_fs_inconsistent_on(overflow, c, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", -+ ca->dev_idx, b, new.gen, -+ bch2_data_types[old.data_type ?: data_type], -+ old.dirty_sectors, sectors); -+ -+ if (c) -+ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), -+ old, new, gc); -+ -+ return 0; -+} -+ -+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type type, -+ unsigned sectors, struct gc_pos pos, -+ unsigned flags) -+{ -+ BUG_ON(type != BCH_DATA_SB && -+ type != BCH_DATA_JOURNAL); -+ -+ preempt_disable(); -+ -+ if (likely(c)) { -+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, -+ ca, b, type, sectors); -+ } else { -+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); -+ } -+ -+ preempt_enable(); -+} -+ -+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) -+{ -+ return DIV_ROUND_UP(sectors * n, d); -+} -+ -+static s64 __ptr_disk_sectors_delta(unsigned old_size, -+ unsigned offset, s64 delta, -+ unsigned flags, -+ unsigned n, unsigned d) -+{ -+ BUG_ON(!n || !d); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, offset) + -+ disk_sectors_scaled(n, d, old_size - offset + delta); -+ } else if (flags & BTREE_TRIGGER_OVERWRITE) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, old_size + delta); -+ } else { -+ return disk_sectors_scaled(n, d, delta); -+ } -+} -+ -+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, -+ unsigned offset, s64 delta, -+ unsigned flags) -+{ -+ return __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, delta, flags, -+ p.crc.compressed_size, -+ p.crc.uncompressed_size); -+} -+ -+static void bucket_set_stripe(struct bch_fs *c, -+ const struct bch_stripe *v, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, -+ unsigned flags) -+{ -+ bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); -+ bool gc = flags & BTREE_TRIGGER_GC; -+ unsigned i; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ const struct bch_extent_ptr *ptr = v->ptrs + i; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, gc); -+ struct bucket_mark new, old; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.stripe = enabled; -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ /* -+ * XXX write repair code for these, flag stripe as possibly bad -+ */ -+ if (old.gen != ptr->gen) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "stripe with stale pointer"); -+#if 0 -+ /* -+ * We'd like to check for these, but these checks don't work -+ * yet: -+ */ -+ if (old.stripe && enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "multiple stripes using same bucket"); -+ -+ if (!old.stripe && !enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "deleting stripe but bucket not marked as stripe bucket"); -+#endif -+ } -+} -+ -+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 *bucket_data_type, -+ u16 *dirty_sectors, u16 *cached_sectors) -+{ -+ u16 *dst_sectors = !p.ptr.cached -+ ? dirty_sectors -+ : cached_sectors; -+ u16 orig_sectors = *dst_sectors; -+ char buf[200]; -+ -+ if (gen_after(p.ptr.gen, bucket_gen)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != p.ptr.gen && !p.ptr.cached) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != p.ptr.gen) -+ return 1; -+ -+ if (*bucket_data_type && *bucket_data_type != ptr_data_type) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type], -+ bch2_data_types[ptr_data_type], -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (checked_add(*dst_sectors, sectors)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ orig_sectors, sectors, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ *bucket_data_type = *dirty_sectors || *cached_sectors -+ ? ptr_data_type : 0; -+ return 0; -+} -+ -+static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bucket_mark old, new; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); -+ u8 bucket_data_type; -+ u64 v; -+ int ret; -+ -+ v = atomic64_read(&g->_mark.v); -+ do { -+ new.v.counter = old.v.counter = v; -+ bucket_data_type = new.data_type; -+ -+ ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, -+ &bucket_data_type, -+ &new.dirty_sectors, -+ &new.cached_sectors); -+ if (ret) -+ return ret; -+ -+ new.data_type = bucket_data_type; -+ -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ -+ if (flags & BTREE_TRIGGER_NOATOMIC) { -+ g->_mark = new; -+ break; -+ } -+ } while ((v = atomic64_cmpxchg(&g->_mark.v, -+ old.v.counter, -+ new.v.counter)) != old.v.counter); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && bucket_became_unavailable(old, new)); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe_ptr(struct bch_fs *c, -+ struct bch_extent_stripe_ptr p, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ s64 sectors, unsigned flags, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct stripe *m; -+ unsigned old, new; -+ int blocks_nonempty_delta; -+ -+ m = genradix_ptr(&c->stripes[gc], p.idx); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ if (!m || !m->alive) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ return -EIO; -+ } -+ -+ BUG_ON(m->r.e.data_type != data_type); -+ -+ *nr_data = m->nr_blocks - m->nr_redundant; -+ *nr_parity = m->nr_redundant; -+ *r = m->r; -+ -+ old = m->block_sectors[p.block]; -+ m->block_sectors[p.block] += sectors; -+ new = m->block_sectors[p.block]; -+ -+ blocks_nonempty_delta = (int) !!new - (int) !!old; -+ if (blocks_nonempty_delta) { -+ m->blocks_nonempty += blocks_nonempty_delta; -+ -+ if (!gc) -+ bch2_stripes_heap_update(c, m, p.idx); -+ } -+ -+ m->dirty = true; -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ return 0; -+} -+ -+static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, -+ unsigned offset, s64 sectors, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ unsigned journal_seq, unsigned flags) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_BTREE -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, -+ fs_usage, journal_seq, flags); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors(c, fs_usage, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, -+ fs_usage, disk_sectors, flags, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas(c, fs_usage, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ /* -+ * There may be other dirty pointers in this extent, but -+ * if so they're not required for mounting if we have an -+ * erasure coded pointer in this extent: -+ */ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas(c, fs_usage, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ size_t idx = s.k->p.offset; -+ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); -+ unsigned i; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", -+ idx); -+ return -1; -+ } -+ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { -+ m->sectors = le16_to_cpu(s.v->sectors); -+ m->algorithm = s.v->algorithm; -+ m->nr_blocks = s.v->nr_blocks; -+ m->nr_redundant = s.v->nr_redundant; -+ -+ bch2_bkey_to_replicas(&m->r.e, k); -+ -+ /* -+ * XXX: account for stripes somehow here -+ */ -+#if 0 -+ update_replicas(c, fs_usage, &m->r.e, stripe_sectors); -+#endif -+ -+ /* gc recalculates these fields: */ -+ if (!(flags & BTREE_TRIGGER_GC)) { -+ for (i = 0; i < s.v->nr_blocks; i++) { -+ m->block_sectors[i] = -+ stripe_blockcount_get(s.v, i); -+ m->blocks_nonempty += !!m->block_sectors[i]; -+ } -+ } -+ -+ if (!gc) -+ bch2_stripes_heap_update(c, m, idx); -+ m->alive = true; -+ } else { -+ if (!gc) -+ bch2_stripes_heap_del(c, m, idx); -+ memset(m, 0, sizeof(*m)); -+ } -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ bucket_set_stripe(c, s.v, fs_usage, 0, flags); -+ return 0; -+} -+ -+static int bch2_mark_key_locked(struct bch_fs *c, -+ struct bkey_s_c k, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ int ret = 0; -+ -+ preempt_disable(); -+ -+ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) -+ fs_usage = fs_usage_ptr(c, journal_seq, -+ flags & BTREE_TRIGGER_GC); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_alloc: -+ ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, -+ fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, -+ fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_stripe: -+ ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_inode: -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ fs_usage->nr_inodes++; -+ else -+ fs_usage->nr_inodes--; -+ break; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(fs_usage->persistent_reserved)); -+ -+ fs_usage->reserved += sectors; -+ fs_usage->persistent_reserved[replicas - 1] += sectors; -+ break; -+ } -+ } -+ -+ preempt_enable(); -+ -+ return ret; -+} -+ -+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = bch2_mark_key_locked(c, k, offset, sectors, -+ fs_usage, journal_seq, flags); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+inline int bch2_mark_overwrite(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c old, -+ struct bkey_i *new, -+ struct bch_fs_usage *fs_usage, -+ unsigned flags, -+ bool is_extents) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned offset = 0; -+ s64 sectors = -((s64) old.k->size); -+ -+ flags |= BTREE_TRIGGER_OVERWRITE; -+ -+ if (is_extents -+ ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 -+ : bkey_cmp(new->k.p, old.k->p)) -+ return 0; -+ -+ if (is_extents) { -+ switch (bch2_extent_overlap(&new->k, old.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) old.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = bkey_start_offset(&new->k) - -+ old.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(old.k) - -+ new->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = -((s64) new->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ } -+ -+ return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, -+ trans->journal_res.seq, flags) ?: 1; -+} -+ -+int bch2_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bch_fs_usage *fs_usage, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_k; -+ int ret = 0; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ bch2_mark_key_locked(c, bkey_i_to_s_c(insert), -+ 0, insert->k.size, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|flags); -+ -+ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) -+ return 0; -+ -+ /* -+ * For non extents, we only mark the new key, not the key being -+ * overwritten - unless we're actually deleting: -+ */ -+ if ((iter->btree_id == BTREE_ID_ALLOC || -+ iter->btree_id == BTREE_ID_EC) && -+ !bkey_deleted(&insert->k)) -+ return 0; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); -+ -+ ret = bch2_mark_overwrite(trans, iter, k, insert, -+ fs_usage, flags, -+ btree_node_type_is_extents(iter->btree_id)); -+ if (ret <= 0) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return ret; -+} -+ -+void bch2_trans_fs_usage_apply(struct btree_trans *trans, -+ struct bch_fs_usage *fs_usage) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ static int warned_disk_usage = 0; -+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; -+ char buf[200]; -+ -+ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, -+ trans->journal_res.seq) || -+ warned_disk_usage || -+ xchg(&warned_disk_usage, 1)) -+ return; -+ -+ bch_err(c, "disk usage increased more than %llu sectors reserved", -+ disk_res_sectors); -+ -+ trans_for_each_update(trans, i) { -+ pr_err("while inserting"); -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); -+ pr_err("%s", buf); -+ pr_err("overlapping with"); -+ -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { -+ struct btree *b = iter_l(i->iter)->b; -+ struct btree_node_iter node_iter = iter_l(i->iter)->iter; -+ struct bkey_packed *_k; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ -+ pr_info("_k %px format %u", _k, _k->format); -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(i->k->k.p, k.k->p)) -+ break; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ pr_err("%s", buf); -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ } else { -+ struct bkey_cached *ck = (void *) i->iter->l[0].b; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); -+ pr_err("%s", buf); -+ } -+ } -+} -+ -+/* trans_mark: */ -+ -+static struct btree_iter *trans_get_update(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct bkey_s_c *k) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ if (i->iter->btree_id == btree_id && -+ (btree_node_type_is_extents(btree_id) -+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && -+ bkey_cmp(pos, i->k->k.p) < 0 -+ : !bkey_cmp(pos, i->iter->pos))) { -+ *k = bkey_i_to_s_c(i->k); -+ return i->iter; -+ } -+ -+ return NULL; -+} -+ -+static int trans_get_key(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct btree_iter **iter, -+ struct bkey_s_c *k) -+{ -+ unsigned flags = btree_id != BTREE_ID_ALLOC -+ ? BTREE_ITER_SLOTS -+ : BTREE_ITER_CACHED; -+ int ret; -+ -+ *iter = trans_get_update(trans, btree_id, pos, k); -+ if (*iter) -+ return 1; -+ -+ *iter = bch2_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_INTENT); -+ if (IS_ERR(*iter)) -+ return PTR_ERR(*iter); -+ -+ *k = __bch2_btree_iter_peek(*iter, flags); -+ ret = bkey_err(*k); -+ if (ret) -+ bch2_trans_iter_put(trans, *iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_pointer(struct btree_trans *trans, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); -+ struct btree_iter *iter; -+ struct bkey_s_c k_a; -+ struct bkey_alloc_unpacked u; -+ struct bkey_i_alloc *a; -+ struct bucket *g; -+ int ret; -+ -+ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); -+ if (iter) { -+ u = bch2_alloc_unpack(k_a); -+ } else { -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto out; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, pos.offset); -+ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, -+ &u.dirty_sectors, &u.cached_sectors); -+ if (ret) -+ goto out; -+ -+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ bkey_alloc_init(&a->k_i); -+ a->k.p = pos; -+ bch2_alloc_pack(a, u); -+ bch2_trans_update(trans, iter, &a->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, -+ struct bch_extent_stripe_ptr p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_stripe *s; -+ int ret = 0; -+ -+ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) { -+ bch2_fs_inconsistent(c, -+ "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ ret = -EIO; -+ goto out; -+ } -+ -+ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ goto out; -+ -+ bkey_reassemble(&s->k_i, k); -+ -+ stripe_blockcount_set(&s->v, p.block, -+ stripe_blockcount_get(&s->v, p.block) + -+ sectors); -+ -+ *nr_data = s->v.nr_blocks - s->v.nr_redundant; -+ *nr_parity = s->v.nr_redundant; -+ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); -+ bch2_trans_update(trans, iter, &s->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_extent(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned offset, -+ s64 sectors, unsigned flags, -+ enum bch_data_type data_type) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_BTREE -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, -+ data_type); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors_list(trans, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, -+ disk_sectors, data_type, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas_list(trans, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas_list(trans, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 idx, unsigned sectors, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ s64 ret; -+ -+ ret = trans_get_key(trans, BTREE_ID_REFLINK, -+ POS(0, idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ bch2_fs_inconsistent(c, -+ "%llu:%llu len %u points to nonexistent indirect extent %llu", -+ p.k->p.inode, p.k->p.offset, p.k->size, idx); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if ((flags & BTREE_TRIGGER_OVERWRITE) && -+ (bkey_start_offset(k.k) < idx || -+ k.k->p.offset > idx + sectors)) -+ goto out; -+ -+ sectors = k.k->p.offset - idx; -+ -+ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&r_v->k_i, k); -+ -+ le64_add_cpu(&r_v->v.refcount, -+ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); -+ -+ if (!r_v->v.refcount) { -+ r_v->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&r_v->k, 0); -+ } -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ bch2_trans_update(trans, iter, &r_v->k_i, 0); -+out: -+ ret = sectors; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, unsigned offset, -+ s64 sectors, unsigned flags) -+{ -+ u64 idx = le64_to_cpu(p.v->idx) + offset; -+ s64 ret = 0; -+ -+ sectors = abs(sectors); -+ BUG_ON(offset + sectors > p.k->size); -+ -+ while (sectors) { -+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); -+ if (ret < 0) -+ break; -+ -+ idx += ret; -+ sectors = max_t(s64, 0LL, sectors - ret); -+ ret = 0; -+ } -+ -+ return ret; -+} -+ -+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, -+ unsigned offset, s64 sectors, unsigned flags) -+{ -+ struct replicas_delta_list *d; -+ struct bch_fs *c = trans->c; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_BTREE); -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_USER); -+ case KEY_TYPE_inode: -+ d = replicas_deltas_realloc(trans, 0); -+ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ d->nr_inodes++; -+ else -+ d->nr_inodes--; -+ return 0; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ d = replicas_deltas_realloc(trans, 0); -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(d->persistent_reserved)); -+ -+ d->persistent_reserved[replicas - 1] += sectors; -+ return 0; -+ } -+ case KEY_TYPE_reflink_p: -+ return bch2_trans_mark_reflink_p(trans, -+ bkey_s_c_to_reflink_p(k), -+ offset, sectors, flags); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_trans_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ unsigned flags) -+{ -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_k; -+ int ret; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), -+ 0, insert->k.size, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) -+ return 0; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ } -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ unsigned offset = 0; -+ s64 sectors = 0; -+ unsigned flags = BTREE_TRIGGER_OVERWRITE; -+ -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(insert->k.p, k.k->p)) -+ break; -+ -+ if (btree_node_is_extents(b)) { -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) k.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = bkey_start_offset(&insert->k) - -+ k.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(k.k) - -+ insert->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = -((s64) insert->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ } -+ -+ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); -+ if (ret) -+ return ret; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return 0; -+} -+ -+/* Disk reservations: */ -+ -+static u64 bch2_recalc_sectors_available(struct bch_fs *c) -+{ -+ percpu_u64_set(&c->pcpu->sectors_available, 0); -+ -+ return avail_factor(__bch2_fs_usage_read_short(c).free); -+} -+ -+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -+{ -+ percpu_down_read(&c->mark_lock); -+ this_cpu_sub(c->usage[0]->online_reserved, -+ res->sectors); -+ percpu_up_read(&c->mark_lock); -+ -+ res->sectors = 0; -+} -+ -+#define SECTORS_CACHE 1024 -+ -+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, -+ unsigned sectors, int flags) -+{ -+ struct bch_fs_pcpu *pcpu; -+ u64 old, v, get; -+ s64 sectors_available; -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ pcpu = this_cpu_ptr(c->pcpu); -+ -+ if (sectors <= pcpu->sectors_available) -+ goto out; -+ -+ v = atomic64_read(&c->sectors_available); -+ do { -+ old = v; -+ get = min((u64) sectors + SECTORS_CACHE, old); -+ -+ if (get < sectors) { -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ goto recalculate; -+ } -+ } while ((v = atomic64_cmpxchg(&c->sectors_available, -+ old, old - get)) != old); -+ -+ pcpu->sectors_available += get; -+ -+out: -+ pcpu->sectors_available -= sectors; -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ return 0; -+ -+recalculate: -+ percpu_down_write(&c->mark_lock); -+ -+ sectors_available = bch2_recalc_sectors_available(c); -+ -+ if (sectors <= sectors_available || -+ (flags & BCH_DISK_RESERVATION_NOFAIL)) { -+ atomic64_set(&c->sectors_available, -+ max_t(s64, 0, sectors_available - sectors)); -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ ret = 0; -+ } else { -+ atomic64_set(&c->sectors_available, sectors_available); -+ ret = -ENOSPC; -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return ret; -+} -+ -+/* Startup/shutdown: */ -+ -+static void buckets_free_rcu(struct rcu_head *rcu) -+{ -+ struct bucket_array *buckets = -+ container_of(rcu, struct bucket_array, rcu); -+ -+ kvpfree(buckets, -+ sizeof(struct bucket_array) + -+ buckets->nbuckets * sizeof(struct bucket)); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bucket_array *buckets = NULL, *old_buckets = NULL; -+ unsigned long *buckets_nouse = NULL; -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ alloc_heap alloc_heap; -+ copygc_heap copygc_heap; -+ -+ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, -+ ca->mi.bucket_size / c->opts.btree_node_size); -+ /* XXX: these should be tunable */ -+ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); -+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); -+ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), -+ btree_reserve * 2); -+ bool resize = ca->buckets[0] != NULL, -+ start_copygc = ca->copygc_thread != NULL; -+ int ret = -ENOMEM; -+ unsigned i; -+ -+ memset(&free, 0, sizeof(free)); -+ memset(&free_inc, 0, sizeof(free_inc)); -+ memset(&alloc_heap, 0, sizeof(alloc_heap)); -+ memset(©gc_heap, 0, sizeof(copygc_heap)); -+ -+ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + -+ nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * -+ sizeof(unsigned long), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_MOVINGGC], -+ copygc_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || -+ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || -+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || -+ !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) -+ goto err; -+ -+ buckets->first_bucket = ca->mi.first_bucket; -+ buckets->nbuckets = nbuckets; -+ -+ bch2_copygc_stop(ca); -+ -+ if (resize) { -+ down_write(&c->gc_lock); -+ down_write(&ca->bucket_lock); -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ old_buckets = bucket_array(ca); -+ -+ if (resize) { -+ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); -+ -+ memcpy(buckets->b, -+ old_buckets->b, -+ n * sizeof(struct bucket)); -+ memcpy(buckets_nouse, -+ ca->buckets_nouse, -+ BITS_TO_LONGS(n) * sizeof(unsigned long)); -+ } -+ -+ rcu_assign_pointer(ca->buckets[0], buckets); -+ buckets = old_buckets; -+ -+ swap(ca->buckets_nouse, buckets_nouse); -+ -+ if (resize) { -+ percpu_up_write(&c->mark_lock); -+ up_write(&c->gc_lock); -+ } -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ fifo_move(&free[i], &ca->free[i]); -+ swap(ca->free[i], free[i]); -+ } -+ fifo_move(&free_inc, &ca->free_inc); -+ swap(ca->free_inc, free_inc); -+ spin_unlock(&c->freelist_lock); -+ -+ /* with gc lock held, alloc_heap can't be in use: */ -+ swap(ca->alloc_heap, alloc_heap); -+ -+ /* and we shut down copygc: */ -+ swap(ca->copygc_heap, copygc_heap); -+ -+ nbuckets = ca->mi.nbuckets; -+ -+ if (resize) -+ up_write(&ca->bucket_lock); -+ -+ if (start_copygc && -+ bch2_copygc_start(c, ca)) -+ bch_err(ca, "error restarting copygc thread"); -+ -+ ret = 0; -+err: -+ free_heap(©gc_heap); -+ free_heap(&alloc_heap); -+ free_fifo(&free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&free[i]); -+ kvpfree(buckets_nouse, -+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); -+ if (buckets) -+ call_rcu(&old_buckets->rcu, buckets_free_rcu); -+ -+ return ret; -+} -+ -+void bch2_dev_buckets_free(struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ free_heap(&ca->copygc_heap); -+ free_heap(&ca->alloc_heap); -+ free_fifo(&ca->free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&ca->free[i]); -+ kvpfree(ca->buckets_nouse, -+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); -+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ -+ free_percpu(ca->usage[0]); -+} -+ -+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) -+ return -ENOMEM; -+ -+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; -+} -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -new file mode 100644 -index 000000000000..97265fe90e96 ---- /dev/null -+++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,327 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ */ -+ -+#ifndef _BUCKETS_H -+#define _BUCKETS_H -+ -+#include "buckets_types.h" -+#include "super.h" -+ -+#define for_each_bucket(_b, _buckets) \ -+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ -+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -+ -+#define bucket_cmpxchg(g, new, expr) \ -+({ \ -+ struct bucket *_g = g; \ -+ u64 _v = atomic64_read(&(g)->_mark.v); \ -+ struct bucket_mark _old; \ -+ \ -+ do { \ -+ (new).v.counter = _old.v.counter = _v; \ -+ expr; \ -+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ -+ _old.v.counter, \ -+ (new).v.counter)) != _old.v.counter);\ -+ _old; \ -+}) -+ -+static inline struct bucket_array *__bucket_array(struct bch_dev *ca, -+ bool gc) -+{ -+ return rcu_dereference_check(ca->buckets[gc], -+ !ca->fs || -+ percpu_rwsem_is_held(&ca->fs->mark_lock) || -+ lockdep_is_held(&ca->fs->gc_lock) || -+ lockdep_is_held(&ca->bucket_lock)); -+} -+ -+static inline struct bucket_array *bucket_array(struct bch_dev *ca) -+{ -+ return __bucket_array(ca, false); -+} -+ -+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) -+{ -+ struct bucket_array *buckets = __bucket_array(ca, gc); -+ -+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); -+ return buckets->b + b; -+} -+ -+static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -+{ -+ return __bucket(ca, b, false); -+} -+ -+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, int rw) -+{ -+ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; -+} -+ -+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -+{ -+ return c->bucket_clock[rw].hand - g->io_time[rw]; -+} -+ -+/* -+ * bucket_gc_gen() returns the difference between the bucket's current gen and -+ * the oldest gen of any pointer into that bucket in the btree. -+ */ -+ -+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) -+{ -+ struct bucket *g = bucket(ca, b); -+ -+ return g->mark.gen - g->oldest_gen; -+} -+ -+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return sector_to_bucket(ca, ptr->offset); -+} -+ -+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr, -+ bool gc) -+{ -+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); -+} -+ -+static inline enum bch_data_type ptr_data_type(const struct bkey *k, -+ const struct bch_extent_ptr *ptr) -+{ -+ if (k->type == KEY_TYPE_btree_ptr || -+ k->type == KEY_TYPE_btree_ptr_v2) -+ return BCH_DATA_BTREE; -+ -+ return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; -+} -+ -+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ struct bucket_mark m; -+ -+ rcu_read_lock(); -+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); -+ rcu_read_unlock(); -+ -+ return m; -+} -+ -+static inline int gen_cmp(u8 a, u8 b) -+{ -+ return (s8) (a - b); -+} -+ -+static inline int gen_after(u8 a, u8 b) -+{ -+ int r = gen_cmp(a, b); -+ -+ return r > 0 ? r : 0; -+} -+ -+/** -+ * ptr_stale() - check if a pointer points into a bucket that has been -+ * invalidated. -+ */ -+static inline u8 ptr_stale(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); -+} -+ -+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, -+ unsigned live_size) -+{ -+ return live_size && p.crc.compression_type -+ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, -+ p.crc.uncompressed_size)) -+ : live_size; -+} -+ -+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) -+{ -+ return __ptr_disk_sectors(p, p.crc.live_size); -+} -+ -+/* bucket gc marks */ -+ -+static inline unsigned bucket_sectors_used(struct bucket_mark mark) -+{ -+ return mark.dirty_sectors + mark.cached_sectors; -+} -+ -+static inline bool bucket_unused(struct bucket_mark mark) -+{ -+ return !mark.owned_by_allocator && -+ !mark.data_type && -+ !bucket_sectors_used(mark); -+} -+ -+static inline bool is_available_bucket(struct bucket_mark mark) -+{ -+ return (!mark.owned_by_allocator && -+ !mark.dirty_sectors && -+ !mark.stripe); -+} -+ -+static inline bool bucket_needs_journal_commit(struct bucket_mark m, -+ u16 last_seq_ondisk) -+{ -+ return m.journal_seq_valid && -+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -+} -+ -+/* Device usage: */ -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); -+ -+void bch2_dev_usage_from_buckets(struct bch_fs *); -+ -+static inline u64 __dev_buckets_available(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; -+ -+ if (WARN_ONCE(stats.buckets_unavailable > total, -+ "buckets_unavailable overflow (%llu > %llu)\n", -+ stats.buckets_unavailable, total)) -+ return 0; -+ -+ return total - stats.buckets_unavailable; -+} -+ -+/* -+ * Number of reclaimable buckets - only for use by the allocator thread: -+ */ -+static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) -+{ -+ return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); -+} -+ -+static inline u64 __dev_buckets_free(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ return __dev_buckets_available(ca, stats) + -+ fifo_used(&ca->free[RESERVE_NONE]) + -+ fifo_used(&ca->free_inc); -+} -+ -+static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) -+{ -+ return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); -+} -+ -+/* Filesystem usage: */ -+ -+static inline unsigned fs_usage_u64s(struct bch_fs *c) -+{ -+ -+ return sizeof(struct bch_fs_usage) / sizeof(u64) + -+ READ_ONCE(c->replicas.nr); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); -+ -+void bch2_fs_usage_to_text(struct printbuf *, -+ struct bch_fs *, struct bch_fs_usage *); -+ -+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *); -+ -+/* key/bucket marking: */ -+ -+void bch2_bucket_seq_cleanup(struct bch_fs *); -+void bch2_fs_usage_initialize(struct bch_fs *); -+ -+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, struct bucket_mark *); -+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, bool, struct gc_pos, unsigned); -+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, enum bch_data_type, unsigned, -+ struct gc_pos, unsigned); -+ -+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, -+ struct bch_fs_usage *, u64, unsigned); -+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, -+ struct disk_reservation *, unsigned); -+ -+int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, -+ struct bkey_s_c, struct bkey_i *, -+ struct bch_fs_usage *, unsigned, bool); -+int bch2_mark_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct bch_fs_usage *, unsigned); -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *, -+ struct bch_fs_usage *, -+ struct replicas_delta_list *); -+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, -+ unsigned, s64, unsigned); -+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, -+ struct bkey_i *insert, unsigned); -+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); -+ -+/* disk reservations: */ -+ -+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); -+ -+static inline void bch2_disk_reservation_put(struct bch_fs *c, -+ struct disk_reservation *res) -+{ -+ if (res->sectors) -+ __bch2_disk_reservation_put(c, res); -+} -+ -+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -+ -+int bch2_disk_reservation_add(struct bch_fs *, -+ struct disk_reservation *, -+ unsigned, int); -+ -+static inline struct disk_reservation -+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -+{ -+ return (struct disk_reservation) { -+ .sectors = 0, -+#if 0 -+ /* not used yet: */ -+ .gen = c->capacity_gen, -+#endif -+ .nr_replicas = nr_replicas, -+ }; -+} -+ -+static inline int bch2_disk_reservation_get(struct bch_fs *c, -+ struct disk_reservation *res, -+ unsigned sectors, -+ unsigned nr_replicas, -+ int flags) -+{ -+ *res = bch2_disk_reservation_init(c, nr_replicas); -+ -+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -+void bch2_dev_buckets_free(struct bch_dev *); -+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); -+ -+#endif /* _BUCKETS_H */ -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -new file mode 100644 -index 000000000000..53f22726893d ---- /dev/null -+++ b/fs/bcachefs/buckets_types.h -@@ -0,0 +1,133 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_TYPES_H -+#define _BUCKETS_TYPES_H -+ -+#include "bcachefs_format.h" -+#include "util.h" -+ -+#define BUCKET_JOURNAL_SEQ_BITS 16 -+ -+struct bucket_mark { -+ union { -+ atomic64_t v; -+ -+ struct { -+ u8 gen; -+ u8 data_type:3, -+ owned_by_allocator:1, -+ journal_seq_valid:1, -+ stripe:1; -+ u16 dirty_sectors; -+ u16 cached_sectors; -+ -+ /* -+ * low bits of journal sequence number when this bucket was most -+ * recently modified: if journal_seq_valid is set, this bucket can't be -+ * reused until the journal sequence number written to disk is >= the -+ * bucket's journal sequence number: -+ */ -+ u16 journal_seq; -+ }; -+ }; -+}; -+ -+struct bucket { -+ union { -+ struct bucket_mark _mark; -+ const struct bucket_mark mark; -+ }; -+ -+ u16 io_time[2]; -+ u8 oldest_gen; -+ u8 gc_gen; -+ unsigned gen_valid:1; -+}; -+ -+struct bucket_array { -+ struct rcu_head rcu; -+ u16 first_bucket; -+ size_t nbuckets; -+ struct bucket b[]; -+}; -+ -+struct bch_dev_usage { -+ u64 buckets[BCH_DATA_NR]; -+ u64 buckets_alloc; -+ u64 buckets_unavailable; -+ -+ /* _compressed_ sectors: */ -+ u64 sectors[BCH_DATA_NR]; -+ u64 sectors_fragmented; -+ -+ u64 buckets_ec; -+ u64 sectors_ec; -+}; -+ -+struct bch_fs_usage { -+ /* all fields are in units of 512 byte sectors: */ -+ -+ u64 online_reserved; -+ -+ /* fields after online_reserved are cleared/recalculated by gc: */ -+ u64 gc_start[0]; -+ -+ u64 hidden; -+ u64 btree; -+ u64 data; -+ u64 cached; -+ u64 reserved; -+ u64 nr_inodes; -+ -+ /* XXX: add stats for compression ratio */ -+#if 0 -+ u64 uncompressed; -+ u64 compressed; -+#endif -+ -+ /* broken out: */ -+ -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ u64 replicas[]; -+}; -+ -+struct bch_fs_usage_short { -+ u64 capacity; -+ u64 used; -+ u64 free; -+ u64 nr_inodes; -+}; -+ -+struct replicas_delta { -+ s64 delta; -+ struct bch_replicas_entry r; -+} __packed; -+ -+struct replicas_delta_list { -+ unsigned size; -+ unsigned used; -+ -+ struct {} memset_start; -+ u64 nr_inodes; -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ struct {} memset_end; -+ struct replicas_delta d[0]; -+}; -+ -+/* -+ * A reservation for space on disk: -+ */ -+struct disk_reservation { -+ u64 sectors; -+ u32 gen; -+ unsigned nr_replicas; -+}; -+ -+struct copygc_heap_entry { -+ u8 gen; -+ u32 sectors; -+ u64 offset; -+}; -+ -+typedef HEAP(struct copygc_heap_entry) copygc_heap; -+ -+#endif /* _BUCKETS_TYPES_H */ -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -new file mode 100644 -index 000000000000..3af521947502 ---- /dev/null -+++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,704 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_CHARDEV -+ -+#include "bcachefs.h" -+#include "bcachefs_ioctl.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "move.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* returns with ref on ca->ref */ -+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, -+ unsigned flags) -+{ -+ struct bch_dev *ca; -+ -+ if (flags & BCH_BY_INDEX) { -+ if (dev >= c->sb.nr_devices) -+ return ERR_PTR(-EINVAL); -+ -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ if (!ca) -+ return ERR_PTR(-EINVAL); -+ } else { -+ char *path; -+ -+ path = strndup_user((const char __user *) -+ (unsigned long) dev, PATH_MAX); -+ if (IS_ERR(path)) -+ return ERR_CAST(path); -+ -+ ca = bch2_dev_lookup(c, path); -+ kfree(path); -+ } -+ -+ return ca; -+} -+ -+#if 0 -+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -+{ -+ struct bch_ioctl_assemble arg; -+ struct bch_fs *c; -+ u64 *user_devs = NULL; -+ char **devs = NULL; -+ unsigned i; -+ int ret = -EFAULT; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); -+ if (!user_devs) -+ return -ENOMEM; -+ -+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); -+ -+ if (copy_from_user(user_devs, user_arg->devs, -+ sizeof(u64) * arg.nr_devs)) -+ goto err; -+ -+ for (i = 0; i < arg.nr_devs; i++) { -+ devs[i] = strndup_user((const char __user *)(unsigned long) -+ user_devs[i], -+ PATH_MAX); -+ if (!devs[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); -+ ret = PTR_ERR_OR_ZERO(c); -+ if (!ret) -+ closure_put(&c->cl); -+err: -+ if (devs) -+ for (i = 0; i < arg.nr_devs; i++) -+ kfree(devs[i]); -+ kfree(devs); -+ return ret; -+} -+ -+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -+{ -+ struct bch_ioctl_incremental arg; -+ const char *err; -+ char *path; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ err = bch2_fs_open_incremental(path); -+ kfree(path); -+ -+ if (err) { -+ pr_err("Could not register bcachefs devices: %s", err); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+#endif -+ -+static long bch2_global_ioctl(unsigned cmd, void __user *arg) -+{ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_ASSEMBLE: -+ return bch2_ioctl_assemble(arg); -+ case BCH_IOCTL_INCREMENTAL: -+ return bch2_ioctl_incremental(arg); -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static long bch2_ioctl_query_uuid(struct bch_fs *c, -+ struct bch_ioctl_query_uuid __user *user_arg) -+{ -+ return copy_to_user(&user_arg->uuid, -+ &c->sb.user_uuid, -+ sizeof(c->sb.user_uuid)); -+} -+ -+#if 0 -+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -+{ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ return bch2_fs_start(c); -+} -+ -+static long bch2_ioctl_stop(struct bch_fs *c) -+{ -+ bch2_fs_stop(c); -+ return 0; -+} -+#endif -+ -+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_add(c, path); -+ kfree(path); -+ -+ return ret; -+} -+ -+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ return bch2_dev_remove(c, ca, arg.flags); -+} -+ -+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_online(c, path); -+ kfree(path); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_offline(c, ca, arg.flags); -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_set_state(struct bch_fs *c, -+ struct bch_ioctl_disk_set_state arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad[0] || arg.pad[1] || arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+struct bch_data_ctx { -+ struct bch_fs *c; -+ struct bch_ioctl_data arg; -+ struct bch_move_stats stats; -+ -+ int ret; -+ -+ struct task_struct *thread; -+}; -+ -+static int bch2_data_thread(void *arg) -+{ -+ struct bch_data_ctx *ctx = arg; -+ -+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -+ -+ ctx->stats.data_type = U8_MAX; -+ return 0; -+} -+ -+static int bch2_data_job_release(struct inode *inode, struct file *file) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ -+ kthread_stop(ctx->thread); -+ put_task_struct(ctx->thread); -+ kfree(ctx); -+ return 0; -+} -+ -+static ssize_t bch2_data_job_read(struct file *file, char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ struct bch_fs *c = ctx->c; -+ struct bch_ioctl_data_event e = { -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.btree_id, -+ .p.pos = ctx->stats.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ }; -+ -+ if (len < sizeof(e)) -+ return -EINVAL; -+ -+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); -+} -+ -+static const struct file_operations bcachefs_data_ops = { -+ .release = bch2_data_job_release, -+ .read = bch2_data_job_read, -+ .llseek = no_llseek, -+}; -+ -+static long bch2_ioctl_data(struct bch_fs *c, -+ struct bch_ioctl_data arg) -+{ -+ struct bch_data_ctx *ctx = NULL; -+ struct file *file = NULL; -+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; -+ int ret, fd = -1; -+ -+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) -+ return -EINVAL; -+ -+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); -+ if (!ctx) -+ return -ENOMEM; -+ -+ ctx->c = c; -+ ctx->arg = arg; -+ -+ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); -+ if (IS_ERR(ctx->thread)) { -+ ret = PTR_ERR(ctx->thread); -+ goto err; -+ } -+ -+ ret = get_unused_fd_flags(flags); -+ if (ret < 0) -+ goto err; -+ fd = ret; -+ -+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); -+ if (IS_ERR(file)) { -+ ret = PTR_ERR(file); -+ goto err; -+ } -+ -+ fd_install(fd, file); -+ -+ get_task_struct(ctx->thread); -+ wake_up_process(ctx->thread); -+ -+ return fd; -+err: -+ if (fd >= 0) -+ put_unused_fd(fd); -+ if (!IS_ERR_OR_NULL(ctx->thread)) -+ kthread_stop(ctx->thread); -+ kfree(ctx); -+ return ret; -+} -+ -+static long bch2_ioctl_fs_usage(struct bch_fs *c, -+ struct bch_ioctl_fs_usage __user *user_arg) -+{ -+ struct bch_ioctl_fs_usage *arg = NULL; -+ struct bch_replicas_usage *dst_e, *dst_end; -+ struct bch_fs_usage *src; -+ u32 replica_entries_bytes; -+ unsigned i; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) -+ return -EFAULT; -+ -+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); -+ if (!arg) -+ return -ENOMEM; -+ -+ src = bch2_fs_usage_read(c); -+ if (!src) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ arg->capacity = c->capacity; -+ arg->used = bch2_fs_sectors_used(c, src); -+ arg->online_reserved = src->online_reserved; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ arg->persistent_reserved[i] = src->persistent_reserved[i]; -+ -+ dst_e = arg->replicas; -+ dst_end = (void *) arg->replicas + replica_entries_bytes; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *src_e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ dst_e->sectors = src->replicas[i]; -+ dst_e->r = *src_e; -+ -+ /* recheck after setting nr_devs: */ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); -+ -+ dst_e = replicas_usage_next(dst_e); -+ } -+ -+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; -+ -+ percpu_up_read(&c->mark_lock); -+ kfree(src); -+ -+ if (!ret) -+ ret = copy_to_user(user_arg, arg, -+ sizeof(*arg) + arg->replica_entries_bytes); -+err: -+ kfree(arg); -+ return ret; -+} -+ -+static long bch2_ioctl_dev_usage(struct bch_fs *c, -+ struct bch_ioctl_dev_usage __user *user_arg) -+{ -+ struct bch_ioctl_dev_usage arg; -+ struct bch_dev_usage src; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad[0] || -+ arg.pad[1] || -+ arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ src = bch2_dev_usage_read(c, ca); -+ -+ arg.state = ca->mi.state; -+ arg.bucket_size = ca->mi.bucket_size; -+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; -+ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; -+ arg.ec_buckets = src.buckets_ec; -+ arg.ec_sectors = src.sectors_ec; -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ arg.buckets[i] = src.buckets[i]; -+ arg.sectors[i] = src.sectors[i]; -+ } -+ -+ percpu_ref_put(&ca->ref); -+ -+ return copy_to_user(user_arg, &arg, sizeof(arg)); -+} -+ -+static long bch2_ioctl_read_super(struct bch_fs *c, -+ struct bch_ioctl_read_super arg) -+{ -+ struct bch_dev *ca = NULL; -+ struct bch_sb *sb; -+ int ret = 0; -+ -+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || -+ arg.pad) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (arg.flags & BCH_READ_DEV) { -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ -+ if (IS_ERR(ca)) { -+ ret = PTR_ERR(ca); -+ goto err; -+ } -+ -+ sb = ca->disk_sb.sb; -+ } else { -+ sb = c->disk_sb.sb; -+ } -+ -+ if (vstruct_bytes(sb) > arg.size) { -+ ret = -ERANGE; -+ goto err; -+ } -+ -+ ret = copy_to_user((void __user *)(unsigned long)arg.sb, -+ sb, vstruct_bytes(sb)); -+err: -+ if (ca) -+ percpu_ref_put(&ca->ref); -+ mutex_unlock(&c->sb_lock); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_get_idx(struct bch_fs *c, -+ struct bch_ioctl_disk_get_idx arg) -+{ -+ dev_t dev = huge_decode_dev(arg.dev); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ if (ca->disk_sb.bdev->bd_dev == dev) { -+ percpu_ref_put(&ca->io_ref); -+ return i; -+ } -+ -+ return -ENOENT; -+} -+ -+static long bch2_ioctl_disk_resize(struct bch_fs *c, -+ struct bch_ioctl_disk_resize arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_resize(c, ca, arg.nbuckets); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+#define BCH_IOCTL(_name, _argtype) \ -+do { \ -+ _argtype i; \ -+ \ -+ if (copy_from_user(&i, arg, sizeof(i))) \ -+ return -EFAULT; \ -+ return bch2_ioctl_##_name(c, i); \ -+} while (0) -+ -+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -+{ -+ /* ioctls that don't require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_QUERY_UUID: -+ return bch2_ioctl_query_uuid(c, arg); -+ case BCH_IOCTL_FS_USAGE: -+ return bch2_ioctl_fs_usage(c, arg); -+ case BCH_IOCTL_DEV_USAGE: -+ return bch2_ioctl_dev_usage(c, arg); -+ } -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_START: -+ BCH_IOCTL(start, struct bch_ioctl_start); -+ case BCH_IOCTL_STOP: -+ return bch2_ioctl_stop(c); -+#endif -+ case BCH_IOCTL_READ_SUPER: -+ BCH_IOCTL(read_super, struct bch_ioctl_read_super); -+ case BCH_IOCTL_DISK_GET_IDX: -+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); -+ } -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ /* ioctls that do require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_DISK_ADD: -+ BCH_IOCTL(disk_add, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_REMOVE: -+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_ONLINE: -+ BCH_IOCTL(disk_online, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_OFFLINE: -+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_SET_STATE: -+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); -+ case BCH_IOCTL_DATA: -+ BCH_IOCTL(data, struct bch_ioctl_data); -+ case BCH_IOCTL_DISK_RESIZE: -+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); -+ -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static DEFINE_IDR(bch_chardev_minor); -+ -+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -+{ -+ unsigned minor = iminor(file_inode(filp)); -+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; -+ void __user *arg = (void __user *) v; -+ -+ return c -+ ? bch2_fs_ioctl(c, cmd, arg) -+ : bch2_global_ioctl(cmd, arg); -+} -+ -+static const struct file_operations bch_chardev_fops = { -+ .owner = THIS_MODULE, -+ .unlocked_ioctl = bch2_chardev_ioctl, -+ .open = nonseekable_open, -+}; -+ -+static int bch_chardev_major; -+static struct class *bch_chardev_class; -+static struct device *bch_chardev; -+ -+void bch2_fs_chardev_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->chardev)) -+ device_unregister(c->chardev); -+ if (c->minor >= 0) -+ idr_remove(&bch_chardev_minor, c->minor); -+} -+ -+int bch2_fs_chardev_init(struct bch_fs *c) -+{ -+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); -+ if (c->minor < 0) -+ return c->minor; -+ -+ c->chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, c->minor), c, -+ "bcachefs%u-ctl", c->minor); -+ if (IS_ERR(c->chardev)) -+ return PTR_ERR(c->chardev); -+ -+ return 0; -+} -+ -+void bch2_chardev_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ device_destroy(bch_chardev_class, -+ MKDEV(bch_chardev_major, U8_MAX)); -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ class_destroy(bch_chardev_class); -+ if (bch_chardev_major > 0) -+ unregister_chrdev(bch_chardev_major, "bcachefs"); -+} -+ -+int __init bch2_chardev_init(void) -+{ -+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); -+ if (bch_chardev_major < 0) -+ return bch_chardev_major; -+ -+ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); -+ if (IS_ERR(bch_chardev_class)) -+ return PTR_ERR(bch_chardev_class); -+ -+ bch_chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, U8_MAX), -+ NULL, "bcachefs-ctl"); -+ if (IS_ERR(bch_chardev)) -+ return PTR_ERR(bch_chardev); -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_CHARDEV */ -diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h -new file mode 100644 -index 000000000000..3a4890d39ff9 ---- /dev/null -+++ b/fs/bcachefs/chardev.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHARDEV_H -+#define _BCACHEFS_CHARDEV_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); -+ -+void bch2_fs_chardev_exit(struct bch_fs *); -+int bch2_fs_chardev_init(struct bch_fs *); -+ -+void bch2_chardev_exit(void); -+int __init bch2_chardev_init(void); -+ -+#else -+ -+static inline long bch2_fs_ioctl(struct bch_fs *c, -+ unsigned cmd, void __user * arg) -+{ -+ return -ENOSYS; -+} -+ -+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } -+ -+static inline void bch2_chardev_exit(void) {} -+static inline int __init bch2_chardev_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_CHARDEV_H */ -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -new file mode 100644 -index 000000000000..3d88719ba86c ---- /dev/null -+++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,618 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static u64 bch2_checksum_init(unsigned type) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return 0; -+ case BCH_CSUM_CRC64: -+ return 0; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_final(unsigned type, u64 crc) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return crc ^ U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return crc ^ U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return crc; -+ case BCH_CSUM_CRC64: -+ return crc; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC32C: -+ return crc32c(crc, data, len); -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC64: -+ return crc64_be(crc, data, len); -+ default: -+ BUG(); -+ } -+} -+ -+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ struct scatterlist *sg, size_t len) -+{ -+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); -+ int ret; -+ -+ skcipher_request_set_sync_tfm(req, tfm); -+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); -+ -+ ret = crypto_skcipher_encrypt(req); -+ BUG_ON(ret); -+} -+ -+static inline void do_encrypt(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct scatterlist sg; -+ -+ sg_init_one(&sg, buf, len); -+ do_encrypt_sg(tfm, nonce, &sg, len); -+} -+ -+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct crypto_sync_skcipher *chacha20 = -+ crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ int ret; -+ -+ if (!chacha20) { -+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); -+ return PTR_ERR(chacha20); -+ } -+ -+ ret = crypto_skcipher_setkey(&chacha20->base, -+ (void *) key, sizeof(*key)); -+ if (ret) { -+ pr_err("crypto_skcipher_setkey() error: %i", ret); -+ goto err; -+ } -+ -+ do_encrypt(chacha20, nonce, buf, len); -+err: -+ crypto_free_sync_skcipher(chacha20); -+ return ret; -+} -+ -+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -+ struct nonce nonce) -+{ -+ u8 key[POLY1305_KEY_SIZE]; -+ -+ nonce.d[3] ^= BCH_NONCE_POLY; -+ -+ memset(key, 0, sizeof(key)); -+ do_encrypt(c->chacha20, nonce, key, sizeof(key)); -+ -+ desc->tfm = c->poly1305; -+ crypto_shash_init(desc); -+ crypto_shash_update(desc, key, sizeof(key)); -+} -+ -+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, -+ struct nonce nonce, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+ crc = bch2_checksum_update(type, crc, data, len); -+ crc = bch2_checksum_final(type, crc); -+ -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+ crypto_shash_update(desc, data, len); -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_encrypt(struct bch_fs *c, unsigned type, -+ struct nonce nonce, void *data, size_t len) -+{ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ do_encrypt(c->chacha20, nonce, data, len); -+} -+ -+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio, -+ struct bvec_iter *iter) -+{ -+ struct bio_vec bv; -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return (struct bch_csum) { 0 }; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ crc = bch2_checksum_update(type, -+ crc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crc = bch2_checksum_update(type, crc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crc = bch2_checksum_final(type, crc); -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ -+ crypto_shash_update(desc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crypto_shash_update(desc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ -+ return __bch2_checksum_bio(c, type, nonce, bio, &iter); -+} -+ -+void bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ struct scatterlist sgl[16], *sg = sgl; -+ size_t bytes = 0; -+ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ -+ bio_for_each_segment(bv, bio, iter) { -+ if (sg == sgl + ARRAY_SIZE(sgl)) { -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+ -+ nonce = nonce_add(nonce, bytes); -+ bytes = 0; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ sg = sgl; -+ } -+ -+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); -+ bytes += bv.bv_len; -+ } -+ -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, -+ struct bch_csum b, size_t b_len) -+{ -+ BUG_ON(!bch2_checksum_mergeable(type)); -+ -+ while (b_len) { -+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); -+ -+ a.lo = bch2_checksum_update(type, a.lo, -+ page_address(ZERO_PAGE(0)), b); -+ b_len -= b; -+ } -+ -+ a.lo ^= b.lo; -+ a.hi ^= b.hi; -+ return a; -+} -+ -+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc_old, -+ struct bch_extent_crc_unpacked *crc_a, -+ struct bch_extent_crc_unpacked *crc_b, -+ unsigned len_a, unsigned len_b, -+ unsigned new_csum_type) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ struct nonce nonce = extent_nonce(version, crc_old); -+ struct bch_csum merged = { 0 }; -+ struct crc_split { -+ struct bch_extent_crc_unpacked *crc; -+ unsigned len; -+ unsigned csum_type; -+ struct bch_csum csum; -+ } splits[3] = { -+ { crc_a, len_a, new_csum_type }, -+ { crc_b, len_b, new_csum_type }, -+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, -+ }, *i; -+ bool mergeable = crc_old.csum_type == new_csum_type && -+ bch2_checksum_mergeable(new_csum_type); -+ unsigned crc_nonce = crc_old.nonce; -+ -+ BUG_ON(len_a + len_b > bio_sectors(bio)); -+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); -+ BUG_ON(crc_is_compressed(crc_old)); -+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)); -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ iter.bi_size = i->len << 9; -+ if (mergeable || i->crc) -+ i->csum = __bch2_checksum_bio(c, i->csum_type, -+ nonce, bio, &iter); -+ else -+ bio_advance_iter(bio, &iter, i->len << 9); -+ nonce = nonce_add(nonce, i->len << 9); -+ } -+ -+ if (mergeable) -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) -+ merged = bch2_checksum_merge(new_csum_type, merged, -+ i->csum, i->len << 9); -+ else -+ merged = bch2_checksum_bio(c, crc_old.csum_type, -+ extent_nonce(version, crc_old), bio); -+ -+ if (bch2_crc_cmp(merged, crc_old.csum)) -+ return -EIO; -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ if (i->crc) -+ *i->crc = (struct bch_extent_crc_unpacked) { -+ .csum_type = i->csum_type, -+ .compression_type = crc_old.compression_type, -+ .compressed_size = i->len, -+ .uncompressed_size = i->len, -+ .offset = 0, -+ .live_size = i->len, -+ .nonce = crc_nonce, -+ .csum = i->csum, -+ }; -+ -+ if (bch2_csum_type_is_encryption(new_csum_type)) -+ crc_nonce += i->len; -+ } -+ -+ return 0; -+} -+ -+#ifdef __KERNEL__ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ char key_description[60]; -+ struct key *keyring_key; -+ const struct user_key_payload *ukp; -+ int ret; -+ -+ snprintf(key_description, sizeof(key_description), -+ "bcachefs:%pUb", &sb->user_uuid); -+ -+ keyring_key = request_key(&key_type_logon, key_description, NULL); -+ if (IS_ERR(keyring_key)) -+ return PTR_ERR(keyring_key); -+ -+ down_read(&keyring_key->sem); -+ ukp = dereference_key_locked(keyring_key); -+ if (ukp->datalen == sizeof(*key)) { -+ memcpy(key, ukp->data, ukp->datalen); -+ ret = 0; -+ } else { -+ ret = -EINVAL; -+ } -+ up_read(&keyring_key->sem); -+ key_put(keyring_key); -+ -+ return ret; -+} -+#else -+#include -+#include -+ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ key_serial_t key_id; -+ char key_description[60]; -+ char uuid[40]; -+ -+ uuid_unparse_lower(sb->user_uuid.b, uuid); -+ sprintf(key_description, "bcachefs:%s", uuid); -+ -+ key_id = request_key("user", key_description, NULL, -+ KEY_SPEC_USER_KEYRING); -+ if (key_id < 0) -+ return -errno; -+ -+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) -+ return -1; -+ -+ return 0; -+} -+#endif -+ -+int bch2_decrypt_sb_key(struct bch_fs *c, -+ struct bch_sb_field_crypt *crypt, -+ struct bch_key *key) -+{ -+ struct bch_encrypted_key sb_key = crypt->key; -+ struct bch_key user_key; -+ int ret = 0; -+ -+ /* is key encrypted? */ -+ if (!bch2_key_is_encrypted(&sb_key)) -+ goto out; -+ -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ /* decrypt real key: */ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &sb_key, sizeof(sb_key)); -+ if (ret) -+ goto err; -+ -+ if (bch2_key_is_encrypted(&sb_key)) { -+ bch_err(c, "incorrect encryption key"); -+ ret = -EINVAL; -+ goto err; -+ } -+out: -+ *key = sb_key.key; -+err: -+ memzero_explicit(&sb_key, sizeof(sb_key)); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ return ret; -+} -+ -+static int bch2_alloc_ciphers(struct bch_fs *c) -+{ -+ if (!c->chacha20) -+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ if (IS_ERR(c->chacha20)) { -+ bch_err(c, "error requesting chacha20 module: %li", -+ PTR_ERR(c->chacha20)); -+ return PTR_ERR(c->chacha20); -+ } -+ -+ if (!c->poly1305) -+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); -+ if (IS_ERR(c->poly1305)) { -+ bch_err(c, "error requesting poly1305 module: %li", -+ PTR_ERR(c->poly1305)); -+ return PTR_ERR(c->poly1305); -+ } -+ -+ return 0; -+} -+ -+int bch2_disable_encryption(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ /* is key encrypted? */ -+ ret = 0; -+ if (bch2_key_is_encrypted(&crypt->key)) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ crypt->key.magic = BCH_KEY_MAGIC; -+ crypt->key.key = key; -+ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_enable_encryption(struct bch_fs *c, bool keyed) -+{ -+ struct bch_encrypted_key key; -+ struct bch_key user_key; -+ struct bch_sb_field_crypt *crypt; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ /* Do we already have an encryption key? */ -+ if (bch2_sb_get_crypt(c->disk_sb.sb)) -+ goto err; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto err; -+ -+ key.magic = BCH_KEY_MAGIC; -+ get_random_bytes(&key.key, sizeof(key.key)); -+ -+ if (keyed) { -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &key, sizeof(key)); -+ if (ret) -+ goto err; -+ } -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto err; -+ -+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); -+ if (!crypt) { -+ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ -+ goto err; -+ } -+ -+ crypt->key = key; -+ -+ /* write superblock */ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); -+ bch2_write_super(c); -+err: -+ mutex_unlock(&c->sb_lock); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ memzero_explicit(&key, sizeof(key)); -+ return ret; -+} -+ -+void bch2_fs_encryption_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->poly1305)) -+ crypto_free_shash(c->poly1305); -+ if (!IS_ERR_OR_NULL(c->chacha20)) -+ crypto_free_sync_skcipher(c->chacha20); -+ if (!IS_ERR_OR_NULL(c->sha256)) -+ crypto_free_shash(c->sha256); -+} -+ -+int bch2_fs_encryption_init(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->sha256 = crypto_alloc_shash("sha256", 0, 0); -+ if (IS_ERR(c->sha256)) { -+ bch_err(c, "error requesting sha256 module"); -+ ret = PTR_ERR(c->sha256); -+ goto out; -+ } -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto out; -+out: -+ memzero_explicit(&key, sizeof(key)); -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -new file mode 100644 -index 000000000000..24dee8039d57 ---- /dev/null -+++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,202 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHECKSUM_H -+#define _BCACHEFS_CHECKSUM_H -+ -+#include "bcachefs.h" -+#include "extents_types.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static inline bool bch2_checksum_mergeable(unsigned type) -+{ -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, -+ struct bch_csum, size_t); -+ -+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -+#define BCH_NONCE_POLY cpu_to_le32(1 << 31) -+ -+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, -+ const void *, size_t); -+ -+/* -+ * This is used for various on disk data structures - bch_sb, prio_set, bset, -+ * jset: The checksum is _always_ the first field of these structs -+ */ -+#define csum_vstruct(_c, _type, _nonce, _i) \ -+({ \ -+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ -+ const void *end = vstruct_end(_i); \ -+ \ -+ bch2_checksum(_c, _type, _nonce, start, end - start); \ -+}) -+ -+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -+int bch2_request_key(struct bch_sb *, struct bch_key *); -+ -+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, -+ void *data, size_t); -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, -+ struct bch_extent_crc_unpacked, -+ struct bch_extent_crc_unpacked *, -+ struct bch_extent_crc_unpacked *, -+ unsigned, unsigned, unsigned); -+ -+void bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, -+ struct bch_key *); -+ -+int bch2_disable_encryption(struct bch_fs *); -+int bch2_enable_encryption(struct bch_fs *, bool); -+ -+void bch2_fs_encryption_exit(struct bch_fs *); -+int bch2_fs_encryption_init(struct bch_fs *); -+ -+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, -+ bool data) -+{ -+ switch (type) { -+ case BCH_CSUM_OPT_NONE: -+ return BCH_CSUM_NONE; -+ case BCH_CSUM_OPT_CRC32C: -+ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; -+ case BCH_CSUM_OPT_CRC64: -+ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; -+ default: -+ BUG(); -+ } -+} -+ -+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, -+ unsigned opt) -+{ -+ if (c->sb.encryption_type) -+ return c->opts.wide_macs -+ ? BCH_CSUM_CHACHA20_POLY1305_128 -+ : BCH_CSUM_CHACHA20_POLY1305_80; -+ -+ return bch2_csum_opt_to_type(opt, true); -+} -+ -+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -+{ -+ if (c->sb.encryption_type) -+ return BCH_CSUM_CHACHA20_POLY1305_128; -+ -+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -+} -+ -+static const unsigned bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+static inline bool bch2_checksum_type_valid(const struct bch_fs *c, -+ unsigned type) -+{ -+ if (type >= BCH_CSUM_NR) -+ return false; -+ -+ if (bch2_csum_type_is_encryption(type) && !c->chacha20) -+ return false; -+ -+ return true; -+} -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -+{ -+ /* -+ * XXX: need some way of preventing the compiler from optimizing this -+ * into a form that isn't constant time.. -+ */ -+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -+} -+ -+/* for skipping ahead and encrypting/decrypting at an offset: */ -+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -+{ -+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); -+ -+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); -+ return nonce; -+} -+ -+static inline struct nonce null_nonce(void) -+{ -+ struct nonce ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ return ret; -+} -+ -+static inline struct nonce extent_nonce(struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ unsigned compression_type = crc_is_compressed(crc) -+ ? crc.compression_type -+ : 0; -+ unsigned size = compression_type ? crc.uncompressed_size : 0; -+ struct nonce nonce = (struct nonce) {{ -+ [0] = cpu_to_le32(size << 22), -+ [1] = cpu_to_le32(version.lo), -+ [2] = cpu_to_le32(version.lo >> 32), -+ [3] = cpu_to_le32(version.hi| -+ (compression_type << 24))^BCH_NONCE_EXTENT, -+ }}; -+ -+ return nonce_add(nonce, crc.nonce << 9); -+} -+ -+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -+{ -+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -+} -+ -+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -+{ -+ __le64 magic = __bch2_sb_magic(sb); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -+{ -+ __le64 magic = bch2_sb_magic(c); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+#endif /* _BCACHEFS_CHECKSUM_H */ -diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c -new file mode 100644 -index 000000000000..a9f5d5696622 ---- /dev/null -+++ b/fs/bcachefs/clock.c -@@ -0,0 +1,194 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "clock.h" -+ -+#include -+#include -+#include -+ -+static inline long io_timer_cmp(io_timer_heap *h, -+ struct io_timer *l, -+ struct io_timer *r) -+{ -+ return l->expire - r->expire; -+} -+ -+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), -+ timer->expire)) { -+ spin_unlock(&clock->timer_lock); -+ timer->fn(timer); -+ return; -+ } -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) -+ goto out; -+ -+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); -+out: -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) { -+ heap_del(&clock->timers, i, io_timer_cmp, NULL); -+ break; -+ } -+ -+ spin_unlock(&clock->timer_lock); -+} -+ -+struct io_clock_wait { -+ struct io_timer io_timer; -+ struct timer_list cpu_timer; -+ struct task_struct *task; -+ int expired; -+}; -+ -+static void io_clock_wait_fn(struct io_timer *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, io_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+static void io_clock_cpu_timeout(struct timer_list *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, cpu_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) -+{ -+ struct io_clock_wait wait; -+ -+ /* XXX: calculate sleep time rigorously */ -+ wait.io_timer.expire = until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ schedule(); -+ -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+void bch2_kthread_io_clock_wait(struct io_clock *clock, -+ unsigned long io_until, -+ unsigned long cpu_timeout) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct io_clock_wait wait; -+ -+ wait.io_timer.expire = io_until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); -+ -+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) -+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if (wait.expired) -+ break; -+ -+ schedule(); -+ try_to_freeze(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ del_singleshot_timer_sync(&wait.cpu_timer); -+ destroy_timer_on_stack(&wait.cpu_timer); -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+static struct io_timer *get_expired_timer(struct io_clock *clock, -+ unsigned long now) -+{ -+ struct io_timer *ret = NULL; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (clock->timers.used && -+ time_after_eq(now, clock->timers.data[0]->expire)) -+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); -+ -+ spin_unlock(&clock->timer_lock); -+ -+ return ret; -+} -+ -+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) -+{ -+ struct io_timer *timer; -+ unsigned long now = atomic_long_add_return(sectors, &clock->now); -+ -+ while ((timer = get_expired_timer(clock, now))) -+ timer->fn(timer); -+} -+ -+ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ unsigned long now; -+ unsigned i; -+ -+ spin_lock(&clock->timer_lock); -+ now = atomic_long_read(&clock->now); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ pr_buf(&out, "%ps:\t%li\n", -+ clock->timers.data[i]->fn, -+ clock->timers.data[i]->expire - now); -+ spin_unlock(&clock->timer_lock); -+ -+ return out.pos - buf; -+} -+ -+void bch2_io_clock_exit(struct io_clock *clock) -+{ -+ free_heap(&clock->timers); -+ free_percpu(clock->pcpu_buf); -+} -+ -+int bch2_io_clock_init(struct io_clock *clock) -+{ -+ atomic_long_set(&clock->now, 0); -+ spin_lock_init(&clock->timer_lock); -+ -+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); -+ -+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); -+ if (!clock->pcpu_buf) -+ return -ENOMEM; -+ -+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h -new file mode 100644 -index 000000000000..da50afe206cc ---- /dev/null -+++ b/fs/bcachefs/clock.h -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_H -+#define _BCACHEFS_CLOCK_H -+ -+void bch2_io_timer_add(struct io_clock *, struct io_timer *); -+void bch2_io_timer_del(struct io_clock *, struct io_timer *); -+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, -+ unsigned long); -+ -+void __bch2_increment_clock(struct io_clock *, unsigned); -+ -+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, -+ int rw) -+{ -+ struct io_clock *clock = &c->io_clock[rw]; -+ -+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= -+ IO_CLOCK_PCPU_SECTORS)) -+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); -+ -+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -+({ \ -+ long __ret = timeout; \ -+ might_sleep(); \ -+ if (!___wait_cond_timeout(condition)) \ -+ __ret = __wait_event_timeout(wq, condition, timeout); \ -+ __ret; \ -+}) -+ -+ssize_t bch2_io_timers_show(struct io_clock *, char *); -+ -+void bch2_io_clock_exit(struct io_clock *); -+int bch2_io_clock_init(struct io_clock *); -+ -+#endif /* _BCACHEFS_CLOCK_H */ -diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h -new file mode 100644 -index 000000000000..92c740a47565 ---- /dev/null -+++ b/fs/bcachefs/clock_types.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_TYPES_H -+#define _BCACHEFS_CLOCK_TYPES_H -+ -+#include "util.h" -+ -+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) -+ -+/* -+ * Clocks/timers in units of sectors of IO: -+ * -+ * Note - they use percpu batching, so they're only approximate. -+ */ -+ -+struct io_timer; -+typedef void (*io_timer_fn)(struct io_timer *); -+ -+struct io_timer { -+ io_timer_fn fn; -+ unsigned long expire; -+}; -+ -+/* Amount to buffer up on a percpu counter */ -+#define IO_CLOCK_PCPU_SECTORS 128 -+ -+typedef HEAP(struct io_timer *) io_timer_heap; -+ -+struct io_clock { -+ atomic_long_t now; -+ u16 __percpu *pcpu_buf; -+ unsigned max_slop; -+ -+ spinlock_t timer_lock; -+ io_timer_heap timers; -+}; -+ -+#endif /* _BCACHEFS_CLOCK_TYPES_H */ -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -new file mode 100644 -index 000000000000..3d75527d2d81 ---- /dev/null -+++ b/fs/bcachefs/compress.c -@@ -0,0 +1,633 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "compress.h" -+#include "extents.h" -+#include "io.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* Bounce buffer: */ -+struct bbuf { -+ void *b; -+ enum { -+ BB_NONE, -+ BB_VMAP, -+ BB_KMALLOC, -+ BB_MEMPOOL, -+ } type; -+ int rw; -+}; -+ -+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -+{ -+ void *b; -+ -+ BUG_ON(size > c->sb.encoded_extent_max << 9); -+ -+ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; -+ -+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; -+ -+ BUG(); -+} -+ -+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ void *expected_start = NULL; -+ -+ __bio_for_each_bvec(bv, bio, iter, start) { -+ if (expected_start && -+ expected_start != page_address(bv.bv_page) + bv.bv_offset) -+ return false; -+ -+ expected_start = page_address(bv.bv_page) + -+ bv.bv_offset + bv.bv_len; -+ } -+ -+ return true; -+} -+ -+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, -+ struct bvec_iter start, int rw) -+{ -+ struct bbuf ret; -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ unsigned nr_pages = 0, flags; -+ struct page *stack_pages[16]; -+ struct page **pages = NULL; -+ void *data; -+ -+ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); -+ -+ if (!IS_ENABLED(CONFIG_HIGHMEM) && -+ bio_phys_contig(bio, start)) -+ return (struct bbuf) { -+ .b = page_address(bio_iter_page(bio, start)) + -+ bio_iter_offset(bio, start), -+ .type = BB_NONE, .rw = rw -+ }; -+ -+ /* check if we can map the pages contiguously: */ -+ __bio_for_each_segment(bv, bio, iter, start) { -+ if (iter.bi_size != start.bi_size && -+ bv.bv_offset) -+ goto bounce; -+ -+ if (bv.bv_len < iter.bi_size && -+ bv.bv_offset + bv.bv_len < PAGE_SIZE) -+ goto bounce; -+ -+ nr_pages++; -+ } -+ -+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); -+ -+ pages = nr_pages > ARRAY_SIZE(stack_pages) -+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) -+ : stack_pages; -+ if (!pages) -+ goto bounce; -+ -+ nr_pages = 0; -+ __bio_for_each_segment(bv, bio, iter, start) -+ pages[nr_pages++] = bv.bv_page; -+ -+ flags = memalloc_nofs_save(); -+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); -+ memalloc_nofs_restore(flags); -+ -+ if (pages != stack_pages) -+ kfree(pages); -+ -+ if (data) -+ return (struct bbuf) { -+ .b = data + bio_iter_offset(bio, start), -+ .type = BB_VMAP, .rw = rw -+ }; -+bounce: -+ ret = __bounce_alloc(c, start.bi_size, rw); -+ -+ if (rw == READ) -+ memcpy_from_bio(ret.b, bio, start); -+ -+ return ret; -+} -+ -+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -+{ -+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -+} -+ -+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -+{ -+ switch (buf.type) { -+ case BB_NONE: -+ break; -+ case BB_VMAP: -+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); -+ break; -+ case BB_KMALLOC: -+ kfree(buf.b); -+ break; -+ case BB_MEMPOOL: -+ mempool_free(buf.b, &c->compression_bounce[buf.rw]); -+ break; -+ } -+} -+ -+static inline void zlib_set_workspace(z_stream *strm, void *workspace) -+{ -+#ifdef __KERNEL__ -+ strm->workspace = workspace; -+#endif -+} -+ -+static int __bio_uncompress(struct bch_fs *c, struct bio *src, -+ void *dst_data, struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf src_data = { NULL }; -+ size_t src_len = src->bi_iter.bi_size; -+ size_t dst_len = crc.uncompressed_size << 9; -+ void *workspace; -+ int ret; -+ -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ switch (crc.compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4_old: -+ case BCH_COMPRESSION_TYPE_lz4: -+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, -+ src_len, dst_len, dst_len); -+ if (ret != dst_len) -+ goto err; -+ break; -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src_data.b, -+ .avail_in = src_len, -+ .next_out = dst_data, -+ .avail_out = dst_len, -+ }; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_inflateInit2(&strm, -MAX_WBITS); -+ ret = zlib_inflate(&strm, Z_FINISH); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != Z_STREAM_END) -+ goto err; -+ break; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_DCtx *ctx; -+ size_t real_src_len = le32_to_cpup(src_data.b); -+ -+ if (real_src_len > src_len - 4) -+ goto err; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); -+ -+ ret = ZSTD_decompressDCtx(ctx, -+ dst_data, dst_len, -+ src_data.b + 4, real_src_len); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != dst_len) -+ goto err; -+ break; -+ } -+ default: -+ BUG(); -+ } -+ ret = 0; -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ return ret; -+err: -+ ret = -EIO; -+ goto out; -+} -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, -+ struct bch_extent_crc_unpacked *crc) -+{ -+ struct bbuf data = { NULL }; -+ size_t dst_len = crc->uncompressed_size << 9; -+ -+ /* bio must own its pages: */ -+ BUG_ON(!bio->bi_vcnt); -+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); -+ -+ if (crc->uncompressed_size > c->sb.encoded_extent_max || -+ crc->compressed_size > c->sb.encoded_extent_max) { -+ bch_err(c, "error rewriting existing data: extent too big"); -+ return -EIO; -+ } -+ -+ data = __bounce_alloc(c, dst_len, WRITE); -+ -+ if (__bio_uncompress(c, bio, data.b, *crc)) { -+ bch_err(c, "error rewriting existing data: decompression error"); -+ bio_unmap_or_unbounce(c, data); -+ return -EIO; -+ } -+ -+ /* -+ * XXX: don't have a good way to assert that the bio was allocated with -+ * enough space, we depend on bch2_move_extent doing the right thing -+ */ -+ bio->bi_iter.bi_size = crc->live_size << 9; -+ -+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); -+ -+ crc->csum_type = 0; -+ crc->compression_type = 0; -+ crc->compressed_size = crc->live_size; -+ crc->uncompressed_size = crc->live_size; -+ crc->offset = 0; -+ crc->csum = (struct bch_csum) { 0, 0 }; -+ -+ bio_unmap_or_unbounce(c, data); -+ return 0; -+} -+ -+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, -+ struct bio *dst, struct bvec_iter dst_iter, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf dst_data = { NULL }; -+ size_t dst_len = crc.uncompressed_size << 9; -+ int ret = -ENOMEM; -+ -+ if (crc.uncompressed_size > c->sb.encoded_extent_max || -+ crc.compressed_size > c->sb.encoded_extent_max) -+ return -EIO; -+ -+ dst_data = dst_len == dst_iter.bi_size -+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) -+ : __bounce_alloc(c, dst_len, WRITE); -+ -+ ret = __bio_uncompress(c, src, dst_data.b, crc); -+ if (ret) -+ goto err; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -+err: -+ bio_unmap_or_unbounce(c, dst_data); -+ return ret; -+} -+ -+static int attempt_compress(struct bch_fs *c, -+ void *workspace, -+ void *dst, size_t dst_len, -+ void *src, size_t src_len, -+ enum bch_compression_type compression_type) -+{ -+ switch (compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4: { -+ int len = src_len; -+ int ret = LZ4_compress_destSize( -+ src, dst, -+ &len, dst_len, -+ workspace); -+ -+ if (len < src_len) -+ return -len; -+ -+ return ret; -+ } -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src, -+ .avail_in = src_len, -+ .next_out = dst, -+ .avail_out = dst_len, -+ }; -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, -+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, -+ Z_DEFAULT_STRATEGY); -+ -+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) -+ return 0; -+ -+ if (zlib_deflateEnd(&strm) != Z_OK) -+ return 0; -+ -+ return strm.total_out; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, -+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); -+ -+ size_t len = ZSTD_compressCCtx(ctx, -+ dst + 4, dst_len - 4, -+ src, src_len, -+ c->zstd_params); -+ if (ZSTD_isError(len)) -+ return 0; -+ -+ *((__le32 *) dst) = cpu_to_le32(len); -+ return len + 4; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned __bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ enum bch_compression_type compression_type) -+{ -+ struct bbuf src_data = { NULL }, dst_data = { NULL }; -+ void *workspace; -+ unsigned pad; -+ int ret = 0; -+ -+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); -+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); -+ -+ /* If it's only one block, don't bother trying to compress: */ -+ if (bio_sectors(src) <= c->opts.block_size) -+ return 0; -+ -+ dst_data = bio_map_or_bounce(c, dst, WRITE); -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); -+ -+ *src_len = src->bi_iter.bi_size; -+ *dst_len = dst->bi_iter.bi_size; -+ -+ /* -+ * XXX: this algorithm sucks when the compression code doesn't tell us -+ * how much would fit, like LZ4 does: -+ */ -+ while (1) { -+ if (*src_len <= block_bytes(c)) { -+ ret = -1; -+ break; -+ } -+ -+ ret = attempt_compress(c, workspace, -+ dst_data.b, *dst_len, -+ src_data.b, *src_len, -+ compression_type); -+ if (ret > 0) { -+ *dst_len = ret; -+ ret = 0; -+ break; -+ } -+ -+ /* Didn't fit: should we retry with a smaller amount? */ -+ if (*src_len <= *dst_len) { -+ ret = -1; -+ break; -+ } -+ -+ /* -+ * If ret is negative, it's a hint as to how much data would fit -+ */ -+ BUG_ON(-ret >= *src_len); -+ -+ if (ret < 0) -+ *src_len = -ret; -+ else -+ *src_len -= (*src_len - *dst_len) / 2; -+ *src_len = round_down(*src_len, block_bytes(c)); -+ } -+ -+ mempool_free(workspace, &c->compress_workspace[compression_type]); -+ -+ if (ret) -+ goto err; -+ -+ /* Didn't get smaller: */ -+ if (round_up(*dst_len, block_bytes(c)) >= *src_len) -+ goto err; -+ -+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; -+ -+ memset(dst_data.b + *dst_len, 0, pad); -+ *dst_len += pad; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); -+ -+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); -+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); -+ BUG_ON(*dst_len & (block_bytes(c) - 1)); -+ BUG_ON(*src_len & (block_bytes(c) - 1)); -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ bio_unmap_or_unbounce(c, dst_data); -+ return compression_type; -+err: -+ compression_type = BCH_COMPRESSION_TYPE_incompressible; -+ goto out; -+} -+ -+unsigned bch2_bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ unsigned compression_type) -+{ -+ unsigned orig_dst = dst->bi_iter.bi_size; -+ unsigned orig_src = src->bi_iter.bi_size; -+ -+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ -+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, -+ c->sb.encoded_extent_max << 9); -+ /* Don't generate a bigger output than input: */ -+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ -+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) -+ compression_type = BCH_COMPRESSION_TYPE_lz4; -+ -+ compression_type = -+ __bio_compress(c, dst, dst_len, src, src_len, compression_type); -+ -+ dst->bi_iter.bi_size = orig_dst; -+ src->bi_iter.bi_size = orig_src; -+ return compression_type; -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *, u64); -+ -+#define BCH_FEATURE_none 0 -+ -+static const unsigned bch2_compression_opt_to_feature[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+#undef BCH_FEATURE_none -+ -+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -+{ -+ int ret = 0; -+ -+ if ((c->sb.features & f) == f) -+ return 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if ((c->sb.features & f) == f) { -+ mutex_unlock(&c->sb_lock); -+ return 0; -+ } -+ -+ ret = __bch2_fs_compress_init(c, c->sb.features|f); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ret; -+ } -+ -+ c->disk_sb.sb->features[0] |= cpu_to_le64(f); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *c, -+ unsigned compression_type) -+{ -+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); -+ -+ return compression_type -+ ? __bch2_check_set_has_compressed_data(c, -+ 1ULL << bch2_compression_opt_to_feature[compression_type]) -+ : 0; -+} -+ -+void bch2_fs_compress_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ mempool_exit(&c->decompress_workspace); -+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) -+ mempool_exit(&c->compress_workspace[i]); -+ mempool_exit(&c->compression_bounce[WRITE]); -+ mempool_exit(&c->compression_bounce[READ]); -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -+{ -+ size_t max_extent = c->sb.encoded_extent_max << 9; -+ size_t decompress_workspace_size = 0; -+ bool decompress_workspace_needed; -+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); -+ struct { -+ unsigned feature; -+ unsigned type; -+ size_t compress_workspace; -+ size_t decompress_workspace; -+ } compression_types[] = { -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, -+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, -+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -+ zlib_inflate_workspacesize(), }, -+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, -+ ZSTD_CCtxWorkspaceBound(params.cParams), -+ ZSTD_DCtxWorkspaceBound() }, -+ }, *i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->zstd_params = params; -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) -+ if (features & (1 << i->feature)) -+ goto have_compressed; -+ -+ goto out; -+have_compressed: -+ -+ if (!mempool_initialized(&c->compression_bounce[READ])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->compression_bounce[WRITE])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) { -+ decompress_workspace_size = -+ max(decompress_workspace_size, i->decompress_workspace); -+ -+ if (!(features & (1 << i->feature))) -+ continue; -+ -+ if (i->decompress_workspace) -+ decompress_workspace_needed = true; -+ -+ if (mempool_initialized(&c->compress_workspace[i->type])) -+ continue; -+ -+ ret = mempool_init_kvpmalloc_pool( -+ &c->compress_workspace[i->type], -+ 1, i->compress_workspace); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->decompress_workspace)) { -+ ret = mempool_init_kvpmalloc_pool( -+ &c->decompress_workspace, -+ 1, decompress_workspace_size); -+ if (ret) -+ goto out; -+ } -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_compress_init(struct bch_fs *c) -+{ -+ u64 f = c->sb.features; -+ -+ if (c->opts.compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; -+ -+ if (c->opts.background_compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; -+ -+ return __bch2_fs_compress_init(c, f); -+ -+} -diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h -new file mode 100644 -index 000000000000..4bab1f61b3b5 ---- /dev/null -+++ b/fs/bcachefs/compress.h -@@ -0,0 +1,18 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_COMPRESS_H -+#define _BCACHEFS_COMPRESS_H -+ -+#include "extents_types.h" -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, -+ struct bch_extent_crc_unpacked *); -+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, -+ struct bvec_iter, struct bch_extent_crc_unpacked); -+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, -+ struct bio *, size_t *, unsigned); -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -+void bch2_fs_compress_exit(struct bch_fs *); -+int bch2_fs_compress_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_COMPRESS_H */ -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -new file mode 100644 -index 000000000000..aa10591a3b1a ---- /dev/null -+++ b/fs/bcachefs/debug.c -@@ -0,0 +1,432 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Assorted bcachefs debug code -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "super.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+static struct dentry *bch_debug; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ struct btree *v = c->verify_data; -+ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; -+ struct bset *sorted, *inmemory; -+ struct extent_ptr_decoded pick; -+ struct bch_dev *ca; -+ struct bio *bio; -+ -+ if (c->opts.nochanges) -+ return; -+ -+ btree_node_io_lock(b); -+ mutex_lock(&c->verify_lock); -+ -+ n_ondisk = c->verify_ondisk; -+ n_sorted = c->verify_data->data; -+ n_inmemory = b->data; -+ -+ bkey_copy(&v->key, &b->key); -+ v->written = 0; -+ v->c.level = b->c.level; -+ v->c.btree_id = b->c.btree_id; -+ bch2_btree_keys_init(v, &c->expensive_debug_checks); -+ -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick) <= 0) -+ return; -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ if (!bch2_dev_get_ioref(ca, READ)) -+ return; -+ -+ bio = bio_alloc_bioset(GFP_NOIO, -+ buf_pages(n_sorted, btree_bytes(c)), -+ &c->btree_bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_READ|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bch2_bio_map(bio, n_sorted, btree_bytes(c)); -+ -+ submit_bio_wait(bio); -+ -+ bio_put(bio); -+ percpu_ref_put(&ca->io_ref); -+ -+ memcpy(n_ondisk, n_sorted, btree_bytes(c)); -+ -+ if (bch2_btree_node_read_done(c, v, false)) -+ goto out; -+ -+ n_sorted = c->verify_data->data; -+ sorted = &n_sorted->keys; -+ inmemory = &n_inmemory->keys; -+ -+ if (inmemory->u64s != sorted->u64s || -+ memcmp(inmemory->start, -+ sorted->start, -+ vstruct_end(inmemory) - (void *) inmemory->start)) { -+ unsigned offset = 0, sectors; -+ struct bset *i; -+ unsigned j; -+ -+ console_lock(); -+ -+ printk(KERN_ERR "*** in memory:\n"); -+ bch2_dump_bset(c, b, inmemory, 0); -+ -+ printk(KERN_ERR "*** read back in:\n"); -+ bch2_dump_bset(c, v, sorted, 0); -+ -+ while (offset < b->written) { -+ if (!offset ) { -+ i = &n_ondisk->keys; -+ sectors = vstruct_blocks(n_ondisk, c->block_bits) << -+ c->block_bits; -+ } else { -+ struct btree_node_entry *bne = -+ (void *) n_ondisk + (offset << 9); -+ i = &bne->keys; -+ -+ sectors = vstruct_blocks(bne, c->block_bits) << -+ c->block_bits; -+ } -+ -+ printk(KERN_ERR "*** on disk block %u:\n", offset); -+ bch2_dump_bset(c, b, i, offset); -+ -+ offset += sectors; -+ } -+ -+ printk(KERN_ERR "*** block %u/%u not written\n", -+ offset >> c->block_bits, btree_blocks(c)); -+ -+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) -+ if (inmemory->_data[j] != sorted->_data[j]) -+ break; -+ -+ printk(KERN_ERR "b->written %u\n", b->written); -+ -+ console_unlock(); -+ panic("verify failed at %u\n", j); -+ } -+out: -+ mutex_unlock(&c->verify_lock); -+ btree_node_io_unlock(b); -+} -+ -+#endif -+ -+#ifdef CONFIG_DEBUG_FS -+ -+/* XXX: bch_fs refcounting */ -+ -+struct dump_iter { -+ struct bpos from; -+ struct bch_fs *c; -+ enum btree_id id; -+ -+ char buf[PAGE_SIZE]; -+ size_t bytes; /* what's currently in buf */ -+ -+ char __user *ubuf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+static int flush_buf(struct dump_iter *i) -+{ -+ if (i->bytes) { -+ size_t bytes = min(i->bytes, i->size); -+ int err = copy_to_user(i->ubuf, i->buf, bytes); -+ -+ if (err) -+ return err; -+ -+ i->ret += bytes; -+ i->ubuf += bytes; -+ i->size -= bytes; -+ i->bytes -= bytes; -+ memmove(i->buf, i->buf + bytes, i->bytes); -+ } -+ -+ return 0; -+} -+ -+static int bch2_dump_open(struct inode *inode, struct file *file) -+{ -+ struct btree_debug *bd = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ if (!i) -+ return -ENOMEM; -+ -+ file->private_data = i; -+ i->from = POS_MIN; -+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); -+ i->id = bd->id; -+ -+ return 0; -+} -+ -+static int bch2_dump_release(struct inode *inode, struct file *file) -+{ -+ kfree(file->private_data); -+ return 0; -+} -+ -+static ssize_t bch2_read_btree(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ k = bch2_btree_iter_peek(iter); -+ -+ while (k.k && !(err = bkey_err(k))) { -+ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); -+ i->bytes = strlen(i->buf); -+ BUG_ON(i->bytes >= PAGE_SIZE); -+ i->buf[i->bytes] = '\n'; -+ i->bytes++; -+ -+ k = bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree, -+}; -+ -+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size || !bkey_cmp(POS_MAX, i->from)) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ /* -+ * can't easily correctly restart a btree node traversal across -+ * all nodes, meh -+ */ -+ i->from = bkey_cmp(POS_MAX, b->key.k.p) -+ ? bkey_successor(b->key.k.p) -+ : b->key.k.p; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_format_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree_formats, -+}; -+ -+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct btree *prev_node = NULL; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(err = bkey_err(k))) { -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_packed *_k = -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+ -+ if (l->b != prev_node) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ } -+ prev_node = l->b; -+ -+ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations bfloat_failed_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_bfloat_failed, -+}; -+ -+void bch2_fs_debug_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->debug)) -+ debugfs_remove_recursive(c->debug); -+} -+ -+void bch2_fs_debug_init(struct bch_fs *c) -+{ -+ struct btree_debug *bd; -+ char name[100]; -+ -+ if (IS_ERR_OR_NULL(bch_debug)) -+ return; -+ -+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ c->debug = debugfs_create_dir(name, bch_debug); -+ if (IS_ERR_OR_NULL(c->debug)) -+ return; -+ -+ for (bd = c->btree_debug; -+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); -+ bd++) { -+ bd->id = bd - c->btree_debug; -+ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], -+ 0400, c->debug, bd, -+ &btree_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-formats", -+ bch2_btree_ids[bd->id]); -+ -+ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, -+ &btree_format_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-bfloat-failed", -+ bch2_btree_ids[bd->id]); -+ -+ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, -+ &bfloat_failed_debug_ops); -+ } -+} -+ -+#endif -+ -+void bch2_debug_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_debug)) -+ debugfs_remove_recursive(bch_debug); -+} -+ -+int __init bch2_debug_init(void) -+{ -+ int ret = 0; -+ -+ bch_debug = debugfs_create_dir("bcachefs", NULL); -+ return ret; -+} -diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h -new file mode 100644 -index 000000000000..56c2d1ab5f63 ---- /dev/null -+++ b/fs/bcachefs/debug.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DEBUG_H -+#define _BCACHEFS_DEBUG_H -+ -+#include "bcachefs.h" -+ -+struct bio; -+struct btree; -+struct bch_fs; -+ -+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_ALWAYS() -+#undef BCH_DEBUG_PARAM -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+void __bch2_btree_verify(struct bch_fs *, struct btree *); -+ -+#define bypass_torture_test(d) ((d)->bypass_torture_test) -+ -+#else /* DEBUG */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) { return false; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} -+ -+#define bypass_torture_test(d) 0 -+ -+#endif -+ -+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ if (verify_btree_ondisk(c)) -+ __bch2_btree_verify(c, b); -+} -+ -+#ifdef CONFIG_DEBUG_FS -+void bch2_fs_debug_exit(struct bch_fs *); -+void bch2_fs_debug_init(struct bch_fs *); -+#else -+static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -+static inline void bch2_fs_debug_init(struct bch_fs *c) {} -+#endif -+ -+void bch2_debug_exit(void); -+int bch2_debug_init(void); -+ -+#endif /* _BCACHEFS_DEBUG_H */ -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -new file mode 100644 -index 000000000000..f34bfda8ab0d ---- /dev/null -+++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,385 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "dirent.h" -+#include "fs.h" -+#include "keylist.h" -+#include "str_hash.h" -+ -+#include -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -+{ -+ unsigned len = bkey_val_bytes(d.k) - -+ offsetof(struct bch_dirent, d_name); -+ -+ return strnlen(d.v->d_name, len); -+} -+ -+static u64 bch2_dirent_hash(const struct bch_hash_info *info, -+ const struct qstr *name) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, name->name, name->len); -+ -+ /* [0,2) reserved for dots */ -+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -+} -+ -+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_dirent_hash(info, key); -+} -+ -+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+ -+ return bch2_dirent_hash(info, &name); -+} -+ -+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ int len = bch2_dirent_name_bytes(l); -+ const struct qstr *r = _r; -+ -+ return len - r->len ?: memcmp(l.v->d_name, r->name, len); -+} -+ -+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -+ int l_len = bch2_dirent_name_bytes(l); -+ int r_len = bch2_dirent_name_bytes(r); -+ -+ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); -+} -+ -+const struct bch_hash_desc bch2_dirent_hash_desc = { -+ .btree_id = BTREE_ID_DIRENTS, -+ .key_type = KEY_TYPE_dirent, -+ .hash_key = dirent_hash_key, -+ .hash_bkey = dirent_hash_bkey, -+ .cmp_key = dirent_cmp_key, -+ .cmp_bkey = dirent_cmp_bkey, -+}; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ unsigned len; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) -+ return "value too small"; -+ -+ len = bch2_dirent_name_bytes(d); -+ if (!len) -+ return "empty name"; -+ -+ /* -+ * older versions of bcachefs were buggy and creating dirent -+ * keys that were bigger than necessary: -+ */ -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) -+ return "value too big"; -+ -+ if (len > BCH_NAME_MAX) -+ return "dirent name too big"; -+ -+ return NULL; -+} -+ -+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ -+ bch_scnmemcpy(out, d.v->d_name, -+ bch2_dirent_name_bytes(d)); -+ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); -+} -+ -+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -+ u8 type, const struct qstr *name, u64 dst) -+{ -+ struct bkey_i_dirent *dirent; -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); -+ -+ if (name->len > BCH_NAME_MAX) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ BUG_ON(u64s > U8_MAX); -+ -+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(dirent)) -+ return dirent; -+ -+ bkey_dirent_init(&dirent->k_i); -+ dirent->k.u64s = u64s; -+ dirent->v.d_inum = cpu_to_le64(dst); -+ dirent->v.d_type = type; -+ -+ memcpy(dirent->v.d_name, name->name, name->len); -+ memset(dirent->v.d_name + name->len, 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_name) - -+ name->len); -+ -+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); -+ -+ return dirent; -+} -+ -+int bch2_dirent_create(struct btree_trans *trans, -+ u64 dir_inum, const struct bch_hash_info *hash_info, -+ u8 type, const struct qstr *name, u64 dst_inum, -+ int flags) -+{ -+ struct bkey_i_dirent *dirent; -+ int ret; -+ -+ dirent = dirent_create_key(trans, type, name, dst_inum); -+ ret = PTR_ERR_OR_ZERO(dirent); -+ if (ret) -+ return ret; -+ -+ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, -+ dir_inum, &dirent->k_i, flags); -+} -+ -+static void dirent_copy_target(struct bkey_i_dirent *dst, -+ struct bkey_s_c_dirent src) -+{ -+ dst->v.d_inum = src.v->d_inum; -+ dst->v.d_type = src.v->d_type; -+} -+ -+int bch2_dirent_rename(struct btree_trans *trans, -+ u64 src_dir, struct bch_hash_info *src_hash, -+ u64 dst_dir, struct bch_hash_info *dst_hash, -+ const struct qstr *src_name, u64 *src_inum, -+ const struct qstr *dst_name, u64 *dst_inum, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_iter = NULL, *dst_iter = NULL; -+ struct bkey_s_c old_src, old_dst; -+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; -+ struct bpos dst_pos = -+ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); -+ int ret = 0; -+ -+ *src_inum = *dst_inum = 0; -+ -+ /* -+ * Lookup dst: -+ * -+ * Note that in BCH_RENAME mode, we're _not_ checking if -+ * the target already exists - we're relying on the VFS -+ * to do that check for us for correctness: -+ */ -+ dst_iter = mode == BCH_RENAME -+ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name) -+ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_iter); -+ if (ret) -+ goto out; -+ -+ old_dst = bch2_btree_iter_peek_slot(dst_iter); -+ -+ if (mode != BCH_RENAME) -+ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); -+ -+ /* Lookup src: */ -+ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ src_hash, src_dir, src_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_iter); -+ if (ret) -+ goto out; -+ -+ old_src = bch2_btree_iter_peek_slot(src_iter); -+ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); -+ -+ /* Create new dst key: */ -+ new_dst = dirent_create_key(trans, 0, dst_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_dst); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); -+ new_dst->k.p = dst_iter->pos; -+ -+ /* Create new src key: */ -+ if (mode == BCH_RENAME_EXCHANGE) { -+ new_src = dirent_create_key(trans, 0, src_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); -+ new_src->k.p = src_iter->pos; -+ } else { -+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ bkey_init(&new_src->k); -+ new_src->k.p = src_iter->pos; -+ -+ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && -+ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { -+ /* -+ * We have a hash collision for the new dst key, -+ * and new_src - the key we're deleting - is between -+ * new_dst's hashed slot and the slot we're going to be -+ * inserting it into - oops. This will break the hash -+ * table if we don't deal with it: -+ */ -+ if (mode == BCH_RENAME) { -+ /* -+ * If we're not overwriting, we can just insert -+ * new_dst at the src position: -+ */ -+ new_dst->k.p = src_iter->pos; -+ bch2_trans_update(trans, src_iter, -+ &new_dst->k_i, 0); -+ goto out; -+ } else { -+ /* If we're overwriting, we can't insert new_dst -+ * at a different slot because it has to -+ * overwrite old_dst - just make sure to use a -+ * whiteout when deleting src: -+ */ -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } else { -+ /* Check if we need a whiteout to delete src: */ -+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, -+ src_hash, src_iter); -+ if (ret < 0) -+ goto out; -+ -+ if (ret) -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } -+ -+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); -+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, src_iter); -+ bch2_trans_iter_put(trans, dst_iter); -+ return ret; -+} -+ -+int bch2_dirent_delete_at(struct btree_trans *trans, -+ const struct bch_hash_info *hash_info, -+ struct btree_iter *iter) -+{ -+ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ hash_info, iter); -+} -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name, unsigned flags) -+{ -+ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ hash_info, dir_inum, name, flags); -+} -+ -+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 inum = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, -+ hash_info, name, 0); -+ if (IS_ERR(iter)) { -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ goto out; -+ } -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+out: -+ bch2_trans_exit(&trans); -+ return inum; -+} -+ -+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, -+ POS(dir_inum, 0), 0, k, ret) { -+ if (k.k->p.inode > dir_inum) -+ break; -+ -+ if (k.k->type == KEY_TYPE_dirent) { -+ ret = -ENOTEMPTY; -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+} -+ -+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(inum, ctx->pos), 0, k, ret) { -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ /* -+ * XXX: dir_emit() can fault and block, while we're holding -+ * locks -+ */ -+ ctx->pos = dirent.k->p.offset; -+ if (!dir_emit(ctx, dirent.v->d_name, -+ bch2_dirent_name_bytes(dirent), -+ le64_to_cpu(dirent.v->d_inum), -+ dirent.v->d_type)) -+ break; -+ ctx->pos = dirent.k->p.offset + 1; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ return ret; -+} -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -new file mode 100644 -index 000000000000..34769371dd13 ---- /dev/null -+++ b/fs/bcachefs/dirent.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DIRENT_H -+#define _BCACHEFS_DIRENT_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_dirent_hash_desc; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_dirent (struct bkey_ops) { \ -+ .key_invalid = bch2_dirent_invalid, \ -+ .val_to_text = bch2_dirent_to_text, \ -+} -+ -+struct qstr; -+struct file; -+struct dir_context; -+struct bch_fs; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); -+ -+static inline unsigned dirent_val_u64s(unsigned len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, -+ sizeof(u64)); -+} -+ -+int bch2_dirent_create(struct btree_trans *, u64, -+ const struct bch_hash_info *, u8, -+ const struct qstr *, u64, int); -+ -+int bch2_dirent_delete_at(struct btree_trans *, -+ const struct bch_hash_info *, -+ struct btree_iter *); -+ -+enum bch_rename_mode { -+ BCH_RENAME, -+ BCH_RENAME_OVERWRITE, -+ BCH_RENAME_EXCHANGE, -+}; -+ -+int bch2_dirent_rename(struct btree_trans *, -+ u64, struct bch_hash_info *, -+ u64, struct bch_hash_info *, -+ const struct qstr *, u64 *, -+ const struct qstr *, u64 *, -+ enum bch_rename_mode); -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *, u64, -+ const struct bch_hash_info *, -+ const struct qstr *, unsigned); -+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, -+ const struct qstr *); -+ -+int bch2_empty_dir_trans(struct btree_trans *, u64); -+int bch2_readdir(struct bch_fs *, u64, struct dir_context *); -+ -+#endif /* _BCACHEFS_DIRENT_H */ -diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c -new file mode 100644 -index 000000000000..4a4ec8f46108 ---- /dev/null -+++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,481 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "disk_groups.h" -+#include "super-io.h" -+ -+#include -+ -+static int group_cmp(const void *_l, const void *_r) -+{ -+ const struct bch_disk_group *l = _l; -+ const struct bch_disk_group *r = _r; -+ -+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - -+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: -+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - -+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: -+ strncmp(l->label, r->label, sizeof(l->label)); -+} -+ -+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g, *sorted = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member *m; -+ unsigned i, nr_groups, len; -+ const char *err = NULL; -+ -+ mi = bch2_sb_get_members(sb); -+ groups = bch2_sb_get_disk_groups(sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ unsigned g; -+ -+ if (!BCH_MEMBER_GROUP(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (g >= nr_groups || -+ BCH_GROUP_DELETED(&groups->entries[g])) -+ return "disk has invalid group"; -+ } -+ -+ if (!nr_groups) -+ return NULL; -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ len = strnlen(g->label, sizeof(g->label)); -+ if (!len) { -+ err = "group with empty label"; -+ goto err; -+ } -+ } -+ -+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); -+ if (!sorted) -+ return "cannot allocate memory"; -+ -+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); -+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); -+ -+ for (i = 0; i + 1 < nr_groups; i++) -+ if (!BCH_GROUP_DELETED(sorted + i) && -+ !group_cmp(sorted + i, sorted + i + 1)) { -+ err = "duplicate groups"; -+ goto err; -+ } -+ -+ err = NULL; -+err: -+ kfree(sorted); -+ return err; -+} -+ -+static void bch2_sb_disk_groups_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g; -+ unsigned nr_groups = disk_groups_nr(groups); -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (g != groups->entries) -+ pr_buf(out, " "); -+ -+ if (BCH_GROUP_DELETED(g)) -+ pr_buf(out, "[deleted]"); -+ else -+ pr_buf(out, "[parent %llu name %s]", -+ BCH_GROUP_PARENT(g), g->label); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { -+ .validate = bch2_sb_disk_groups_validate, -+ .to_text = bch2_sb_disk_groups_to_text -+}; -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_sb_field_disk_groups *groups; -+ struct bch_disk_groups_cpu *cpu_g, *old_g; -+ unsigned i, g, nr_groups; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ if (!groups) -+ return 0; -+ -+ cpu_g = kzalloc(sizeof(*cpu_g) + -+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); -+ if (!cpu_g) -+ return -ENOMEM; -+ -+ cpu_g->nr = nr_groups; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *src = &groups->entries[i]; -+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; -+ -+ dst->deleted = BCH_GROUP_DELETED(src); -+ dst->parent = BCH_GROUP_PARENT(src); -+ } -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ struct bch_disk_group_cpu *dst = -+ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m); -+ while (g) { -+ dst = &cpu_g->entries[g - 1]; -+ __set_bit(i, dst->devs.d); -+ g = dst->parent; -+ } -+ } -+ -+ old_g = rcu_dereference_protected(c->disk_groups, -+ lockdep_is_held(&c->sb_lock)); -+ rcu_assign_pointer(c->disk_groups, cpu_g); -+ if (old_g) -+ kfree_rcu(old_g, rcu); -+ -+ return 0; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return NULL; -+ case TARGET_DEV: { -+ struct bch_dev *ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ return ca ? &ca->self : NULL; -+ } -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ -+ return t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return false; -+ case TARGET_DEV: -+ return dev == t.dev; -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g; -+ const struct bch_devs_mask *m; -+ bool ret; -+ -+ rcu_read_lock(); -+ g = rcu_dereference(c->disk_groups); -+ m = t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ -+ ret = m ? test_bit(dev, m->d) : false; -+ rcu_read_unlock(); -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, -+ unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *g = groups->entries + i; -+ -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ if (!BCH_GROUP_DELETED(g) && -+ BCH_GROUP_PARENT(g) == parent && -+ strnlen(g->label, sizeof(g->label)) == namelen && -+ !memcmp(name, g->label, namelen)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ struct bch_disk_group *g; -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; -+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); -+ i++) -+ ; -+ -+ if (i == nr_groups) { -+ unsigned u64s = -+ (sizeof(struct bch_sb_field_disk_groups) + -+ sizeof(struct bch_disk_group) * (nr_groups + 1)) / -+ sizeof(u64); -+ -+ groups = bch2_sb_resize_disk_groups(sb, u64s); -+ if (!groups) -+ return -ENOSPC; -+ -+ nr_groups = disk_groups_nr(groups); -+ } -+ -+ BUG_ON(i >= nr_groups); -+ -+ g = &groups->entries[i]; -+ -+ memcpy(g->label, name, namelen); -+ if (namelen < sizeof(g->label)) -+ g->label[namelen] = '\0'; -+ SET_BCH_GROUP_DELETED(g, 0); -+ SET_BCH_GROUP_PARENT(g, parent); -+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); -+ -+ return i; -+} -+ -+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ v = __bch2_disk_group_find(groups, v + 1, name, len); -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups; -+ unsigned parent = 0; -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ groups = bch2_sb_get_disk_groups(sb->sb); -+ -+ v = __bch2_disk_group_find(groups, parent, name, len); -+ if (v < 0) -+ v = __bch2_disk_group_add(sb, parent, name, len); -+ if (v < 0) -+ return v; -+ -+ parent = v + 1; -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+void bch2_disk_path_to_text(struct printbuf *out, -+ struct bch_sb_handle *sb, -+ unsigned v) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ struct bch_disk_group *g; -+ unsigned nr = 0; -+ u16 path[32]; -+ -+ while (1) { -+ if (nr == ARRAY_SIZE(path)) -+ goto inval; -+ -+ if (v >= disk_groups_nr(groups)) -+ goto inval; -+ -+ g = groups->entries + v; -+ -+ if (BCH_GROUP_DELETED(g)) -+ goto inval; -+ -+ path[nr++] = v; -+ -+ if (!BCH_GROUP_PARENT(g)) -+ break; -+ -+ v = BCH_GROUP_PARENT(g) - 1; -+ } -+ -+ while (nr) { -+ v = path[--nr]; -+ g = groups->entries + v; -+ -+ bch_scnmemcpy(out, g->label, -+ strnlen(g->label, sizeof(g->label))); -+ -+ if (nr) -+ pr_buf(out, "."); -+ } -+ return; -+inval: -+ pr_buf(out, "invalid group %u", v); -+} -+ -+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -+{ -+ struct bch_member *mi; -+ int v = -1; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (!strlen(name) || !strcmp(name, "none")) -+ goto write_sb; -+ -+ v = bch2_disk_path_find_or_create(&c->disk_sb, name); -+ if (v < 0) { -+ mutex_unlock(&c->sb_lock); -+ return v; -+ } -+ -+write_sb: -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ SET_BCH_MEMBER_GROUP(mi, v + 1); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) -+{ -+ struct bch_dev *ca; -+ int g; -+ -+ if (!strlen(buf) || !strcmp(buf, "none")) { -+ *v = 0; -+ return 0; -+ } -+ -+ /* Is it a device? */ -+ ca = bch2_dev_lookup(c, buf); -+ if (!IS_ERR(ca)) { -+ *v = dev_to_target(ca->dev_idx); -+ percpu_ref_put(&ca->ref); -+ return 0; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ g = bch2_disk_path_find(&c->disk_sb, buf); -+ mutex_unlock(&c->sb_lock); -+ -+ if (g >= 0) { -+ *v = group_to_target(g); -+ return 0; -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) -+{ -+ struct target t = target_decode(v); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ pr_buf(out, "none"); -+ break; -+ case TARGET_DEV: { -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ char b[BDEVNAME_SIZE]; -+ -+ pr_buf(out, "/dev/%s", -+ bdevname(ca->disk_sb.bdev, b)); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ pr_buf(out, "offline device %u", t.dev); -+ } else { -+ pr_buf(out, "invalid device %u", t.dev); -+ } -+ -+ rcu_read_unlock(); -+ break; -+ } -+ case TARGET_GROUP: -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, &c->disk_sb, t.group); -+ mutex_unlock(&c->sb_lock); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h -new file mode 100644 -index 000000000000..c8e0c37a5e1a ---- /dev/null -+++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,88 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DISK_GROUPS_H -+#define _BCACHEFS_DISK_GROUPS_H -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; -+ -+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -+{ -+ return groups -+ ? (vstruct_end(&groups->field) - -+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) -+ : 0; -+} -+ -+struct target { -+ enum { -+ TARGET_NULL, -+ TARGET_DEV, -+ TARGET_GROUP, -+ } type; -+ union { -+ unsigned dev; -+ unsigned group; -+ }; -+}; -+ -+#define TARGET_DEV_START 1 -+#define TARGET_GROUP_START (256 + TARGET_DEV_START) -+ -+static inline u16 dev_to_target(unsigned dev) -+{ -+ return TARGET_DEV_START + dev; -+} -+ -+static inline u16 group_to_target(unsigned group) -+{ -+ return TARGET_GROUP_START + group; -+} -+ -+static inline struct target target_decode(unsigned target) -+{ -+ if (target >= TARGET_GROUP_START) -+ return (struct target) { -+ .type = TARGET_GROUP, -+ .group = target - TARGET_GROUP_START -+ }; -+ -+ if (target >= TARGET_DEV_START) -+ return (struct target) { -+ .type = TARGET_DEV, -+ .group = target - TARGET_DEV_START -+ }; -+ -+ return (struct target) { .type = TARGET_NULL }; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); -+ -+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, -+ enum bch_data_type data_type, -+ u16 target) -+{ -+ struct bch_devs_mask devs = c->rw_devs[data_type]; -+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); -+ -+ if (t) -+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); -+ return devs; -+} -+ -+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); -+ -+int bch2_disk_path_find(struct bch_sb_handle *, const char *); -+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, -+ unsigned); -+ -+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *); -+ -+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -+ -+const char *bch2_sb_validate_disk_groups(struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_DISK_GROUPS_H */ -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -new file mode 100644 -index 000000000000..8c7e9cb74888 ---- /dev/null -+++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1368 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+/* erasure coding */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+#ifdef __KERNEL__ -+ -+#include -+#include -+ -+static void raid5_recov(unsigned disks, unsigned failed_idx, -+ size_t size, void **data) -+{ -+ unsigned i = 2, nr; -+ -+ BUG_ON(failed_idx >= disks); -+ -+ swap(data[0], data[failed_idx]); -+ memcpy(data[0], data[1], size); -+ -+ while (i < disks) { -+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); -+ xor_blocks(nr, size, data[0], data + i); -+ i += nr; -+ } -+ -+ swap(data[0], data[failed_idx]); -+} -+ -+static void raid_gen(int nd, int np, size_t size, void **v) -+{ -+ if (np >= 1) -+ raid5_recov(nd + np, nd, size, v); -+ if (np >= 2) -+ raid6_call.gen_syndrome(nd + np, size, v); -+ BUG_ON(np > 2); -+} -+ -+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -+{ -+ switch (nr) { -+ case 0: -+ break; -+ case 1: -+ if (ir[0] < nd + 1) -+ raid5_recov(nd + 1, ir[0], size, v); -+ else -+ raid6_call.gen_syndrome(nd + np, size, v); -+ break; -+ case 2: -+ if (ir[1] < nd) { -+ /* data+data failure. */ -+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); -+ } else if (ir[0] < nd) { -+ /* data + p/q failure */ -+ -+ if (ir[1] == nd) /* data + p failure */ -+ raid6_datap_recov(nd + np, size, ir[0], v); -+ else { /* data + q failure */ -+ raid5_recov(nd + 1, ir[0], size, v); -+ raid6_call.gen_syndrome(nd + np, size, v); -+ } -+ } else { -+ raid_gen(nd, np, size, v); -+ } -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+#else -+ -+#include -+ -+#endif -+ -+struct ec_bio { -+ struct bch_dev *ca; -+ struct ec_stripe_buf *buf; -+ size_t idx; -+ struct bio bio; -+}; -+ -+/* Stripes btree keys: */ -+ -+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ -+ if (k.k->p.inode) -+ return "invalid stripe key"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s)) -+ return "incorrect value size"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s) || -+ bkey_val_u64s(k.k) < stripe_val_u64s(s)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ unsigned i; -+ -+ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", -+ s->algorithm, -+ le16_to_cpu(s->sectors), -+ s->nr_blocks - s->nr_redundant, -+ s->nr_redundant, -+ s->csum_type, -+ 1U << s->csum_granularity_bits); -+ -+ for (i = 0; i < s->nr_blocks; i++) -+ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, -+ (u64) s->ptrs[i].offset, -+ stripe_blockcount_get(s, i)); -+} -+ -+static int ptr_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ const struct bch_extent_ptr *ptr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { -+ const struct bch_extent_ptr *ptr2 = v->ptrs + i; -+ -+ if (ptr->dev == ptr2->dev && -+ ptr->gen == ptr2->gen && -+ ptr->offset >= ptr2->offset && -+ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int extent_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ struct bkey_s_c k) -+{ -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const struct bch_extent_ptr *ptr; -+ int idx; -+ -+ extent_for_each_ptr(e, ptr) { -+ idx = ptr_matches_stripe(c, v, ptr); -+ if (idx >= 0) -+ return idx; -+ } -+ break; -+ } -+ } -+ -+ return -1; -+} -+ -+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ -+ extent_for_each_entry(e, entry) -+ if (extent_entry_type(entry) == -+ BCH_EXTENT_ENTRY_stripe_ptr && -+ entry->stripe_ptr.idx == idx) -+ return true; -+ -+ break; -+ } -+ } -+ -+ return false; -+} -+ -+static void ec_stripe_key_init(struct bch_fs *c, -+ struct bkey_i_stripe *s, -+ struct open_buckets *blocks, -+ struct open_buckets *parity, -+ unsigned stripe_size) -+{ -+ struct open_bucket *ob; -+ unsigned i, u64s; -+ -+ bkey_stripe_init(&s->k_i); -+ s->v.sectors = cpu_to_le16(stripe_size); -+ s->v.algorithm = 0; -+ s->v.nr_blocks = parity->nr + blocks->nr; -+ s->v.nr_redundant = parity->nr; -+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); -+ s->v.csum_type = BCH_CSUM_CRC32C; -+ s->v.pad = 0; -+ -+ open_bucket_for_each(c, blocks, ob, i) -+ s->v.ptrs[i] = ob->ptr; -+ -+ open_bucket_for_each(c, parity, ob, i) -+ s->v.ptrs[blocks->nr + i] = ob->ptr; -+ -+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { -+ BUG_ON(1 << s->v.csum_granularity_bits >= -+ le16_to_cpu(s->v.sectors) || -+ s->v.csum_granularity_bits == U8_MAX); -+ s->v.csum_granularity_bits++; -+ } -+ -+ set_bkey_val_u64s(&s->k, u64s); -+} -+ -+/* Checksumming: */ -+ -+static void ec_generate_checksums(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csums_per_device = stripe_csums_per_device(v); -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i, j; -+ -+ if (!csum_bytes) -+ return; -+ -+ BUG_ON(buf->offset); -+ BUG_ON(buf->size != le16_to_cpu(v->sectors)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ for (j = 0; j < csums_per_device; j++) { -+ unsigned offset = j << v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, buf->size - offset); -+ -+ struct bch_csum csum = -+ bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + (offset << 9), -+ len << 9); -+ -+ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); -+ } -+ } -+} -+ -+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i; -+ -+ if (!csum_bytes) -+ return; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ unsigned offset = buf->offset; -+ unsigned end = buf->offset + buf->size; -+ -+ if (!test_bit(i, buf->valid)) -+ continue; -+ -+ while (offset < end) { -+ unsigned j = offset >> v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, end - offset); -+ struct bch_csum csum; -+ -+ BUG_ON(offset & (csum_granularity - 1)); -+ BUG_ON(offset + len != le16_to_cpu(v->sectors) && -+ ((offset + len) & (csum_granularity - 1))); -+ -+ csum = bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + ((offset - buf->offset) << 9), -+ len << 9); -+ -+ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { -+ __bcache_io_error(c, -+ "checksum error while doing reconstruct read (%u:%u)", -+ i, j); -+ clear_bit(i, buf->valid); -+ break; -+ } -+ -+ offset += len; -+ } -+ } -+} -+ -+/* Erasure coding: */ -+ -+static void ec_generate_ec(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = le16_to_cpu(v->sectors) << 9; -+ -+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -+} -+ -+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) -+{ -+ return nr - bitmap_weight(buf->valid, nr); -+} -+ -+static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -+{ -+ return __ec_nr_failed(buf, buf->key.v.nr_blocks); -+} -+ -+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = buf->size << 9; -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ return -1; -+ } -+ -+ for (i = 0; i < nr_data; i++) -+ if (!test_bit(i, buf->valid)) -+ failed[nr_failed++] = i; -+ -+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); -+ return 0; -+} -+ -+/* IO: */ -+ -+static void ec_block_endio(struct bio *bio) -+{ -+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); -+ struct bch_dev *ca = ec_bio->ca; -+ struct closure *cl = bio->bi_private; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", -+ bio_data_dir(bio) ? "write" : "read", -+ blk_status_to_str(bio->bi_status))) -+ clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ -+ bio_put(&ec_bio->bio); -+ percpu_ref_put(&ca->io_ref); -+ closure_put(cl); -+} -+ -+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, -+ unsigned rw, unsigned idx, struct closure *cl) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned offset = 0, bytes = buf->size << 9; -+ struct bch_extent_ptr *ptr = &v->ptrs[idx]; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (!bch2_dev_get_ioref(ca, rw)) { -+ clear_bit(idx, buf->valid); -+ return; -+ } -+ -+ while (offset < bytes) { -+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, -+ DIV_ROUND_UP(bytes, PAGE_SIZE)); -+ unsigned b = min_t(size_t, bytes - offset, -+ nr_iovecs << PAGE_SHIFT); -+ struct ec_bio *ec_bio; -+ -+ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, -+ &c->ec_bioset), -+ struct ec_bio, bio); -+ -+ ec_bio->ca = ca; -+ ec_bio->buf = buf; -+ ec_bio->idx = idx; -+ -+ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); -+ bio_set_op_attrs(&ec_bio->bio, rw, 0); -+ -+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); -+ ec_bio->bio.bi_end_io = ec_block_endio; -+ ec_bio->bio.bi_private = cl; -+ -+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); -+ -+ closure_get(cl); -+ percpu_ref_get(&ca->io_ref); -+ -+ submit_bio(&ec_bio->bio); -+ -+ offset += b; -+ } -+ -+ percpu_ref_put(&ca->io_ref); -+} -+ -+/* recovery read path: */ -+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct ec_stripe_buf *buf; -+ struct closure cl; -+ struct bkey_s_c k; -+ struct bch_stripe *v; -+ unsigned stripe_idx; -+ unsigned offset, end; -+ unsigned i, nr_data, csum_granularity; -+ int ret = 0, idx; -+ -+ closure_init_stack(&cl); -+ -+ BUG_ON(!rbio->pick.has_ec); -+ -+ stripe_idx = rbio->pick.ec.idx; -+ -+ buf = kzalloc(sizeof(*buf), GFP_NOIO); -+ if (!buf) -+ return -ENOMEM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, -+ POS(0, stripe_idx), -+ BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stripe not found"); -+ kfree(buf); -+ return bch2_trans_exit(&trans) ?: -EIO; -+ } -+ -+ bkey_reassemble(&buf->key.k_i, k); -+ bch2_trans_exit(&trans); -+ -+ v = &buf->key.v; -+ -+ nr_data = v->nr_blocks - v->nr_redundant; -+ -+ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); -+ BUG_ON(idx < 0); -+ -+ csum_granularity = 1U << v->csum_granularity_bits; -+ -+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; -+ end = offset + bio_sectors(&rbio->bio); -+ -+ BUG_ON(end > le16_to_cpu(v->sectors)); -+ -+ buf->offset = round_down(offset, csum_granularity); -+ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), -+ round_up(end, csum_granularity)) - buf->offset; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); -+ if (!buf->data[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ memset(buf->valid, 0xFF, sizeof(buf->valid)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ struct bch_extent_ptr *ptr = v->ptrs + i; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ptr_stale(ca, ptr)) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stale pointer"); -+ clear_bit(i, buf->valid); -+ continue; -+ } -+ -+ ec_block_io(c, buf, REQ_OP_READ, i, &cl); -+ } -+ -+ closure_sync(&cl); -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ ec_validate_checksums(c, buf); -+ -+ ret = ec_do_recov(c, buf); -+ if (ret) -+ goto err; -+ -+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, -+ buf->data[idx] + ((offset - buf->offset) << 9)); -+err: -+ for (i = 0; i < v->nr_blocks; i++) -+ kfree(buf->data[i]); -+ kfree(buf); -+ return ret; -+} -+ -+/* stripe bucket accounting: */ -+ -+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -+{ -+ ec_stripes_heap n, *h = &c->ec_stripes_heap; -+ -+ if (idx >= h->size) { -+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) -+ return -ENOMEM; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ if (n.size > h->size) { -+ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); -+ n.used = h->used; -+ swap(*h, n); -+ } -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ free_heap(&n); -+ } -+ -+ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) -+ return -ENOMEM; -+ -+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && -+ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+static int ec_stripe_mem_alloc(struct bch_fs *c, -+ struct btree_iter *iter) -+{ -+ size_t idx = iter->pos.offset; -+ int ret = 0; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) -+ return ret; -+ -+ bch2_trans_unlock(iter->trans); -+ ret = -EINTR; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) -+ return ret; -+ -+ return -ENOMEM; -+} -+ -+static ssize_t stripe_idx_to_delete(struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ -+ return h->used && h->data[0].blocks_nonempty == 0 -+ ? h->data[0].idx : -1; -+} -+ -+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, -+ struct ec_stripe_heap_entry l, -+ struct ec_stripe_heap_entry r) -+{ -+ return ((l.blocks_nonempty > r.blocks_nonempty) - -+ (l.blocks_nonempty < r.blocks_nonempty)); -+} -+ -+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, -+ size_t i) -+{ -+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); -+ -+ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; -+} -+ -+static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m = genradix_ptr(&c->stripes[0], idx); -+ -+ BUG_ON(!m->alive); -+ BUG_ON(m->heap_idx >= h->used); -+ BUG_ON(h->data[m->heap_idx].idx != idx); -+} -+ -+void bch2_stripes_heap_update(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ size_t i; -+ -+ if (m->alive) { -+ heap_verify_backpointer(c, idx); -+ -+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; -+ -+ i = m->heap_idx; -+ heap_sift_up(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ heap_sift_down(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+ } else { -+ bch2_stripes_heap_insert(c, m, idx); -+ } -+ -+ if (stripe_idx_to_delete(c) >= 0 && -+ !percpu_ref_is_dying(&c->writes)) -+ schedule_work(&c->ec_stripe_delete_work); -+} -+ -+void bch2_stripes_heap_del(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ heap_verify_backpointer(c, idx); -+ -+ m->alive = false; -+ heap_del(&c->ec_stripes_heap, m->heap_idx, -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+} -+ -+void bch2_stripes_heap_insert(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ BUG_ON(heap_full(&c->ec_stripes_heap)); -+ -+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { -+ .idx = idx, -+ .blocks_nonempty = m->blocks_nonempty, -+ }), -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ m->alive = true; -+ -+ heap_verify_backpointer(c, idx); -+} -+ -+/* stripe deletion */ -+ -+static int ec_stripe_delete(struct bch_fs *c, size_t idx) -+{ -+ return bch2_btree_delete_range(c, BTREE_ID_EC, -+ POS(0, idx), -+ POS(0, idx + 1), -+ NULL); -+} -+ -+static void ec_stripe_delete_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, ec_stripe_delete_work); -+ ssize_t idx; -+ -+ down_read(&c->gc_lock); -+ mutex_lock(&c->ec_stripe_create_lock); -+ -+ while (1) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ idx = stripe_idx_to_delete(c); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ if (idx < 0) -+ break; -+ -+ if (ec_stripe_delete(c, idx)) -+ break; -+ } -+ -+ mutex_unlock(&c->ec_stripe_create_lock); -+ up_read(&c->gc_lock); -+} -+ -+/* stripe creation: */ -+ -+static int ec_stripe_bkey_insert(struct bch_fs *c, -+ struct bkey_i_stripe *stripe) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bpos start_pos = POS(0, c->ec_stripe_hint); -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { -+ if (start_pos.offset) { -+ start_pos = POS_MIN; -+ bch2_btree_iter_set_pos(iter, start_pos); -+ continue; -+ } -+ -+ ret = -ENOSPC; -+ break; -+ } -+ -+ if (bkey_deleted(k.k)) -+ goto found_slot; -+ } -+ -+ goto err; -+found_slot: -+ start_pos = iter->pos; -+ -+ ret = ec_stripe_mem_alloc(c, iter); -+ if (ret) -+ goto err; -+ -+ stripe->k.p = iter->pos; -+ -+ bch2_trans_update(&trans, iter, &stripe->k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static void extent_stripe_ptr_add(struct bkey_s_extent e, -+ struct ec_stripe_buf *s, -+ struct bch_extent_ptr *ptr, -+ unsigned block) -+{ -+ struct bch_extent_stripe_ptr *dst = (void *) ptr; -+ union bch_extent_entry *end = extent_entry_last(e); -+ -+ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); -+ e.k->u64s += sizeof(*dst) / sizeof(u64); -+ -+ *dst = (struct bch_extent_stripe_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, -+ .block = block, -+ .idx = s->key.k.p.offset, -+ }; -+} -+ -+static int ec_stripe_update_ptrs(struct bch_fs *c, -+ struct ec_stripe_buf *s, -+ struct bkey *pos) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_extent e; -+ struct bkey_on_stack sk; -+ int ret = 0, dev, idx; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(pos), -+ BTREE_ITER_INTENT); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { -+ struct bch_extent_ptr *ptr, *ec_ptr = NULL; -+ -+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ idx = extent_matches_stripe(c, &s->key.v, k); -+ if (idx < 0) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ dev = s->key.v.ptrs[idx].dev; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ e = bkey_i_to_s_extent(sk.k); -+ -+ extent_for_each_ptr(e, ptr) { -+ if (ptr->dev == dev) -+ ec_ptr = ptr; -+ else -+ ptr->cached = true; -+ } -+ -+ extent_stripe_ptr_add(e, s, ec_ptr, idx); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* -+ * data buckets of new stripe all written: create the stripe -+ */ -+static void ec_stripe_create(struct ec_stripe_new *s) -+{ -+ struct bch_fs *c = s->c; -+ struct open_bucket *ob; -+ struct bkey_i *k; -+ struct bch_stripe *v = &s->stripe.key.v; -+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -+ struct closure cl; -+ int ret; -+ -+ BUG_ON(s->h->s == s); -+ -+ closure_init_stack(&cl); -+ -+ if (s->err) { -+ bch_err(c, "error creating stripe: error writing data buckets"); -+ goto err; -+ } -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ goto err; -+ -+ BUG_ON(bitmap_weight(s->blocks_allocated, -+ s->blocks.nr) != s->blocks.nr); -+ -+ ec_generate_ec(&s->stripe); -+ -+ ec_generate_checksums(&s->stripe); -+ -+ /* write p/q: */ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); -+ -+ closure_sync(&cl); -+ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ if (!test_bit(i, s->stripe.valid)) { -+ bch_err(c, "error creating stripe: error writing redundancy buckets"); -+ goto err_put_writes; -+ } -+ -+ mutex_lock(&c->ec_stripe_create_lock); -+ -+ ret = ec_stripe_bkey_insert(c, &s->stripe.key); -+ if (ret) { -+ bch_err(c, "error creating stripe: error creating stripe key"); -+ goto err_unlock; -+ } -+ -+ for_each_keylist_key(&s->keys, k) { -+ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); -+ if (ret) -+ break; -+ } -+ -+err_unlock: -+ mutex_unlock(&c->ec_stripe_create_lock); -+err_put_writes: -+ percpu_ref_put(&c->writes); -+err: -+ open_bucket_for_each(c, &s->blocks, ob, i) { -+ ob->ec = NULL; -+ __bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_open_buckets_put(c, &s->parity); -+ -+ bch2_keylist_free(&s->keys, s->inline_keys); -+ -+ mutex_lock(&s->h->lock); -+ list_del(&s->list); -+ mutex_unlock(&s->h->lock); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+} -+ -+static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s = h->s; -+ -+ list_add(&s->list, &h->stripes); -+ h->s = NULL; -+ -+ return s; -+} -+ -+static void ec_stripe_new_put(struct ec_stripe_new *s) -+{ -+ BUG_ON(atomic_read(&s->pin) <= 0); -+ if (atomic_dec_and_test(&s->pin)) -+ ec_stripe_create(s); -+} -+ -+/* have a full bucket - hand it off to be erasure coded: */ -+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ if (ob->sectors_free) -+ s->err = -1; -+ -+ ec_stripe_new_put(s); -+} -+ -+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ s->err = -EIO; -+} -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct bch_dev *ca; -+ unsigned offset; -+ -+ if (!ob) -+ return NULL; -+ -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ offset = ca->mi.bucket_size - ob->sectors_free; -+ -+ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); -+} -+ -+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, -+ struct bpos pos, unsigned sectors) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct ec_stripe_new *ec; -+ -+ if (!ob) -+ return; -+ -+ ec = ob->ec; -+ mutex_lock(&ec->lock); -+ -+ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, -+ ARRAY_SIZE(ec->inline_keys), -+ BKEY_U64s)) { -+ BUG(); -+ } -+ -+ bkey_init(&ec->keys.top->k); -+ ec->keys.top->k.p = pos; -+ bch2_key_resize(&ec->keys.top->k, sectors); -+ bch2_keylist_push(&ec->keys); -+ -+ mutex_unlock(&ec->lock); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ unsigned l = *((const unsigned *) _l); -+ unsigned r = *((const unsigned *) _r); -+ -+ return cmp_int(l, r); -+} -+ -+/* pick most common bucket size: */ -+static unsigned pick_blocksize(struct bch_fs *c, -+ struct bch_devs_mask *devs) -+{ -+ struct bch_dev *ca; -+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; -+ struct { -+ unsigned nr, size; -+ } cur = { 0, 0 }, best = { 0, 0 }; -+ -+ for_each_member_device_rcu(ca, c, i, devs) -+ sizes[nr++] = ca->mi.bucket_size; -+ -+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); -+ -+ for (i = 0; i < nr; i++) { -+ if (sizes[i] != cur.size) { -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ cur.nr = 0; -+ cur.size = sizes[i]; -+ } -+ -+ cur.nr++; -+ } -+ -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ return best.size; -+} -+ -+int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s; -+ unsigned i; -+ -+ BUG_ON(h->parity.nr != h->redundancy); -+ BUG_ON(!h->blocks.nr); -+ BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); -+ lockdep_assert_held(&h->lock); -+ -+ s = kzalloc(sizeof(*s), GFP_KERNEL); -+ if (!s) -+ return -ENOMEM; -+ -+ mutex_init(&s->lock); -+ atomic_set(&s->pin, 1); -+ s->c = c; -+ s->h = h; -+ s->blocks = h->blocks; -+ s->parity = h->parity; -+ -+ memset(&h->blocks, 0, sizeof(h->blocks)); -+ memset(&h->parity, 0, sizeof(h->parity)); -+ -+ bch2_keylist_init(&s->keys, s->inline_keys); -+ -+ s->stripe.offset = 0; -+ s->stripe.size = h->blocksize; -+ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); -+ -+ ec_stripe_key_init(c, &s->stripe.key, -+ &s->blocks, &s->parity, -+ h->blocksize); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { -+ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); -+ if (!s->stripe.data[i]) -+ goto err; -+ } -+ -+ h->s = s; -+ -+ return 0; -+err: -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+ return -ENOMEM; -+} -+ -+static struct ec_stripe_head * -+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, -+ unsigned algo, unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ h = kzalloc(sizeof(*h), GFP_KERNEL); -+ if (!h) -+ return NULL; -+ -+ mutex_init(&h->lock); -+ mutex_lock(&h->lock); -+ INIT_LIST_HEAD(&h->stripes); -+ -+ h->target = target; -+ h->algo = algo; -+ h->redundancy = redundancy; -+ -+ rcu_read_lock(); -+ h->devs = target_rw_devs(c, BCH_DATA_USER, target); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (!ca->mi.durability) -+ __clear_bit(i, h->devs.d); -+ -+ h->blocksize = pick_blocksize(c, &h->devs); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (ca->mi.bucket_size == h->blocksize) -+ h->nr_active_devs++; -+ -+ rcu_read_unlock(); -+ list_add(&h->list, &c->ec_new_stripe_list); -+ return h; -+} -+ -+void bch2_ec_stripe_head_put(struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s = NULL; -+ -+ if (h->s && -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr) == h->s->blocks.nr) -+ s = ec_stripe_set_pending(h); -+ -+ mutex_unlock(&h->lock); -+ -+ if (s) -+ ec_stripe_new_put(s); -+} -+ -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ -+ if (!redundancy) -+ return NULL; -+ -+ mutex_lock(&c->ec_new_stripe_lock); -+ list_for_each_entry(h, &c->ec_new_stripe_list, list) -+ if (h->target == target && -+ h->algo == algo && -+ h->redundancy == redundancy) { -+ mutex_lock(&h->lock); -+ goto found; -+ } -+ -+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); -+found: -+ mutex_unlock(&c->ec_new_stripe_lock); -+ return h; -+} -+ -+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&c->ec_new_stripe_lock); -+ list_for_each_entry(h, &c->ec_new_stripe_list, list) { -+ struct ec_stripe_new *s = NULL; -+ -+ mutex_lock(&h->lock); -+ bch2_open_buckets_stop_dev(c, ca, &h->blocks); -+ bch2_open_buckets_stop_dev(c, ca, &h->parity); -+ -+ if (!h->s) -+ goto unlock; -+ -+ open_bucket_for_each(c, &h->s->blocks, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ goto unlock; -+found: -+ h->s->err = -1; -+ s = ec_stripe_set_pending(h); -+unlock: -+ mutex_unlock(&h->lock); -+ -+ if (s) -+ ec_stripe_new_put(s); -+ } -+ mutex_unlock(&c->ec_new_stripe_lock); -+} -+ -+static int __bch2_stripe_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct stripe *m, -+ size_t idx, -+ struct bkey_i_stripe *new_key) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ unsigned i; -+ int ret; -+ -+ bch2_btree_iter_set_pos(iter, POS(0, idx)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ return -EIO; -+ -+ bkey_reassemble(&new_key->k_i, k); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ for (i = 0; i < new_key->v.nr_blocks; i++) -+ stripe_blockcount_set(&new_key->v, i, -+ m->block_sectors[i]); -+ m->dirty = false; -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ bch2_trans_update(trans, iter, &new_key->k_i, 0); -+ return 0; -+} -+ -+int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct genradix_iter giter; -+ struct bkey_i_stripe *new_key; -+ struct stripe *m; -+ int ret = 0; -+ -+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); -+ BUG_ON(!new_key); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ genradix_for_each(&c->stripes[0], giter, m) { -+ if (!m->dirty) -+ continue; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|flags, -+ __bch2_stripe_write_key(&trans, iter, m, -+ giter.pos, new_key)); -+ -+ if (ret) -+ break; -+ -+ *wrote = true; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ kfree(new_key); -+ -+ return ret; -+} -+ -+static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_stripe) -+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: -+ bch2_mark_key(c, k, 0, 0, NULL, 0, -+ BTREE_TRIGGER_ALLOC_READ| -+ BTREE_TRIGGER_NOATOMIC); -+ -+ return ret; -+} -+ -+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, -+ NULL, bch2_stripes_read_fn); -+ if (ret) -+ bch_err(c, "error reading stripes: %i", ret); -+ -+ return ret; -+} -+ -+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ size_t i, idx = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); -+ -+ k = bch2_btree_iter_prev(iter); -+ if (!IS_ERR_OR_NULL(k.k)) -+ idx = k.k->p.offset + 1; -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (!idx) -+ return 0; -+ -+ if (!gc && -+ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), -+ GFP_KERNEL)) -+ return -ENOMEM; -+#if 0 -+ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -+#else -+ for (i = 0; i < idx; i++) -+ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) -+ return -ENOMEM; -+#endif -+ return 0; -+} -+ -+void bch2_fs_ec_exit(struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ -+ while (1) { -+ mutex_lock(&c->ec_new_stripe_lock); -+ h = list_first_entry_or_null(&c->ec_new_stripe_list, -+ struct ec_stripe_head, list); -+ if (h) -+ list_del(&h->list); -+ mutex_unlock(&c->ec_new_stripe_lock); -+ if (!h) -+ break; -+ -+ BUG_ON(h->s); -+ BUG_ON(!list_empty(&h->stripes)); -+ kfree(h); -+ } -+ -+ free_heap(&c->ec_stripes_heap); -+ genradix_free(&c->stripes[0]); -+ bioset_exit(&c->ec_bioset); -+} -+ -+int bch2_fs_ec_init(struct bch_fs *c) -+{ -+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -+ -+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), -+ BIOSET_NEED_BVECS); -+} -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -new file mode 100644 -index 000000000000..4dfaac034886 ---- /dev/null -+++ b/fs/bcachefs/ec.h -@@ -0,0 +1,163 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_H -+#define _BCACHEFS_EC_H -+ -+#include "ec_types.h" -+#include "keylist_types.h" -+ -+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_stripe (struct bkey_ops) { \ -+ .key_invalid = bch2_stripe_invalid, \ -+ .val_to_text = bch2_stripe_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(le16_to_cpu(s->sectors), -+ 1 << s->csum_granularity_bits); -+} -+ -+static inline unsigned stripe_csum_offset(const struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; -+ -+ return sizeof(struct bch_stripe) + -+ sizeof(struct bch_extent_ptr) * s->nr_blocks + -+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -+} -+ -+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return stripe_csum_offset(s, s->nr_blocks, 0) + -+ sizeof(u16) * idx; -+} -+ -+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -+} -+ -+static inline void stripe_blockcount_set(struct bch_stripe *s, -+ unsigned idx, unsigned v) -+{ -+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); -+ -+ *p = cpu_to_le16(v); -+} -+ -+static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), -+ sizeof(u64)); -+} -+ -+static inline void *stripe_csum(struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ return (void *) s + stripe_csum_offset(s, dev, csum_idx); -+} -+ -+struct bch_read_bio; -+ -+struct ec_stripe_buf { -+ /* might not be buffering the entire stripe: */ -+ unsigned offset; -+ unsigned size; -+ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ void *data[EC_STRIPE_MAX]; -+ -+ union { -+ struct bkey_i_stripe key; -+ u64 pad[255]; -+ }; -+}; -+ -+struct ec_stripe_head; -+ -+struct ec_stripe_new { -+ struct bch_fs *c; -+ struct ec_stripe_head *h; -+ struct mutex lock; -+ struct list_head list; -+ -+ /* counts in flight writes, stripe is created when pin == 0 */ -+ atomic_t pin; -+ -+ int err; -+ -+ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ struct open_buckets blocks; -+ struct open_buckets parity; -+ -+ struct keylist keys; -+ u64 inline_keys[BKEY_U64s * 8]; -+ -+ struct ec_stripe_buf stripe; -+}; -+ -+struct ec_stripe_head { -+ struct list_head list; -+ struct mutex lock; -+ -+ struct list_head stripes; -+ -+ unsigned target; -+ unsigned algo; -+ unsigned redundancy; -+ -+ struct bch_devs_mask devs; -+ unsigned nr_active_devs; -+ -+ unsigned blocksize; -+ -+ struct dev_stripe_state block_stripe; -+ struct dev_stripe_state parity_stripe; -+ -+ struct open_buckets blocks; -+ struct open_buckets parity; -+ -+ struct ec_stripe_new *s; -+}; -+ -+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, -+ struct bpos, unsigned); -+ -+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); -+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); -+ -+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -+ -+void bch2_ec_stripe_head_put(struct ec_stripe_head *); -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, -+ unsigned, unsigned); -+ -+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -+ -+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -+ -+void bch2_ec_flush_new_stripes(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_stripes_read(struct bch_fs *, struct journal_keys *); -+int bch2_stripes_write(struct bch_fs *, unsigned, bool *); -+ -+int bch2_ec_mem_alloc(struct bch_fs *, bool); -+ -+void bch2_fs_ec_exit(struct bch_fs *); -+int bch2_fs_ec_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_EC_H */ -diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h -new file mode 100644 -index 000000000000..5c3f77c8aac7 ---- /dev/null -+++ b/fs/bcachefs/ec_types.h -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_TYPES_H -+#define _BCACHEFS_EC_TYPES_H -+ -+#include -+ -+#define EC_STRIPE_MAX 16 -+ -+struct bch_replicas_padded { -+ struct bch_replicas_entry e; -+ u8 pad[EC_STRIPE_MAX]; -+}; -+ -+struct stripe { -+ size_t heap_idx; -+ -+ u16 sectors; -+ u8 algorithm; -+ -+ u8 nr_blocks; -+ u8 nr_redundant; -+ -+ unsigned alive:1; -+ unsigned dirty:1; -+ u8 blocks_nonempty; -+ u16 block_sectors[EC_STRIPE_MAX]; -+ -+ struct bch_replicas_padded r; -+}; -+ -+struct ec_stripe_heap_entry { -+ size_t idx; -+ unsigned blocks_nonempty; -+}; -+ -+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; -+ -+#endif /* _BCACHEFS_EC_TYPES_H */ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -new file mode 100644 -index 000000000000..cd46706fb6f5 ---- /dev/null -+++ b/fs/bcachefs/error.c -@@ -0,0 +1,172 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "error.h" -+#include "io.h" -+#include "super.h" -+ -+#define FSCK_ERR_RATELIMIT_NR 10 -+ -+bool bch2_inconsistent_error(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_ERROR, &c->flags); -+ -+ switch (c->opts.errors) { -+ case BCH_ON_ERROR_CONTINUE: -+ return false; -+ case BCH_ON_ERROR_RO: -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+ return true; -+ case BCH_ON_ERROR_PANIC: -+ panic(bch2_fmt(c, "panic after error")); -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_fatal_error(struct bch_fs *c) -+{ -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+} -+ -+void bch2_io_error_work(struct work_struct *work) -+{ -+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); -+ struct bch_fs *c = ca->fs; -+ bool dev; -+ -+ down_write(&c->state_lock); -+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED); -+ if (dev -+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED) -+ : bch2_fs_emergency_read_only(c)) -+ bch_err(ca, -+ "too many IO errors, setting %s RO", -+ dev ? "device" : "filesystem"); -+ up_write(&c->state_lock); -+} -+ -+void bch2_io_error(struct bch_dev *ca) -+{ -+ //queue_work(system_long_wq, &ca->io_error_work); -+} -+ -+#ifdef __KERNEL__ -+#define ask_yn() false -+#else -+#include "tools-util.h" -+#endif -+ -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, -+ const char *fmt, ...) -+{ -+ struct fsck_err_state *s = NULL; -+ va_list args; -+ bool fix = false, print = true, suppressing = false; -+ char _buf[sizeof(s->buf)], *buf = _buf; -+ -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { -+ va_start(args, fmt); -+ vprintk(fmt, args); -+ va_end(args); -+ -+ return bch2_inconsistent_error(c) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_FIX; -+ } -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry(s, &c->fsck_errors, list) -+ if (s->fmt == fmt) -+ goto found; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS); -+ if (!s) { -+ if (!c->fsck_alloc_err) -+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); -+ c->fsck_alloc_err = true; -+ buf = _buf; -+ goto print; -+ } -+ -+ INIT_LIST_HEAD(&s->list); -+ s->fmt = fmt; -+found: -+ list_move(&s->list, &c->fsck_errors); -+ s->nr++; -+ if (c->opts.ratelimit_errors && -+ s->nr >= FSCK_ERR_RATELIMIT_NR) { -+ if (s->nr == FSCK_ERR_RATELIMIT_NR) -+ suppressing = true; -+ else -+ print = false; -+ } -+ buf = s->buf; -+print: -+ va_start(args, fmt); -+ vscnprintf(buf, sizeof(_buf), fmt, args); -+ va_end(args); -+ -+ if (c->opts.fix_errors == FSCK_OPT_EXIT) { -+ bch_err(c, "%s, exiting", buf); -+ } else if (flags & FSCK_CAN_FIX) { -+ if (c->opts.fix_errors == FSCK_OPT_ASK) { -+ printk(KERN_ERR "%s: fix?", buf); -+ fix = ask_yn(); -+ } else if (c->opts.fix_errors == FSCK_OPT_YES || -+ (c->opts.nochanges && -+ !(flags & FSCK_CAN_IGNORE))) { -+ if (print) -+ bch_err(c, "%s, fixing", buf); -+ fix = true; -+ } else { -+ if (print) -+ bch_err(c, "%s, not fixing", buf); -+ fix = false; -+ } -+ } else if (flags & FSCK_NEED_FSCK) { -+ if (print) -+ bch_err(c, "%s (run fsck to correct)", buf); -+ } else { -+ if (print) -+ bch_err(c, "%s (repair unimplemented)", buf); -+ } -+ -+ if (suppressing) -+ bch_err(c, "Ratelimiting new instances of previous error"); -+ -+ mutex_unlock(&c->fsck_error_lock); -+ -+ if (fix) { -+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ return FSCK_ERR_FIX; -+ } else { -+ set_bit(BCH_FS_ERROR, &c->flags); -+ return c->opts.fix_errors == FSCK_OPT_EXIT || -+ !(flags & FSCK_CAN_IGNORE) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_IGNORE; -+ } -+} -+ -+void bch2_flush_fsck_errs(struct bch_fs *c) -+{ -+ struct fsck_err_state *s, *n; -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { -+ if (s->ratelimited) -+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); -+ -+ list_del(&s->list); -+ kfree(s); -+ } -+ -+ mutex_unlock(&c->fsck_error_lock); -+} -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -new file mode 100644 -index 000000000000..94b53312fbbd ---- /dev/null -+++ b/fs/bcachefs/error.h -@@ -0,0 +1,211 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ERROR_H -+#define _BCACHEFS_ERROR_H -+ -+#include -+#include -+ -+struct bch_dev; -+struct bch_fs; -+struct work_struct; -+ -+/* -+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag -+ * superblock as such -+ */ -+ -+/* Error messages: */ -+ -+/* -+ * Inconsistency errors: The on disk data is inconsistent. If these occur during -+ * initial recovery, they don't indicate a bug in the running code - we walk all -+ * the metadata before modifying anything. If they occur at runtime, they -+ * indicate either a bug in the running code or (less likely) data is being -+ * silently corrupted under us. -+ * -+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in -+ * BCH_ON_ERROR_CONTINUE mode -+ */ -+ -+bool bch2_inconsistent_error(struct bch_fs *); -+ -+#define bch2_fs_inconsistent(c, ...) \ -+({ \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_inconsistent_error(c); \ -+}) -+ -+#define bch2_fs_inconsistent_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_inconsistent(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Later we might want to mark only the particular device inconsistent, not the -+ * entire filesystem: -+ */ -+ -+#define bch2_dev_inconsistent(ca, ...) \ -+do { \ -+ bch_err(ca, __VA_ARGS__); \ -+ bch2_inconsistent_error((ca)->fs); \ -+} while (0) -+ -+#define bch2_dev_inconsistent_on(cond, ca, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_inconsistent(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally -+ * be able to repair: -+ */ -+ -+enum { -+ BCH_FSCK_OK = 0, -+ BCH_FSCK_ERRORS_NOT_FIXED = 1, -+ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, -+ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, -+ BCH_FSCK_UNKNOWN_VERSION = 4, -+}; -+ -+enum fsck_err_opts { -+ FSCK_OPT_EXIT, -+ FSCK_OPT_YES, -+ FSCK_OPT_NO, -+ FSCK_OPT_ASK, -+}; -+ -+enum fsck_err_ret { -+ FSCK_ERR_IGNORE = 0, -+ FSCK_ERR_FIX = 1, -+ FSCK_ERR_EXIT = 2, -+}; -+ -+struct fsck_err_state { -+ struct list_head list; -+ const char *fmt; -+ u64 nr; -+ bool ratelimited; -+ char buf[512]; -+}; -+ -+#define FSCK_CAN_FIX (1 << 0) -+#define FSCK_CAN_IGNORE (1 << 1) -+#define FSCK_NEED_FSCK (1 << 2) -+ -+__printf(3, 4) __cold -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *, -+ unsigned, const char *, ...); -+void bch2_flush_fsck_errs(struct bch_fs *); -+ -+#define __fsck_err(c, _flags, msg, ...) \ -+({ \ -+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ -+ \ -+ if (_fix == FSCK_ERR_EXIT) { \ -+ bch_err(c, "Unable to continue, halting"); \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ \ -+ _fix; \ -+}) -+ -+/* These macros return true if error should be fixed: */ -+ -+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -+ -+#define __fsck_err_on(cond, c, _flags, ...) \ -+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) -+ -+#define need_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define need_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+#define fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+/* -+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW -+ * mode - pretty much just due to metadata IO errors: -+ */ -+ -+void bch2_fatal_error(struct bch_fs *); -+ -+#define bch2_fs_fatal_error(c, ...) \ -+do { \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_fatal_error(c); \ -+} while (0) -+ -+#define bch2_fs_fatal_err_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_fatal_error(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * IO errors: either recoverable metadata IO (because we have replicas), or data -+ * IO - we need to log it and print out a message, but we don't (necessarily) -+ * want to shut down the fs: -+ */ -+ -+void bch2_io_error_work(struct work_struct *); -+ -+/* Does the error handling without logging a message */ -+void bch2_io_error(struct bch_dev *); -+ -+/* Logs message and handles the error: */ -+#define bch2_dev_io_error(ca, fmt, ...) \ -+do { \ -+ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ -+ "IO error on %s for " fmt), \ -+ (ca)->name, ##__VA_ARGS__); \ -+ bch2_io_error(ca); \ -+} while (0) -+ -+#define bch2_dev_io_err_on(cond, ca, ...) \ -+({ \ -+ bool _ret = (cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_io_error(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* kill? */ -+ -+#define __bcache_io_error(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, \ -+ "IO error: " fmt), ##__VA_ARGS__) -+ -+#define bcache_io_error(c, bio, fmt, ...) \ -+do { \ -+ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ -+ (bio)->bi_status = BLK_STS_IOERR; \ -+} while (0) -+ -+#endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -new file mode 100644 -index 000000000000..fd011df3cb99 ---- /dev/null -+++ b/fs/bcachefs/extent_update.c -@@ -0,0 +1,229 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "debug.h" -+#include "extents.h" -+#include "extent_update.h" -+ -+/* -+ * This counts the number of iterators to the alloc & ec btrees we'll need -+ * inserting/removing this extent: -+ */ -+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ unsigned ret = 0; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int count_iters_for_insert(struct btree_trans *trans, -+ struct bkey_s_c k, -+ unsigned offset, -+ struct bpos *end, -+ unsigned *nr_iters, -+ unsigned max_iters) -+{ -+ int ret = 0, ret2 = 0; -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ break; -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx = le64_to_cpu(p.v->idx); -+ unsigned sectors = bpos_min(*end, p.k->p).offset - -+ bkey_start_offset(p.k); -+ struct btree_iter *iter; -+ struct bkey_s_c r_k; -+ -+ for_each_btree_key(trans, iter, -+ BTREE_ID_REFLINK, POS(0, idx + offset), -+ BTREE_ITER_SLOTS, r_k, ret2) { -+ if (bkey_cmp(bkey_start_pos(r_k.k), -+ POS(0, idx + sectors)) >= 0) -+ break; -+ -+ /* extent_update_to_keys(), for the reflink_v update */ -+ *nr_iters += 1; -+ -+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); -+ -+ if (*nr_iters >= max_iters) { -+ struct bpos pos = bkey_start_pos(k.k); -+ pos.offset += min_t(u64, k.k->size, -+ r_k.k->p.offset - idx); -+ -+ *end = bpos_min(*end, pos); -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ break; -+ } -+ } -+ -+ return ret2 ?: ret; -+} -+ -+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -+ -+int bch2_extent_atomic_end(struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bpos *end) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *_k; -+ unsigned nr_iters = 0; -+ int ret; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ b = iter->l[0].b; -+ node_iter = iter->l[0].iter; -+ -+ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ -+ *end = bpos_min(insert->k.p, b->key.k.p); -+ -+ /* extent_update_to_keys(): */ -+ nr_iters += 1; -+ -+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, -+ &nr_iters, EXTENT_ITERS_MAX / 2); -+ if (ret < 0) -+ return ret; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); -+ unsigned offset = 0; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_start_pos(k.k)) > 0) -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ -+ /* extent_handle_overwrites(): */ -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ case BCH_EXTENT_OVERLAP_FRONT: -+ nr_iters += 1; -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ nr_iters += 2; -+ break; -+ } -+ -+ ret = count_iters_for_insert(trans, k, offset, end, -+ &nr_iters, EXTENT_ITERS_MAX); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ bch2_cut_back(end, k); -+ return 0; -+} -+ -+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ return !bkey_cmp(end, k->k.p); -+} -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *_k; -+ struct bkey_s_c k; -+ struct bkey unpacked; -+ int sectors; -+ -+ _k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!_k) -+ return BTREE_INSERT_OK; -+ -+ k = bkey_disassemble(l->b, _k, &unpacked); -+ -+ /* Check if we're splitting a compressed extent: */ -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && -+ bkey_cmp(insert->k.p, k.k->p) < 0 && -+ (sectors = bch2_bkey_sectors_compressed(k))) { -+ int flags = trans->flags & BTREE_INSERT_NOFAIL -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ -+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, -+ sectors, flags)) { -+ case 0: -+ break; -+ case -ENOSPC: -+ return BTREE_INSERT_ENOSPC; -+ default: -+ BUG(); -+ } -+ } -+ -+ return BTREE_INSERT_OK; -+} -diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h -new file mode 100644 -index 000000000000..38dc084627d2 ---- /dev/null -+++ b/fs/bcachefs/extent_update.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENT_UPDATE_H -+#define _BCACHEFS_EXTENT_UPDATE_H -+ -+#include "bcachefs.h" -+ -+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, -+ struct bpos *); -+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *); -+ -+#endif /* _BCACHEFS_EXTENT_UPDATE_H */ -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -new file mode 100644 -index 000000000000..251d4af773a5 ---- /dev/null -+++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1268 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * -+ * Code for managing the extent btree and dynamically updating the writeback -+ * dirty sector count. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+static unsigned bch2_crc_field_size_max[] = { -+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -+}; -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *, -+ struct bch_extent_crc_unpacked, -+ enum bch_extent_entry_type); -+ -+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, -+ unsigned dev) -+{ -+ struct bch_dev_io_failures *i; -+ -+ for (i = f->devs; i < f->devs + f->nr; i++) -+ if (i->dev == dev) -+ return i; -+ -+ return NULL; -+} -+ -+void bch2_mark_io_failure(struct bch_io_failures *failed, -+ struct extent_ptr_decoded *p) -+{ -+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); -+ -+ if (!f) { -+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); -+ -+ f = &failed->devs[failed->nr++]; -+ f->dev = p->ptr.dev; -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else if (p->idx != f->idx) { -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else { -+ f->nr_failed++; -+ } -+} -+ -+/* -+ * returns true if p1 is better than p2: -+ */ -+static inline bool ptr_better(struct bch_fs *c, -+ const struct extent_ptr_decoded p1, -+ const struct extent_ptr_decoded p2) -+{ -+ if (likely(!p1.idx && !p2.idx)) { -+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); -+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); -+ -+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); -+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); -+ -+ /* Pick at random, biased in favor of the faster device: */ -+ -+ return bch2_rand_range(l1 + l2) > l1; -+ } -+ -+ if (force_reconstruct_read(c)) -+ return p1.idx > p2.idx; -+ -+ return p1.idx < p2.idx; -+} -+ -+/* -+ * This picks a non-stale pointer, preferably from a device other than @avoid. -+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to -+ * other devices, it will still pick a pointer from avoid. -+ */ -+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_io_failures *failed, -+ struct extent_ptr_decoded *pick) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_dev_io_failures *f; -+ struct bch_dev *ca; -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_error) -+ return -EIO; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ /* -+ * If there are any dirty pointers it's an error if we can't -+ * read: -+ */ -+ if (!ret && !p.ptr.cached) -+ ret = -EIO; -+ -+ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) -+ continue; -+ -+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; -+ if (f) -+ p.idx = f->nr_failed < f->nr_retries -+ ? f->idx -+ : f->idx + 1; -+ -+ if (!p.idx && -+ !bch2_dev_is_readable(ca)) -+ p.idx++; -+ -+ if (force_reconstruct_read(c) && -+ !p.idx && p.has_ec) -+ p.idx++; -+ -+ if (p.idx >= (unsigned) p.has_ec + 1) -+ continue; -+ -+ if (ret > 0 && !ptr_better(c, p, *pick)) -+ continue; -+ -+ *pick = p; -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ const char *err; -+ char buf[160]; -+ struct bucket_mark mark; -+ struct bch_dev *ca; -+ -+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && -+ !bch2_bkey_replicas_marked_locked(c, k, false), c, -+ "btree key bad (replicas not marked in superblock):\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ mark = ptr_bucket_mark(ca, ptr); -+ -+ err = "stale"; -+ if (gen_after(mark.gen, ptr->gen)) -+ goto err; -+ -+ err = "inconsistent"; -+ if (mark.data_type != BCH_DATA_BTREE || -+ mark.dirty_sectors < c->opts.btree_node_size) -+ goto err; -+ } -+out: -+ percpu_up_read(&c->mark_lock); -+ return; -+err: -+ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", -+ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), -+ PTR_BUCKET_NR(ca, ptr), -+ mark.gen, (unsigned) mark.v.counter); -+ goto out; -+} -+ -+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ pr_buf(out, "seq %llx sectors %u written %u min_key ", -+ le64_to_cpu(bp.v->seq), -+ le16_to_cpu(bp.v->sectors), -+ le16_to_cpu(bp.v->sectors_written)); -+ -+ bch2_bpos_to_text(out, bp.v->min_key); -+ pr_buf(out, " "); -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s k) -+{ -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); -+ -+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bp.v->min_key, POS_MIN)) -+ bp.v->min_key = write -+ ? bkey_predecessor(bp.v->min_key) -+ : bkey_successor(bp.v->min_key); -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ char buf[160]; -+ -+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && -+ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, -+ "extent key bad (replicas not marked in superblock):\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); -+ unsigned stale = gen_after(mark.gen, p.ptr.gen); -+ unsigned disk_sectors = ptr_disk_sectors(p); -+ unsigned mark_sectors = p.ptr.cached -+ ? mark.cached_sectors -+ : mark.dirty_sectors; -+ -+ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, -+ "stale dirty pointer (ptr gen %u bucket %u", -+ p.ptr.gen, mark.gen); -+ -+ bch2_fs_inconsistent_on(stale > 96, c, -+ "key too stale: %i", stale); -+ -+ bch2_fs_inconsistent_on(!stale && -+ (mark.data_type != BCH_DATA_USER || -+ mark_sectors < disk_sectors), c, -+ "extent pointer not marked: %s:\n" -+ "type %u sectors %u < %u", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), -+ mark.data_type, -+ mark_sectors, disk_sectors); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+enum merge_result bch2_extent_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_extent l = bkey_s_to_extent(_l); -+ struct bkey_s_extent r = bkey_s_to_extent(_r); -+ union bch_extent_entry *en_l = l.v->start; -+ union bch_extent_entry *en_r = r.v->start; -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) -+ return BCH_MERGE_NOMERGE; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, NULL); -+ -+ extent_for_each_entry(l, en_l) { -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (extent_entry_type(en_l) != extent_entry_type(en_r)) -+ return BCH_MERGE_NOMERGE; -+ -+ switch (extent_entry_type(en_l)) { -+ case BCH_EXTENT_ENTRY_ptr: { -+ const struct bch_extent_ptr *lp = &en_l->ptr; -+ const struct bch_extent_ptr *rp = &en_r->ptr; -+ struct bch_dev *ca; -+ -+ if (lp->offset + crc_l.compressed_size != rp->offset || -+ lp->dev != rp->dev || -+ lp->gen != rp->gen) -+ return BCH_MERGE_NOMERGE; -+ -+ /* We don't allow extents to straddle buckets: */ -+ ca = bch_dev_bkey_exists(c, lp->dev); -+ -+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ } -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || -+ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) -+ return BCH_MERGE_NOMERGE; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ if (crc_l.csum_type != crc_r.csum_type || -+ crc_l.compression_type != crc_r.compression_type || -+ crc_l.nonce != crc_r.nonce) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || -+ crc_r.offset) -+ return BCH_MERGE_NOMERGE; -+ -+ if (!bch2_checksum_mergeable(crc_l.csum_type)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_is_compressed(crc_l)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.csum_type && -+ crc_l.uncompressed_size + -+ crc_r.uncompressed_size > c->sb.encoded_extent_max) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.uncompressed_size + crc_r.uncompressed_size > -+ bch2_crc_field_size_max[extent_entry_type(en_l)]) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ default: -+ return BCH_MERGE_NOMERGE; -+ } -+ } -+ -+ extent_for_each_entry(l, en_l) { -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (!extent_entry_is_crc(en_l)) -+ continue; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, -+ crc_l.csum, -+ crc_r.csum, -+ crc_r.uncompressed_size << 9); -+ -+ crc_l.uncompressed_size += crc_r.uncompressed_size; -+ crc_l.compressed_size += crc_r.compressed_size; -+ -+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, -+ extent_entry_type(en_l)); -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) -+ return "incorrect value size"; -+ -+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) -+ return "invalid nr_replicas"; -+ -+ return NULL; -+} -+ -+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ pr_buf(out, "generation %u replicas %u", -+ le32_to_cpu(r.v->generation), -+ r.v->nr_replicas); -+} -+ -+enum merge_result bch2_reservation_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reservation l = bkey_s_to_reservation(_l); -+ struct bkey_s_reservation r = bkey_s_to_reservation(_r); -+ -+ if (l.v->generation != r.v->generation || -+ l.v->nr_replicas != r.v->nr_replicas) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, r.s); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* Extent checksum entries: */ -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, -+ struct bch_extent_crc_unpacked r) -+{ -+ return (l.csum_type != r.csum_type || -+ l.compression_type != r.compression_type || -+ l.compressed_size != r.compressed_size || -+ l.uncompressed_size != r.uncompressed_size || -+ l.offset != r.offset || -+ l.live_size != r.live_size || -+ l.nonce != r.nonce || -+ bch2_crc_cmp(l.csum, r.csum)); -+} -+ -+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, -+ struct bch_extent_crc_unpacked n) -+{ -+ return !crc_is_compressed(u) && -+ u.csum_type && -+ u.uncompressed_size > u.live_size && -+ bch2_csum_type_is_encryption(u.csum_type) == -+ bch2_csum_type_is_encryption(n.csum_type); -+} -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, -+ struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ if (!n.csum_type) -+ return false; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (can_narrow_crc(crc, n)) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * We're writing another replica for this extent, so while we've got the data in -+ * memory we'll be computing a new checksum for the currently live data. -+ * -+ * If there are other replicas we aren't moving, and they are checksummed but -+ * not compressed, we can modify them to point to only the data that is -+ * currently live (so that readers won't have to bounce) while we've got the -+ * checksum we need: -+ */ -+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked u; -+ struct extent_ptr_decoded p; -+ union bch_extent_entry *i; -+ bool ret = false; -+ -+ /* Find a checksum entry that covers only live data: */ -+ if (!n.csum_type) { -+ bkey_for_each_crc(&k->k, ptrs, u, i) -+ if (!crc_is_compressed(u) && -+ u.csum_type && -+ u.live_size == u.uncompressed_size) { -+ n = u; -+ goto found; -+ } -+ return false; -+ } -+found: -+ BUG_ON(crc_is_compressed(n)); -+ BUG_ON(n.offset); -+ BUG_ON(n.live_size != k->k.size); -+ -+restart_narrow_pointers: -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ -+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) -+ if (can_narrow_crc(p.crc, n)) { -+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); -+ p.ptr.offset += p.crc.offset; -+ p.crc = n; -+ bch2_extent_ptr_decoded_append(k, &p); -+ ret = true; -+ goto restart_narrow_pointers; -+ } -+ -+ return ret; -+} -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *dst, -+ struct bch_extent_crc_unpacked src, -+ enum bch_extent_entry_type type) -+{ -+#define set_common_fields(_dst, _src) \ -+ _dst.type = 1 << type; \ -+ _dst.csum_type = _src.csum_type, \ -+ _dst.compression_type = _src.compression_type, \ -+ _dst._compressed_size = _src.compressed_size - 1, \ -+ _dst._uncompressed_size = _src.uncompressed_size - 1, \ -+ _dst.offset = _src.offset -+ -+ switch (type) { -+ case BCH_EXTENT_ENTRY_crc32: -+ set_common_fields(dst->crc32, src); -+ dst->crc32.csum = *((__le32 *) &src.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ set_common_fields(dst->crc64, src); -+ dst->crc64.nonce = src.nonce; -+ dst->crc64.csum_lo = src.csum.lo; -+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ set_common_fields(dst->crc128, src); -+ dst->crc128.nonce = src.nonce; -+ dst->crc128.csum = src.csum; -+ break; -+ default: -+ BUG(); -+ } -+#undef set_common_fields -+} -+ -+void bch2_extent_crc_append(struct bkey_i *k, -+ struct bch_extent_crc_unpacked new) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ union bch_extent_crc *crc = (void *) ptrs.end; -+ enum bch_extent_entry_type type; -+ -+ if (bch_crc_bytes[new.csum_type] <= 4 && -+ new.uncompressed_size <= CRC32_SIZE_MAX && -+ new.nonce <= CRC32_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc32; -+ else if (bch_crc_bytes[new.csum_type] <= 10 && -+ new.uncompressed_size <= CRC64_SIZE_MAX && -+ new.nonce <= CRC64_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc64; -+ else if (bch_crc_bytes[new.csum_type] <= 16 && -+ new.uncompressed_size <= CRC128_SIZE_MAX && -+ new.nonce <= CRC128_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc128; -+ else -+ BUG(); -+ -+ bch2_extent_crc_pack(crc, new, type); -+ -+ k->k.u64s += extent_entry_u64s(ptrs.end); -+ -+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -+} -+ -+/* Generic code for keys with pointers: */ -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -+{ -+ return bch2_bkey_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -+{ -+ return k.k->type == KEY_TYPE_reservation -+ ? bkey_s_c_to_reservation(k).v->nr_replicas -+ : bch2_bkey_dirty_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -+{ -+ unsigned ret = 0; -+ -+ if (k.k->type == KEY_TYPE_reservation) { -+ ret = bkey_s_c_to_reservation(k).v->nr_replicas; -+ } else { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ ret += !p.ptr.cached && !crc_is_compressed(p.crc); -+ } -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned ret = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && crc_is_compressed(p.crc)) -+ ret += p.crc.compressed_size; -+ -+ return ret; -+} -+ -+bool bch2_bkey_is_incompressible(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, entry) -+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) -+ return true; -+ return false; -+} -+ -+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, -+ unsigned nr_replicas) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end = pos; -+ struct bkey_s_c k; -+ bool ret = true; -+ int err; -+ -+ end.offset += size; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, -+ BTREE_ITER_SLOTS, k, err) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { -+ ret = false; -+ break; -+ } -+ } -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static unsigned bch2_extent_ptr_durability(struct bch_fs *c, -+ struct extent_ptr_decoded p) -+{ -+ unsigned durability = 0; -+ struct bch_dev *ca; -+ -+ if (p.ptr.cached) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) -+ durability = max_t(unsigned, durability, ca->mi.durability); -+ -+ if (p.has_ec) { -+ struct stripe *s = -+ genradix_ptr(&c->stripes[0], p.ec.idx); -+ -+ if (WARN_ON(!s)) -+ goto out; -+ -+ durability = max_t(unsigned, durability, s->nr_redundant); -+ } -+out: -+ return durability; -+} -+ -+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned durability = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ durability += bch2_extent_ptr_durability(c, p); -+ -+ return durability; -+} -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, -+ unsigned target, -+ unsigned nr_desired_replicas) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; -+ -+ if (target && extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra && -+ !bch2_dev_in_target(c, p.ptr.dev, target)) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+ -+ if (extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+} -+ -+void bch2_bkey_append_ptr(struct bkey_i *k, -+ struct bch_extent_ptr ptr) -+{ -+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); -+ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); -+ -+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ -+ memcpy((void *) &k->v + bkey_val_bytes(&k->k), -+ &ptr, -+ sizeof(ptr)); -+ k->u64s++; -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void __extent_entry_insert(struct bkey_i *k, -+ union bch_extent_entry *dst, -+ union bch_extent_entry *new) -+{ -+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); -+ -+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), -+ dst, (u64 *) end - (u64 *) dst); -+ k->k.u64s += extent_entry_u64s(new); -+ memcpy(dst, new, extent_entry_bytes(new)); -+} -+ -+void bch2_extent_ptr_decoded_append(struct bkey_i *k, -+ struct extent_ptr_decoded *p) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked crc = -+ bch2_extent_crc_unpack(&k->k, NULL); -+ union bch_extent_entry *pos; -+ -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = ptrs.start; -+ goto found; -+ } -+ -+ bkey_for_each_crc(&k->k, ptrs, crc, pos) -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = extent_entry_next(pos); -+ goto found; -+ } -+ -+ bch2_extent_crc_append(k, p->crc); -+ pos = bkey_val_end(bkey_i_to_s(k)); -+found: -+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ptr)); -+ -+ if (p->has_ec) { -+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ec)); -+ } -+} -+ -+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, -+ union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *i = ptrs.start; -+ -+ if (i == entry) -+ return NULL; -+ -+ while (extent_entry_next(i) != entry) -+ i = extent_entry_next(i); -+ return i; -+} -+ -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, -+ struct bch_extent_ptr *ptr) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *dst, *src, *prev; -+ bool drop_crc = true; -+ -+ EBUG_ON(ptr < &ptrs.start->ptr || -+ ptr >= &ptrs.end->ptr); -+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); -+ -+ src = extent_entry_next(to_entry(ptr)); -+ if (src != ptrs.end && -+ !extent_entry_is_crc(src)) -+ drop_crc = false; -+ -+ dst = to_entry(ptr); -+ while ((prev = extent_entry_prev(ptrs, dst))) { -+ if (extent_entry_is_ptr(prev)) -+ break; -+ -+ if (extent_entry_is_crc(prev)) { -+ if (drop_crc) -+ dst = prev; -+ break; -+ } -+ -+ dst = prev; -+ } -+ -+ memmove_u64s_down(dst, src, -+ (u64 *) ptrs.end - (u64 *) src); -+ k.k->u64s -= (u64 *) src - (u64 *) dst; -+ -+ return dst; -+} -+ -+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -+} -+ -+const struct bch_extent_ptr * -+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (ptr->dev == dev) -+ return ptr; -+ -+ return NULL; -+} -+ -+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (bch2_dev_in_target(c, ptr->dev, target) && -+ (!ptr->cached || -+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) -+ return true; -+ -+ return false; -+} -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_extent_ptr m, u64 offset) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == m.dev && -+ p.ptr.gen == m.gen && -+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == -+ (s64) m.offset - offset) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. -+ * -+ * Returns true if @k should be dropped entirely -+ * -+ * For existing keys, only called when btree nodes are being rewritten, not when -+ * they're merely being compacted/resorted in memory. -+ */ -+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, -+ ptr->cached && -+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); -+ -+ /* will only happen if all pointers were cached: */ -+ if (!bch2_bkey_nr_ptrs(k.s_c)) -+ k.k->type = KEY_TYPE_discard; -+ -+ return bkey_whiteout(k.k); -+} -+ -+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ const struct bch_extent_ptr *ptr; -+ const struct bch_extent_stripe_ptr *ec; -+ struct bch_dev *ca; -+ bool first = true; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (!first) -+ pr_buf(out, " "); -+ -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ptr = entry_to_ptr(entry); -+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] -+ ? bch_dev_bkey_exists(c, ptr->dev) -+ : NULL; -+ -+ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, -+ (u64) ptr->offset, ptr->gen, -+ ptr->cached ? " cached" : "", -+ ca && ptr_stale(ca, ptr) -+ ? " stale" : ""); -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", -+ crc.compressed_size, -+ crc.uncompressed_size, -+ crc.offset, crc.nonce, -+ crc.csum_type, -+ crc.compression_type); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ec = &entry->stripe_ptr; -+ -+ pr_buf(out, "ec: idx %llu block %u", -+ (u64) ec->idx, ec->block); -+ break; -+ default: -+ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); -+ return; -+ } -+ -+ first = false; -+ } -+} -+ -+static const char *extent_ptr_invalid(const struct bch_fs *c, -+ struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ unsigned size_ondisk, -+ bool metadata) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr2; -+ struct bch_dev *ca; -+ -+ if (!bch2_dev_exists2(c, ptr->dev)) -+ return "pointer to invalid device"; -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!ca) -+ return "pointer to invalid device"; -+ -+ bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr != ptr2 && ptr->dev == ptr2->dev) -+ return "multiple pointers to same device"; -+ -+ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) -+ return "offset past end of device"; -+ -+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) -+ return "offset before first bucket"; -+ -+ if (bucket_remainder(ca, ptr->offset) + -+ size_ondisk > ca->mi.bucket_size) -+ return "spans multiple buckets"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ unsigned size_ondisk = k.k->size; -+ const char *reason; -+ unsigned nonce = UINT_MAX; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr) -+ size_ondisk = c->opts.btree_node_size; -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) -+ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) -+ return "invalid extent entry type"; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr && -+ !extent_entry_is_ptr(entry)) -+ return "has non ptr field"; -+ -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ reason = extent_ptr_invalid(c, k, &entry->ptr, -+ size_ondisk, false); -+ if (reason) -+ return reason; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ if (crc.offset + crc.live_size > -+ crc.uncompressed_size) -+ return "checksum offset + key size > uncompressed size"; -+ -+ size_ondisk = crc.compressed_size; -+ -+ if (!bch2_checksum_type_valid(c, crc.csum_type)) -+ return "invalid checksum type"; -+ -+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) -+ return "invalid compression type"; -+ -+ if (bch2_csum_type_is_encryption(crc.csum_type)) { -+ if (nonce == UINT_MAX) -+ nonce = crc.offset + crc.nonce; -+ else if (nonce != crc.offset + crc.nonce) -+ return "incorrect nonce"; -+ } -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+ -+ return NULL; -+} -+ -+void bch2_ptr_swab(struct bkey_s k) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ u64 *d; -+ -+ for (d = (u64 *) ptrs.start; -+ d != (u64 *) ptrs.end; -+ d++) -+ *d = swab64(*d); -+ -+ for (entry = ptrs.start; -+ entry < ptrs.end; -+ entry = extent_entry_next(entry)) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.csum = swab32(entry->crc32.csum); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); -+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.csum.hi = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.hi); -+ entry->crc128.csum.lo = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+} -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 sub; -+ -+ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, k.k->p) > 0); -+ -+ sub = where.offset - bkey_start_offset(k.k); -+ -+ k.k->size -= sub; -+ -+ if (!k.k->size) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: { -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ bool seen_crc = false; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ if (!seen_crc) -+ entry->ptr.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ -+ if (extent_entry_is_crc(entry)) -+ seen_crc = true; -+ } -+ -+ break; -+ } -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); -+ -+ le64_add_cpu(&p.v->idx, sub); -+ break; -+ } -+ case KEY_TYPE_inline_data: { -+ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); -+ -+ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); -+ -+ memmove(d.v->data, -+ d.v->data + sub, -+ bkey_val_bytes(d.k) - sub); -+ -+ new_val_u64s -= sub >> 3; -+ break; -+ } -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -+ -+int bch2_cut_back_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 len = 0; -+ -+ if (bkey_cmp(where, k.k->p) >= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); -+ -+ len = where.offset - bkey_start_offset(k.k); -+ -+ k.k->p = where; -+ k.k->size = len; -+ -+ if (!len) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inline_data: -+ new_val_u64s = min(new_val_u64s, k.k->size << 6); -+ break; -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -new file mode 100644 -index 000000000000..29b15365d19c ---- /dev/null -+++ b/fs/bcachefs/extents.h -@@ -0,0 +1,603 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_H -+#define _BCACHEFS_EXTENTS_H -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "extents_types.h" -+ -+struct bch_fs; -+struct btree_trans; -+ -+/* extent entries: */ -+ -+#define extent_entry_last(_e) \ -+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) -+ -+#define entry_to_ptr(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ -+ \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const struct bch_extent_ptr *) (_entry), \ -+ (struct bch_extent_ptr *) (_entry)); \ -+}) -+ -+/* downcast, preserves const */ -+#define to_entry(_entry) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ -+ !type_is(_entry, struct bch_extent_ptr *) && \ -+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ -+ \ -+ __builtin_choose_expr( \ -+ (type_is_exact(_entry, const union bch_extent_crc *) || \ -+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ -+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ -+ (const union bch_extent_entry *) (_entry), \ -+ (union bch_extent_entry *) (_entry)); \ -+}) -+ -+#define extent_entry_next(_entry) \ -+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) -+ -+static inline unsigned -+__extent_entry_type(const union bch_extent_entry *e) -+{ -+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -+} -+ -+static inline enum bch_extent_entry_type -+extent_entry_type(const union bch_extent_entry *e) -+{ -+ int ret = __ffs(e->type); -+ -+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); -+ -+ return ret; -+} -+ -+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -+{ -+ switch (extent_entry_type(entry)) { -+#define x(f, n) \ -+ case BCH_EXTENT_ENTRY_##f: \ -+ return sizeof(struct bch_extent_##f); -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -+{ -+ return extent_entry_bytes(entry) / sizeof(u64); -+} -+ -+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+union bch_extent_crc { -+ u8 type; -+ struct bch_extent_crc32 crc32; -+ struct bch_extent_crc64 crc64; -+ struct bch_extent_crc128 crc128; -+}; -+ -+#define __entry_to_crc(_entry) \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const union bch_extent_crc *) (_entry), \ -+ (union bch_extent_crc *) (_entry)) -+ -+#define entry_to_crc(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ -+ \ -+ __entry_to_crc(_entry); \ -+}) -+ -+static inline struct bch_extent_crc_unpacked -+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -+{ -+#define common_fields(_crc) \ -+ .csum_type = _crc.csum_type, \ -+ .compression_type = _crc.compression_type, \ -+ .compressed_size = _crc._compressed_size + 1, \ -+ .uncompressed_size = _crc._uncompressed_size + 1, \ -+ .offset = _crc.offset, \ -+ .live_size = k->size -+ -+ if (!crc) -+ return (struct bch_extent_crc_unpacked) { -+ .compressed_size = k->size, -+ .uncompressed_size = k->size, -+ .live_size = k->size, -+ }; -+ -+ switch (extent_entry_type(to_entry(crc))) { -+ case BCH_EXTENT_ENTRY_crc32: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc32), -+ }; -+ -+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; -+ -+ memcpy(&ret.csum.lo, &crc->crc32.csum, -+ sizeof(crc->crc32.csum)); -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc64: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc64), -+ .nonce = crc->crc64.nonce, -+ .csum.lo = (__force __le64) crc->crc64.csum_lo, -+ }; -+ -+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc128: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc128), -+ .nonce = crc->crc128.nonce, -+ .csum = crc->crc128.csum, -+ }; -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+#undef common_fields -+} -+ -+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -+{ -+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && -+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -+} -+ -+/* bkey_ptrs: generically over any key type that has ptrs */ -+ -+struct bkey_ptrs_c { -+ const union bch_extent_entry *start; -+ const union bch_extent_entry *end; -+}; -+ -+struct bkey_ptrs { -+ union bch_extent_entry *start; -+ union bch_extent_entry *end; -+}; -+ -+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: { -+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ return (struct bkey_ptrs_c) { -+ e.v->start, -+ extent_entry_last(e) -+ }; -+ } -+ case KEY_TYPE_stripe: { -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&s.v->ptrs[0]), -+ to_entry(&s.v->ptrs[s.v->nr_blocks]), -+ }; -+ } -+ case KEY_TYPE_reflink_v: { -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ return (struct bkey_ptrs_c) { -+ r.v->start, -+ bkey_val_end(r), -+ }; -+ } -+ case KEY_TYPE_btree_ptr_v2: { -+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ default: -+ return (struct bkey_ptrs_c) { NULL, NULL }; -+ } -+} -+ -+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -+{ -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); -+ -+ return (struct bkey_ptrs) { -+ (void *) p.start, -+ (void *) p.end -+ }; -+} -+ -+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ -+ for ((_entry) = (_start); \ -+ (_entry) < (_end); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define __bkey_ptr_next(_ptr, _end) \ -+({ \ -+ typeof(_end) _entry; \ -+ \ -+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ -+ if (extent_entry_is_ptr(_entry)) \ -+ break; \ -+ \ -+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -+}) -+ -+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) -+ -+#define bkey_extent_entry_for_each(_p, _entry) \ -+ bkey_extent_entry_for_each_from(_p, _entry, _p.start) -+ -+#define __bkey_for_each_ptr(_start, _end, _ptr) \ -+ for ((_ptr) = (_start); \ -+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ -+ (_ptr)++) -+ -+#define bkey_ptr_next(_p, _ptr) \ -+ __bkey_ptr_next(_ptr, (_p).end) -+ -+#define bkey_for_each_ptr(_p, _ptr) \ -+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) -+ -+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -+({ \ -+ __label__ out; \ -+ \ -+ (_ptr).idx = 0; \ -+ (_ptr).has_ec = false; \ -+ \ -+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ -+ switch (extent_entry_type(_entry)) { \ -+ case BCH_EXTENT_ENTRY_ptr: \ -+ (_ptr).ptr = _entry->ptr; \ -+ goto out; \ -+ case BCH_EXTENT_ENTRY_crc32: \ -+ case BCH_EXTENT_ENTRY_crc64: \ -+ case BCH_EXTENT_ENTRY_crc128: \ -+ (_ptr).crc = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_entry)); \ -+ break; \ -+ case BCH_EXTENT_ENTRY_stripe_ptr: \ -+ (_ptr).ec = _entry->stripe_ptr; \ -+ (_ptr).has_ec = true; \ -+ break; \ -+ } \ -+out: \ -+ _entry < (_end); \ -+}) -+ -+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ -+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ -+ (_entry) = _start; \ -+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ -+ _ptr, _entry) -+ -+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ -+({ \ -+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ -+ if (extent_entry_is_crc(_iter)) { \ -+ (_crc) = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_iter)); \ -+ break; \ -+ } \ -+ \ -+ (_iter) < (_end); \ -+}) -+ -+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ -+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ -+ (_iter) = (_start); \ -+ bkey_crc_next(_k, _start, _end, _crc, _iter); \ -+ (_iter) = extent_entry_next(_iter)) -+ -+#define bkey_for_each_crc(_k, _p, _crc, _iter) \ -+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) -+ -+/* Iterate over pointers in KEY_TYPE_extent: */ -+ -+#define extent_for_each_entry_from(_e, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, \ -+ extent_entry_last(_e),_entry) -+ -+#define extent_for_each_entry(_e, _entry) \ -+ extent_for_each_entry_from(_e, _entry, (_e).v->start) -+ -+#define extent_ptr_next(_e, _ptr) \ -+ __bkey_ptr_next(_ptr, extent_entry_last(_e)) -+ -+#define extent_for_each_ptr(_e, _ptr) \ -+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -+ -+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ -+ extent_entry_last(_e), _ptr, _entry) -+ -+/* utility code common to all keys with pointers: */ -+ -+void bch2_mark_io_failure(struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, -+ int, struct bkey_s); -+ -+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_v2_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .compat = bch2_btree_ptr_v2_compat, \ -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_extent_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_extent (struct bkey_ops) { \ -+ .key_invalid = bch2_extent_invalid, \ -+ .key_debugcheck = bch2_extent_debugcheck, \ -+ .val_to_text = bch2_extent_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .key_normalize = bch2_extent_normalize, \ -+ .key_merge = bch2_extent_merge, \ -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_reservation_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reservation (struct bkey_ops) { \ -+ .key_invalid = bch2_reservation_invalid, \ -+ .val_to_text = bch2_reservation_to_text, \ -+ .key_merge = bch2_reservation_merge, \ -+} -+ -+/* Extent checksum entries: */ -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c, -+ struct bch_extent_crc_unpacked); -+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -+void bch2_extent_crc_append(struct bkey_i *, -+ struct bch_extent_crc_unpacked); -+ -+/* Generic code for keys with pointers: */ -+ -+static inline bool bkey_extent_is_direct_data(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_data(const struct bkey *k) -+{ -+ return bkey_extent_is_direct_data(k) || -+ k->type == KEY_TYPE_inline_data || -+ k->type == KEY_TYPE_reflink_p; -+} -+ -+/* -+ * Should extent be counted under inode->i_sectors? -+ */ -+static inline bool bkey_extent_is_allocation(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reservation: -+ case KEY_TYPE_reflink_p: -+ case KEY_TYPE_reflink_v: -+ case KEY_TYPE_inline_data: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (!ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -+bool bch2_bkey_is_incompressible(struct bkey_s_c); -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); -+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, -+ unsigned, unsigned); -+ -+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -+void bch2_extent_ptr_decoded_append(struct bkey_i *, -+ struct extent_ptr_decoded *); -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, -+ struct bch_extent_ptr *); -+ -+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -+do { \ -+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ -+ \ -+ _ptr = &_ptrs.start->ptr; \ -+ \ -+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ -+ if (_cond) { \ -+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ -+ _ptrs = bch2_bkey_ptrs(_k); \ -+ continue; \ -+ } \ -+ \ -+ (_ptr)++; \ -+ } \ -+} while (0) -+ -+void bch2_bkey_drop_device(struct bkey_s, unsigned); -+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, -+ struct bch_extent_ptr, u64); -+ -+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); -+ -+void bch2_ptr_swab(struct bkey_s); -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos, struct bkey_s); -+int bch2_cut_back_s(struct bpos, struct bkey_s); -+ -+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_front_s(where, bkey_i_to_s(k)); -+} -+ -+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_back_s(where, bkey_i_to_s(k)); -+} -+ -+/** -+ * bch_key_resize - adjust size of @k -+ * -+ * bkey_start_offset(k) will be preserved, modifies where the extent ends -+ */ -+static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -+{ -+ k->p.offset -= k->size; -+ k->p.offset += new_size; -+ k->size = new_size; -+} -+ -+/* -+ * In extent_sort_fix_overlapping(), insert_fixup_extent(), -+ * extent_merge_inline() - we're modifying keys in place that are packed. To do -+ * that we have to unpack the key, modify the unpacked key - then this -+ * copies/repacks the unpacked to the original as necessary. -+ */ -+static inline void extent_save(struct btree *b, struct bkey_packed *dst, -+ struct bkey *src) -+{ -+ struct bkey_format *f = &b->format; -+ struct bkey_i *dst_unpacked; -+ -+ if ((dst_unpacked = packed_to_bkey(dst))) -+ dst_unpacked->k = *src; -+ else -+ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -+} -+ -+#endif /* _BCACHEFS_EXTENTS_H */ -diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h -new file mode 100644 -index 000000000000..43d6c341ecca ---- /dev/null -+++ b/fs/bcachefs/extents_types.h -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_TYPES_H -+#define _BCACHEFS_EXTENTS_TYPES_H -+ -+#include "bcachefs_format.h" -+ -+struct bch_extent_crc_unpacked { -+ u32 compressed_size; -+ u32 uncompressed_size; -+ u32 live_size; -+ -+ u8 csum_type; -+ u8 compression_type; -+ -+ u16 offset; -+ -+ u16 nonce; -+ -+ struct bch_csum csum; -+}; -+ -+struct extent_ptr_decoded { -+ unsigned idx; -+ bool has_ec; -+ struct bch_extent_crc_unpacked crc; -+ struct bch_extent_ptr ptr; -+ struct bch_extent_stripe_ptr ec; -+}; -+ -+struct bch_io_failures { -+ u8 nr; -+ struct bch_dev_io_failures { -+ u8 dev; -+ u8 idx; -+ u8 nr_failed; -+ u8 nr_retries; -+ } devs[BCH_REPLICAS_MAX]; -+}; -+ -+#endif /* _BCACHEFS_EXTENTS_TYPES_H */ -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -new file mode 100644 -index 000000000000..26d5cad7e6a5 ---- /dev/null -+++ b/fs/bcachefs/eytzinger.h -@@ -0,0 +1,285 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _EYTZINGER_H -+#define _EYTZINGER_H -+ -+#include -+#include -+ -+#include "util.h" -+ -+/* -+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an -+ * array -+ */ -+ -+/* -+ * One based indexing version: -+ * -+ * With one based indexing each level of the tree starts at a power of two - -+ * good for cacheline alignment: -+ * -+ * Size parameter is treated as if we were using 0 based indexing, however: -+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there -+ * are actually size - 1 elements -+ */ -+ -+static inline unsigned eytzinger1_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + child; -+} -+ -+static inline unsigned eytzinger1_left_child(unsigned i) -+{ -+ return eytzinger1_child(i, 0); -+} -+ -+static inline unsigned eytzinger1_right_child(unsigned i) -+{ -+ return eytzinger1_child(i, 1); -+} -+ -+static inline unsigned eytzinger1_first(unsigned size) -+{ -+ return rounddown_pow_of_two(size - 1); -+} -+ -+static inline unsigned eytzinger1_last(unsigned size) -+{ -+ return rounddown_pow_of_two(size) - 1; -+} -+ -+/* -+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that -+ * -+ * eytzinger1_next(0) == eytzinger1_first()) -+ * eytzinger1_prev(0) == eytzinger1_last()) -+ * -+ * eytzinger1_prev(eytzinger1_first()) == 0 -+ * eytzinger1_next(eytzinger1_last()) == 0 -+ */ -+ -+static inline unsigned eytzinger1_next(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_right_child(i) < size) { -+ i = eytzinger1_right_child(i); -+ -+ i <<= __fls(size) - __fls(i); -+ i >>= i >= size; -+ } else { -+ i >>= ffz(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_left_child(i) < size) { -+ i = eytzinger1_left_child(i) + 1; -+ -+ i <<= __fls(size) - __fls(i); -+ i -= 1; -+ i >>= i >= size; -+ } else { -+ i >>= __ffs(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_extra(unsigned size) -+{ -+ return (size - rounddown_pow_of_two(size - 1)) << 1; -+} -+ -+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned b = __fls(i); -+ unsigned shift = __fls(size - 1) - b; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ i ^= 1U << b; -+ i <<= 1; -+ i |= 1; -+ i <<= shift; -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i -= (i - extra) >> 1; -+ */ -+ s = extra - i; -+ i += (s >> 1) & (s >> 31); -+ -+ return i; -+} -+ -+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned shift; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i += i - extra; -+ */ -+ s = extra - i; -+ i -= s & (s >> 31); -+ -+ shift = __ffs(i); -+ -+ i >>= shift + 1; -+ i |= 1U << (__fls(size - 1) - shift); -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -+} -+ -+#define eytzinger1_for_each(_i, _size) \ -+ for ((_i) = eytzinger1_first((_size)); \ -+ (_i) != 0; \ -+ (_i) = eytzinger1_next((_i), (_size))) -+ -+/* Zero based indexing version: */ -+ -+static inline unsigned eytzinger0_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + 1 + child; -+} -+ -+static inline unsigned eytzinger0_left_child(unsigned i) -+{ -+ return eytzinger0_child(i, 0); -+} -+ -+static inline unsigned eytzinger0_right_child(unsigned i) -+{ -+ return eytzinger0_child(i, 1); -+} -+ -+static inline unsigned eytzinger0_first(unsigned size) -+{ -+ return eytzinger1_first(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_last(unsigned size) -+{ -+ return eytzinger1_last(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_next(unsigned i, unsigned size) -+{ -+ return eytzinger1_next(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -+{ -+ return eytzinger1_prev(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_extra(unsigned size) -+{ -+ return eytzinger1_extra(size + 1); -+} -+ -+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -+} -+ -+#define eytzinger0_for_each(_i, _size) \ -+ for ((_i) = eytzinger0_first((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_next((_i), (_size))) -+ -+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); -+ -+/* return greatest node <= @search, or -1 if not found */ -+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, -+ eytzinger_cmp_fn cmp, const void *search) -+{ -+ unsigned i, n = 0; -+ -+ if (!nr) -+ return -1; -+ -+ do { -+ i = n; -+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); -+ } while (n < nr); -+ -+ if (n & 1) { -+ /* @i was greater than @search, return previous node: */ -+ -+ if (i == eytzinger0_first(nr)) -+ return -1; -+ -+ return eytzinger0_prev(i, nr); -+ } else { -+ return i; -+ } -+} -+ -+#define eytzinger0_find(base, nr, size, _cmp, search) \ -+({ \ -+ void *_base = (base); \ -+ void *_search = (search); \ -+ size_t _nr = (nr); \ -+ size_t _size = (size); \ -+ size_t _i = 0; \ -+ int _res; \ -+ \ -+ while (_i < _nr && \ -+ (_res = _cmp(_search, _base + _i * _size, _size))) \ -+ _i = eytzinger0_child(_i, _res > 0); \ -+ _i; \ -+}) -+ -+void eytzinger0_sort(void *, size_t, size_t, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+#endif /* _EYTZINGER_H */ -diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h -new file mode 100644 -index 000000000000..cdb272708a4b ---- /dev/null -+++ b/fs/bcachefs/fifo.h -@@ -0,0 +1,127 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FIFO_H -+#define _BCACHEFS_FIFO_H -+ -+#include "util.h" -+ -+#define FIFO(type) \ -+struct { \ -+ size_t front, back, size, mask; \ -+ type *data; \ -+} -+ -+#define DECLARE_FIFO(type, name) FIFO(type) name -+ -+#define fifo_buf_size(fifo) \ -+ ((fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ -+ : 0) -+ -+#define init_fifo(fifo, _size, _gfp) \ -+({ \ -+ (fifo)->front = (fifo)->back = 0; \ -+ (fifo)->size = (_size); \ -+ (fifo)->mask = (fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) - 1 \ -+ : 0; \ -+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ -+}) -+ -+#define free_fifo(fifo) \ -+do { \ -+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ -+ (fifo)->data = NULL; \ -+} while (0) -+ -+#define fifo_swap(l, r) \ -+do { \ -+ swap((l)->front, (r)->front); \ -+ swap((l)->back, (r)->back); \ -+ swap((l)->size, (r)->size); \ -+ swap((l)->mask, (r)->mask); \ -+ swap((l)->data, (r)->data); \ -+} while (0) -+ -+#define fifo_move(dest, src) \ -+do { \ -+ typeof(*((dest)->data)) _t; \ -+ while (!fifo_full(dest) && \ -+ fifo_pop(src, _t)) \ -+ fifo_push(dest, _t); \ -+} while (0) -+ -+#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) -+ -+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) -+ -+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) -+ -+#define fifo_entry_idx_abs(fifo, p) \ -+ ((((p) >= &fifo_peek_front(fifo) \ -+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ -+ (((p) - (fifo)->data))) -+ -+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] -+ -+#define fifo_push_back_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) -+ -+#define fifo_push_front_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) -+ -+#define fifo_push_back(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_push_front(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_pop_front(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_pop_back(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -+#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -+#define fifo_peek(fifo) fifo_peek_front(fifo) -+ -+#define fifo_for_each_entry(_entry, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#endif /* _BCACHEFS_FIFO_H */ -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -new file mode 100644 -index 000000000000..878419d40992 ---- /dev/null -+++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,317 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "fs-common.h" -+#include "inode.h" -+#include "xattr.h" -+ -+#include -+ -+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, -+ struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *new_inode, -+ const struct qstr *name, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct posix_acl *default_acl, -+ struct posix_acl *acl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *dir_iter = NULL; -+ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); -+ -+ if (!name) -+ new_inode->bi_flags |= BCH_INODE_UNLINKED; -+ -+ ret = bch2_inode_create(trans, new_inode, -+ BLOCKDEV_INODE_MAX, 0, -+ &c->unused_inode_hint); -+ if (ret) -+ goto err; -+ -+ if (default_acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ default_acl, ACL_TYPE_DEFAULT); -+ if (ret) -+ goto err; -+ } -+ -+ if (acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ acl, ACL_TYPE_ACCESS); -+ if (ret) -+ goto err; -+ } -+ -+ if (name) { -+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ if (S_ISDIR(new_inode->bi_mode)) -+ dir_u->bi_nlink++; -+ -+ ret = bch2_inode_write(trans, dir_iter, dir_u); -+ if (ret) -+ goto err; -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(new_inode->bi_mode), -+ name, new_inode->bi_inum, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ goto err; -+ } -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, -+ u64 inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ inode_u->bi_ctime = now; -+ bch2_inode_nlink_inc(inode_u); -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(inode_u->bi_mode), -+ name, inum, BCH_HASH_SET_MUST_CREATE) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ bch2_trans_iter_put(trans, inode_iter); -+ return ret; -+} -+ -+int bch2_unlink_trans(struct btree_trans *trans, -+ u64 dir_inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, -+ const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, -+ *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 inum, now = bch2_current_time(trans->c); -+ struct bkey_s_c k; -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, -+ name, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dirent_iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(dirent_iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; -+ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); -+ bch2_inode_nlink_dec(inode_u); -+ -+ ret = (S_ISDIR(inode_u->bi_mode) -+ ? bch2_empty_dir_trans(trans, inum) -+ : 0) ?: -+ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, inode_iter); -+ bch2_trans_iter_put(trans, dirent_iter); -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, -+ struct bch_inode_unpacked *src_u) -+{ -+ u64 src, dst; -+ unsigned id; -+ bool ret = false; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ if (dst_u->bi_fields_set & (1 << id)) -+ continue; -+ -+ src = bch2_inode_opt_get(src_u, id); -+ dst = bch2_inode_opt_get(dst_u, id); -+ -+ if (src == dst) -+ continue; -+ -+ bch2_inode_opt_set(dst_u, id, src); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+int bch2_rename_trans(struct btree_trans *trans, -+ u64 src_dir, struct bch_inode_unpacked *src_dir_u, -+ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, -+ struct bch_inode_unpacked *src_inode_u, -+ struct bch_inode_unpacked *dst_inode_u, -+ const struct qstr *src_name, -+ const struct qstr *dst_name, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; -+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; -+ struct bch_hash_info src_hash, dst_hash; -+ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); -+ int ret; -+ -+ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_dir_iter); -+ if (ret) -+ goto err; -+ -+ src_hash = bch2_hash_info_init(trans->c, src_dir_u); -+ -+ if (dst_dir != src_dir) { -+ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_dir_iter); -+ if (ret) -+ goto err; -+ -+ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); -+ } else { -+ dst_dir_u = src_dir_u; -+ dst_hash = src_hash; -+ } -+ -+ ret = bch2_dirent_rename(trans, -+ src_dir, &src_hash, -+ dst_dir, &dst_hash, -+ src_name, &src_inode, -+ dst_name, &dst_inode, -+ mode); -+ if (ret) -+ goto err; -+ -+ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_inode_iter); -+ if (ret) -+ goto err; -+ -+ if (dst_inode) { -+ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_inode_iter); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ if (S_ISDIR(src_inode_u->bi_mode) != -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -ENOTDIR; -+ goto err; -+ } -+ -+ if (S_ISDIR(dst_inode_u->bi_mode) && -+ bch2_empty_dir_trans(trans, dst_inode)) { -+ ret = -ENOTEMPTY; -+ goto err; -+ } -+ } -+ -+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -+ S_ISDIR(src_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (S_ISDIR(src_inode_u->bi_mode)) { -+ src_dir_u->bi_nlink--; -+ dst_dir_u->bi_nlink++; -+ } -+ -+ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { -+ dst_dir_u->bi_nlink--; -+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) -+ bch2_inode_nlink_dec(dst_inode_u); -+ -+ src_dir_u->bi_mtime = now; -+ src_dir_u->bi_ctime = now; -+ -+ if (src_dir != dst_dir) { -+ dst_dir_u->bi_mtime = now; -+ dst_dir_u->bi_ctime = now; -+ } -+ -+ src_inode_u->bi_ctime = now; -+ -+ if (dst_inode) -+ dst_inode_u->bi_ctime = now; -+ -+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: -+ (src_dir != dst_dir -+ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) -+ : 0 ) ?: -+ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: -+ (dst_inode -+ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) -+ : 0 ); -+err: -+ bch2_trans_iter_put(trans, dst_inode_iter); -+ bch2_trans_iter_put(trans, src_inode_iter); -+ bch2_trans_iter_put(trans, dst_dir_iter); -+ bch2_trans_iter_put(trans, src_dir_iter); -+ return ret; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h -new file mode 100644 -index 000000000000..2273b7961c9b ---- /dev/null -+++ b/fs/bcachefs/fs-common.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_COMMON_H -+#define _BCACHEFS_FS_COMMON_H -+ -+struct posix_acl; -+ -+int bch2_create_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct posix_acl *, -+ struct posix_acl *); -+ -+int bch2_link_trans(struct btree_trans *, u64, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_unlink_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_rename_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ const struct qstr *, -+ enum bch_rename_mode); -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *); -+ -+#endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -new file mode 100644 -index 000000000000..ec78e7b52375 ---- /dev/null -+++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3132 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "journal.h" -+#include "io.h" -+#include "keylist.h" -+#include "quota.h" -+#include "reflink.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+struct quota_res { -+ u64 sectors; -+}; -+ -+struct bch_writepage_io { -+ struct closure cl; -+ struct bch_inode_info *inode; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_write { -+ struct completion done; -+ struct kiocb *req; -+ struct mm_struct *mm; -+ unsigned loop:1, -+ sync:1, -+ free_iov:1; -+ struct quota_res quota_res; -+ u64 written; -+ -+ struct iov_iter iter; -+ struct iovec inline_vecs[2]; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_read { -+ struct closure cl; -+ struct kiocb *req; -+ long ret; -+ struct bch_read_bio rbio; -+}; -+ -+/* pagecache_block must be held */ -+static int write_invalidate_inode_pages_range(struct address_space *mapping, -+ loff_t start, loff_t end) -+{ -+ int ret; -+ -+ /* -+ * XXX: the way this is currently implemented, we can spin if a process -+ * is continually redirtying a specific page -+ */ -+ do { -+ if (!mapping->nrpages && -+ !mapping->nrexceptional) -+ return 0; -+ -+ ret = filemap_write_and_wait_range(mapping, start, end); -+ if (ret) -+ break; -+ -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = invalidate_inode_pages2_range(mapping, -+ start >> PAGE_SHIFT, -+ end >> PAGE_SHIFT); -+ } while (ret == -EBUSY); -+ -+ return ret; -+} -+ -+/* quotas */ -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ if (!res->sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ BUG_ON(res->sectors > inode->ei_quota_reserved); -+ -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); -+ inode->ei_quota_reserved -= res->sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ res->sectors = 0; -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ int ret; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, -+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); -+ if (likely(!ret)) { -+ inode->ei_quota_reserved += sectors; -+ res->sectors += sectors; -+ } -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+#else -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ return 0; -+} -+ -+#endif -+ -+/* i_size updates: */ -+ -+struct inode_new_size { -+ loff_t new_size; -+ u64 now; -+ unsigned fields; -+}; -+ -+static int inode_set_size(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_new_size *s = p; -+ -+ bi->bi_size = s->new_size; -+ if (s->fields & ATTR_ATIME) -+ bi->bi_atime = s->now; -+ if (s->fields & ATTR_MTIME) -+ bi->bi_mtime = s->now; -+ if (s->fields & ATTR_CTIME) -+ bi->bi_ctime = s->now; -+ -+ return 0; -+} -+ -+int __must_check bch2_write_inode_size(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ loff_t new_size, unsigned fields) -+{ -+ struct inode_new_size s = { -+ .new_size = new_size, -+ .now = bch2_current_time(c), -+ .fields = fields, -+ }; -+ -+ return bch2_write_inode(c, inode, inode_set_size, &s, fields); -+} -+ -+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ if (!sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+#ifdef CONFIG_BCACHEFS_QUOTA -+ if (quota_res && sectors > 0) { -+ BUG_ON(sectors > quota_res->sectors); -+ BUG_ON(sectors > inode->ei_quota_reserved); -+ -+ quota_res->sectors -= sectors; -+ inode->ei_quota_reserved -= sectors; -+ } else { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); -+ } -+#endif -+ inode->v.i_blocks += sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+} -+ -+/* page state: */ -+ -+/* stored in page->private: */ -+ -+struct bch_page_sector { -+ /* Uncompressed, fully allocated replicas: */ -+ unsigned nr_replicas:3; -+ -+ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ -+ unsigned replicas_reserved:3; -+ -+ /* i_sectors: */ -+ enum { -+ SECTOR_UNALLOCATED, -+ SECTOR_RESERVED, -+ SECTOR_DIRTY, -+ SECTOR_ALLOCATED, -+ } state:2; -+}; -+ -+struct bch_page_state { -+ spinlock_t lock; -+ atomic_t write_count; -+ struct bch_page_sector s[PAGE_SECTORS]; -+}; -+ -+static inline struct bch_page_state *__bch2_page_state(struct page *page) -+{ -+ return page_has_private(page) -+ ? (struct bch_page_state *) page_private(page) -+ : NULL; -+} -+ -+static inline struct bch_page_state *bch2_page_state(struct page *page) -+{ -+ EBUG_ON(!PageLocked(page)); -+ -+ return __bch2_page_state(page); -+} -+ -+/* for newly allocated pages: */ -+static void __bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = __bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+static void bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+/* for newly allocated pages: */ -+static struct bch_page_state *__bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ struct bch_page_state *s; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); -+ if (!s) -+ return NULL; -+ -+ spin_lock_init(&s->lock); -+ /* -+ * migrate_page_move_mapping() assumes that pages with private data -+ * have their count elevated by 1. -+ */ -+ get_page(page); -+ set_page_private(page, (unsigned long) s); -+ SetPagePrivate(page); -+ return s; -+} -+ -+static struct bch_page_state *bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); -+} -+ -+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ /* XXX: this should not be open coded */ -+ return inode->ei_inode.bi_data_replicas -+ ? inode->ei_inode.bi_data_replicas - 1 -+ : c->opts.data_replicas; -+} -+ -+static inline unsigned sectors_to_reserve(struct bch_page_sector *s, -+ unsigned nr_replicas) -+{ -+ return max(0, (int) nr_replicas - -+ s->nr_replicas - -+ s->replicas_reserved); -+} -+ -+static int bch2_get_page_disk_reservation(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct page *page, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned nr_replicas = inode_nr_replicas(c, inode); -+ struct disk_reservation disk_res = { 0 }; -+ unsigned i, disk_res_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ if (!disk_res_sectors) -+ return 0; -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ disk_res_sectors, 1, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ s->s[i].replicas_reserved += -+ sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ return 0; -+} -+ -+struct bch2_page_reservation { -+ struct disk_reservation disk; -+ struct quota_res quota; -+}; -+ -+static void bch2_page_reservation_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ memset(res, 0, sizeof(*res)); -+ -+ res->disk.nr_replicas = inode_nr_replicas(c, inode); -+} -+ -+static void bch2_page_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ bch2_disk_reservation_put(c, &res->disk); -+ bch2_quota_reservation_put(c, inode, &res->quota); -+} -+ -+static int bch2_page_reservation_get(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned i, disk_sectors = 0, quota_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ disk_sectors += sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; -+ } -+ -+ if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, -+ disk_sectors, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ if (quota_sectors) { -+ ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, -+ check_enospc); -+ if (unlikely(ret)) { -+ struct disk_reservation tmp = { -+ .sectors = disk_sectors -+ }; -+ -+ bch2_disk_reservation_put(c, &tmp); -+ res->disk.sectors -= disk_sectors; -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_clear_page_bits(struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_page_state *s = bch2_page_state(page); -+ struct disk_reservation disk_res = { 0 }; -+ int i, dirty_sectors = 0; -+ -+ if (!s) -+ return; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(PageWriteback(page)); -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) { -+ disk_res.sectors += s->s[i].replicas_reserved; -+ s->s[i].replicas_reserved = 0; -+ -+ if (s->s[i].state == SECTOR_DIRTY) { -+ dirty_sectors++; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ } -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, NULL, -dirty_sectors); -+ -+ bch2_page_state_release(page); -+} -+ -+static void bch2_set_page_dirty(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i, dirty_sectors = 0; -+ -+ WARN_ON((u64) page_offset(page) + offset + len > -+ round_up((u64) i_size_read(&inode->v), block_bytes(c))); -+ -+ spin_lock(&s->lock); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ unsigned sectors = sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ -+ /* -+ * This can happen if we race with the error path in -+ * bch2_writepage_io_done(): -+ */ -+ sectors = min_t(unsigned, sectors, res->disk.sectors); -+ -+ s->s[i].replicas_reserved += sectors; -+ res->disk.sectors -= sectors; -+ -+ if (s->s[i].state == SECTOR_UNALLOCATED) -+ dirty_sectors++; -+ -+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); -+ } -+ -+ spin_unlock(&s->lock); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, &res->quota, dirty_sectors); -+ -+ if (!PageDirty(page)) -+ __set_page_dirty_nobuffers(page); -+} -+ -+vm_fault_t bch2_page_fault(struct vm_fault *vmf) -+{ -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ int ret; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = filemap_fault(vmf); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return ret; -+} -+ -+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -+{ -+ struct page *page = vmf->page; -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation res; -+ unsigned len; -+ loff_t isize; -+ int ret = VM_FAULT_LOCKED; -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ sb_start_pagefault(inode->v.i_sb); -+ file_update_time(file); -+ -+ /* -+ * Not strictly necessary, but helps avoid dio writes livelocking in -+ * write_invalidate_inode_pages_range() - can drop this if/when we get -+ * a write_invalidate_inode_pages_range() that works without dropping -+ * page lock before invalidating page -+ */ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ lock_page(page); -+ isize = i_size_read(&inode->v); -+ -+ if (page->mapping != mapping || page_offset(page) >= isize) { -+ unlock_page(page); -+ ret = VM_FAULT_NOPAGE; -+ goto out; -+ } -+ -+ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); -+ -+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { -+ unlock_page(page); -+ ret = VM_FAULT_SIGBUS; -+ goto out; -+ } -+ -+ bch2_set_page_dirty(c, inode, page, &res, 0, len); -+ bch2_page_reservation_put(c, inode, &res); -+ -+ wait_for_stable_page(page); -+out: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ sb_end_pagefault(inode->v.i_sb); -+ -+ return ret; -+} -+ -+void bch2_invalidatepage(struct page *page, unsigned int offset, -+ unsigned int length) -+{ -+ if (offset || length < PAGE_SIZE) -+ return; -+ -+ bch2_clear_page_bits(page); -+} -+ -+int bch2_releasepage(struct page *page, gfp_t gfp_mask) -+{ -+ if (PageDirty(page)) -+ return 0; -+ -+ bch2_clear_page_bits(page); -+ return 1; -+} -+ -+#ifdef CONFIG_MIGRATION -+int bch2_migrate_page(struct address_space *mapping, struct page *newpage, -+ struct page *page, enum migrate_mode mode) -+{ -+ int ret; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(!PageLocked(newpage)); -+ -+ ret = migrate_page_move_mapping(mapping, newpage, page, 0); -+ if (ret != MIGRATEPAGE_SUCCESS) -+ return ret; -+ -+ if (PagePrivate(page)) { -+ ClearPagePrivate(page); -+ get_page(newpage); -+ set_page_private(newpage, page_private(page)); -+ set_page_private(page, 0); -+ put_page(page); -+ SetPagePrivate(newpage); -+ } -+ -+ if (mode != MIGRATE_SYNC_NO_COPY) -+ migrate_page_copy(newpage, page); -+ else -+ migrate_page_states(newpage, page); -+ return MIGRATEPAGE_SUCCESS; -+} -+#endif -+ -+/* readpage(s): */ -+ -+static void bch2_readpages_end_io(struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) { -+ struct page *page = bv->bv_page; -+ -+ if (!bio->bi_status) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ } -+ -+ bio_put(bio); -+} -+ -+static inline void page_state_init_for_read(struct page *page) -+{ -+ SetPagePrivate(page); -+ page->private = 0; -+} -+ -+struct readpages_iter { -+ struct address_space *mapping; -+ struct page **pages; -+ unsigned nr_pages; -+ unsigned nr_added; -+ unsigned idx; -+ pgoff_t offset; -+}; -+ -+static int readpages_iter_init(struct readpages_iter *iter, -+ struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->mapping = mapping; -+ iter->offset = list_last_entry(pages, struct page, lru)->index; -+ -+ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); -+ if (!iter->pages) -+ return -ENOMEM; -+ -+ while (!list_empty(pages)) { -+ struct page *page = list_last_entry(pages, struct page, lru); -+ -+ __bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ iter->pages[iter->nr_pages++] = page; -+ list_del(&page->lru); -+ } -+ -+ return 0; -+} -+ -+static inline struct page *readpage_iter_next(struct readpages_iter *iter) -+{ -+ struct page *page; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(iter->idx > iter->nr_added); -+ BUG_ON(iter->nr_added > iter->nr_pages); -+ -+ if (iter->idx < iter->nr_added) -+ goto out; -+ -+ while (1) { -+ if (iter->idx == iter->nr_pages) -+ return NULL; -+ -+ ret = add_to_page_cache_lru_vec(iter->mapping, -+ iter->pages + iter->nr_added, -+ iter->nr_pages - iter->nr_added, -+ iter->offset + iter->nr_added, -+ GFP_NOFS); -+ if (ret > 0) -+ break; -+ -+ page = iter->pages[iter->nr_added]; -+ iter->idx++; -+ iter->nr_added++; -+ -+ __bch2_page_state_release(page); -+ put_page(page); -+ } -+ -+ iter->nr_added += ret; -+ -+ for (i = iter->idx; i < iter->nr_added; i++) -+ put_page(iter->pages[i]); -+out: -+ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); -+ -+ return iter->pages[iter->idx]; -+} -+ -+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -+{ -+ struct bvec_iter iter; -+ struct bio_vec bv; -+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v -+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = k.k->type == KEY_TYPE_reservation -+ ? SECTOR_RESERVED -+ : SECTOR_ALLOCATED; -+ -+ bio_for_each_segment(bv, bio, iter) { -+ struct bch_page_state *s = bch2_page_state(bv.bv_page); -+ unsigned i; -+ -+ for (i = bv.bv_offset >> 9; -+ i < (bv.bv_offset + bv.bv_len) >> 9; -+ i++) { -+ s->s[i].nr_replicas = nr_ptrs; -+ s->s[i].state = state; -+ } -+ } -+} -+ -+static bool extent_partial_reads_expensive(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (crc.csum_type || crc.compression_type) -+ return true; -+ return false; -+} -+ -+static void readpage_bio_extend(struct readpages_iter *iter, -+ struct bio *bio, -+ unsigned sectors_this_extent, -+ bool get_more) -+{ -+ while (bio_sectors(bio) < sectors_this_extent && -+ bio->bi_vcnt < bio->bi_max_vecs) { -+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; -+ struct page *page = readpage_iter_next(iter); -+ int ret; -+ -+ if (page) { -+ if (iter->offset + iter->idx != page_offset) -+ break; -+ -+ iter->idx++; -+ } else { -+ if (!get_more) -+ break; -+ -+ page = xa_load(&iter->mapping->i_pages, page_offset); -+ if (page && !xa_is_value(page)) -+ break; -+ -+ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); -+ if (!page) -+ break; -+ -+ if (!__bch2_page_state_create(page, 0)) { -+ put_page(page); -+ break; -+ } -+ -+ ret = add_to_page_cache_lru(page, iter->mapping, -+ page_offset, GFP_NOFS); -+ if (ret) { -+ __bch2_page_state_release(page); -+ put_page(page); -+ break; -+ } -+ -+ put_page(page); -+ } -+ -+ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); -+ } -+} -+ -+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, -+ struct bch_read_bio *rbio, u64 inum, -+ struct readpages_iter *readpages_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_on_stack sk; -+ int flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE; -+ int ret = 0; -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+retry: -+ while (1) { -+ struct bkey_s_c k; -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inum, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(trans); -+ -+ if (readpages_iter) -+ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, -+ extent_partial_reads_expensive(k)); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ bch2_add_page_sectors(&rbio->bio, k); -+ -+ bch2_read_extent(c, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (ret) { -+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); -+ bio_endio(&rbio->bio); -+ } -+ -+ bkey_on_stack_exit(&sk, c); -+} -+ -+int bch2_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct page *page; -+ struct readpages_iter readpages_iter; -+ int ret; -+ -+ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); -+ BUG_ON(ret); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ while ((page = readpage_iter_next(&readpages_iter))) { -+ pgoff_t index = readpages_iter.offset + readpages_iter.idx; -+ unsigned n = min_t(unsigned, -+ readpages_iter.nr_pages - -+ readpages_iter.idx, -+ BIO_MAX_PAGES); -+ struct bch_read_bio *rbio = -+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), -+ opts); -+ -+ readpages_iter.idx++; -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); -+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bchfs_read(&trans, iter, rbio, inode->v.i_ino, -+ &readpages_iter); -+ } -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_trans_exit(&trans); -+ kfree(readpages_iter.pages); -+ -+ return 0; -+} -+ -+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, -+ u64 inum, struct page *page) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ -+ bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); -+ rbio->bio.bi_iter.bi_sector = -+ (sector_t) page->index << PAGE_SECTOR_SHIFT; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bchfs_read(&trans, iter, rbio, inum, NULL); -+ -+ bch2_trans_exit(&trans); -+} -+ -+int bch2_readpage(struct file *file, struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct bch_read_bio *rbio; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ return 0; -+} -+ -+static void bch2_read_single_page_end_io(struct bio *bio) -+{ -+ complete(bio->bi_private); -+} -+ -+static int bch2_read_single_page(struct page *page, -+ struct address_space *mapping) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_read_bio *rbio; -+ int ret; -+ DECLARE_COMPLETION_ONSTACK(done); -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), -+ io_opts(c, &inode->ei_inode)); -+ rbio->bio.bi_private = &done; -+ rbio->bio.bi_end_io = bch2_read_single_page_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ wait_for_completion(&done); -+ -+ ret = blk_status_to_errno(rbio->bio.bi_status); -+ bio_put(&rbio->bio); -+ -+ if (ret < 0) -+ return ret; -+ -+ SetPageUptodate(page); -+ return 0; -+} -+ -+/* writepages: */ -+ -+struct bch_writepage_state { -+ struct bch_writepage_io *io; -+ struct bch_io_opts opts; -+}; -+ -+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ return (struct bch_writepage_state) { -+ .opts = io_opts(c, &inode->ei_inode) -+ }; -+} -+ -+static void bch2_writepage_io_free(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ -+ bio_put(&io->op.wbio.bio); -+} -+ -+static void bch2_writepage_io_done(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ struct bch_fs *c = io->op.c; -+ struct bio *bio = &io->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bvec; -+ unsigned i; -+ -+ if (io->op.error) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ SetPageError(bvec->bv_page); -+ mapping_set_error(bvec->bv_page->mapping, -EIO); -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ /* -+ * racing with fallocate can cause us to add fewer sectors than -+ * expected - but we shouldn't add more sectors than expected: -+ */ -+ BUG_ON(io->op.i_sectors_delta > 0); -+ -+ /* -+ * (error (due to going RO) halfway through a page can screw that up -+ * slightly) -+ * XXX wtf? -+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); -+ */ -+ -+ /* -+ * PageWriteback is effectively our ref on the inode - fixup i_blocks -+ * before calling end_page_writeback: -+ */ -+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); -+ -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(bvec->bv_page); -+ } -+ -+ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); -+} -+ -+static void bch2_writepage_do_io(struct bch_writepage_state *w) -+{ -+ struct bch_writepage_io *io = w->io; -+ -+ w->io = NULL; -+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); -+ continue_at(&io->cl, bch2_writepage_io_done, NULL); -+} -+ -+/* -+ * Get a bch_writepage_io and add @page to it - appending to an existing one if -+ * possible, else allocating a new one: -+ */ -+static void bch2_writepage_io_alloc(struct bch_fs *c, -+ struct writeback_control *wbc, -+ struct bch_writepage_state *w, -+ struct bch_inode_info *inode, -+ u64 sector, -+ unsigned nr_replicas) -+{ -+ struct bch_write_op *op; -+ -+ w->io = container_of(bio_alloc_bioset(GFP_NOFS, -+ BIO_MAX_PAGES, -+ &c->writepage_bioset), -+ struct bch_writepage_io, op.wbio.bio); -+ -+ closure_init(&w->io->cl, NULL); -+ w->io->inode = inode; -+ -+ op = &w->io->op; -+ bch2_write_op_init(op, c, w->opts); -+ op->target = w->opts.foreground_target; -+ op_journal_seq_set(op, &inode->ei_journal_seq); -+ op->nr_replicas = nr_replicas; -+ op->res.nr_replicas = nr_replicas; -+ op->write_point = writepoint_hashed(inode->ei_last_dirtied); -+ op->pos = POS(inode->v.i_ino, sector); -+ op->wbio.bio.bi_iter.bi_sector = sector; -+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -+} -+ -+static int __bch2_writepage(struct page *page, -+ struct writeback_control *wbc, -+ void *data) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_writepage_state *w = data; -+ struct bch_page_state *s, orig; -+ unsigned i, offset, nr_replicas_this_write = U32_MAX; -+ loff_t i_size = i_size_read(&inode->v); -+ pgoff_t end_index = i_size >> PAGE_SHIFT; -+ int ret; -+ -+ EBUG_ON(!PageUptodate(page)); -+ -+ /* Is the page fully inside i_size? */ -+ if (page->index < end_index) -+ goto do_io; -+ -+ /* Is the page fully outside i_size? (truncate in progress) */ -+ offset = i_size & (PAGE_SIZE - 1); -+ if (page->index > end_index || !offset) { -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* -+ * The page straddles i_size. It must be zeroed out on each and every -+ * writepage invocation because it may be mmapped. "A file is mapped -+ * in multiples of the page size. For a file that is not a multiple of -+ * the page size, the remaining memory is zeroed when mapped, and -+ * writes to that region are not written out to the file." -+ */ -+ zero_user_segment(page, offset, PAGE_SIZE); -+do_io: -+ s = bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ ret = bch2_get_page_disk_reservation(c, inode, page, true); -+ if (ret) { -+ SetPageError(page); -+ mapping_set_error(page->mapping, ret); -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* Before unlocking the page, get copy of reservations: */ -+ orig = *s; -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ nr_replicas_this_write = -+ min_t(unsigned, nr_replicas_this_write, -+ s->s[i].nr_replicas + -+ s->s[i].replicas_reserved); -+ } -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ s->s[i].nr_replicas = w->opts.compression -+ ? 0 : nr_replicas_this_write; -+ -+ s->s[i].replicas_reserved = 0; -+ s->s[i].state = SECTOR_ALLOCATED; -+ } -+ -+ BUG_ON(atomic_read(&s->write_count)); -+ atomic_set(&s->write_count, 1); -+ -+ BUG_ON(PageWriteback(page)); -+ set_page_writeback(page); -+ -+ unlock_page(page); -+ -+ offset = 0; -+ while (1) { -+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; -+ u64 sector; -+ -+ while (offset < PAGE_SECTORS && -+ orig.s[offset].state < SECTOR_DIRTY) -+ offset++; -+ -+ if (offset == PAGE_SECTORS) -+ break; -+ -+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; -+ -+ while (offset + sectors < PAGE_SECTORS && -+ orig.s[offset + sectors].state >= SECTOR_DIRTY) -+ sectors++; -+ -+ for (i = offset; i < offset + sectors; i++) { -+ reserved_sectors += orig.s[i].replicas_reserved; -+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; -+ } -+ -+ if (w->io && -+ (w->io->op.res.nr_replicas != nr_replicas_this_write || -+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || -+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= -+ (BIO_MAX_PAGES * PAGE_SIZE) || -+ bio_end_sector(&w->io->op.wbio.bio) != sector)) -+ bch2_writepage_do_io(w); -+ -+ if (!w->io) -+ bch2_writepage_io_alloc(c, wbc, w, inode, sector, -+ nr_replicas_this_write); -+ -+ atomic_inc(&s->write_count); -+ -+ BUG_ON(inode != w->io->inode); -+ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, -+ sectors << 9, offset << 9)); -+ -+ /* Check for writing past i_size: */ -+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c))); -+ -+ w->io->op.res.sectors += reserved_sectors; -+ w->io->op.i_sectors_delta -= dirty_sectors; -+ w->io->op.new_i_size = i_size; -+ -+ offset += sectors; -+ } -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(page); -+ -+ return 0; -+} -+ -+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(mapping->host)); -+ struct blk_plug plug; -+ int ret; -+ -+ blk_start_plug(&plug); -+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ blk_finish_plug(&plug); -+ return ret; -+} -+ -+int bch2_writepage(struct page *page, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); -+ int ret; -+ -+ ret = __bch2_writepage(page, wbc, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ -+ return ret; -+} -+ -+/* buffered writes: */ -+ -+int bch2_write_begin(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned flags, -+ struct page **pagep, void **fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res; -+ pgoff_t index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ struct page *page; -+ int ret = -ENOMEM; -+ -+ res = kmalloc(sizeof(*res), GFP_KERNEL); -+ if (!res) -+ return -ENOMEM; -+ -+ bch2_page_reservation_init(c, inode, res); -+ *fsdata = res; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ page = grab_cache_page_write_begin(mapping, index, flags); -+ if (!page) -+ goto err_unlock; -+ -+ if (PageUptodate(page)) -+ goto out; -+ -+ /* If we're writing entire page, don't need to read it in first: */ -+ if (len == PAGE_SIZE) -+ goto out; -+ -+ if (!offset && pos + len >= inode->v.i_size) { -+ zero_user_segment(page, len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+ -+ if (index > inode->v.i_size >> PAGE_SHIFT) { -+ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+readpage: -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto err; -+out: -+ ret = bch2_page_reservation_get(c, inode, page, res, -+ offset, len, true); -+ if (ret) { -+ if (!PageUptodate(page)) { -+ /* -+ * If the page hasn't been read in, we won't know if we -+ * actually need a reservation - we don't actually need -+ * to read here, we just need to check if the page is -+ * fully backed by uncompressed data: -+ */ -+ goto readpage; -+ } -+ -+ goto err; -+ } -+ -+ *pagep = page; -+ return 0; -+err: -+ unlock_page(page); -+ put_page(page); -+ *pagep = NULL; -+err_unlock: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ kfree(res); -+ *fsdata = NULL; -+ return ret; -+} -+ -+int bch2_write_end(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res = fsdata; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ if (unlikely(copied < len && !PageUptodate(page))) { -+ /* -+ * The page needs to be read in, but that would destroy -+ * our partial write - simplest thing is to just force -+ * userspace to redo the write: -+ */ -+ zero_user(page, 0, PAGE_SIZE); -+ flush_dcache_page(page); -+ copied = 0; -+ } -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ if (copied) { -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, res, offset, copied); -+ -+ inode->ei_last_dirtied = (unsigned long) current; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_page_reservation_put(c, inode, res); -+ kfree(res); -+ -+ return copied; -+} -+ -+#define WRITE_BATCH_PAGES 32 -+ -+static int __bch2_buffered_write(struct bch_inode_info *inode, -+ struct address_space *mapping, -+ struct iov_iter *iter, -+ loff_t pos, unsigned len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct page *pages[WRITE_BATCH_PAGES]; -+ struct bch2_page_reservation res; -+ unsigned long index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); -+ unsigned i, reserved = 0, set_dirty = 0; -+ unsigned copied = 0, nr_pages_copied = 0; -+ int ret = 0; -+ -+ BUG_ON(!len); -+ BUG_ON(nr_pages > ARRAY_SIZE(pages)); -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ for (i = 0; i < nr_pages; i++) { -+ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); -+ if (!pages[i]) { -+ nr_pages = i; -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ len = min_t(unsigned, len, -+ nr_pages * PAGE_SIZE - offset); -+ break; -+ } -+ } -+ -+ if (offset && !PageUptodate(pages[0])) { -+ ret = bch2_read_single_page(pages[0], mapping); -+ if (ret) -+ goto out; -+ } -+ -+ if ((pos + len) & (PAGE_SIZE - 1) && -+ !PageUptodate(pages[nr_pages - 1])) { -+ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { -+ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); -+ } else { -+ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); -+ if (ret) -+ goto out; -+ } -+ } -+ -+ while (reserved < len) { -+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - reserved, -+ PAGE_SIZE - pg_offset); -+retry_reservation: -+ ret = bch2_page_reservation_get(c, inode, page, &res, -+ pg_offset, pg_len, true); -+ -+ if (ret && !PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (!ret) -+ goto retry_reservation; -+ } -+ -+ if (ret) -+ goto out; -+ -+ reserved += pg_len; -+ } -+ -+ if (mapping_writably_mapped(mapping)) -+ for (i = 0; i < nr_pages; i++) -+ flush_dcache_page(pages[i]); -+ -+ while (copied < len) { -+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - copied, -+ PAGE_SIZE - pg_offset); -+ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, -+ iter, pg_offset, pg_len); -+ -+ if (!pg_copied) -+ break; -+ -+ flush_dcache_page(page); -+ iov_iter_advance(iter, pg_copied); -+ copied += pg_copied; -+ } -+ -+ if (!copied) -+ goto out; -+ -+ if (copied < len && -+ ((offset + copied) & (PAGE_SIZE - 1))) { -+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; -+ -+ if (!PageUptodate(page)) { -+ zero_user(page, 0, PAGE_SIZE); -+ copied -= (offset + copied) & (PAGE_SIZE - 1); -+ } -+ } -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ while (set_dirty < copied) { -+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, copied - set_dirty, -+ PAGE_SIZE - pg_offset); -+ -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); -+ unlock_page(page); -+ put_page(page); -+ -+ set_dirty += pg_len; -+ } -+ -+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); -+ inode->ei_last_dirtied = (unsigned long) current; -+out: -+ for (i = nr_pages_copied; i < nr_pages; i++) { -+ unlock_page(pages[i]); -+ put_page(pages[i]); -+ } -+ -+ bch2_page_reservation_put(c, inode, &res); -+ -+ return copied ?: ret; -+} -+ -+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ loff_t pos = iocb->ki_pos; -+ ssize_t written = 0; -+ int ret = 0; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ do { -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE * WRITE_BATCH_PAGES - offset); -+again: -+ /* -+ * Bring in the user page that we will copy from _first_. -+ * Otherwise there's a nasty deadlock on copying from the -+ * same page as we're writing to, without it being marked -+ * up-to-date. -+ * -+ * Not only is this an optimisation, but it is also required -+ * to check that the address is actually valid, when atomic -+ * usercopies are used, below. -+ */ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE - offset); -+ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ ret = -EFAULT; -+ break; -+ } -+ } -+ -+ if (unlikely(fatal_signal_pending(current))) { -+ ret = -EINTR; -+ break; -+ } -+ -+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); -+ if (unlikely(ret < 0)) -+ break; -+ -+ cond_resched(); -+ -+ if (unlikely(ret == 0)) { -+ /* -+ * If we were unable to copy any data at all, we must -+ * fall back to a single segment length write. -+ * -+ * If we didn't fallback here, we could livelock -+ * because not all segments in the iov can be copied at -+ * once without a pagefault. -+ */ -+ bytes = min_t(unsigned long, PAGE_SIZE - offset, -+ iov_iter_single_seg_count(iter)); -+ goto again; -+ } -+ pos += ret; -+ written += ret; -+ -+ balance_dirty_pages_ratelimited(mapping); -+ } while (iov_iter_count(iter)); -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return written ? written : ret; -+} -+ -+/* O_DIRECT reads */ -+ -+static void bch2_dio_read_complete(struct closure *cl) -+{ -+ struct dio_read *dio = container_of(cl, struct dio_read, cl); -+ -+ dio->req->ki_complete(dio->req, dio->ret, 0); -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+} -+ -+static void bch2_direct_IO_read_endio(struct bio *bio) -+{ -+ struct dio_read *dio = bio->bi_private; -+ -+ if (bio->bi_status) -+ dio->ret = blk_status_to_errno(bio->bi_status); -+ -+ closure_put(&dio->cl); -+} -+ -+static void bch2_direct_IO_read_split_endio(struct bio *bio) -+{ -+ bch2_direct_IO_read_endio(bio); -+ bio_check_pages_dirty(bio); /* transfers ownership */ -+} -+ -+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct dio_read *dio; -+ struct bio *bio; -+ loff_t offset = req->ki_pos; -+ bool sync = is_sync_kiocb(req); -+ size_t shorten; -+ ssize_t ret; -+ -+ if ((offset|iter->count) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ ret = min_t(loff_t, iter->count, -+ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); -+ -+ if (!ret) -+ return ret; -+ -+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); -+ iter->count -= shorten; -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_read_bioset); -+ -+ bio->bi_end_io = bch2_direct_IO_read_endio; -+ -+ dio = container_of(bio, struct dio_read, rbio.bio); -+ closure_init(&dio->cl, NULL); -+ -+ /* -+ * this is a _really_ horrible hack just to avoid an atomic sub at the -+ * end: -+ */ -+ if (!sync) { -+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER - -+ CLOSURE_RUNNING + -+ CLOSURE_DESTRUCTOR); -+ } else { -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER + 1); -+ } -+ -+ dio->req = req; -+ dio->ret = ret; -+ -+ goto start; -+ while (iter->count) { -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->bio_read); -+ bio->bi_end_io = bch2_direct_IO_read_split_endio; -+start: -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); -+ bio->bi_iter.bi_sector = offset >> 9; -+ bio->bi_private = dio; -+ -+ ret = bio_iov_iter_get_pages(bio, iter); -+ if (ret < 0) { -+ /* XXX: fault inject this path */ -+ bio->bi_status = BLK_STS_RESOURCE; -+ bio_endio(bio); -+ break; -+ } -+ -+ offset += bio->bi_iter.bi_size; -+ bio_set_pages_dirty(bio); -+ -+ if (iter->count) -+ closure_get(&dio->cl); -+ -+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); -+ } -+ -+ iter->count += shorten; -+ -+ if (sync) { -+ closure_sync(&dio->cl); -+ closure_debug_destroy(&dio->cl); -+ ret = dio->ret; -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+ return ret; -+ } else { -+ return -EIOCBQUEUED; -+ } -+} -+ -+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ size_t count = iov_iter_count(iter); -+ ssize_t ret; -+ -+ if (!count) -+ return 0; /* skip atime */ -+ -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ struct blk_plug plug; -+ -+ ret = filemap_write_and_wait_range(mapping, -+ iocb->ki_pos, -+ iocb->ki_pos + count - 1); -+ if (ret < 0) -+ return ret; -+ -+ file_accessed(file); -+ -+ blk_start_plug(&plug); -+ ret = bch2_direct_IO_read(iocb, iter); -+ blk_finish_plug(&plug); -+ -+ if (ret >= 0) -+ iocb->ki_pos += ret; -+ } else { -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = generic_file_read_iter(iocb, iter); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ } -+ -+ return ret; -+} -+ -+/* O_DIRECT writes */ -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *); -+ -+static long bch2_dio_write_loop(struct dio_write *dio) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct kiocb *req = dio->req; -+ struct address_space *mapping = req->ki_filp->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bio *bio = &dio->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ unsigned unaligned; -+ bool sync = dio->sync; -+ long ret; -+ -+ if (dio->loop) -+ goto loop; -+ -+ while (1) { -+ if (kthread) -+ use_mm(dio->mm); -+ BUG_ON(current->faults_disabled_mapping); -+ current->faults_disabled_mapping = mapping; -+ -+ ret = bio_iov_iter_get_pages(bio, &dio->iter); -+ -+ current->faults_disabled_mapping = NULL; -+ if (kthread) -+ unuse_mm(dio->mm); -+ -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); -+ bio->bi_iter.bi_size -= unaligned; -+ iov_iter_revert(&dio->iter, unaligned); -+ -+ if (!bio->bi_iter.bi_size) { -+ /* -+ * bio_iov_iter_get_pages was only able to get < -+ * blocksize worth of pages: -+ */ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); -+ dio->op.end_io = bch2_dio_write_loop_async; -+ dio->op.target = dio->op.opts.foreground_target; -+ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); -+ dio->op.write_point = writepoint_hashed((unsigned long) current); -+ dio->op.nr_replicas = dio->op.opts.data_replicas; -+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); -+ -+ if ((req->ki_flags & IOCB_DSYNC) && -+ !c->opts.journal_flush_disabled) -+ dio->op.flags |= BCH_WRITE_FLUSH; -+ -+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), -+ dio->op.opts.data_replicas, 0); -+ if (unlikely(ret) && -+ !bch2_check_range_allocated(c, dio->op.pos, -+ bio_sectors(bio), dio->op.opts.data_replicas)) -+ goto err; -+ -+ task_io_account_write(bio->bi_iter.bi_size); -+ -+ if (!dio->sync && !dio->loop && dio->iter.count) { -+ struct iovec *iov = dio->inline_vecs; -+ -+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { -+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), -+ GFP_KERNEL); -+ if (unlikely(!iov)) { -+ dio->sync = sync = true; -+ goto do_io; -+ } -+ -+ dio->free_iov = true; -+ } -+ -+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); -+ dio->iter.iov = iov; -+ } -+do_io: -+ dio->loop = true; -+ closure_call(&dio->op.cl, bch2_write, NULL, NULL); -+ -+ if (sync) -+ wait_for_completion(&dio->done); -+ else -+ return -EIOCBQUEUED; -+loop: -+ i_sectors_acct(c, inode, &dio->quota_res, -+ dio->op.i_sectors_delta); -+ req->ki_pos += (u64) dio->op.written << 9; -+ dio->written += dio->op.written; -+ -+ spin_lock(&inode->v.i_lock); -+ if (req->ki_pos > inode->v.i_size) -+ i_size_write(&inode->v, req->ki_pos); -+ spin_unlock(&inode->v.i_lock); -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ if (!dio->iter.count || dio->op.error) -+ break; -+ -+ bio_reset(bio); -+ reinit_completion(&dio->done); -+ } -+ -+ ret = dio->op.error ?: ((long) dio->written << 9); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ -+ if (dio->free_iov) -+ kfree(dio->iter.iov); -+ -+ bio_put(bio); -+ -+ /* inode->i_dio_count is our ref on inode and thus bch_fs */ -+ inode_dio_end(&inode->v); -+ -+ if (!sync) { -+ req->ki_complete(req, ret, 0); -+ ret = -EIOCBQUEUED; -+ } -+ return ret; -+} -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *op) -+{ -+ struct dio_write *dio = container_of(op, struct dio_write, op); -+ -+ if (dio->sync) -+ complete(&dio->done); -+ else -+ bch2_dio_write_loop(dio); -+} -+ -+static noinline -+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct dio_write *dio; -+ struct bio *bio; -+ bool locked = true, extending; -+ ssize_t ret; -+ -+ prefetch(&c->opts); -+ prefetch((void *) &c->opts + 64); -+ prefetch(&inode->ei_inode); -+ prefetch((void *) &inode->ei_inode + 64); -+ -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(req, iter); -+ if (unlikely(ret <= 0)) -+ goto err; -+ -+ ret = file_remove_privs(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = file_update_time(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) -+ goto err; -+ -+ inode_dio_begin(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ extending = req->ki_pos + iter->count > inode->v.i_size; -+ if (!extending) { -+ inode_unlock(&inode->v); -+ locked = false; -+ } -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_write_bioset); -+ dio = container_of(bio, struct dio_write, op.wbio.bio); -+ init_completion(&dio->done); -+ dio->req = req; -+ dio->mm = current->mm; -+ dio->loop = false; -+ dio->sync = is_sync_kiocb(req) || extending; -+ dio->free_iov = false; -+ dio->quota_res.sectors = 0; -+ dio->written = 0; -+ dio->iter = *iter; -+ -+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, -+ iter->count >> 9, true); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter->count - 1); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = bch2_dio_write_loop(dio); -+err: -+ if (locked) -+ inode_unlock(&inode->v); -+ return ret; -+err_put_bio: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ bio_put(bio); -+ inode_dio_end(&inode->v); -+ goto err; -+} -+ -+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ ssize_t ret; -+ -+ if (iocb->ki_flags & IOCB_DIRECT) -+ return bch2_direct_write(iocb, from); -+ -+ /* We can write back this queue in page reclaim */ -+ current->backing_dev_info = inode_to_bdi(&inode->v); -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto unlock; -+ -+ ret = file_remove_privs(file); -+ if (ret) -+ goto unlock; -+ -+ ret = file_update_time(file); -+ if (ret) -+ goto unlock; -+ -+ ret = bch2_buffered_write(iocb, from); -+ if (likely(ret > 0)) -+ iocb->ki_pos += ret; -+unlock: -+ inode_unlock(&inode->v); -+ current->backing_dev_info = NULL; -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); -+ -+ return ret; -+} -+ -+/* fsync: */ -+ -+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret, ret2; -+ -+ ret = file_write_and_wait_range(file, start, end); -+ if (ret) -+ return ret; -+ -+ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) -+ goto out; -+ -+ ret = sync_inode_metadata(&inode->v, 1); -+ if (ret) -+ return ret; -+out: -+ if (!c->opts.journal_flush_disabled) -+ ret = bch2_journal_flush_seq(&c->journal, -+ inode->ei_journal_seq); -+ ret2 = file_check_and_advance_wb_err(file); -+ -+ return ret ?: ret2; -+} -+ -+/* truncate: */ -+ -+static inline int range_has_data(struct bch_fs *c, -+ struct bpos start, -+ struct bpos end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (bkey_extent_is_data(k.k)) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_truncate_page(struct bch_inode_info *inode, -+ pgoff_t index, loff_t start, loff_t end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_page_state *s; -+ unsigned start_offset = start & (PAGE_SIZE - 1); -+ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; -+ unsigned i; -+ struct page *page; -+ int ret = 0; -+ -+ /* Page boundary? Nothing to do */ -+ if (!((index == start >> PAGE_SHIFT && start_offset) || -+ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) -+ return 0; -+ -+ /* Above i_size? */ -+ if (index << PAGE_SHIFT >= inode->v.i_size) -+ return 0; -+ -+ page = find_lock_page(mapping, index); -+ if (!page) { -+ /* -+ * XXX: we're doing two index lookups when we end up reading the -+ * page -+ */ -+ ret = range_has_data(c, -+ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), -+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); -+ if (ret <= 0) -+ return ret; -+ -+ page = find_or_create_page(mapping, index, GFP_KERNEL); -+ if (unlikely(!page)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ } -+ -+ s = bch2_page_state_create(page, 0); -+ if (!s) { -+ ret = -ENOMEM; -+ goto unlock; -+ } -+ -+ if (!PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto unlock; -+ } -+ -+ if (index != start >> PAGE_SHIFT) -+ start_offset = 0; -+ if (index != end >> PAGE_SHIFT) -+ end_offset = PAGE_SIZE; -+ -+ for (i = round_up(start_offset, block_bytes(c)) >> 9; -+ i < round_down(end_offset, block_bytes(c)) >> 9; -+ i++) { -+ s->s[i].nr_replicas = 0; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ -+ zero_user_segment(page, start_offset, end_offset); -+ -+ /* -+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. -+ * -+ * XXX: because we aren't currently tracking whether the page has actual -+ * data in it (vs. just 0s, or only partially written) this wrong. ick. -+ */ -+ ret = bch2_get_page_disk_reservation(c, inode, page, false); -+ BUG_ON(ret); -+ -+ __set_page_dirty_nobuffers(page); -+unlock: -+ unlock_page(page); -+ put_page(page); -+out: -+ return ret; -+} -+ -+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) -+{ -+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, -+ from, round_up(from, PAGE_SIZE)); -+} -+ -+static int bch2_extend(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *inode_u, -+ struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ int ret; -+ -+ /* -+ * sync appends: -+ * -+ * this has to be done _before_ extending i_size: -+ */ -+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); -+ if (ret) -+ return ret; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, inode->v.i_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_truncate_finish_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); -+ return 0; -+} -+ -+static int bch2_truncate_start_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, void *p) -+{ -+ u64 *new_i_size = p; -+ -+ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_size = *new_i_size; -+ return 0; -+} -+ -+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_inode_unpacked inode_u; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 new_i_size = iattr->ia_size; -+ s64 i_sectors_delta = 0; -+ int ret = 0; -+ -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ /* -+ * fetch current on disk i_size: inode is locked, i_size can only -+ * increase underneath us: -+ */ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(iter); -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * check this before next assertion; on filesystem error our normal -+ * invariants are a bit broken (truncate has to truncate the page cache -+ * before the inode). -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ BUG_ON(inode->v.i_size < inode_u.bi_size); -+ -+ if (iattr->ia_size > inode->v.i_size) { -+ ret = bch2_extend(inode, &inode_u, iattr); -+ goto err; -+ } -+ -+ ret = bch2_truncate_page(inode, iattr->ia_size); -+ if (unlikely(ret)) -+ goto err; -+ -+ /* -+ * When extending, we're going to write the new i_size to disk -+ * immediately so we need to flush anything above the current on disk -+ * i_size first: -+ * -+ * Also, when extending we need to flush the page that i_size currently -+ * straddles - if it's mapped to userspace, we need to ensure that -+ * userspace has to redirty it and call .mkwrite -> set_page_dirty -+ * again to allocate the part of the page that was extended. -+ */ -+ if (iattr->ia_size > inode_u.bi_size) -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, -+ iattr->ia_size - 1); -+ else if (iattr->ia_size & (PAGE_SIZE - 1)) -+ ret = filemap_write_and_wait_range(mapping, -+ round_down(iattr->ia_size, PAGE_SIZE), -+ iattr->ia_size - 1); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, -+ &new_i_size, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ round_up(iattr->ia_size, block_bytes(c)) >> 9, -+ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ return ret; -+} -+ -+/* fallocate: */ -+ -+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; -+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; -+ int ret = 0; -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (offset >> PAGE_SHIFT != -+ (offset + len) >> PAGE_SHIFT) { -+ ret = __bch2_truncate_page(inode, -+ (offset + len) >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ truncate_pagecache_range(&inode->v, offset, offset + len - 1); -+ -+ if (discard_start < discard_end) { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ discard_start, discard_end, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ } -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ -+ return ret; -+} -+ -+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, -+ loff_t offset, loff_t len, -+ bool insert) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bkey_on_stack copy; -+ struct btree_trans trans; -+ struct btree_iter *src, *dst; -+ loff_t shift, new_size; -+ u64 src_start; -+ int ret; -+ -+ if ((offset | len) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ bkey_on_stack_init(©); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); -+ -+ /* -+ * We need i_mutex to keep the page cache consistent with the extents -+ * btree, and the btree consistent with i_size - we don't need outside -+ * locking for the extents btree itself, because we're using linked -+ * iterators -+ */ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (insert) { -+ ret = -EFBIG; -+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) -+ goto err; -+ -+ ret = -EINVAL; -+ if (offset >= inode->v.i_size) -+ goto err; -+ -+ src_start = U64_MAX; -+ shift = len; -+ } else { -+ ret = -EINVAL; -+ if (offset + len >= inode->v.i_size) -+ goto err; -+ -+ src_start = offset + len; -+ shift = -len; -+ } -+ -+ new_size = inode->v.i_size + shift; -+ -+ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); -+ if (ret) -+ goto err; -+ -+ if (insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } else { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ offset >> 9, (offset + len) >> 9, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (ret) -+ goto err; -+ } -+ -+ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, src_start >> 9), -+ BTREE_ITER_INTENT); -+ BUG_ON(IS_ERR_OR_NULL(src)); -+ -+ dst = bch2_trans_copy_iter(&trans, src); -+ BUG_ON(IS_ERR_OR_NULL(dst)); -+ -+ while (1) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ struct bpos next_pos; -+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); -+ struct bpos atomic_end; -+ unsigned trigger_flags = 0; -+ -+ k = insert -+ ? bch2_btree_iter_peek_prev(src) -+ : bch2_btree_iter_peek(src); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ if (!k.k || k.k->p.inode != inode->v.i_ino) -+ break; -+ -+ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); -+ -+ if (insert && -+ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) -+ break; -+reassemble: -+ bkey_on_stack_reassemble(©, c, k); -+ -+ if (insert && -+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) -+ bch2_cut_front(move_pos, copy.k); -+ -+ copy.k->k.p.offset += shift >> 9; -+ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); -+ -+ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); -+ if (ret) -+ goto bkey_err; -+ -+ if (bkey_cmp(atomic_end, copy.k->k.p)) { -+ if (insert) { -+ move_pos = atomic_end; -+ move_pos.offset -= shift >> 9; -+ goto reassemble; -+ } else { -+ bch2_cut_back(atomic_end, copy.k); -+ } -+ } -+ -+ bkey_init(&delete.k); -+ delete.k.p = copy.k->k.p; -+ delete.k.size = copy.k->k.size; -+ delete.k.p.offset -= shift >> 9; -+ -+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; -+ -+ if (copy.k->k.size == k.k->size) { -+ /* -+ * If we're moving the entire extent, we can skip -+ * running triggers: -+ */ -+ trigger_flags |= BTREE_TRIGGER_NORUN; -+ } else { -+ /* We might end up splitting compressed extents: */ -+ unsigned nr_ptrs = -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ copy.k->k.size, nr_ptrs, -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ } -+ -+ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); -+ -+ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: -+ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: -+ bch2_trans_commit(&trans, &disk_res, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(c, &disk_res); -+bkey_err: -+ if (!ret) -+ bch2_btree_iter_set_pos(src, next_pos); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ bch2_trans_unlock(&trans); -+ -+ if (!insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(©, c); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+static long bchfs_fallocate(struct bch_inode_info *inode, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end_pos; -+ loff_t end = offset + len; -+ loff_t block_start = round_down(offset, block_bytes(c)); -+ loff_t block_end = round_up(end, block_bytes(c)); -+ unsigned sectors; -+ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { -+ ret = inode_newsize_ok(&inode->v, end); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode & FALLOC_FL_ZERO_RANGE) { -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, end); -+ -+ if (!ret && -+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) -+ ret = __bch2_truncate_page(inode, -+ end >> PAGE_SHIFT, -+ offset, end); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_pagecache_range(&inode->v, offset, end - 1); -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, block_start >> 9), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ end_pos = POS(inode->v.i_ino, block_end >> 9); -+ -+ while (bkey_cmp(iter->pos, end_pos) < 0) { -+ s64 i_sectors_delta = 0; -+ struct disk_reservation disk_res = { 0 }; -+ struct quota_res quota_res = { 0 }; -+ struct bkey_i_reservation reservation; -+ struct bkey_s_c k; -+ -+ bch2_trans_begin(&trans); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ /* already reserved */ -+ if (k.k->type == KEY_TYPE_reservation && -+ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ if (bkey_extent_is_data(k.k) && -+ !(mode & FALLOC_FL_ZERO_RANGE)) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ bkey_reservation_init(&reservation.k_i); -+ reservation.k.type = KEY_TYPE_reservation; -+ reservation.k.p = k.k->p; -+ reservation.k.size = k.k->size; -+ -+ bch2_cut_front(iter->pos, &reservation.k_i); -+ bch2_cut_back(end_pos, &reservation.k_i); -+ -+ sectors = reservation.k.size; -+ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); -+ -+ if (!bkey_extent_is_allocation(k.k)) { -+ ret = bch2_quota_reservation_add(c, inode, -+ "a_res, -+ sectors, true); -+ if (unlikely(ret)) -+ goto bkey_err; -+ } -+ -+ if (reservation.v.nr_replicas < replicas || -+ bch2_bkey_sectors_compressed(k)) { -+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, -+ replicas, 0); -+ if (unlikely(ret)) -+ goto bkey_err; -+ -+ reservation.v.nr_replicas = disk_res.nr_replicas; -+ } -+ -+ ret = bch2_extent_update(&trans, iter, &reservation.k_i, -+ &disk_res, &inode->ei_journal_seq, -+ 0, &i_sectors_delta); -+ i_sectors_acct(c, inode, "a_res, i_sectors_delta); -+bkey_err: -+ bch2_quota_reservation_put(c, inode, "a_res); -+ bch2_disk_reservation_put(c, &disk_res); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ /* -+ * Do we need to extend the file? -+ * -+ * If we zeroed up to the end of the file, we dropped whatever writes -+ * were going to write out the current i_size, so we have to extend -+ * manually even if FL_KEEP_SIZE was set: -+ */ -+ if (end >= inode->v.i_size && -+ (!(mode & FALLOC_FL_KEEP_SIZE) || -+ (mode & FALLOC_FL_ZERO_RANGE))) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ do { -+ bch2_trans_begin(&trans); -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ } while (ret == -EINTR); -+ -+ bch2_trans_unlock(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * Sync existing appends before extending i_size, -+ * as in bch2_extend(): -+ */ -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, S64_MAX); -+ if (ret) -+ goto err; -+ -+ if (mode & FALLOC_FL_KEEP_SIZE) -+ end = inode->v.i_size; -+ else -+ i_size_write(&inode->v, end); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, end, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+long bch2_fallocate_dispatch(struct file *file, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ long ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) -+ ret = bchfs_fallocate(inode, mode, offset, len); -+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) -+ ret = bchfs_fpunch(inode, offset, len); -+ else if (mode == FALLOC_FL_INSERT_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, true); -+ else if (mode == FALLOC_FL_COLLAPSE_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, false); -+ else -+ ret = -EOPNOTSUPP; -+ -+ percpu_ref_put(&c->writes); -+ -+ return ret; -+} -+ -+static void mark_range_unallocated(struct bch_inode_info *inode, -+ loff_t start, loff_t end) -+{ -+ pgoff_t index = start >> PAGE_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; -+ struct pagevec pvec; -+ -+ pagevec_init(&pvec); -+ -+ do { -+ unsigned nr_pages, i, j; -+ -+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, -+ &index, end_index); -+ if (nr_pages == 0) -+ break; -+ -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pvec.pages[i]; -+ struct bch_page_state *s; -+ -+ lock_page(page); -+ s = bch2_page_state(page); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = 0; j < PAGE_SECTORS; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ unlock_page(page); -+ } -+ pagevec_release(&pvec); -+ } while (index <= end_index); -+} -+ -+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, -+ struct file *file_dst, loff_t pos_dst, -+ loff_t len, unsigned remap_flags) -+{ -+ struct bch_inode_info *src = file_bch_inode(file_src); -+ struct bch_inode_info *dst = file_bch_inode(file_dst); -+ struct bch_fs *c = src->v.i_sb->s_fs_info; -+ s64 i_sectors_delta = 0; -+ u64 aligned_len; -+ loff_t ret = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) -+ return -EINVAL; -+ -+ if (remap_flags & REMAP_FILE_DEDUP) -+ return -EOPNOTSUPP; -+ -+ if ((pos_src & (block_bytes(c) - 1)) || -+ (pos_dst & (block_bytes(c) - 1))) -+ return -EINVAL; -+ -+ if (src == dst && -+ abs(pos_src - pos_dst) < len) -+ return -EINVAL; -+ -+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ file_update_time(file_dst); -+ -+ inode_dio_wait(&src->v); -+ inode_dio_wait(&dst->v); -+ -+ ret = generic_remap_file_range_prep(file_src, pos_src, -+ file_dst, pos_dst, -+ &len, remap_flags); -+ if (ret < 0 || len == 0) -+ goto err; -+ -+ aligned_len = round_up((u64) len, block_bytes(c)); -+ -+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, -+ pos_dst, pos_dst + len - 1); -+ if (ret) -+ goto err; -+ -+ mark_range_unallocated(src, pos_src, pos_src + aligned_len); -+ -+ ret = bch2_remap_range(c, -+ POS(dst->v.i_ino, pos_dst >> 9), -+ POS(src->v.i_ino, pos_src >> 9), -+ aligned_len >> 9, -+ &dst->ei_journal_seq, -+ pos_dst + len, &i_sectors_delta); -+ if (ret < 0) -+ goto err; -+ -+ /* -+ * due to alignment, we might have remapped slightly more than requsted -+ */ -+ ret = min((u64) ret << 9, (u64) len); -+ -+ /* XXX get a quota reservation */ -+ i_sectors_acct(c, dst, NULL, i_sectors_delta); -+ -+ spin_lock(&dst->v.i_lock); -+ if (pos_dst + ret > dst->v.i_size) -+ i_size_write(&dst->v, pos_dst + ret); -+ spin_unlock(&dst->v.i_lock); -+err: -+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ return ret; -+} -+ -+/* fseek: */ -+ -+static int page_data_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (s) -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state >= SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t bch2_seek_pagecache_data(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ struct page *page; -+ pgoff_t start_index = start_offset >> PAGE_SHIFT; -+ pgoff_t end_index = end_offset >> PAGE_SHIFT; -+ pgoff_t index = start_index; -+ loff_t ret; -+ int offset; -+ -+ while (index <= end_index) { -+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { -+ lock_page(page); -+ -+ offset = page_data_offset(page, -+ page->index == start_index -+ ? start_offset & (PAGE_SIZE - 1) -+ : 0); -+ if (offset >= 0) { -+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + -+ offset, -+ start_offset, end_offset); -+ unlock_page(page); -+ put_page(page); -+ return ret; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ } else { -+ break; -+ } -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_data(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_data = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ break; -+ } else if (bkey_extent_is_data(k.k)) { -+ next_data = max(offset, bkey_start_offset(k.k) << 9); -+ break; -+ } else if (k.k->p.offset >> 9 > isize) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_data > offset) -+ next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data); -+ -+ if (next_data >= isize) -+ return -ENXIO; -+ -+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -+} -+ -+static int __page_hole_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (!s) -+ return 0; -+ -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state < SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) -+{ -+ pgoff_t index = offset >> PAGE_SHIFT; -+ struct page *page; -+ int pg_offset; -+ loff_t ret = -1; -+ -+ page = find_lock_entry(mapping, index); -+ if (!page || xa_is_value(page)) -+ return offset; -+ -+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); -+ if (pg_offset >= 0) -+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; -+ -+ unlock_page(page); -+ -+ return ret; -+} -+ -+static loff_t bch2_seek_pagecache_hole(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ loff_t offset = start_offset, hole; -+ -+ while (offset < end_offset) { -+ hole = page_hole_offset(mapping, offset); -+ if (hole >= 0 && hole <= end_offset) -+ return max(start_offset, hole); -+ -+ offset += PAGE_SIZE; -+ offset &= PAGE_MASK; -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_hole(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_hole = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), -+ BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE); -+ break; -+ } else if (!bkey_extent_is_data(k.k)) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9); -+ -+ if (next_hole < k.k->p.offset << 9) -+ break; -+ } else { -+ offset = max(offset, bkey_start_offset(k.k) << 9); -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_hole > isize) -+ next_hole = isize; -+ -+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -+} -+ -+loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -+{ -+ switch (whence) { -+ case SEEK_SET: -+ case SEEK_CUR: -+ case SEEK_END: -+ return generic_file_llseek(file, offset, whence); -+ case SEEK_DATA: -+ return bch2_seek_data(file, offset); -+ case SEEK_HOLE: -+ return bch2_seek_hole(file, offset); -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_fs_fsio_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->dio_write_bioset); -+ bioset_exit(&c->dio_read_bioset); -+ bioset_exit(&c->writepage_bioset); -+} -+ -+int bch2_fs_fsio_init(struct bch_fs *c) -+{ -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bioset_init(&c->writepage_bioset, -+ 4, offsetof(struct bch_writepage_io, op.wbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_read_bioset, -+ 4, offsetof(struct dio_read, rbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_write_bioset, -+ 4, offsetof(struct dio_write, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ ret = -ENOMEM; -+ -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h -new file mode 100644 -index 000000000000..7063556d289b ---- /dev/null -+++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_H -+#define _BCACHEFS_FS_IO_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+#include "buckets.h" -+#include "io_types.h" -+ -+#include -+ -+struct quota_res; -+ -+int __must_check bch2_write_inode_size(struct bch_fs *, -+ struct bch_inode_info *, -+ loff_t, unsigned); -+ -+int bch2_writepage(struct page *, struct writeback_control *); -+int bch2_readpage(struct file *, struct page *); -+ -+int bch2_writepages(struct address_space *, struct writeback_control *); -+int bch2_readpages(struct file *, struct address_space *, -+ struct list_head *, unsigned); -+ -+int bch2_write_begin(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page **, void **); -+int bch2_write_end(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page *, void *); -+ -+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); -+ -+int bch2_fsync(struct file *, loff_t, loff_t, int); -+ -+int bch2_truncate(struct bch_inode_info *, struct iattr *); -+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -+ -+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, -+ loff_t, loff_t, unsigned); -+ -+loff_t bch2_llseek(struct file *, loff_t, int); -+ -+vm_fault_t bch2_page_fault(struct vm_fault *); -+vm_fault_t bch2_page_mkwrite(struct vm_fault *); -+void bch2_invalidatepage(struct page *, unsigned int, unsigned int); -+int bch2_releasepage(struct page *, gfp_t); -+int bch2_migrate_page(struct address_space *, struct page *, -+ struct page *, enum migrate_mode); -+ -+void bch2_fs_fsio_exit(struct bch_fs *); -+int bch2_fs_fsio_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_H */ -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -new file mode 100644 -index 000000000000..031e6d931171 ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,308 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-ioctl.h" -+#include "quota.h" -+ -+#include -+#include -+ -+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -+ -+struct flags_set { -+ unsigned mask; -+ unsigned flags; -+ -+ unsigned projid; -+}; -+ -+static int bch2_inode_flags_set(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ /* -+ * We're relying on btree locking here for exclusion with other ioctl -+ * calls - use the flags in the btree (@bi), not inode->i_flags: -+ */ -+ struct flags_set *s = p; -+ unsigned newflags = s->flags; -+ unsigned oldflags = bi->bi_flags & s->mask; -+ -+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && -+ !capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ -+ if (!S_ISREG(bi->bi_mode) && -+ !S_ISDIR(bi->bi_mode) && -+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) -+ return -EINVAL; -+ -+ bi->bi_flags &= ~s->mask; -+ bi->bi_flags |= newflags; -+ -+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -+ return 0; -+} -+ -+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -+{ -+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); -+ -+ return put_user(flags, arg); -+} -+ -+static int bch2_ioc_setflags(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ void __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -+ unsigned uflags; -+ int ret; -+ -+ if (get_user(uflags, (int __user *) arg)) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -+ if (uflags) -+ return -EOPNOTSUPP; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto setflags_out; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -+ ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+setflags_out: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct fsxattr fa = { 0 }; -+ -+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); -+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -+ -+ return copy_to_user(arg, &fa, sizeof(fa)); -+} -+ -+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct flags_set *s = p; -+ -+ if (s->projid != bi->bi_project) { -+ bi->bi_fields_set |= 1U << Inode_opt_project; -+ bi->bi_project = s->projid; -+ } -+ -+ return bch2_inode_flags_set(inode, bi, p); -+} -+ -+static int bch2_ioc_fssetxattr(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -+ struct fsxattr fa; -+ int ret; -+ -+ if (copy_from_user(&fa, arg, sizeof(fa))) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -+ if (fa.fsx_xflags) -+ return -EOPNOTSUPP; -+ -+ if (fa.fsx_projid >= U32_MAX) -+ return -EINVAL; -+ -+ s.projid = fa.fsx_projid + 1; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto err; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_set_projid(c, inode, s.projid); -+ if (ret) -+ goto err_unlock; -+ -+ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -+ ATTR_CTIME); -+err_unlock: -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_inode_info *dir = p; -+ -+ return !bch2_reinherit_attrs(bi, &dir->ei_inode); -+} -+ -+static int bch2_ioc_reinherit_attrs(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *src, -+ const char __user *name) -+{ -+ struct bch_inode_info *dst; -+ struct inode *vinode = NULL; -+ char *kname = NULL; -+ struct qstr qstr; -+ int ret = 0; -+ u64 inum; -+ -+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); -+ if (!kname) -+ return -ENOMEM; -+ -+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); -+ if (unlikely(ret < 0)) -+ goto err1; -+ -+ qstr.len = ret; -+ qstr.name = kname; -+ -+ ret = -ENOENT; -+ inum = bch2_dirent_lookup(c, src->v.i_ino, -+ &src->ei_str_hash, -+ &qstr); -+ if (!inum) -+ goto err1; -+ -+ vinode = bch2_vfs_inode_get(c, inum); -+ ret = PTR_ERR_OR_ZERO(vinode); -+ if (ret) -+ goto err1; -+ -+ dst = to_bch_ei(vinode); -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ goto err2; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ if (inode_attr_changing(src, dst, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst, -+ src->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err3; -+ } -+ -+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -+err3: -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ /* return true if we did work */ -+ if (ret >= 0) -+ ret = !ret; -+ -+ mnt_drop_write_file(file); -+err2: -+ iput(vinode); -+err1: -+ kfree(kname); -+ -+ return ret; -+} -+ -+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct super_block *sb = inode->v.i_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ return bch2_ioc_getflags(inode, (int __user *) arg); -+ -+ case FS_IOC_SETFLAGS: -+ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); -+ -+ case FS_IOC_FSGETXATTR: -+ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); -+ case FS_IOC_FSSETXATTR: -+ return bch2_ioc_fssetxattr(c, file, inode, -+ (void __user *) arg); -+ -+ case BCHFS_IOC_REINHERIT_ATTRS: -+ return bch2_ioc_reinherit_attrs(c, file, inode, -+ (void __user *) arg); -+ -+ case FS_IOC_GETVERSION: -+ return -ENOTTY; -+ case FS_IOC_SETVERSION: -+ return -ENOTTY; -+ -+ case FS_IOC_GOINGDOWN: -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ down_write(&sb->s_umount); -+ sb->s_flags |= SB_RDONLY; -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only due to ioctl"); -+ up_write(&sb->s_umount); -+ return 0; -+ -+ default: -+ return bch2_fs_ioctl(c, cmd, (void __user *) arg); -+ } -+} -+ -+#ifdef CONFIG_COMPAT -+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ /* These are just misnamed, they actually get/put from/to user an int */ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ cmd = FS_IOC_GETFLAGS; -+ break; -+ case FS_IOC32_SETFLAGS: -+ cmd = FS_IOC_SETFLAGS; -+ break; -+ default: -+ return -ENOIOCTLCMD; -+ } -+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -+} -+#endif -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h -new file mode 100644 -index 000000000000..f201980ef2c3 ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.h -@@ -0,0 +1,81 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IOCTL_H -+#define _BCACHEFS_FS_IOCTL_H -+ -+/* Inode flags: */ -+ -+/* bcachefs inode flags -> vfs inode flags: */ -+static const unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_SYNC] = S_SYNC, -+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, -+ [__BCH_INODE_APPEND] = S_APPEND, -+ [__BCH_INODE_NOATIME] = S_NOATIME, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -+static const unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_SYNC] = FS_SYNC_FL, -+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_APPEND] = FS_APPEND_FL, -+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, -+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -+static const unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, -+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, -+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, -+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -+}; -+ -+#define set_flags(_map, _in, _out) \ -+do { \ -+ unsigned _i; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & (1 << _i)) \ -+ (_out) |= _map[_i]; \ -+ else \ -+ (_out) &= ~_map[_i]; \ -+} while (0) -+ -+#define map_flags(_map, _in) \ -+({ \ -+ unsigned _out = 0; \ -+ \ -+ set_flags(_map, _in, _out); \ -+ _out; \ -+}) -+ -+#define map_flags_rev(_map, _in) \ -+({ \ -+ unsigned _i, _out = 0; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & _map[_i]) { \ -+ (_out) |= 1 << _i; \ -+ (_in) &= ~_map[_i]; \ -+ } \ -+ (_out); \ -+}) -+ -+#define map_defined(_map) \ -+({ \ -+ unsigned _in = ~0; \ -+ \ -+ map_flags_rev(_map, _in); \ -+}) -+ -+/* Set VFS inode flags from bcachefs inode: */ -+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -+{ -+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -+} -+ -+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); -+ -+#endif /* _BCACHEFS_FS_IOCTL_H */ -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -new file mode 100644 -index 000000000000..a47923d67f7a ---- /dev/null -+++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1605 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "extents.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-io.h" -+#include "fs-ioctl.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "quota.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct kmem_cache *bch2_inode_cache; -+ -+static void bch2_vfs_inode_init(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *); -+ -+static void journal_seq_copy(struct bch_inode_info *dst, -+ u64 journal_seq) -+{ -+ u64 old, v = READ_ONCE(dst->ei_journal_seq); -+ -+ do { -+ old = v; -+ -+ if (old >= journal_seq) -+ break; -+ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); -+} -+ -+static void __pagecache_lock_put(struct pagecache_lock *lock, long i) -+{ -+ BUG_ON(atomic_long_read(&lock->v) == 0); -+ -+ if (atomic_long_sub_return_release(i, &lock->v) == 0) -+ wake_up_all(&lock->wait); -+} -+ -+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) -+{ -+ long v = atomic_long_read(&lock->v), old; -+ -+ do { -+ old = v; -+ -+ if (i > 0 ? v < 0 : v > 0) -+ return false; -+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, -+ old, old + i)) != old); -+ return true; -+} -+ -+static void __pagecache_lock_get(struct pagecache_lock *lock, long i) -+{ -+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, 1); -+} -+ -+void bch2_pagecache_add_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, 1); -+} -+ -+void bch2_pagecache_block_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, -1); -+} -+ -+void bch2_pagecache_block_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, -1); -+} -+ -+void bch2_inode_update_after_write(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ unsigned fields) -+{ -+ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); -+ i_uid_write(&inode->v, bi->bi_uid); -+ i_gid_write(&inode->v, bi->bi_gid); -+ inode->v.i_mode = bi->bi_mode; -+ -+ if (fields & ATTR_ATIME) -+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); -+ if (fields & ATTR_MTIME) -+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); -+ if (fields & ATTR_CTIME) -+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); -+ -+ inode->ei_inode = *bi; -+ -+ bch2_inode_flags_to_vfs(inode); -+} -+ -+int __must_check bch2_write_inode(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ inode_set_fn set, -+ void *p, unsigned fields) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked inode_u; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ (set ? set(inode, &inode_u, p) : 0) ?: -+ bch2_inode_write(&trans, iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * the btree node lock protects inode->ei_inode, not ei_update_lock; -+ * this is important for inode updates via bchfs_write_index_update -+ */ -+ if (!ret) -+ bch2_inode_update_after_write(c, inode, &inode_u, fields); -+ -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_fs_quota_transfer(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_qid new_qid, -+ unsigned qtypes, -+ enum quota_acct_mode mode) -+{ -+ unsigned i; -+ int ret; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ for (i = 0; i < QTYP_NR; i++) -+ if (new_qid.q[i] == inode->ei_qid.q[i]) -+ qtypes &= ~(1U << i); -+ -+ if (!qtypes) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ -+ ret = bch2_quota_transfer(c, qtypes, new_qid, -+ inode->ei_qid, -+ inode->v.i_blocks + -+ inode->ei_quota_reserved, -+ mode); -+ if (!ret) -+ for (i = 0; i < QTYP_NR; i++) -+ if (qtypes & (1 << i)) -+ inode->ei_qid.q[i] = new_qid.q[i]; -+ -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) -+{ -+ struct bch_inode_unpacked inode_u; -+ struct bch_inode_info *inode; -+ int ret; -+ -+ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); -+ if (unlikely(!inode)) -+ return ERR_PTR(-ENOMEM); -+ if (!(inode->v.i_state & I_NEW)) -+ return &inode->v; -+ -+ ret = bch2_inode_find_by_inum(c, inum, &inode_u); -+ if (ret) { -+ iget_failed(&inode->v); -+ return ERR_PTR(ret); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ -+ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); -+ -+ unlock_new_inode(&inode->v); -+ -+ return &inode->v; -+} -+ -+static struct bch_inode_info * -+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, -+ umode_t mode, dev_t rdev, bool tmpfile) -+{ -+ struct bch_fs *c = dir->v.i_sb->s_fs_info; -+ struct user_namespace *ns = dir->v.i_sb->s_user_ns; -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u; -+ struct bch_inode_info *inode, *old; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *default_acl = NULL, *acl = NULL; -+ u64 journal_seq = 0; -+ int ret; -+ -+ /* -+ * preallocate acls + vfs inode before btree transaction, so that -+ * nothing can fail after the transaction succeeds: -+ */ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); -+ if (ret) -+ return ERR_PTR(ret); -+#endif -+ inode = to_bch_ei(new_inode(c->vfs_sb)); -+ if (unlikely(!inode)) { -+ inode = ERR_PTR(-ENOMEM); -+ goto err; -+ } -+ -+ bch2_inode_init_early(c, &inode_u); -+ -+ if (!tmpfile) -+ mutex_lock(&dir->ei_update_lock); -+ -+ bch2_trans_init(&trans, c, 8, 1024); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, -+ !tmpfile ? &dentry->d_name : NULL, -+ from_kuid(ns, current_fsuid()), -+ from_kgid(ns, current_fsgid()), -+ mode, rdev, -+ default_acl, acl) ?: -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (unlikely(ret)) -+ goto err_before_quota; -+ -+ ret = bch2_trans_commit(&trans, NULL, &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (unlikely(ret)) { -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+err_before_quota: -+ if (ret == -EINTR) -+ goto retry; -+ goto err_trans; -+ } -+ -+ if (!tmpfile) { -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(dir, journal_seq); -+ mutex_unlock(&dir->ei_update_lock); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ journal_seq_copy(inode, journal_seq); -+ -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); -+ -+ /* -+ * we must insert the new inode into the inode cache before calling -+ * bch2_trans_exit() and dropping locks, else we could race with another -+ * thread pulling the inode in and modifying it: -+ */ -+ -+ old = to_bch_ei(insert_inode_locked2(&inode->v)); -+ if (unlikely(old)) { -+ /* -+ * We raced, another process pulled the new inode into cache -+ * before us: -+ */ -+ journal_seq_copy(old, journal_seq); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ -+ inode = old; -+ } else { -+ /* -+ * we really don't want insert_inode_locked2() to be setting -+ * I_NEW... -+ */ -+ unlock_new_inode(&inode->v); -+ } -+ -+ bch2_trans_exit(&trans); -+err: -+ posix_acl_release(default_acl); -+ posix_acl_release(acl); -+ return inode; -+err_trans: -+ if (!tmpfile) -+ mutex_unlock(&dir->ei_update_lock); -+ -+ bch2_trans_exit(&trans); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ inode = ERR_PTR(ret); -+ goto err; -+} -+ -+/* methods */ -+ -+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, -+ unsigned int flags) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct inode *vinode = NULL; -+ u64 inum; -+ -+ inum = bch2_dirent_lookup(c, dir->v.i_ino, -+ &dir->ei_str_hash, -+ &dentry->d_name); -+ -+ if (inum) -+ vinode = bch2_vfs_inode_get(c, inum); -+ -+ return d_splice_alias(vinode, dentry); -+} -+ -+static int bch2_mknod(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, dev_t rdev) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_create(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, bool excl) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); -+} -+ -+static int __bch2_link(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_info *dir, -+ struct dentry *dentry) -+{ -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u, inode_u; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ ret = bch2_link_trans(&trans, -+ dir->v.i_ino, -+ inode->v.i_ino, &dir_u, &inode_u, -+ &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ return ret; -+} -+ -+static int bch2_link(struct dentry *old_dentry, struct inode *vdir, -+ struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ return ret; -+ -+ ihold(&inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_inode_unpacked dir_u, inode_u; -+ struct btree_trans trans; -+ int ret; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_unlink_trans(&trans, -+ dir->v.i_ino, &dir_u, -+ &inode_u, &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &dir->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_MTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ -+ return ret; -+} -+ -+static int bch2_symlink(struct inode *vdir, struct dentry *dentry, -+ const char *symname) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; -+ int ret; -+ -+ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); -+ if (unlikely(IS_ERR(inode))) -+ return PTR_ERR(inode); -+ -+ inode_lock(&inode->v); -+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); -+ inode_unlock(&inode->v); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); -+ if (unlikely(ret)) -+ goto err; -+ -+ journal_seq_copy(dir, inode->ei_journal_seq); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ goto err; -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+err: -+ iput(&inode->v); -+ return ret; -+} -+ -+static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); -+} -+ -+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, -+ struct inode *dst_vdir, struct dentry *dst_dentry, -+ unsigned flags) -+{ -+ struct bch_fs *c = src_vdir->i_sb->s_fs_info; -+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); -+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); -+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); -+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); -+ struct bch_inode_unpacked dst_dir_u, src_dir_u; -+ struct bch_inode_unpacked src_inode_u, dst_inode_u; -+ struct btree_trans trans; -+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE -+ ? BCH_RENAME_EXCHANGE -+ : dst_dentry->d_inode -+ ? BCH_RENAME_OVERWRITE : BCH_RENAME; -+ u64 journal_seq = 0; -+ int ret; -+ -+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) -+ return -EINVAL; -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, -+ 0, LLONG_MAX); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 8, 2048); -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, src_inode, -+ dst_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst_inode, -+ src_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+retry: -+ bch2_trans_begin(&trans); -+ ret = bch2_rename_trans(&trans, -+ src_dir->v.i_ino, &src_dir_u, -+ dst_dir->v.i_ino, &dst_dir_u, -+ &src_inode_u, -+ &dst_inode_u, -+ &src_dentry->d_name, -+ &dst_dentry->d_name, -+ mode) ?: -+ bch2_trans_commit(&trans, NULL, -+ &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); -+ BUG_ON(dst_inode && -+ dst_inode->v.i_ino != dst_inode_u.bi_inum); -+ -+ bch2_inode_update_after_write(c, src_dir, &src_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(src_dir, journal_seq); -+ -+ if (src_dir != dst_dir) { -+ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(dst_dir, journal_seq); -+ } -+ -+ bch2_inode_update_after_write(c, src_inode, &src_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(src_inode, journal_seq); -+ -+ if (dst_inode) { -+ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(dst_inode, journal_seq); -+ } -+err: -+ bch2_trans_exit(&trans); -+ -+ bch2_fs_quota_transfer(c, src_inode, -+ bch_qid(&src_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ if (dst_inode) -+ bch2_fs_quota_transfer(c, dst_inode, -+ bch_qid(&dst_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ return ret; -+} -+ -+void bch2_setattr_copy(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ unsigned int ia_valid = attr->ia_valid; -+ -+ if (ia_valid & ATTR_UID) -+ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); -+ if (ia_valid & ATTR_GID) -+ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); -+ -+ if (ia_valid & ATTR_ATIME) -+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); -+ if (ia_valid & ATTR_MTIME) -+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); -+ if (ia_valid & ATTR_CTIME) -+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); -+ -+ if (ia_valid & ATTR_MODE) { -+ umode_t mode = attr->ia_mode; -+ kgid_t gid = ia_valid & ATTR_GID -+ ? attr->ia_gid -+ : inode->v.i_gid; -+ -+ if (!in_group_p(gid) && -+ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) -+ mode &= ~S_ISGID; -+ bi->bi_mode = mode; -+ } -+} -+ -+static int bch2_setattr_nonsize(struct bch_inode_info *inode, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_qid qid; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl = NULL; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ -+ qid = inode->ei_qid; -+ -+ if (attr->ia_valid & ATTR_UID) -+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); -+ -+ if (attr->ia_valid & ATTR_GID) -+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); -+ -+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ kfree(acl); -+ acl = NULL; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ bch2_setattr_copy(inode, &inode_u, attr); -+ -+ if (attr->ia_valid & ATTR_MODE) { -+ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); -+ if (ret) -+ goto btree_err; -+ } -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err_trans; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); -+ -+ if (acl) -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+err_trans: -+ bch2_trans_exit(&trans); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_getattr(const struct path *path, struct kstat *stat, -+ u32 request_mask, unsigned query_flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ stat->dev = inode->v.i_sb->s_dev; -+ stat->ino = inode->v.i_ino; -+ stat->mode = inode->v.i_mode; -+ stat->nlink = inode->v.i_nlink; -+ stat->uid = inode->v.i_uid; -+ stat->gid = inode->v.i_gid; -+ stat->rdev = inode->v.i_rdev; -+ stat->size = i_size_read(&inode->v); -+ stat->atime = inode->v.i_atime; -+ stat->mtime = inode->v.i_mtime; -+ stat->ctime = inode->v.i_ctime; -+ stat->blksize = block_bytes(c); -+ stat->blocks = inode->v.i_blocks; -+ -+ if (request_mask & STATX_BTIME) { -+ stat->result_mask |= STATX_BTIME; -+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); -+ } -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) -+ stat->attributes |= STATX_ATTR_IMMUTABLE; -+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) -+ stat->attributes |= STATX_ATTR_APPEND; -+ stat->attributes_mask |= STATX_ATTR_APPEND; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) -+ stat->attributes |= STATX_ATTR_NODUMP; -+ stat->attributes_mask |= STATX_ATTR_NODUMP; -+ -+ return 0; -+} -+ -+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = setattr_prepare(dentry, iattr); -+ if (ret) -+ return ret; -+ -+ return iattr->ia_valid & ATTR_SIZE -+ ? bch2_truncate(inode, iattr) -+ : bch2_setattr_nonsize(inode, iattr); -+} -+ -+static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_mark_tmpfile(dentry, &inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_fill_extent(struct bch_fs *c, -+ struct fiemap_extent_info *info, -+ struct bkey_s_c k, unsigned flags) -+{ -+ if (bkey_extent_is_data(k.k)) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ flags |= FIEMAP_EXTENT_SHARED; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int flags2 = 0; -+ u64 offset = p.ptr.offset; -+ -+ if (p.crc.compression_type) -+ flags2 |= FIEMAP_EXTENT_ENCODED; -+ else -+ offset += p.crc.offset; -+ -+ if ((offset & (c->opts.block_size - 1)) || -+ (k.k->size & (c->opts.block_size - 1))) -+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; -+ -+ ret = fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ offset << 9, -+ k.k->size << 9, flags|flags2); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+ } else if (k.k->type == KEY_TYPE_reservation) { -+ return fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ 0, k.k->size << 9, -+ flags| -+ FIEMAP_EXTENT_DELALLOC| -+ FIEMAP_EXTENT_UNWRITTEN); -+ } else { -+ BUG(); -+ } -+} -+ -+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -+ u64 start, u64 len) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *ei = to_bch_ei(vinode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack cur, prev; -+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -+ unsigned offset_into_extent, sectors; -+ bool have_extent = false; -+ int ret = 0; -+ -+ if (start + len < start) -+ return -EINVAL; -+ -+ bkey_on_stack_init(&cur); -+ bkey_on_stack_init(&prev); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(ei->v.i_ino, start >> 9), 0); -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ if (!bkey_extent_is_data(k.k) && -+ k.k->type != KEY_TYPE_reservation) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_realloc(&cur, c, k.k->u64s); -+ bkey_on_stack_realloc(&prev, c, k.k->u64s); -+ bkey_reassemble(cur.k, k); -+ k = bkey_i_to_s_c(cur.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &cur); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ if (offset_into_extent) -+ bch2_cut_front(POS(k.k->p.inode, -+ bkey_start_offset(k.k) + -+ offset_into_extent), -+ cur.k); -+ bch2_key_resize(&cur.k->k, sectors); -+ cur.k->k.p = iter->pos; -+ cur.k->k.p.offset += cur.k->k.size; -+ -+ if (have_extent) { -+ ret = bch2_fill_extent(c, info, -+ bkey_i_to_s_c(prev.k), 0); -+ if (ret) -+ break; -+ } -+ -+ bkey_copy(prev.k, cur.k); -+ have_extent = true; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ bch2_btree_iter_set_pos(iter, k.k->p); -+ else -+ bch2_btree_iter_next(iter); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (!ret && have_extent) -+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -+ FIEMAP_EXTENT_LAST); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&cur, c); -+ bkey_on_stack_exit(&prev, c); -+ return ret < 0 ? ret : 0; -+} -+ -+static const struct vm_operations_struct bch_vm_ops = { -+ .fault = bch2_page_fault, -+ .map_pages = filemap_map_pages, -+ .page_mkwrite = bch2_page_mkwrite, -+}; -+ -+static int bch2_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ file_accessed(file); -+ -+ vma->vm_ops = &bch_vm_ops; -+ return 0; -+} -+ -+/* Directories: */ -+ -+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -+{ -+ return generic_file_llseek_size(file, offset, whence, -+ S64_MAX, S64_MAX); -+} -+ -+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ if (!dir_emit_dots(file, ctx)) -+ return 0; -+ -+ return bch2_readdir(c, inode->v.i_ino, ctx); -+} -+ -+static const struct file_operations bch_file_operations = { -+ .llseek = bch2_llseek, -+ .read_iter = bch2_read_iter, -+ .write_iter = bch2_write_iter, -+ .mmap = bch2_mmap, -+ .open = generic_file_open, -+ .fsync = bch2_fsync, -+ .splice_read = generic_file_splice_read, -+ /* -+ * Broken, on v5.3: -+ .splice_write = iter_file_splice_write, -+ */ -+ .fallocate = bch2_fallocate_dispatch, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+ .remap_file_range = bch2_remap_file_range, -+}; -+ -+static const struct inode_operations bch_file_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .fiemap = bch2_fiemap, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_dir_inode_operations = { -+ .lookup = bch2_lookup, -+ .create = bch2_create, -+ .link = bch2_link, -+ .unlink = bch2_unlink, -+ .symlink = bch2_symlink, -+ .mkdir = bch2_mkdir, -+ .rmdir = bch2_unlink, -+ .mknod = bch2_mknod, -+ .rename = bch2_rename2, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .tmpfile = bch2_tmpfile, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct file_operations bch_dir_file_operations = { -+ .llseek = bch2_dir_llseek, -+ .read = generic_read_dir, -+ .iterate_shared = bch2_vfs_readdir, -+ .fsync = bch2_fsync, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+}; -+ -+static const struct inode_operations bch_symlink_inode_operations = { -+ .get_link = page_get_link, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_special_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct address_space_operations bch_address_space_operations = { -+ .writepage = bch2_writepage, -+ .readpage = bch2_readpage, -+ .writepages = bch2_writepages, -+ .readpages = bch2_readpages, -+ .set_page_dirty = __set_page_dirty_nobuffers, -+ .write_begin = bch2_write_begin, -+ .write_end = bch2_write_end, -+ .invalidatepage = bch2_invalidatepage, -+ .releasepage = bch2_releasepage, -+ .direct_IO = noop_direct_IO, -+#ifdef CONFIG_MIGRATION -+ .migratepage = bch2_migrate_page, -+#endif -+ .error_remove_page = generic_error_remove_page, -+}; -+ -+static struct inode *bch2_nfs_get_inode(struct super_block *sb, -+ u64 ino, u32 generation) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct inode *vinode; -+ -+ if (ino < BCACHEFS_ROOT_INO) -+ return ERR_PTR(-ESTALE); -+ -+ vinode = bch2_vfs_inode_get(c, ino); -+ if (IS_ERR(vinode)) -+ return ERR_CAST(vinode); -+ if (generation && vinode->i_generation != generation) { -+ /* we didn't find the right inode.. */ -+ iput(vinode); -+ return ERR_PTR(-ESTALE); -+ } -+ return vinode; -+} -+ -+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_parent(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static const struct export_operations bch_export_ops = { -+ .fh_to_dentry = bch2_fh_to_dentry, -+ .fh_to_parent = bch2_fh_to_parent, -+ //.get_parent = bch2_get_parent, -+}; -+ -+static void bch2_vfs_inode_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi) -+{ -+ bch2_inode_update_after_write(c, inode, bi, ~0); -+ -+ inode->v.i_blocks = bi->bi_sectors; -+ inode->v.i_ino = bi->bi_inum; -+ inode->v.i_rdev = bi->bi_dev; -+ inode->v.i_generation = bi->bi_generation; -+ inode->v.i_size = bi->bi_size; -+ -+ inode->ei_journal_seq = 0; -+ inode->ei_quota_reserved = 0; -+ inode->ei_str_hash = bch2_hash_info_init(c, bi); -+ inode->ei_qid = bch_qid(bi); -+ -+ inode->v.i_mapping->a_ops = &bch_address_space_operations; -+ -+ switch (inode->v.i_mode & S_IFMT) { -+ case S_IFREG: -+ inode->v.i_op = &bch_file_inode_operations; -+ inode->v.i_fop = &bch_file_operations; -+ break; -+ case S_IFDIR: -+ inode->v.i_op = &bch_dir_inode_operations; -+ inode->v.i_fop = &bch_dir_file_operations; -+ break; -+ case S_IFLNK: -+ inode_nohighmem(&inode->v); -+ inode->v.i_op = &bch_symlink_inode_operations; -+ break; -+ default: -+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); -+ inode->v.i_op = &bch_special_inode_operations; -+ break; -+ } -+} -+ -+static struct inode *bch2_alloc_inode(struct super_block *sb) -+{ -+ struct bch_inode_info *inode; -+ -+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); -+ if (!inode) -+ return NULL; -+ -+ inode_init_once(&inode->v); -+ mutex_init(&inode->ei_update_lock); -+ pagecache_lock_init(&inode->ei_pagecache_lock); -+ mutex_init(&inode->ei_quota_lock); -+ inode->ei_journal_seq = 0; -+ -+ return &inode->v; -+} -+ -+static void bch2_i_callback(struct rcu_head *head) -+{ -+ struct inode *vinode = container_of(head, struct inode, i_rcu); -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ kmem_cache_free(bch2_inode_cache, inode); -+} -+ -+static void bch2_destroy_inode(struct inode *vinode) -+{ -+ call_rcu(&vinode->i_rcu, bch2_i_callback); -+} -+ -+static int inode_update_times_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); -+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); -+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); -+ -+ return 0; -+} -+ -+static int bch2_vfs_write_inode(struct inode *vinode, -+ struct writeback_control *wbc) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static void bch2_evict_inode(struct inode *vinode) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ truncate_inode_pages_final(&inode->v.i_data); -+ -+ clear_inode(&inode->v); -+ -+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); -+ -+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), -+ KEY_TYPE_QUOTA_WARN); -+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode->v.i_ino); -+ } -+} -+ -+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -+{ -+ struct super_block *sb = dentry->d_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); -+ unsigned shift = sb->s_blocksize_bits - 9; -+ u64 fsid; -+ -+ buf->f_type = BCACHEFS_STATFS_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = usage.capacity >> shift; -+ buf->f_bfree = (usage.capacity - usage.used) >> shift; -+ buf->f_bavail = buf->f_bfree; -+ buf->f_files = usage.nr_inodes; -+ buf->f_ffree = U64_MAX; -+ -+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ -+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); -+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; -+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; -+ buf->f_namelen = BCH_NAME_MAX; -+ -+ return 0; -+} -+ -+static int bch2_sync_fs(struct super_block *sb, int wait) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (c->opts.journal_flush_disabled) -+ return 0; -+ -+ if (!wait) { -+ bch2_journal_flush_async(&c->journal, NULL); -+ return 0; -+ } -+ -+ return bch2_journal_flush(&c->journal); -+} -+ -+static struct bch_fs *bch2_path_to_fs(const char *dev) -+{ -+ struct bch_fs *c; -+ struct block_device *bdev = lookup_bdev(dev); -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ c = bch2_bdev_to_fs(bdev); -+ bdput(bdev); -+ return c ?: ERR_PTR(-ENOENT); -+} -+ -+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, -+ unsigned nr_devs, struct bch_opts opts) -+{ -+ struct bch_fs *c, *c1, *c2; -+ size_t i; -+ -+ if (!nr_devs) -+ return ERR_PTR(-EINVAL); -+ -+ c = bch2_fs_open(devs, nr_devs, opts); -+ -+ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { -+ /* -+ * Already open? -+ * Look up each block device, make sure they all belong to a -+ * filesystem and they all belong to the _same_ filesystem -+ */ -+ -+ c1 = bch2_path_to_fs(devs[0]); -+ if (IS_ERR(c1)) -+ return c; -+ -+ for (i = 1; i < nr_devs; i++) { -+ c2 = bch2_path_to_fs(devs[i]); -+ if (!IS_ERR(c2)) -+ closure_put(&c2->cl); -+ -+ if (c1 != c2) { -+ closure_put(&c1->cl); -+ return c; -+ } -+ } -+ -+ c = c1; -+ } -+ -+ if (IS_ERR(c)) -+ return c; -+ -+ down_write(&c->state_lock); -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) { -+ up_write(&c->state_lock); -+ closure_put(&c->cl); -+ pr_err("err mounting %s: incomplete filesystem", dev_name); -+ return ERR_PTR(-EINVAL); -+ } -+ -+ up_write(&c->state_lock); -+ -+ set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); -+ return c; -+} -+ -+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, -+ struct bch_opts opts) -+{ -+ char *dev_name = NULL, **devs = NULL, *s; -+ struct bch_fs *c = ERR_PTR(-ENOMEM); -+ size_t i, nr_devs = 0; -+ -+ dev_name = kstrdup(_dev_name, GFP_KERNEL); -+ if (!dev_name) -+ goto err; -+ -+ for (s = dev_name; s; s = strchr(s + 1, ':')) -+ nr_devs++; -+ -+ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); -+ if (!devs) -+ goto err; -+ -+ for (i = 0, s = dev_name; -+ s; -+ (s = strchr(s, ':')) && (*s++ = '\0')) -+ devs[i++] = s; -+ -+ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); -+err: -+ kfree(devs); -+ kfree(dev_name); -+ return c; -+} -+ -+static int bch2_remount(struct super_block *sb, int *flags, char *data) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_opts opts = bch2_opts_empty(); -+ int ret; -+ -+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ret; -+ -+ if (opts.read_only != c->opts.read_only) { -+ down_write(&c->state_lock); -+ -+ if (opts.read_only) { -+ bch2_fs_read_only(c); -+ -+ sb->s_flags |= SB_RDONLY; -+ } else { -+ ret = bch2_fs_read_write(c); -+ if (ret) { -+ bch_err(c, "error going rw: %i", ret); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ sb->s_flags &= ~SB_RDONLY; -+ } -+ -+ c->opts.read_only = opts.read_only; -+ -+ up_write(&c->state_lock); -+ } -+ -+ if (opts.errors >= 0) -+ c->opts.errors = opts.errors; -+ -+ return ret; -+} -+ -+static int bch2_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ enum bch_opt_id i; -+ char buf[512]; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ bch2_opt_to_text(&PBUF(buf), c, opt, v, -+ OPT_SHOW_MOUNT_STYLE); -+ seq_putc(seq, ','); -+ seq_puts(seq, buf); -+ } -+ -+ return 0; -+ -+} -+ -+static const struct super_operations bch_super_operations = { -+ .alloc_inode = bch2_alloc_inode, -+ .destroy_inode = bch2_destroy_inode, -+ .write_inode = bch2_vfs_write_inode, -+ .evict_inode = bch2_evict_inode, -+ .sync_fs = bch2_sync_fs, -+ .statfs = bch2_statfs, -+ .show_options = bch2_show_options, -+ .remount_fs = bch2_remount, -+#if 0 -+ .put_super = bch2_put_super, -+ .freeze_fs = bch2_freeze, -+ .unfreeze_fs = bch2_unfreeze, -+#endif -+}; -+ -+static int bch2_test_super(struct super_block *s, void *data) -+{ -+ return s->s_fs_info == data; -+} -+ -+static int bch2_set_super(struct super_block *s, void *data) -+{ -+ s->s_fs_info = data; -+ return 0; -+} -+ -+static struct dentry *bch2_mount(struct file_system_type *fs_type, -+ int flags, const char *dev_name, void *data) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ struct super_block *sb; -+ struct inode *vinode; -+ struct bch_opts opts = bch2_opts_empty(); -+ unsigned i; -+ int ret; -+ -+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ c = bch2_open_as_blockdevs(dev_name, opts); -+ if (IS_ERR(c)) -+ return ERR_CAST(c); -+ -+ sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); -+ if (IS_ERR(sb)) { -+ closure_put(&c->cl); -+ return ERR_CAST(sb); -+ } -+ -+ BUG_ON(sb->s_fs_info != c); -+ -+ if (sb->s_root) { -+ closure_put(&c->cl); -+ -+ if ((flags ^ sb->s_flags) & SB_RDONLY) { -+ ret = -EBUSY; -+ goto err_put_super; -+ } -+ goto out; -+ } -+ -+ sb->s_blocksize = block_bytes(c); -+ sb->s_blocksize_bits = ilog2(block_bytes(c)); -+ sb->s_maxbytes = MAX_LFS_FILESIZE; -+ sb->s_op = &bch_super_operations; -+ sb->s_export_op = &bch_export_ops; -+#ifdef CONFIG_BCACHEFS_QUOTA -+ sb->s_qcop = &bch2_quotactl_operations; -+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -+#endif -+ sb->s_xattr = bch2_xattr_handlers; -+ sb->s_magic = BCACHEFS_STATFS_MAGIC; -+ sb->s_time_gran = c->sb.time_precision; -+ c->vfs_sb = sb; -+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); -+ -+ ret = super_setup_bdi(sb); -+ if (ret) -+ goto err_put_super; -+ -+ sb->s_bdi->congested_fn = bch2_congested; -+ sb->s_bdi->congested_data = c; -+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; -+ -+ for_each_online_member(ca, c, i) { -+ struct block_device *bdev = ca->disk_sb.bdev; -+ -+ /* XXX: create an anonymous device for multi device filesystems */ -+ sb->s_bdev = bdev; -+ sb->s_dev = bdev->bd_dev; -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ if (c->opts.acl) -+ sb->s_flags |= SB_POSIXACL; -+#endif -+ -+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); -+ if (IS_ERR(vinode)) { -+ bch_err(c, "error mounting: error getting root inode %i", -+ (int) PTR_ERR(vinode)); -+ ret = PTR_ERR(vinode); -+ goto err_put_super; -+ } -+ -+ sb->s_root = d_make_root(vinode); -+ if (!sb->s_root) { -+ bch_err(c, "error mounting: error allocating root dentry"); -+ ret = -ENOMEM; -+ goto err_put_super; -+ } -+ -+ sb->s_flags |= SB_ACTIVE; -+out: -+ return dget(sb->s_root); -+ -+err_put_super: -+ deactivate_locked_super(sb); -+ return ERR_PTR(ret); -+} -+ -+static void bch2_kill_sb(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ generic_shutdown_super(sb); -+ -+ if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) -+ bch2_fs_stop(c); -+ else -+ closure_put(&c->cl); -+} -+ -+static struct file_system_type bcache_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "bcachefs", -+ .mount = bch2_mount, -+ .kill_sb = bch2_kill_sb, -+ .fs_flags = FS_REQUIRES_DEV, -+}; -+ -+MODULE_ALIAS_FS("bcachefs"); -+ -+void bch2_vfs_exit(void) -+{ -+ unregister_filesystem(&bcache_fs_type); -+ if (bch2_inode_cache) -+ kmem_cache_destroy(bch2_inode_cache); -+} -+ -+int __init bch2_vfs_init(void) -+{ -+ int ret = -ENOMEM; -+ -+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); -+ if (!bch2_inode_cache) -+ goto err; -+ -+ ret = register_filesystem(&bcache_fs_type); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ bch2_vfs_exit(); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h -new file mode 100644 -index 000000000000..eda903a45325 ---- /dev/null -+++ b/fs/bcachefs/fs.h -@@ -0,0 +1,174 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_H -+#define _BCACHEFS_FS_H -+ -+#include "inode.h" -+#include "opts.h" -+#include "str_hash.h" -+#include "quota_types.h" -+ -+#include -+#include -+ -+/* -+ * Two-state lock - can be taken for add or block - both states are shared, -+ * like read side of rwsem, but conflict with other state: -+ */ -+struct pagecache_lock { -+ atomic_long_t v; -+ wait_queue_head_t wait; -+}; -+ -+static inline void pagecache_lock_init(struct pagecache_lock *lock) -+{ -+ atomic_long_set(&lock->v, 0); -+ init_waitqueue_head(&lock->wait); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *); -+void bch2_pagecache_add_get(struct pagecache_lock *); -+void bch2_pagecache_block_put(struct pagecache_lock *); -+void bch2_pagecache_block_get(struct pagecache_lock *); -+ -+struct bch_inode_info { -+ struct inode v; -+ -+ struct mutex ei_update_lock; -+ u64 ei_journal_seq; -+ u64 ei_quota_reserved; -+ unsigned long ei_last_dirtied; -+ -+ struct pagecache_lock ei_pagecache_lock; -+ -+ struct mutex ei_quota_lock; -+ struct bch_qid ei_qid; -+ -+ struct bch_hash_info ei_str_hash; -+ -+ /* copy of inode in btree: */ -+ struct bch_inode_unpacked ei_inode; -+}; -+ -+#define to_bch_ei(_inode) \ -+ container_of_or_null(_inode, struct bch_inode_info, v) -+ -+static inline int ptrcmp(void *l, void *r) -+{ -+ return cmp_int(l, r); -+} -+ -+enum bch_inode_lock_op { -+ INODE_LOCK = (1U << 0), -+ INODE_PAGECACHE_BLOCK = (1U << 1), -+ INODE_UPDATE_LOCK = (1U << 2), -+}; -+ -+#define bch2_lock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ down_write_nested(&a[i]->v.i_rwsem, i); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_lock_nested(&a[i]->ei_update_lock, i);\ -+ } \ -+} while (0) -+ -+#define bch2_unlock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ up_write(&a[i]->v.i_rwsem); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_unlock(&a[i]->ei_update_lock); \ -+ } \ -+} while (0) -+ -+static inline struct bch_inode_info *file_bch_inode(struct file *file) -+{ -+ return to_bch_ei(file_inode(file)); -+} -+ -+static inline bool inode_attr_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode, -+ enum inode_opt_id id) -+{ -+ return !(inode->ei_inode.bi_fields_set & (1 << id)) && -+ bch2_inode_opt_get(&dir->ei_inode, id) != -+ bch2_inode_opt_get(&inode->ei_inode, id); -+} -+ -+static inline bool inode_attrs_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode) -+{ -+ unsigned id; -+ -+ for (id = 0; id < Inode_opt_nr; id++) -+ if (inode_attr_changing(dir, inode, id)) -+ return true; -+ -+ return false; -+} -+ -+struct bch_inode_unpacked; -+ -+#ifndef NO_BCACHEFS_FS -+ -+int bch2_fs_quota_transfer(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_qid, -+ unsigned, -+ enum quota_acct_mode); -+ -+static inline int bch2_set_projid(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ u32 projid) -+{ -+ struct bch_qid qid = inode->ei_qid; -+ -+ qid.q[QTYP_PRJ] = projid; -+ -+ return bch2_fs_quota_transfer(c, inode, qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); -+ -+/* returns 0 if we want to do the update, or error is passed up */ -+typedef int (*inode_set_fn)(struct bch_inode_info *, -+ struct bch_inode_unpacked *, void *); -+ -+void bch2_inode_update_after_write(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, -+ unsigned); -+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, -+ inode_set_fn, void *, unsigned); -+ -+void bch2_vfs_exit(void); -+int bch2_vfs_init(void); -+ -+#else -+ -+static inline void bch2_vfs_exit(void) {} -+static inline int bch2_vfs_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_FS_H */ -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -new file mode 100644 -index 000000000000..c6ca5968a2e0 ---- /dev/null -+++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,1498 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "inode.h" -+#include "keylist.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include /* struct qstr */ -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 sectors = 0; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, -+ POS(inum, 0), 0, k, ret) { -+ if (k.k->p.inode != inum) -+ break; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ sectors += k.k->size; -+ } -+ -+ bch2_trans_iter_free(trans, iter); -+ -+ return ret ?: sectors; -+} -+ -+static int __remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ struct bch_fs *c = trans->c; -+ struct qstr name; -+ struct bch_inode_unpacked dir_inode; -+ struct bch_hash_info dir_hash_info; -+ u64 dir_inum = dirent.k->p.inode; -+ int ret; -+ char *buf; -+ -+ name.len = bch2_dirent_name_bytes(dirent); -+ buf = bch2_trans_kmalloc(trans, name.len + 1); -+ if (IS_ERR(buf)) -+ return PTR_ERR(buf); -+ -+ memcpy(buf, dirent.v->d_name, name.len); -+ buf[name.len] = '\0'; -+ name.name = buf; -+ -+ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); -+ if (ret) -+ return ret; -+ -+ dir_hash_info = bch2_hash_info_init(c, &dir_inode); -+ -+ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, -+ &dir_hash_info, dir_inum, &name); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i deleting dirent", ret); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+static int remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ return __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __remove_dirent(trans, dirent)); -+} -+ -+static int reattach_inode(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ u64 inum) -+{ -+ struct bch_inode_unpacked dir_u, inode_u; -+ char name_buf[20]; -+ struct qstr name; -+ int ret; -+ -+ snprintf(name_buf, sizeof(name_buf), "%llu", inum); -+ name = (struct qstr) QSTR(name_buf); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW, -+ bch2_link_trans(&trans, lostfound_inode->bi_inum, -+ inum, &dir_u, &inode_u, &name)); -+ if (ret) -+ bch_err(c, "error %i reattaching inode %llu", ret, inum); -+ -+ return ret; -+} -+ -+struct inode_walker { -+ bool first_this_inode; -+ bool have_inode; -+ u64 cur_inum; -+ struct bch_inode_unpacked inode; -+}; -+ -+static struct inode_walker inode_walker_init(void) -+{ -+ return (struct inode_walker) { -+ .cur_inum = -1, -+ .have_inode = false, -+ }; -+} -+ -+static int walk_inode(struct btree_trans *trans, -+ struct inode_walker *w, u64 inum) -+{ -+ if (inum != w->cur_inum) { -+ int ret = bch2_inode_find_by_inum_trans(trans, inum, -+ &w->inode); -+ -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ w->have_inode = !ret; -+ w->cur_inum = inum; -+ w->first_this_inode = true; -+ } else { -+ w->first_this_inode = false; -+ } -+ -+ return 0; -+} -+ -+struct hash_check { -+ struct bch_hash_info info; -+ -+ /* start of current chain of hash collisions: */ -+ struct btree_iter *chain; -+ -+ /* next offset in current chain of hash collisions: */ -+ u64 chain_end; -+}; -+ -+static void hash_check_init(struct hash_check *h) -+{ -+ h->chain = NULL; -+ h->chain_end = 0; -+} -+ -+static void hash_stop_chain(struct btree_trans *trans, -+ struct hash_check *h) -+{ -+ if (h->chain) -+ bch2_trans_iter_free(trans, h->chain); -+ h->chain = NULL; -+} -+ -+static void hash_check_set_inode(struct btree_trans *trans, -+ struct hash_check *h, -+ const struct bch_inode_unpacked *bi) -+{ -+ h->info = bch2_hash_info_init(trans->c, bi); -+ hash_stop_chain(trans, h); -+} -+ -+static int hash_redo_key(const struct bch_hash_desc desc, -+ struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k, -+ u64 hashed) -+{ -+ struct bkey_i delete; -+ struct bkey_i *tmp; -+ -+ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if (IS_ERR(tmp)) -+ return PTR_ERR(tmp); -+ -+ bkey_reassemble(tmp, k); -+ -+ bkey_init(&delete.k); -+ delete.k.p = k_iter->pos; -+ bch2_trans_update(trans, k_iter, &delete, 0); -+ -+ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, -+ tmp, BCH_HASH_SET_MUST_CREATE); -+} -+ -+static int fsck_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ int ret; -+retry: -+ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret == -EINTR) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (!ret) -+ goto retry; -+ } -+ -+ return ret; -+} -+ -+static int hash_check_duplicates(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k2; -+ char buf[200]; -+ int ret = 0; -+ -+ if (!bkey_cmp(h->chain->pos, k_iter->pos)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, h->chain); -+ BUG_ON(IS_ERR(iter)); -+ -+ for_each_btree_key_continue(iter, 0, k2, ret) { -+ if (bkey_cmp(k2.k->p, k.k->p) >= 0) -+ break; -+ -+ if (fsck_err_on(k2.k->type == desc.key_type && -+ !desc.cmp_bkey(k, k2), c, -+ "duplicate hash table keys:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); -+ if (ret) -+ return ret; -+ ret = 1; -+ break; -+ } -+ } -+fsck_err: -+ bch2_trans_iter_free(trans, iter); -+ return ret; -+} -+ -+static void hash_set_chain_start(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ bool hole = (k.k->type != KEY_TYPE_whiteout && -+ k.k->type != desc.key_type); -+ -+ if (hole || k.k->p.offset > h->chain_end + 1) -+ hash_stop_chain(trans, h); -+ -+ if (!hole) { -+ if (!h->chain) { -+ h->chain = bch2_trans_copy_iter(trans, k_iter); -+ BUG_ON(IS_ERR(h->chain)); -+ } -+ -+ h->chain_end = k.k->p.offset; -+ } -+} -+ -+static bool key_has_correct_hash(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ u64 hash; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return true; -+ -+ hash = desc.hash_bkey(&h->info, k); -+ -+ return hash >= h->chain->pos.offset && -+ hash <= k.k->p.offset; -+} -+ -+static int hash_check_key(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ char buf[200]; -+ u64 hashed; -+ int ret = 0; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return 0; -+ -+ hashed = desc.hash_bkey(&h->info, k); -+ -+ if (fsck_err_on(hashed < h->chain->pos.offset || -+ hashed > k.k->p.offset, c, -+ "hash table key at wrong offset: btree %u, %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ desc.btree_id, k.k->p.offset, -+ hashed, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(desc, trans, h, k_iter, k, hashed)); -+ if (ret) { -+ bch_err(c, "hash_redo_key err %i", ret); -+ return ret; -+ } -+ return 1; -+ } -+ -+ ret = hash_check_duplicates(trans, desc, h, k_iter, k); -+fsck_err: -+ return ret; -+} -+ -+static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *iter, struct bkey_s_c *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_dirent *d = NULL; -+ int ret = -EINVAL; -+ char buf[200]; -+ unsigned len; -+ u64 hash; -+ -+ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) -+ return 0; -+ -+ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); -+ BUG_ON(!len); -+ -+ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); -+ buf[len] = '\0'; -+ -+ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); -+ if (!d) { -+ bch_err(c, "memory allocation failure"); -+ return -ENOMEM; -+ } -+ -+ bkey_reassemble(&d->k_i, *k); -+ -+ do { -+ --len; -+ if (!len) -+ goto err_redo; -+ -+ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); -+ -+ BUG_ON(bkey_val_bytes(&d->k) < -+ offsetof(struct bch_dirent, d_name) + len); -+ -+ memset(d->v.d_name + len, 0, -+ bkey_val_bytes(&d->k) - -+ offsetof(struct bch_dirent, d_name) - len); -+ -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, -+ bkey_i_to_s_c(&d->k_i)); -+ } while (hash < h->chain->pos.offset || -+ hash > k->k->p.offset); -+ -+ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", -+ buf, strlen(buf), d->v.d_name, len)) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); -+ if (ret) -+ goto err; -+ -+ *k = bch2_btree_iter_peek(iter); -+ -+ BUG_ON(k->k->type != KEY_TYPE_dirent); -+ } -+err: -+fsck_err: -+ kfree(d); -+ return ret; -+err_redo: -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); -+ -+ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" -+ "hash table key at wrong offset: btree %u, offset %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ buf, strlen(buf), BTREE_ID_DIRENTS, -+ k->k->p.offset, hash, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ *k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(bch2_dirent_hash_desc, trans, -+ h, iter, *k, hash)); -+ if (ret) -+ bch_err(c, "hash_redo_key err %i", ret); -+ else -+ ret = 1; -+ } -+ -+ goto err; -+} -+ -+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) -+{ -+ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), -+ POS(inode_nr + 1, 0), NULL); -+} -+ -+static int bch2_fix_overlapping_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, struct bpos cut_at) -+{ -+ struct btree_iter *u_iter; -+ struct bkey_i *u; -+ int ret; -+ -+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(u, k); -+ bch2_cut_front(cut_at, u); -+ -+ u_iter = bch2_trans_copy_iter(trans, iter); -+ ret = PTR_ERR_OR_ZERO(u_iter); -+ if (ret) -+ return ret; -+ -+ /* -+ * We don't want to go through the -+ * extent_handle_overwrites path: -+ */ -+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); -+ -+ /* -+ * XXX: this is going to leave disk space -+ * accounting slightly wrong -+ */ -+ ret = bch2_trans_update(trans, u_iter, u, 0); -+ bch2_trans_iter_put(trans, u_iter); -+ return ret; -+} -+ -+/* -+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and -+ * that i_size an i_sectors are consistent -+ */ -+noinline_for_stack -+static int check_extents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack prev; -+ u64 i_sectors; -+ int ret = 0; -+ -+ bkey_on_stack_init(&prev); -+ prev.k->k = KEY(0, 0, 0); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking extents"); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { -+ char buf1[200]; -+ char buf2[200]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); -+ bch2_bkey_val_to_text(&PBUF(buf2), c, k); -+ -+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_fix_overlapping_extent(&trans, -+ iter, k, prev.k->k.p)); -+ if (ret) -+ goto err; -+ } -+ } -+ bkey_on_stack_reassemble(&prev, c, k); -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "extent type %u for missing inode %llu", -+ k.k->type, k.k->p.inode) || -+ fsck_err_on(w.have_inode && -+ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, -+ "extent type %u for non regular file, inode %llu mode %o", -+ k.k->type, k.k->p.inode, w.inode.bi_mode)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(w.first_this_inode && -+ w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && -+ w.inode.bi_sectors != -+ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), -+ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", -+ w.inode.bi_inum, -+ w.inode.bi_sectors, i_sectors)) { -+ struct bkey_inode_buf p; -+ -+ w.inode.bi_sectors = i_sectors; -+ -+ bch2_trans_unlock(&trans); -+ -+ bch2_inode_pack(&p, &w.inode); -+ -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &p.inode.k_i, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i updating inode", ret); -+ goto err; -+ } -+ -+ /* revalidate iterator: */ -+ k = bch2_btree_iter_peek(iter); -+ } -+ -+ if (fsck_err_on(w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ k.k->type != KEY_TYPE_reservation && -+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, -+ "extent type %u offset %llu past end of inode %llu, i_size %llu", -+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, -+ w.inode.bi_size); -+ if (ret) -+ goto err; -+ continue; -+ } -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ bkey_on_stack_exit(&prev, c); -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, -+ * validate d_type -+ */ -+noinline_for_stack -+static int check_dirents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned name_len; -+ char buf[200]; -+ int ret = 0; -+ -+ bch_verbose(c, "checking dirents"); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ hash_check_init(&h); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ struct bkey_s_c_dirent d; -+ struct bch_inode_unpacked target; -+ bool have_target; -+ u64 d_inum; -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "dirent in nonexisting directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf)) || -+ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, -+ "dirent in non directory inode type %u:\n%s", -+ mode_to_type(w.inode.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = check_dirent_hash(&trans, &h, iter, &k); -+ if (ret > 0) { -+ ret = 0; -+ continue; -+ } -+ if (ret) -+ goto fsck_err; -+ -+ if (ret) -+ goto fsck_err; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ name_len = bch2_dirent_name_bytes(d); -+ -+ if (fsck_err_on(!name_len, c, "empty dirent") || -+ fsck_err_on(name_len == 1 && -+ !memcmp(d.v->d_name, ".", 1), c, -+ ". dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, -+ "dirent name has invalid chars")) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(d_inum == d.k->p.inode, c, -+ "dirent points to own directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); -+ if (ret && ret != -ENOENT) -+ break; -+ -+ have_target = !ret; -+ ret = 0; -+ -+ if (fsck_err_on(!have_target, c, -+ "dirent points to missing inode:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(have_target && -+ d.v->d_type != -+ mode_to_type(target.bi_mode), c, -+ "incorrect d_type: should be %u:\n%s", -+ mode_to_type(target.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ struct bkey_i_dirent *n; -+ -+ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); -+ if (!n) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_type = mode_to_type(target.bi_mode); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); -+ kfree(n); -+ if (ret) -+ goto err; -+ -+ } -+ } -+ -+ hash_stop_chain(&trans, &h); -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk xattrs: verify that they all have a corresponding inode -+ */ -+noinline_for_stack -+static int check_xattrs(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch_verbose(c, "checking xattrs"); -+ -+ hash_check_init(&h); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "xattr for missing inode %llu", -+ k.k->p.inode)) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = hash_check_key(&trans, bch2_xattr_hash_desc, -+ &h, iter, k); -+ if (ret) -+ goto fsck_err; -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Get root directory, create if it doesn't exist: */ -+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) -+{ -+ struct bkey_inode_buf packed; -+ int ret; -+ -+ bch_verbose(c, "checking root directory"); -+ -+ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "root directory missing")) -+ goto create_root; -+ -+ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, -+ "root inode not a directory")) -+ goto create_root; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_root: -+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, -+ 0, NULL); -+ root_inode->bi_inum = BCACHEFS_ROOT_INO; -+ -+ bch2_inode_pack(&packed, root_inode); -+ -+ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, -+ NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+} -+ -+/* Get lost+found, create if it doesn't exist: */ -+static int check_lostfound(struct bch_fs *c, -+ struct bch_inode_unpacked *root_inode, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct qstr lostfound = QSTR("lost+found"); -+ struct bch_hash_info root_hash_info = -+ bch2_hash_info_init(c, root_inode); -+ u64 inum; -+ int ret; -+ -+ bch_verbose(c, "checking lost+found"); -+ -+ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, -+ &lostfound); -+ if (!inum) { -+ bch_notice(c, "creating lost+found"); -+ goto create_lostfound; -+ } -+ -+ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "lost+found missing")) -+ goto create_lostfound; -+ -+ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, -+ "lost+found inode not a directory")) -+ goto create_lostfound; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_lostfound: -+ bch2_inode_init_early(c, lostfound_inode); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_create_trans(&trans, -+ BCACHEFS_ROOT_INO, root_inode, -+ lostfound_inode, &lostfound, -+ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); -+ if (ret) -+ bch_err(c, "error creating lost+found: %i", ret); -+ -+ return ret; -+} -+ -+struct inode_bitmap { -+ unsigned long *bits; -+ size_t size; -+}; -+ -+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) -+{ -+ return nr < b->size ? test_bit(nr, b->bits) : false; -+} -+ -+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) -+{ -+ if (nr >= b->size) { -+ size_t new_size = max_t(size_t, max_t(size_t, -+ PAGE_SIZE * 8, -+ b->size * 2), -+ nr + 1); -+ void *n; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); -+ if (!n) { -+ return -ENOMEM; -+ } -+ -+ b->bits = n; -+ b->size = new_size; -+ } -+ -+ __set_bit(nr, b->bits); -+ return 0; -+} -+ -+struct pathbuf { -+ size_t nr; -+ size_t size; -+ -+ struct pathbuf_entry { -+ u64 inum; -+ u64 offset; -+ } *entries; -+}; -+ -+static int path_down(struct pathbuf *p, u64 inum) -+{ -+ if (p->nr == p->size) { -+ size_t new_size = max_t(size_t, 256UL, p->size * 2); -+ void *n = krealloc(p->entries, -+ new_size * sizeof(p->entries[0]), -+ GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ p->entries = n; -+ p->size = new_size; -+ }; -+ -+ p->entries[p->nr++] = (struct pathbuf_entry) { -+ .inum = inum, -+ .offset = 0, -+ }; -+ return 0; -+} -+ -+noinline_for_stack -+static int check_directory_structure(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct inode_bitmap dirs_done = { NULL, 0 }; -+ struct pathbuf path = { 0, 0, NULL }; -+ struct pathbuf_entry *e; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ bool had_unreachable; -+ u64 d_inum; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking directory structure"); -+ -+ /* DFS: */ -+restart_dfs: -+ had_unreachable = false; -+ -+ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, BCACHEFS_ROOT_INO); -+ if (ret) -+ goto err; -+ -+ while (path.nr) { -+next: -+ e = &path.entries[path.nr - 1]; -+ -+ if (e->offset == U64_MAX) -+ goto up; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(e->inum, e->offset + 1), 0, k, ret) { -+ if (k.k->p.inode != e->inum) -+ break; -+ -+ e->offset = k.k->p.offset; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ if (dirent.v->d_type != DT_DIR) -+ continue; -+ -+ d_inum = le64_to_cpu(dirent.v->d_inum); -+ -+ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, -+ "directory %llu has multiple hardlinks", -+ d_inum)) { -+ ret = remove_dirent(&trans, dirent); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = inode_bitmap_set(&dirs_done, d_inum); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, d_inum); -+ if (ret) { -+ goto err; -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter); -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+ goto next; -+ } -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+up: -+ path.nr--; -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) -+ continue; -+ -+ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); -+ if (ret == -EINTR) -+ goto retry; -+ if (!ret) -+ continue; -+ -+ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, -+ "unreachable directory found (inum %llu)", -+ k.k->p.offset)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); -+ if (ret) { -+ goto err; -+ } -+ -+ had_unreachable = true; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ if (ret) -+ goto err; -+ -+ if (had_unreachable) { -+ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ memset(&dirs_done, 0, sizeof(dirs_done)); -+ memset(&path, 0, sizeof(path)); -+ goto restart_dfs; -+ } -+err: -+fsck_err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ return ret; -+} -+ -+struct nlink { -+ u32 count; -+ u32 dir_count; -+}; -+ -+typedef GENRADIX(struct nlink) nlink_table; -+ -+static void inc_link(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end, -+ u64 inum, bool dir) -+{ -+ struct nlink *link; -+ -+ if (inum < range_start || inum >= *range_end) -+ return; -+ -+ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); -+ if (!link) { -+ bch_verbose(c, "allocation failed during fsck - will need another pass"); -+ *range_end = inum; -+ return; -+ } -+ -+ if (dir) -+ link->dir_count++; -+ else -+ link->count++; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ u64 d_inum; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_dirent: -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ if (d.v->d_type == DT_DIR) -+ inc_link(c, links, range_start, range_end, -+ d.k->p.inode, true); -+ -+ inc_link(c, links, range_start, range_end, -+ d_inum, false); -+ -+ break; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); -+ -+ return ret; -+} -+ -+static int check_inode_nlink(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct bch_inode_unpacked *u, -+ struct nlink *link, -+ bool *do_update) -+{ -+ u32 i_nlink = bch2_inode_nlink_get(u); -+ u32 real_i_nlink = -+ link->count * nlink_bias(u->bi_mode) + -+ link->dir_count; -+ int ret = 0; -+ -+ /* -+ * These should have been caught/fixed by earlier passes, we don't -+ * repair them here: -+ */ -+ if (S_ISDIR(u->bi_mode) && link->count > 1) { -+ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", -+ u->bi_inum, link->count); -+ return 0; -+ } -+ -+ if (S_ISDIR(u->bi_mode) && !link->count) { -+ need_fsck_err(c, "unreachable directory found (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!S_ISDIR(u->bi_mode) && link->dir_count) { -+ need_fsck_err(c, "non directory with subdirectories (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!link->count && -+ !(u->bi_flags & BCH_INODE_UNLINKED) && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", -+ u->bi_inum, mode_to_type(u->bi_mode)) == -+ FSCK_ERR_IGNORE) -+ return 0; -+ -+ ret = reattach_inode(c, lostfound_inode, u->bi_inum); -+ if (ret) -+ return ret; -+ -+ link->count = 1; -+ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink < link->count) { -+ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", -+ u->bi_inum, i_nlink, link->count, -+ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ c->sb.clean) { -+ if (fsck_err(c, "filesystem marked clean, " -+ "but inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (real_i_nlink && i_nlink != real_i_nlink) -+ bch_verbose(c, "setting inode %llu nlink from %u to %u", -+ u->bi_inum, i_nlink, real_i_nlink); -+set_i_nlink: -+ if (i_nlink != real_i_nlink) { -+ bch2_inode_nlink_set(u, real_i_nlink); -+ *do_update = true; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int check_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct btree_iter *iter, -+ struct bkey_s_c_inode inode, -+ struct nlink *link) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ bool do_update = false; -+ int ret = 0; -+ -+ ret = bch2_inode_unpack(inode, &u); -+ -+ bch2_trans_unlock(trans); -+ -+ if (bch2_fs_inconsistent_on(ret, c, -+ "error unpacking inode %llu in fsck", -+ inode.k->p.inode)) -+ return ret; -+ -+ if (link) { -+ ret = check_inode_nlink(c, lostfound_inode, &u, link, -+ &do_update); -+ if (ret) -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_UNLINKED && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", -+ u.bi_inum))) { -+ bch_verbose(c, "deleting inode %llu", u.bi_inum); -+ -+ ret = bch2_inode_rm(c, u.bi_inum); -+ if (ret) -+ bch_err(c, "error in fsck: error %i while deleting inode", ret); -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", -+ u.bi_inum))) { -+ bch_verbose(c, "truncating inode %llu", u.bi_inum); -+ -+ /* -+ * XXX: need to truncate partial blocks too here - or ideally -+ * just switch units to bytes and that issue goes away -+ */ -+ -+ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i truncating inode", ret); -+ return ret; -+ } -+ -+ /* -+ * We truncated without our normal sector accounting hook, just -+ * make sure we recalculate it: -+ */ -+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; -+ -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ do_update = true; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", -+ u.bi_inum))) { -+ s64 sectors; -+ -+ bch_verbose(c, "recounting sectors for inode %llu", -+ u.bi_inum); -+ -+ sectors = bch2_count_inode_sectors(trans, u.bi_inum); -+ if (sectors < 0) { -+ bch_err(c, "error in fsck: error %i recounting inode sectors", -+ (int) sectors); -+ return sectors; -+ } -+ -+ u.bi_sectors = sectors; -+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; -+ do_update = true; -+ } -+ -+ if (do_update) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, &u); -+ -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); -+ if (ret) -+ bch_err(c, "error in fsck: error %i " -+ "updating inode", ret); -+ } -+fsck_err: -+ return ret; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_inodes(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ nlink_table *links, -+ u64 range_start, u64 range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct nlink *link, zero_links = { 0, 0 }; -+ struct genradix_iter nlinks_iter; -+ int ret = 0, ret2 = 0; -+ u64 nlinks_pos; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, -+ POS(0, range_start), 0); -+ nlinks_iter = genradix_iter_init(links, 0); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret2 = bkey_err(k))) { -+peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); -+ -+ if (!link && (!k.k || iter->pos.offset >= range_end)) -+ break; -+ -+ nlinks_pos = range_start + nlinks_iter.pos; -+ if (iter->pos.offset > nlinks_pos) { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link && link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ genradix_iter_advance(&nlinks_iter, links); -+ goto peek_nlinks; -+ } -+ -+ if (iter->pos.offset < nlinks_pos || !link) -+ link = &zero_links; -+ -+ if (k.k && k.k->type == KEY_TYPE_inode) { -+ ret = check_inode(&trans, lostfound_inode, iter, -+ bkey_s_c_to_inode(k), link); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } else { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ } -+ -+ if (nlinks_pos == iter->pos.offset) -+ genradix_iter_advance(&nlinks_iter, links); -+ -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+fsck_err: -+ bch2_trans_exit(&trans); -+ -+ if (ret2) -+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); -+ -+ return ret ?: ret2; -+} -+ -+noinline_for_stack -+static int check_inode_nlinks(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ nlink_table links; -+ u64 this_iter_range_start, next_iter_range_start = 0; -+ int ret = 0; -+ -+ bch_verbose(c, "checking inode nlinks"); -+ -+ genradix_init(&links); -+ -+ do { -+ this_iter_range_start = next_iter_range_start; -+ next_iter_range_start = U64_MAX; -+ -+ ret = bch2_gc_walk_dirents(c, &links, -+ this_iter_range_start, -+ &next_iter_range_start); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, -+ this_iter_range_start, -+ next_iter_range_start); -+ if (ret) -+ break; -+ -+ genradix_free(&links); -+ } while (next_iter_range_start != U64_MAX); -+ -+ genradix_free(&links); -+ -+ return ret; -+} -+ -+/* -+ * Checks for inconsistencies that shouldn't happen, unless we have a bug. -+ * Doesn't fix them yet, mainly because they haven't yet been observed: -+ */ -+int bch2_fsck_full(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_extents(c) ?: -+ check_dirents(c) ?: -+ check_xattrs(c) ?: -+ check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_directory_structure(c, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_inode_nlink(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_walk_inodes_only(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_inode inode; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ inode = bkey_s_c_to_inode(k); -+ -+ if (inode.v->bi_flags & -+ (BCH_INODE_I_SIZE_DIRTY| -+ BCH_INODE_I_SECTORS_DIRTY| -+ BCH_INODE_UNLINKED)) { -+ ret = check_inode(&trans, NULL, iter, inode, NULL); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } -+ } -+ BUG_ON(ret == -EINTR); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h -new file mode 100644 -index 000000000000..9e4af02bde1e ---- /dev/null -+++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FSCK_H -+#define _BCACHEFS_FSCK_H -+ -+int bch2_fsck_full(struct bch_fs *); -+int bch2_fsck_inode_nlink(struct bch_fs *); -+int bch2_fsck_walk_inodes_only(struct bch_fs *); -+ -+#endif /* _BCACHEFS_FSCK_H */ -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -new file mode 100644 -index 000000000000..7d20f082ad45 ---- /dev/null -+++ b/fs/bcachefs/inode.c -@@ -0,0 +1,554 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "str_hash.h" -+ -+#include -+ -+#include -+ -+const char * const bch2_inode_opts[] = { -+#define x(name, ...) #name, -+ BCH_INODE_OPTS() -+#undef x -+ NULL, -+}; -+ -+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -+static const u8 bits_table[8] = { -+ 1 * 8 - 1, -+ 2 * 8 - 2, -+ 3 * 8 - 3, -+ 4 * 8 - 4, -+ 6 * 8 - 5, -+ 8 * 8 - 6, -+ 10 * 8 - 7, -+ 13 * 8 - 8, -+}; -+ -+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) -+{ -+ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; -+ unsigned shift, bytes, bits = likely(!hi) -+ ? fls64(lo) -+ : fls64(hi) + 64; -+ -+ for (shift = 1; shift <= 8; shift++) -+ if (bits < bits_table[shift - 1]) -+ goto got_shift; -+ -+ BUG(); -+got_shift: -+ bytes = byte_table[shift - 1]; -+ -+ BUG_ON(out + bytes > end); -+ -+ memcpy(out, (u8 *) in + 16 - bytes, bytes); -+ *out |= (1 << 8) >> shift; -+ -+ return bytes; -+} -+ -+static int inode_decode_field(const u8 *in, const u8 *end, -+ u64 out[2], unsigned *out_bits) -+{ -+ __be64 be[2] = { 0, 0 }; -+ unsigned bytes, shift; -+ u8 *p; -+ -+ if (in >= end) -+ return -1; -+ -+ if (!*in) -+ return -1; -+ -+ /* -+ * position of highest set bit indicates number of bytes: -+ * shift = number of bits to remove in high byte: -+ */ -+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ -+ bytes = byte_table[shift - 1]; -+ -+ if (in + bytes > end) -+ return -1; -+ -+ p = (u8 *) be + 16 - bytes; -+ memcpy(p, in, bytes); -+ *p ^= (1 << 8) >> shift; -+ -+ out[0] = be64_to_cpu(be[0]); -+ out[1] = be64_to_cpu(be[1]); -+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); -+ -+ return bytes; -+} -+ -+void bch2_inode_pack(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) -+{ -+ u8 *out = packed->inode.v.fields; -+ u8 *end = (void *) &packed[1]; -+ u8 *last_nonzero_field = out; -+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; -+ unsigned bytes; -+ -+ bkey_inode_init(&packed->inode.k_i); -+ packed->inode.k.p.offset = inode->bi_inum; -+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; -+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); -+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); -+ -+#define x(_name, _bits) \ -+ out += inode_encode_field(out, end, 0, inode->_name); \ -+ nr_fields++; \ -+ \ -+ if (inode->_name) { \ -+ last_nonzero_field = out; \ -+ last_nonzero_fieldnr = nr_fields; \ -+ } -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ out = last_nonzero_field; -+ nr_fields = last_nonzero_fieldnr; -+ -+ bytes = out - (u8 *) &packed->inode.v; -+ set_bkey_val_bytes(&packed->inode.k, bytes); -+ memset_u64s_tail(&packed->inode.v, 0, bytes); -+ -+ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { -+ struct bch_inode_unpacked unpacked; -+ -+ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), -+ &unpacked); -+ BUG_ON(ret); -+ BUG_ON(unpacked.bi_inum != inode->bi_inum); -+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); -+ BUG_ON(unpacked.bi_mode != inode->bi_mode); -+ -+#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); -+ BCH_INODE_FIELDS() -+#undef x -+ } -+} -+ -+int bch2_inode_unpack(struct bkey_s_c_inode inode, -+ struct bch_inode_unpacked *unpacked) -+{ -+ const u8 *in = inode.v->fields; -+ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); -+ u64 field[2]; -+ unsigned fieldnr = 0, field_bits; -+ int ret; -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ -+#define x(_name, _bits) \ -+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ -+ memset(&unpacked->_name, 0, \ -+ sizeof(*unpacked) - \ -+ offsetof(struct bch_inode_unpacked, _name)); \ -+ return 0; \ -+ } \ -+ \ -+ ret = inode_decode_field(in, end, field, &field_bits); \ -+ if (ret < 0) \ -+ return ret; \ -+ \ -+ if (field_bits > sizeof(unpacked->_name) * 8) \ -+ return -1; \ -+ \ -+ unpacked->_name = field[1]; \ -+ in += ret; -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ -+ return 0; -+} -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u64 inum, unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), -+ BTREE_ITER_SLOTS|flags); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); -+ if (ret) -+ goto err; -+ -+ return iter; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ERR_PTR(ret); -+} -+ -+int bch2_inode_write(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode) -+{ -+ struct bkey_inode_buf *inode_p; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+ -+ bch2_inode_pack(inode_p, inode); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ return 0; -+} -+ -+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) -+ return "incorrect value size"; -+ -+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) -+ return "fs inode in blockdev range"; -+ -+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) -+ return "invalid str hash type"; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) -+ return "invalid variable length fields"; -+ -+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && -+ unpacked.bi_nlink != 0) -+ return "flagged as unlinked but bi_nlink != 0"; -+ -+ return NULL; -+} -+ -+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) { -+ pr_buf(out, "(unpack error)"); -+ return; -+ } -+ -+#define x(_name, _bits) \ -+ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); -+ BCH_INODE_FIELDS() -+#undef x -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); -+ -+ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -+} -+ -+void bch2_inode_init_early(struct bch_fs *c, -+ struct bch_inode_unpacked *inode_u) -+{ -+ enum bch_str_hash_type str_hash = -+ bch2_str_hash_opt_to_type(c, c->opts.str_hash); -+ -+ memset(inode_u, 0, sizeof(*inode_u)); -+ -+ /* ick */ -+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; -+ get_random_bytes(&inode_u->bi_hash_seed, -+ sizeof(inode_u->bi_hash_seed)); -+} -+ -+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ inode_u->bi_mode = mode; -+ inode_u->bi_uid = uid; -+ inode_u->bi_gid = gid; -+ inode_u->bi_dev = rdev; -+ inode_u->bi_atime = now; -+ inode_u->bi_mtime = now; -+ inode_u->bi_ctime = now; -+ inode_u->bi_otime = now; -+ -+ if (parent && parent->bi_mode & S_ISGID) { -+ inode_u->bi_gid = parent->bi_gid; -+ if (S_ISDIR(mode)) -+ inode_u->bi_mode |= S_ISGID; -+ } -+ -+ if (parent) { -+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ } -+} -+ -+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ bch2_inode_init_early(c, inode_u); -+ bch2_inode_init_late(inode_u, bch2_current_time(c), -+ uid, gid, mode, rdev, parent); -+} -+ -+static inline u32 bkey_generation(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ BUG(); -+ case KEY_TYPE_inode_generation: -+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_inode_create(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ u64 min, u64 max, u64 *hint) -+{ -+ struct bkey_inode_buf *inode_p; -+ struct btree_iter *iter = NULL; -+ struct bkey_s_c k; -+ u64 start; -+ int ret; -+ -+ if (!max) -+ max = ULLONG_MAX; -+ -+ if (trans->c->opts.inodes_32bit) -+ max = min_t(u64, max, U32_MAX); -+ -+ start = READ_ONCE(*hint); -+ -+ if (start >= max || start < min) -+ start = min; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+again: -+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(iter->pos, POS(0, max)) > 0) -+ break; -+ -+ if (k.k->type != KEY_TYPE_inode) -+ goto found_slot; -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ -+ if (ret) -+ return ret; -+ -+ if (start != min) { -+ /* Retry from start */ -+ start = min; -+ goto again; -+ } -+ -+ return -ENOSPC; -+found_slot: -+ *hint = k.k->p.offset; -+ inode_u->bi_inum = k.k->p.offset; -+ inode_u->bi_generation = bkey_generation(k); -+ -+ bch2_inode_pack(inode_p, inode_u); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_inode_generation delete; -+ struct bpos start = POS(inode_nr, 0); -+ struct bpos end = POS(inode_nr + 1, 0); -+ int ret; -+ -+ /* -+ * If this was a directory, there shouldn't be any real dirents left - -+ * but there could be whiteouts (from hash collisions) that we should -+ * delete: -+ * -+ * XXX: the dirent could ideally would delete whiteouts when they're no -+ * longer needed -+ */ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, -+ start, end, NULL); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ do { -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ u32 bi_generation = 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, -+ "inode %llu not found when deleting", -+ inode_nr); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: { -+ struct bch_inode_unpacked inode_u; -+ -+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) -+ bi_generation = inode_u.bi_generation + 1; -+ break; -+ } -+ case KEY_TYPE_inode_generation: { -+ struct bkey_s_c_inode_generation g = -+ bkey_s_c_to_inode_generation(k); -+ bi_generation = le32_to_cpu(g.v->bi_generation); -+ break; -+ } -+ } -+ -+ if (!bi_generation) { -+ bkey_init(&delete.k); -+ delete.k.p.offset = inode_nr; -+ } else { -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p.offset = inode_nr; -+ delete.v.bi_generation = cpu_to_le32(bi_generation); -+ } -+ -+ bch2_trans_update(&trans, iter, &delete.k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, -+ POS(0, inode_nr), BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode -+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) -+ : -ENOENT; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ return bch2_trans_do(c, NULL, NULL, 0, -+ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void) -+{ -+ struct bch_inode_unpacked *u, test_inodes[] = { -+ { -+ .bi_atime = U64_MAX, -+ .bi_ctime = U64_MAX, -+ .bi_mtime = U64_MAX, -+ .bi_otime = U64_MAX, -+ .bi_size = U64_MAX, -+ .bi_sectors = U64_MAX, -+ .bi_uid = U32_MAX, -+ .bi_gid = U32_MAX, -+ .bi_nlink = U32_MAX, -+ .bi_generation = U32_MAX, -+ .bi_dev = U32_MAX, -+ }, -+ }; -+ -+ for (u = test_inodes; -+ u < test_inodes + ARRAY_SIZE(test_inodes); -+ u++) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, u); -+ } -+} -+#endif -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -new file mode 100644 -index 000000000000..bb759a46dc41 ---- /dev/null -+++ b/fs/bcachefs/inode.h -@@ -0,0 +1,177 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_INODE_H -+#define _BCACHEFS_INODE_H -+ -+#include "opts.h" -+ -+extern const char * const bch2_inode_opts[]; -+ -+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *, -+ struct bkey_s_c); -+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_generation_invalid, \ -+ .val_to_text = bch2_inode_generation_to_text, \ -+} -+ -+struct bch_inode_unpacked { -+ u64 bi_inum; -+ __le64 bi_hash_seed; -+ u32 bi_flags; -+ u16 bi_mode; -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_FIELDS() -+#undef x -+}; -+ -+struct bkey_inode_buf { -+ struct bkey_i_inode inode; -+ -+#define x(_name, _bits) + 8 + _bits / 8 -+ u8 _pad[0 + BCH_INODE_FIELDS()]; -+#undef x -+} __attribute__((packed, aligned(8))); -+ -+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *, -+ struct bch_inode_unpacked *, u64, unsigned); -+int bch2_inode_write(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *); -+ -+void bch2_inode_init_early(struct bch_fs *, -+ struct bch_inode_unpacked *); -+void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_create(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ u64, u64, u64 *); -+ -+int bch2_inode_rm(struct bch_fs *, u64); -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); -+ -+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts ret = { 0 }; -+ -+#define x(_name, _bits) \ -+ if (inode->bi_##_name) \ -+ opt_set(ret, _name, inode->bi_##_name - 1); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ inode->bi_##_name = v; \ -+ break; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ return inode->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct bch_io_opts -+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); -+ -+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); -+ return opts; -+} -+ -+static inline u8 mode_to_type(umode_t mode) -+{ -+ return (mode >> 12) & 15; -+} -+ -+/* i_nlink: */ -+ -+static inline unsigned nlink_bias(umode_t mode) -+{ -+ return S_ISDIR(mode) ? 2 : 1; -+} -+ -+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -+{ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ else -+ bi->bi_nlink++; -+} -+ -+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) -+{ -+ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); -+ if (bi->bi_nlink) -+ bi->bi_nlink--; -+ else -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+} -+ -+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -+{ -+ return bi->bi_flags & BCH_INODE_UNLINKED -+ ? 0 -+ : bi->bi_nlink + nlink_bias(bi->bi_mode); -+} -+ -+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, -+ unsigned nlink) -+{ -+ if (nlink) { -+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ } else { -+ bi->bi_nlink = 0; -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void); -+#else -+static inline void bch2_inode_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_INODE_H */ -diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c -new file mode 100644 -index 000000000000..8d608c900525 ---- /dev/null -+++ b/fs/bcachefs/io.c -@@ -0,0 +1,2355 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Some low level IO code, and hacks for various block layer limitations -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "compress.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "extent_update.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+#include -+ -+static bool bch2_target_congested(struct bch_fs *c, u16 target) -+{ -+ const struct bch_devs_mask *devs; -+ unsigned d, nr = 0, total = 0; -+ u64 now = local_clock(), last; -+ s64 congested; -+ struct bch_dev *ca; -+ -+ if (!target) -+ return false; -+ -+ rcu_read_lock(); -+ devs = bch2_target_to_mask(c, target); -+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { -+ ca = rcu_dereference(c->devs[d]); -+ if (!ca) -+ continue; -+ -+ congested = atomic_read(&ca->congested); -+ last = READ_ONCE(ca->congested_last); -+ if (time_after64(now, last)) -+ congested -= (now - last) >> 12; -+ -+ total += max(congested, 0LL); -+ nr++; -+ } -+ rcu_read_unlock(); -+ -+ return bch2_rand_range(nr * CONGESTED_MAX) < total; -+} -+ -+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, -+ u64 now, int rw) -+{ -+ u64 latency_capable = -+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; -+ /* ideally we'd be taking into account the device's variance here: */ -+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); -+ s64 latency_over = io_latency - latency_threshold; -+ -+ if (latency_threshold && latency_over > 0) { -+ /* -+ * bump up congested by approximately latency_over * 4 / -+ * latency_threshold - we don't need much accuracy here so don't -+ * bother with the divide: -+ */ -+ if (atomic_read(&ca->congested) < CONGESTED_MAX) -+ atomic_add(latency_over >> -+ max_t(int, ilog2(latency_threshold) - 2, 0), -+ &ca->congested); -+ -+ ca->congested_last = now; -+ } else if (atomic_read(&ca->congested) > 0) { -+ atomic_dec(&ca->congested); -+ } -+} -+ -+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -+{ -+ atomic64_t *latency = &ca->cur_latency[rw]; -+ u64 now = local_clock(); -+ u64 io_latency = time_after64(now, submit_time) -+ ? now - submit_time -+ : 0; -+ u64 old, new, v = atomic64_read(latency); -+ -+ do { -+ old = v; -+ -+ /* -+ * If the io latency was reasonably close to the current -+ * latency, skip doing the update and atomic operation - most of -+ * the time: -+ */ -+ if (abs((int) (old - io_latency)) < (old >> 1) && -+ now & ~(~0 << 5)) -+ break; -+ -+ new = ewma_add(old, io_latency, 5); -+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); -+ -+ bch2_congested_acct(ca, io_latency, now, rw); -+ -+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); -+} -+ -+/* Allocate, free from mempool: */ -+ -+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ if (bv->bv_page != ZERO_PAGE(0)) -+ mempool_free(bv->bv_page, &c->bio_bounce_pages); -+ bio->bi_vcnt = 0; -+} -+ -+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -+{ -+ struct page *page; -+ -+ if (likely(!*using_mempool)) { -+ page = alloc_page(GFP_NOIO); -+ if (unlikely(!page)) { -+ mutex_lock(&c->bio_bounce_pages_lock); -+ *using_mempool = true; -+ goto pool_alloc; -+ -+ } -+ } else { -+pool_alloc: -+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); -+ } -+ -+ return page; -+} -+ -+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, -+ size_t size) -+{ -+ bool using_mempool = false; -+ -+ while (size) { -+ struct page *page = __bio_alloc_page_pool(c, &using_mempool); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ if (using_mempool) -+ mutex_unlock(&c->bio_bounce_pages_lock); -+} -+ -+/* Extent update path: */ -+ -+static int sum_sector_overwrites(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i *new, -+ bool may_allocate, -+ bool *maybe_extending, -+ s64 *delta) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c old; -+ int ret = 0; -+ -+ *maybe_extending = true; -+ *delta = 0; -+ -+ iter = bch2_trans_copy_iter(trans, extent_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { -+ if (!may_allocate && -+ bch2_bkey_nr_ptrs_fully_allocated(old) < -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { -+ ret = -ENOSPC; -+ break; -+ } -+ -+ *delta += (min(new->k.p.offset, -+ old.k->p.offset) - -+ max(bkey_start_offset(&new->k), -+ bkey_start_offset(old.k))) * -+ (bkey_extent_is_allocation(&new->k) - -+ bkey_extent_is_allocation(old.k)); -+ -+ if (bkey_cmp(old.k->p, new->k.p) >= 0) { -+ /* -+ * Check if there's already data above where we're -+ * going to be writing to - this means we're definitely -+ * not extending the file: -+ * -+ * Note that it's not sufficient to check if there's -+ * data up to the sector offset we're going to be -+ * writing to, because i_size could be up to one block -+ * less: -+ */ -+ if (!bkey_cmp(old.k->p, new->k.p)) -+ old = bch2_btree_iter_next(iter); -+ -+ if (old.k && !bkey_err(old) && -+ old.k->p.inode == extent_iter->pos.inode && -+ bkey_extent_is_data(old.k)) -+ *maybe_extending = false; -+ -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_extent_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ u64 new_i_size, -+ s64 *i_sectors_delta) -+{ -+ /* this must live until after bch2_trans_commit(): */ -+ struct bkey_inode_buf inode_p; -+ bool extending = false; -+ s64 delta = 0; -+ int ret; -+ -+ ret = bch2_extent_trim_atomic(k, iter); -+ if (ret) -+ return ret; -+ -+ ret = sum_sector_overwrites(trans, iter, k, -+ disk_res && disk_res->sectors != 0, -+ &extending, &delta); -+ if (ret) -+ return ret; -+ -+ new_i_size = extending -+ ? min(k->k.p.offset << 9, new_i_size) -+ : 0; -+ -+ if (delta || new_i_size) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ inode_iter = bch2_inode_peek(trans, &inode_u, -+ k->k.p.inode, BTREE_ITER_INTENT); -+ if (IS_ERR(inode_iter)) -+ return PTR_ERR(inode_iter); -+ -+ /* -+ * XXX: -+ * writeback can race a bit with truncate, because truncate -+ * first updates the inode then truncates the pagecache. This is -+ * ugly, but lets us preserve the invariant that the in memory -+ * i_size is always >= the on disk i_size. -+ * -+ BUG_ON(new_i_size > inode_u.bi_size && -+ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); -+ */ -+ BUG_ON(new_i_size > inode_u.bi_size && !extending); -+ -+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ new_i_size > inode_u.bi_size) -+ inode_u.bi_size = new_i_size; -+ else -+ new_i_size = 0; -+ -+ inode_u.bi_sectors += delta; -+ -+ if (delta || new_i_size) { -+ bch2_inode_pack(&inode_p, &inode_u); -+ bch2_trans_update(trans, inode_iter, -+ &inode_p.inode.k_i, 0); -+ } -+ -+ bch2_trans_iter_put(trans, inode_iter); -+ } -+ -+ bch2_trans_update(trans, iter, k, 0); -+ -+ ret = bch2_trans_commit(trans, disk_res, journal_seq, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (!ret && i_sectors_delta) -+ *i_sectors_delta += delta; -+ -+ return ret; -+} -+ -+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end, u64 *journal_seq, -+ s64 *i_sectors_delta) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); -+ struct bkey_s_c k; -+ int ret = 0, ret2 = 0; -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto btree_err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = iter->pos; -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_update(trans, iter, &delete, -+ &disk_res, journal_seq, -+ 0, i_sectors_delta); -+ bch2_disk_reservation_put(c, &disk_res); -+btree_err: -+ if (ret == -EINTR) { -+ ret2 = ret; -+ ret = 0; -+ } -+ if (ret) -+ break; -+ } -+ -+ if (bkey_cmp(iter->pos, end) > 0) { -+ bch2_btree_iter_set_pos(iter, end); -+ ret = bch2_btree_iter_traverse(iter); -+ } -+ -+ return ret ?: ret2; -+} -+ -+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, -+ u64 *journal_seq, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inum, start), -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), -+ journal_seq, i_sectors_delta); -+ bch2_trans_exit(&trans); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ -+ return ret; -+} -+ -+int bch2_write_index_default(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_on_stack sk; -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ k = bch2_keylist_front(keys); -+ -+ bkey_on_stack_realloc(&sk, c, k->k.u64s); -+ bkey_copy(sk.k, k); -+ bch2_cut_front(iter->pos, sk.k); -+ -+ ret = bch2_extent_update(&trans, iter, sk.k, -+ &op->res, op_journal_seq(op), -+ op->new_i_size, &op->i_sectors_delta); -+ if (ret == -EINTR) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_cmp(iter->pos, k->k.p) >= 0) -+ bch2_keylist_pop_front(keys); -+ } while (!bch2_keylist_empty(keys)); -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* Writes */ -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, -+ enum bch_data_type type, -+ const struct bkey_i *k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); -+ const struct bch_extent_ptr *ptr; -+ struct bch_write_bio *n; -+ struct bch_dev *ca; -+ -+ BUG_ON(c->opts.nochanges); -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || -+ !c->devs[ptr->dev]); -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (to_entry(ptr + 1) < ptrs.end) { -+ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, -+ &ca->replica_set)); -+ -+ n->bio.bi_end_io = wbio->bio.bi_end_io; -+ n->bio.bi_private = wbio->bio.bi_private; -+ n->parent = wbio; -+ n->split = true; -+ n->bounce = false; -+ n->put_bio = true; -+ n->bio.bi_opf = wbio->bio.bi_opf; -+ bio_inc_remaining(&wbio->bio); -+ } else { -+ n = wbio; -+ n->split = false; -+ } -+ -+ n->c = c; -+ n->dev = ptr->dev; -+ n->have_ioref = bch2_dev_get_ioref(ca, WRITE); -+ n->submit_time = local_clock(); -+ n->bio.bi_iter.bi_sector = ptr->offset; -+ -+ if (!journal_flushes_device(ca)) -+ n->bio.bi_opf |= REQ_FUA; -+ -+ if (likely(n->have_ioref)) { -+ this_cpu_add(ca->io_done->sectors[WRITE][type], -+ bio_sectors(&n->bio)); -+ -+ bio_set_dev(&n->bio, ca->disk_sb.bdev); -+ submit_bio(&n->bio); -+ } else { -+ n->bio.bi_status = BLK_STS_REMOVED; -+ bio_endio(&n->bio); -+ } -+ } -+} -+ -+static void __bch2_write(struct closure *); -+ -+static void bch2_write_done(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) -+ op->error = bch2_journal_error(&c->journal); -+ -+ bch2_disk_reservation_put(c, &op->res); -+ percpu_ref_put(&c->writes); -+ bch2_keylist_free(&op->insert_keys, op->inline_keys); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); -+ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ up(&c->io_in_flight); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/** -+ * bch_write_index - after a write, update index to point to new data -+ */ -+static void __bch2_write_index(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct keylist *keys = &op->insert_keys; -+ struct bch_extent_ptr *ptr; -+ struct bkey_i *src, *dst = keys->keys, *n, *k; -+ unsigned dev; -+ int ret; -+ -+ for (src = keys->keys; src != keys->top; src = n) { -+ n = bkey_next(src); -+ -+ if (bkey_extent_is_direct_data(&src->k)) { -+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, -+ test_bit(ptr->dev, op->failed.d)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { -+ ret = -EIO; -+ goto err; -+ } -+ } -+ -+ if (dst != src) -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ -+ keys->top = dst; -+ -+ /* -+ * probably not the ideal place to hook this in, but I don't -+ * particularly want to plumb io_opts all the way through the btree -+ * update stack right now -+ */ -+ for_each_keylist_key(keys, k) { -+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); -+ -+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) -+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); -+ -+ } -+ -+ if (!bch2_keylist_empty(keys)) { -+ u64 sectors_start = keylist_sectors(keys); -+ int ret = op->index_update_fn(op); -+ -+ BUG_ON(ret == -EINTR); -+ BUG_ON(keylist_sectors(keys) && !ret); -+ -+ op->written += sectors_start - keylist_sectors(keys); -+ -+ if (ret) { -+ __bcache_io_error(c, "btree IO error %i", ret); -+ op->error = ret; -+ } -+ } -+out: -+ /* If some a bucket wasn't written, we can't erasure code it: */ -+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) -+ bch2_open_bucket_write_error(c, &op->open_buckets, dev); -+ -+ bch2_open_buckets_put(c, &op->open_buckets); -+ return; -+err: -+ keys->top = keys->keys; -+ op->error = ret; -+ goto out; -+} -+ -+static void bch2_write_index(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ __bch2_write_index(op); -+ -+ if (!(op->flags & BCH_WRITE_DONE)) { -+ continue_at(cl, __bch2_write, index_update_wq(op)); -+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { -+ bch2_journal_flush_seq_async(&c->journal, -+ *op_journal_seq(op), -+ cl); -+ continue_at(cl, bch2_write_done, index_update_wq(op)); -+ } else { -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ } -+} -+ -+static void bch2_write_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", -+ blk_status_to_str(bio->bi_status))) -+ set_bit(wbio->dev, op->failed.d); -+ -+ if (wbio->have_ioref) { -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (wbio->bounce) -+ bch2_bio_free_pages_pool(c, bio); -+ -+ if (wbio->put_bio) -+ bio_put(bio); -+ -+ if (parent) -+ bio_endio(&parent->bio); -+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) -+ closure_put(cl); -+ else -+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); -+} -+ -+static void init_append_extent(struct bch_write_op *op, -+ struct write_point *wp, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_i_extent *e; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(crc.compressed_size > wp->sectors_free); -+ wp->sectors_free -= crc.compressed_size; -+ op->pos.offset += crc.uncompressed_size; -+ -+ e = bkey_extent_init(op->insert_keys.top); -+ e->k.p = op->pos; -+ e->k.size = crc.uncompressed_size; -+ e->k.version = version; -+ -+ if (crc.csum_type || -+ crc.compression_type || -+ crc.nonce) -+ bch2_extent_crc_append(&e->k_i, crc); -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ union bch_extent_entry *end = -+ bkey_val_end(bkey_i_to_s(&e->k_i)); -+ -+ end->ptr = ob->ptr; -+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ end->ptr.cached = !ca->mi.durability || -+ (op->flags & BCH_WRITE_CACHED) != 0; -+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; -+ -+ e->k.u64s++; -+ -+ BUG_ON(crc.compressed_size > ob->sectors_free); -+ ob->sectors_free -= crc.compressed_size; -+ } -+ -+ bch2_keylist_push(&op->insert_keys); -+} -+ -+static struct bio *bch2_write_bio_alloc(struct bch_fs *c, -+ struct write_point *wp, -+ struct bio *src, -+ bool *page_alloc_failed, -+ void *buf) -+{ -+ struct bch_write_bio *wbio; -+ struct bio *bio; -+ unsigned output_available = -+ min(wp->sectors_free << 9, src->bi_iter.bi_size); -+ unsigned pages = DIV_ROUND_UP(output_available + -+ (buf -+ ? ((unsigned long) buf & (PAGE_SIZE - 1)) -+ : 0), PAGE_SIZE); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); -+ wbio = wbio_init(bio); -+ wbio->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ wbio->bio.bi_opf = src->bi_opf; -+ -+ if (buf) { -+ bch2_bio_map(bio, buf, output_available); -+ return bio; -+ } -+ -+ wbio->bounce = true; -+ -+ /* -+ * We can't use mempool for more than c->sb.encoded_extent_max -+ * worth of pages, but we'd like to allocate more if we can: -+ */ -+ bch2_bio_alloc_pages_pool(c, bio, -+ min_t(unsigned, output_available, -+ c->sb.encoded_extent_max << 9)); -+ -+ if (bio->bi_iter.bi_size < output_available) -+ *page_alloc_failed = -+ bch2_bio_alloc_pages(bio, -+ output_available - -+ bio->bi_iter.bi_size, -+ GFP_NOFS) != 0; -+ -+ return bio; -+} -+ -+static int bch2_write_rechecksum(struct bch_fs *c, -+ struct bch_write_op *op, -+ unsigned new_csum_type) -+{ -+ struct bio *bio = &op->wbio.bio; -+ struct bch_extent_crc_unpacked new_crc; -+ int ret; -+ -+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ -+ -+ if (bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)) -+ new_csum_type = op->crc.csum_type; -+ -+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -+ NULL, &new_crc, -+ op->crc.offset, op->crc.live_size, -+ new_csum_type); -+ if (ret) -+ return ret; -+ -+ bio_advance(bio, op->crc.offset << 9); -+ bio->bi_iter.bi_size = op->crc.live_size << 9; -+ op->crc = new_crc; -+ return 0; -+} -+ -+static int bch2_write_decrypt(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ struct bch_csum csum; -+ -+ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) -+ return 0; -+ -+ /* -+ * If we need to decrypt data in the write path, we'll no longer be able -+ * to verify the existing checksum (poly1305 mac, in this case) after -+ * it's decrypted - this is the last point we'll be able to reverify the -+ * checksum: -+ */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return -EIO; -+ -+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ return 0; -+} -+ -+static enum prep_encoded_ret { -+ PREP_ENCODED_OK, -+ PREP_ENCODED_ERR, -+ PREP_ENCODED_CHECKSUM_ERR, -+ PREP_ENCODED_DO_WRITE, -+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *bio = &op->wbio.bio; -+ -+ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -+ return PREP_ENCODED_OK; -+ -+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); -+ -+ /* Can we just write the entire extent as is? */ -+ if (op->crc.uncompressed_size == op->crc.live_size && -+ op->crc.compressed_size <= wp->sectors_free && -+ (op->crc.compression_type == op->compression_type || -+ op->incompressible)) { -+ if (!crc_is_compressed(op->crc) && -+ op->csum_type != op->crc.csum_type && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_DO_WRITE; -+ } -+ -+ /* -+ * If the data is compressed and we couldn't write the entire extent as -+ * is, we have to decompress it: -+ */ -+ if (crc_is_compressed(op->crc)) { -+ struct bch_csum csum; -+ -+ if (bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* Last point we can still verify checksum: */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, -+ extent_nonce(op->version, op->crc), -+ bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) -+ return PREP_ENCODED_ERR; -+ } -+ -+ /* -+ * No longer have compressed data after this point - data might be -+ * encrypted: -+ */ -+ -+ /* -+ * If the data is checksummed and we're only writing a subset, -+ * rechecksum and adjust bio to point to currently live data: -+ */ -+ if ((op->crc.live_size != op->crc.uncompressed_size || -+ op->crc.csum_type != op->csum_type) && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* -+ * If we want to compress the data, it has to be decrypted: -+ */ -+ if ((op->compression_type || -+ bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(op->csum_type)) && -+ bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_OK; -+} -+ -+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, -+ struct bio **_dst) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *src = &op->wbio.bio, *dst = src; -+ struct bvec_iter saved_iter; -+ void *ec_buf; -+ struct bpos ec_pos = op->pos; -+ unsigned total_output = 0, total_input = 0; -+ bool bounce = false; -+ bool page_alloc_failed = false; -+ int ret, more = 0; -+ -+ BUG_ON(!bio_sectors(src)); -+ -+ ec_buf = bch2_writepoint_ec_buf(c, wp); -+ -+ switch (bch2_write_prep_encoded_data(op, wp)) { -+ case PREP_ENCODED_OK: -+ break; -+ case PREP_ENCODED_ERR: -+ ret = -EIO; -+ goto err; -+ case PREP_ENCODED_CHECKSUM_ERR: -+ BUG(); -+ goto csum_err; -+ case PREP_ENCODED_DO_WRITE: -+ /* XXX look for bug here */ -+ if (ec_buf) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bio_copy_data(dst, src); -+ bounce = true; -+ } -+ init_append_extent(op, wp, op->version, op->crc); -+ goto do_write; -+ } -+ -+ if (ec_buf || -+ op->compression_type || -+ (op->csum_type && -+ !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ (bch2_csum_type_is_encryption(op->csum_type) && -+ !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bounce = true; -+ } -+ -+ saved_iter = dst->bi_iter; -+ -+ do { -+ struct bch_extent_crc_unpacked crc = -+ (struct bch_extent_crc_unpacked) { 0 }; -+ struct bversion version = op->version; -+ size_t dst_len, src_len; -+ -+ if (page_alloc_failed && -+ bio_sectors(dst) < wp->sectors_free && -+ bio_sectors(dst) < c->sb.encoded_extent_max) -+ break; -+ -+ BUG_ON(op->compression_type && -+ (op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_csum_type_is_encryption(op->crc.csum_type)); -+ BUG_ON(op->compression_type && !bounce); -+ -+ crc.compression_type = op->incompressible -+ ? BCH_COMPRESSION_TYPE_incompressible -+ : op->compression_type -+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, -+ op->compression_type) -+ : 0; -+ if (!crc_is_compressed(crc)) { -+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); -+ -+ if (op->csum_type) -+ dst_len = min_t(unsigned, dst_len, -+ c->sb.encoded_extent_max << 9); -+ -+ if (bounce) { -+ swap(dst->bi_iter.bi_size, dst_len); -+ bio_copy_data(dst, src); -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ src_len = dst_len; -+ } -+ -+ BUG_ON(!src_len || !dst_len); -+ -+ if (bch2_csum_type_is_encryption(op->csum_type)) { -+ if (bversion_zero(version)) { -+ version.lo = atomic64_inc_return(&c->key_version); -+ } else { -+ crc.nonce = op->nonce; -+ op->nonce += src_len >> 9; -+ } -+ } -+ -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ !crc_is_compressed(crc) && -+ bch2_csum_type_is_encryption(op->crc.csum_type) == -+ bch2_csum_type_is_encryption(op->csum_type)) { -+ /* -+ * Note: when we're using rechecksum(), we need to be -+ * checksumming @src because it has all the data our -+ * existing checksum covers - if we bounced (because we -+ * were trying to compress), @dst will only have the -+ * part of the data the new checksum will cover. -+ * -+ * But normally we want to be checksumming post bounce, -+ * because part of the reason for bouncing is so the -+ * data can't be modified (by userspace) while it's in -+ * flight. -+ */ -+ if (bch2_rechecksum_bio(c, src, version, op->crc, -+ &crc, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->csum_type)) -+ goto csum_err; -+ } else { -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_rechecksum_bio(c, src, version, op->crc, -+ NULL, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->crc.csum_type)) -+ goto csum_err; -+ -+ crc.compressed_size = dst_len >> 9; -+ crc.uncompressed_size = src_len >> 9; -+ crc.live_size = src_len >> 9; -+ -+ swap(dst->bi_iter.bi_size, dst_len); -+ bch2_encrypt_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum = bch2_checksum_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum_type = op->csum_type; -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ init_append_extent(op, wp, version, crc); -+ -+ if (dst != src) -+ bio_advance(dst, dst_len); -+ bio_advance(src, src_len); -+ total_output += dst_len; -+ total_input += src_len; -+ } while (dst->bi_iter.bi_size && -+ src->bi_iter.bi_size && -+ wp->sectors_free && -+ !bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)); -+ -+ more = src->bi_iter.bi_size != 0; -+ -+ dst->bi_iter = saved_iter; -+ -+ if (dst == src && more) { -+ BUG_ON(total_output != total_input); -+ -+ dst = bio_split(src, total_input >> 9, -+ GFP_NOIO, &c->bio_write); -+ wbio_init(dst)->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ dst->bi_opf = src->bi_opf; -+ } -+ -+ dst->bi_iter.bi_size = total_output; -+do_write: -+ /* might have done a realloc... */ -+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); -+ -+ *_dst = dst; -+ return more; -+csum_err: -+ bch_err(c, "error verifying existing checksum while " -+ "rewriting existing data (memory corruption?)"); -+ ret = -EIO; -+err: -+ if (to_wbio(dst)->bounce) -+ bch2_bio_free_pages_pool(c, dst); -+ if (to_wbio(dst)->put_bio) -+ bio_put(dst); -+ -+ return ret; -+} -+ -+static void __bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ struct write_point *wp; -+ struct bio *bio; -+ bool skip_put = true; -+ int ret; -+again: -+ memset(&op->failed, 0, sizeof(op->failed)); -+ -+ do { -+ struct bkey_i *key_to_write; -+ unsigned key_to_write_offset = op->insert_keys.top_p - -+ op->insert_keys.keys_p; -+ -+ /* +1 for possible cache device: */ -+ if (op->open_buckets.nr + op->nr_replicas + 1 > -+ ARRAY_SIZE(op->open_buckets.v)) -+ goto flush_io; -+ -+ if (bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)) -+ goto flush_io; -+ -+ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && -+ percpu_ref_is_dying(&c->writes)) { -+ ret = -EROFS; -+ goto err; -+ } -+ -+ wp = bch2_alloc_sectors_start(c, -+ op->target, -+ op->opts.erasure_code, -+ op->write_point, -+ &op->devs_have, -+ op->nr_replicas, -+ op->nr_replicas_required, -+ op->alloc_reserve, -+ op->flags, -+ (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); -+ EBUG_ON(!wp); -+ -+ if (unlikely(IS_ERR(wp))) { -+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { -+ ret = PTR_ERR(wp); -+ goto err; -+ } -+ -+ goto flush_io; -+ } -+ -+ bch2_open_bucket_get(c, wp, &op->open_buckets); -+ ret = bch2_write_extent(op, wp, &bio); -+ bch2_alloc_sectors_done(c, wp); -+ -+ if (ret < 0) -+ goto err; -+ -+ if (ret) { -+ skip_put = false; -+ } else { -+ /* -+ * for the skip_put optimization this has to be set -+ * before we submit the bio: -+ */ -+ op->flags |= BCH_WRITE_DONE; -+ } -+ -+ bio->bi_end_io = bch2_write_endio; -+ bio->bi_private = &op->cl; -+ bio->bi_opf |= REQ_OP_WRITE; -+ -+ if (!skip_put) -+ closure_get(bio->bi_private); -+ else -+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; -+ -+ key_to_write = (void *) (op->insert_keys.keys_p + -+ key_to_write_offset); -+ -+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, -+ key_to_write); -+ } while (ret); -+ -+ if (!skip_put) -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ return; -+err: -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ return; -+flush_io: -+ /* -+ * If the write can't all be submitted at once, we generally want to -+ * block synchronously as that signals backpressure to the caller. -+ * -+ * However, if we're running out of a workqueue, we can't block here -+ * because we'll be blocking other work items from completing: -+ */ -+ if (current->flags & PF_WQ_WORKER) { -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ return; -+ } -+ -+ closure_sync(cl); -+ -+ if (!bch2_keylist_empty(&op->insert_keys)) { -+ __bch2_write_index(op); -+ -+ if (op->error) { -+ op->flags |= BCH_WRITE_DONE; -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ return; -+ } -+ } -+ -+ goto again; -+} -+ -+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -+{ -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->wbio.bio; -+ struct bvec_iter iter; -+ struct bkey_i_inline_data *id; -+ unsigned sectors; -+ int ret; -+ -+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); -+ -+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); -+ if (ret) { -+ op->error = ret; -+ goto err; -+ } -+ -+ sectors = bio_sectors(bio); -+ op->pos.offset += sectors; -+ -+ id = bkey_inline_data_init(op->insert_keys.top); -+ id->k.p = op->pos; -+ id->k.version = op->version; -+ id->k.size = sectors; -+ -+ iter = bio->bi_iter; -+ iter.bi_size = data_len; -+ memcpy_from_bio(id->v.data, bio, iter); -+ -+ while (data_len & 7) -+ id->v.data[data_len++] = '\0'; -+ set_bkey_val_bytes(&id->k, data_len); -+ bch2_keylist_push(&op->insert_keys); -+ -+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at_nobarrier(cl, bch2_write_index, NULL); -+ return; -+err: -+ bch2_write_done(&op->cl); -+} -+ -+/** -+ * bch_write - handle a write to a cache device or flash only volume -+ * -+ * This is the starting point for any data to end up in a cache device; it could -+ * be from a normal write, or a writeback write, or a write to a flash only -+ * volume - it's also used by the moving garbage collector to compact data in -+ * mostly empty buckets. -+ * -+ * It first writes the data to the cache, creating a list of keys to be inserted -+ * (if the data won't fit in a single open bucket, there will be multiple keys); -+ * after the data is written it calls bch_journal, and after the keys have been -+ * added to the next journal write they're inserted into the btree. -+ * -+ * If op->discard is true, instead of inserting the data it invalidates the -+ * region of the cache represented by op->bio and op->inode. -+ */ -+void bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bio *bio = &op->wbio.bio; -+ struct bch_fs *c = op->c; -+ unsigned data_len; -+ -+ BUG_ON(!op->nr_replicas); -+ BUG_ON(!op->write_point.v); -+ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); -+ -+ op->start_time = local_clock(); -+ bch2_keylist_init(&op->insert_keys, op->inline_keys); -+ wbio_init(bio)->put_bio = false; -+ -+ if (bio_sectors(bio) & (c->opts.block_size - 1)) { -+ __bcache_io_error(c, "misaligned write"); -+ op->error = -EIO; -+ goto err; -+ } -+ -+ if (c->opts.nochanges || -+ !percpu_ref_tryget(&c->writes)) { -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ __bcache_io_error(c, "read only"); -+ op->error = -EROFS; -+ goto err; -+ } -+ -+ /* -+ * Can't ratelimit copygc - we'd deadlock: -+ */ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ down(&c->io_in_flight); -+ -+ bch2_increment_clock(c, bio_sectors(bio), WRITE); -+ -+ data_len = min_t(u64, bio->bi_iter.bi_size, -+ op->new_i_size - (op->pos.offset << 9)); -+ -+ if (c->opts.inline_data && -+ data_len <= min(block_bytes(c) / 2, 1024U)) { -+ bch2_write_data_inline(op, data_len); -+ return; -+ } -+ -+ continue_at_nobarrier(cl, __bch2_write, NULL); -+ return; -+err: -+ bch2_disk_reservation_put(c, &op->res); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/* Cache promotion on read */ -+ -+struct promote_op { -+ struct closure cl; -+ struct rcu_head rcu; -+ u64 start_time; -+ -+ struct rhash_head hash; -+ struct bpos pos; -+ -+ struct migrate_write write; -+ struct bio_vec bi_inline_vecs[0]; /* must be last */ -+}; -+ -+static const struct rhashtable_params bch_promote_params = { -+ .head_offset = offsetof(struct promote_op, hash), -+ .key_offset = offsetof(struct promote_op, pos), -+ .key_len = sizeof(struct bpos), -+}; -+ -+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, -+ struct bpos pos, -+ struct bch_io_opts opts, -+ unsigned flags) -+{ -+ if (!(flags & BCH_READ_MAY_PROMOTE)) -+ return false; -+ -+ if (!opts.promote_target) -+ return false; -+ -+ if (bch2_bkey_has_target(c, k, opts.promote_target)) -+ return false; -+ -+ if (bch2_target_congested(c, opts.promote_target)) { -+ /* XXX trace this */ -+ return false; -+ } -+ -+ if (rhashtable_lookup_fast(&c->promote_table, &pos, -+ bch_promote_params)) -+ return false; -+ -+ return true; -+} -+ -+static void promote_free(struct bch_fs *c, struct promote_op *op) -+{ -+ int ret; -+ -+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ percpu_ref_put(&c->writes); -+ kfree_rcu(op, rcu); -+} -+ -+static void promote_done(struct closure *cl) -+{ -+ struct promote_op *op = -+ container_of(cl, struct promote_op, cl); -+ struct bch_fs *c = op->write.op.c; -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -+ op->start_time); -+ -+ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); -+ promote_free(c, op); -+} -+ -+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->write.op.wbio.bio; -+ -+ trace_promote(&rbio->bio); -+ -+ /* we now own pages: */ -+ BUG_ON(!rbio->bounce); -+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+ -+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ -+ bch2_migrate_read_done(&op->write, rbio); -+ -+ closure_init(cl, NULL); -+ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); -+ closure_return_with_destructor(cl, promote_done); -+} -+ -+static struct promote_op *__promote_alloc(struct bch_fs *c, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned sectors, -+ struct bch_read_bio **rbio) -+{ -+ struct promote_op *op = NULL; -+ struct bio *bio; -+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ int ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return NULL; -+ -+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); -+ if (!op) -+ goto err; -+ -+ op->start_time = local_clock(); -+ op->pos = pos; -+ -+ /* -+ * We don't use the mempool here because extents that aren't -+ * checksummed or compressed can be too big for the mempool: -+ */ -+ *rbio = kzalloc(sizeof(struct bch_read_bio) + -+ sizeof(struct bio_vec) * pages, -+ GFP_NOIO); -+ if (!*rbio) -+ goto err; -+ -+ rbio_init(&(*rbio)->bio, opts); -+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); -+ -+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, -+ GFP_NOIO)) -+ goto err; -+ -+ (*rbio)->bounce = true; -+ (*rbio)->split = true; -+ (*rbio)->kmalloc = true; -+ -+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, -+ bch_promote_params)) -+ goto err; -+ -+ bio = &op->write.op.wbio.bio; -+ bio_init(bio, bio->bi_inline_vecs, pages); -+ -+ ret = bch2_migrate_write_init(c, &op->write, -+ writepoint_hashed((unsigned long) current), -+ opts, -+ DATA_PROMOTE, -+ (struct data_opts) { -+ .target = opts.promote_target -+ }, -+ btree_id, k); -+ BUG_ON(ret); -+ -+ return op; -+err: -+ if (*rbio) -+ bio_free_pages(&(*rbio)->bio); -+ kfree(*rbio); -+ *rbio = NULL; -+ kfree(op); -+ percpu_ref_put(&c->writes); -+ return NULL; -+} -+ -+noinline -+static struct promote_op *promote_alloc(struct bch_fs *c, -+ struct bvec_iter iter, -+ struct bkey_s_c k, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned flags, -+ struct bch_read_bio **rbio, -+ bool *bounce, -+ bool *read_full) -+{ -+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); -+ /* data might have to be decompressed in the write path: */ -+ unsigned sectors = promote_full -+ ? max(pick->crc.compressed_size, pick->crc.live_size) -+ : bvec_iter_sectors(iter); -+ struct bpos pos = promote_full -+ ? bkey_start_pos(k.k) -+ : POS(k.k->p.inode, iter.bi_sector); -+ struct promote_op *promote; -+ -+ if (!should_promote(c, k, pos, opts, flags)) -+ return NULL; -+ -+ promote = __promote_alloc(c, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_REFLINK -+ : BTREE_ID_EXTENTS, -+ k, pos, pick, opts, sectors, rbio); -+ if (!promote) -+ return NULL; -+ -+ *bounce = true; -+ *read_full = promote_full; -+ return promote; -+} -+ -+/* Read */ -+ -+#define READ_RETRY_AVOID 1 -+#define READ_RETRY 2 -+#define READ_ERR 3 -+ -+enum rbio_context { -+ RBIO_CONTEXT_NULL, -+ RBIO_CONTEXT_HIGHPRI, -+ RBIO_CONTEXT_UNBOUND, -+}; -+ -+static inline struct bch_read_bio * -+bch2_rbio_parent(struct bch_read_bio *rbio) -+{ -+ return rbio->split ? rbio->parent : rbio; -+} -+ -+__always_inline -+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, -+ enum rbio_context context, -+ struct workqueue_struct *wq) -+{ -+ if (context <= rbio->context) { -+ fn(&rbio->work); -+ } else { -+ rbio->work.func = fn; -+ rbio->context = context; -+ queue_work(wq, &rbio->work); -+ } -+} -+ -+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -+{ -+ BUG_ON(rbio->bounce && !rbio->split); -+ -+ if (rbio->promote) -+ promote_free(rbio->c, rbio->promote); -+ rbio->promote = NULL; -+ -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ -+ if (rbio->split) { -+ struct bch_read_bio *parent = rbio->parent; -+ -+ if (rbio->kmalloc) -+ kfree(rbio); -+ else -+ bio_put(&rbio->bio); -+ -+ rbio = parent; -+ } -+ -+ return rbio; -+} -+ -+/* -+ * Only called on a top level bch_read_bio to complete an entire read request, -+ * not a split: -+ */ -+static void bch2_rbio_done(struct bch_read_bio *rbio) -+{ -+ if (rbio->start_time) -+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], -+ rbio->start_time); -+ bio_endio(&rbio->bio); -+} -+ -+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, -+ unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ rbio->pos, BTREE_ITER_SLOTS); -+retry: -+ rbio->bio.bi_status = 0; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) -+ goto err; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ if (!bch2_bkey_matches_ptr(c, k, -+ rbio->pick.ptr, -+ rbio->pos.offset - -+ rbio->pick.crc.offset)) { -+ /* extent we wanted to read no longer exists: */ -+ rbio->hole = true; -+ goto out; -+ } -+ -+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); -+ if (ret == READ_RETRY) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_rbio_done(rbio); -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ goto out; -+} -+ -+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode, bvec_iter.bi_sector), -+ BTREE_ITER_SLOTS, k, ret) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; -+ swap(bvec_iter.bi_size, bytes); -+ -+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, -+ offset_into_extent, failed, flags); -+ switch (ret) { -+ case READ_RETRY: -+ goto retry; -+ case READ_ERR: -+ goto err; -+ }; -+ -+ if (bytes == bvec_iter.bi_size) -+ goto out; -+ -+ swap(bvec_iter.bi_size, bytes); -+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ /* -+ * If we get here, it better have been because there was an error -+ * reading a btree node -+ */ -+ BUG_ON(!ret); -+ __bcache_io_error(c, "btree IO error: %i", ret); -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ bch2_rbio_done(rbio); -+} -+ -+static void bch2_rbio_retry(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bvec_iter iter = rbio->bvec_iter; -+ unsigned flags = rbio->flags; -+ u64 inode = rbio->pos.inode; -+ struct bch_io_failures failed = { .nr = 0 }; -+ -+ trace_read_retry(&rbio->bio); -+ -+ if (rbio->retry == READ_RETRY_AVOID) -+ bch2_mark_io_failure(&failed, &rbio->pick); -+ -+ rbio->bio.bi_status = 0; -+ -+ rbio = bch2_rbio_free(rbio); -+ -+ flags |= BCH_READ_IN_RETRY; -+ flags &= ~BCH_READ_MAY_PROMOTE; -+ -+ if (flags & BCH_READ_NODECODE) -+ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); -+ else -+ bch2_read_retry(c, rbio, iter, inode, &failed, flags); -+} -+ -+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, -+ blk_status_t error) -+{ -+ rbio->retry = retry; -+ -+ if (rbio->flags & BCH_READ_IN_RETRY) -+ return; -+ -+ if (retry == READ_ERR) { -+ rbio = bch2_rbio_free(rbio); -+ -+ rbio->bio.bi_status = error; -+ bch2_rbio_done(rbio); -+ } else { -+ bch2_rbio_punt(rbio, bch2_rbio_retry, -+ RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ } -+} -+ -+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, -+ struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; -+ struct bch_extent_crc_unpacked new_crc; -+ struct btree_iter *iter = NULL; -+ struct bkey_i *new; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (crc_is_compressed(rbio->pick.crc)) -+ return 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ if ((ret = PTR_ERR_OR_ZERO(iter))) -+ goto out; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto out; -+ -+ /* -+ * going to be temporarily appending another checksum entry: -+ */ -+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + -+ BKEY_EXTENT_U64s_MAX * 8); -+ if ((ret = PTR_ERR_OR_ZERO(new))) -+ goto out; -+ -+ bkey_reassemble(new, k); -+ k = bkey_i_to_s_c(new); -+ -+ if (bversion_cmp(k.k->version, rbio->version) || -+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) -+ goto out; -+ -+ /* Extent was merged? */ -+ if (bkey_start_offset(k.k) < data_offset || -+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) -+ goto out; -+ -+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, -+ rbio->pick.crc, NULL, &new_crc, -+ bkey_start_offset(k.k) - data_offset, k.k->size, -+ rbio->pick.crc.csum_type)) { -+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); -+ ret = 0; -+ goto out; -+ } -+ -+ if (!bch2_bkey_narrow_crcs(new, new_crc)) -+ goto out; -+ -+ bch2_trans_update(trans, iter, new, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -+{ -+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, -+ __bch2_rbio_narrow_crcs(&trans, rbio)); -+} -+ -+/* Inner part that may run in process context */ -+static void __bch2_read_endio(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct bio *src = &rbio->bio; -+ struct bio *dst = &bch2_rbio_parent(rbio)->bio; -+ struct bvec_iter dst_iter = rbio->bvec_iter; -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ struct bch_csum csum; -+ -+ /* Reset iterator for checksumming and copying bounced data: */ -+ if (rbio->bounce) { -+ src->bi_iter.bi_size = crc.compressed_size << 9; -+ src->bi_iter.bi_idx = 0; -+ src->bi_iter.bi_bvec_done = 0; -+ } else { -+ src->bi_iter = rbio->bvec_iter; -+ } -+ -+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) -+ goto csum_err; -+ -+ if (unlikely(rbio->narrow_crcs)) -+ bch2_rbio_narrow_crcs(rbio); -+ -+ if (rbio->flags & BCH_READ_NODECODE) -+ goto nodecode; -+ -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ -+ if (crc_is_compressed(crc)) { -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); -+ -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; -+ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ -+ if (rbio->promote) { -+ /* -+ * Re encrypt data we decrypted, so it's consistent with -+ * rbio->crc: -+ */ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ promote_start(rbio->promote, rbio); -+ rbio->promote = NULL; -+ } -+nodecode: -+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ rbio = bch2_rbio_free(rbio); -+ bch2_rbio_done(rbio); -+ } -+ return; -+csum_err: -+ /* -+ * Checksum error: if the bio wasn't bounced, we may have been -+ * reading into buffers owned by userspace (that userspace can -+ * scribble over) - retry the read, bouncing it this time: -+ */ -+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -+ rbio->flags |= BCH_READ_MUST_BOUNCE; -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); -+ return; -+ } -+ -+ bch2_dev_io_error(ca, -+ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", -+ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, -+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, -+ csum.hi, csum.lo, crc.csum_type); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ return; -+decompression_err: -+ __bcache_io_error(c, "decompression error, inode %llu offset %llu", -+ rbio->pos.inode, -+ (u64) rbio->bvec_iter.bi_sector); -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ return; -+} -+ -+static void bch2_read_endio(struct bio *bio) -+{ -+ struct bch_read_bio *rbio = -+ container_of(bio, struct bch_read_bio, bio); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct workqueue_struct *wq = NULL; -+ enum rbio_context context = RBIO_CONTEXT_NULL; -+ -+ if (rbio->have_ioref) { -+ bch2_latency_acct(ca, rbio->submit_time, READ); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (!rbio->split) -+ rbio->bio.bi_end_io = rbio->end_io; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", -+ blk_status_to_str(bio->bi_status))) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ return; -+ } -+ -+ if (rbio->pick.ptr.cached && -+ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ ptr_stale(ca, &rbio->pick.ptr))) { -+ atomic_long_inc(&c->read_realloc_races); -+ -+ if (rbio->flags & BCH_READ_RETRY_IF_STALE) -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); -+ else -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -+ return; -+ } -+ -+ if (rbio->narrow_crcs || -+ crc_is_compressed(rbio->pick.crc) || -+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) -+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; -+ else if (rbio->pick.crc.csum_type) -+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; -+ -+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -+} -+ -+int __bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *orig_k) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 reflink_offset; -+ int ret; -+ -+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + -+ *offset_into_extent; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, -+ POS(0, reflink_offset), -+ BTREE_ITER_SLOTS); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ __bcache_io_error(trans->c, -+ "pointer to nonexistent indirect extent"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); -+ bkey_on_stack_reassemble(orig_k, trans->c, k); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, -+ struct bvec_iter iter, struct bkey_s_c k, -+ unsigned offset_into_extent, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct extent_ptr_decoded pick; -+ struct bch_read_bio *rbio = NULL; -+ struct bch_dev *ca; -+ struct promote_op *promote = NULL; -+ bool bounce = false, read_full = false, narrow_crcs = false; -+ struct bpos pos = bkey_start_pos(k.k); -+ int pick_ret; -+ -+ if (k.k->type == KEY_TYPE_inline_data) { -+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); -+ unsigned bytes = min_t(unsigned, iter.bi_size, -+ bkey_val_bytes(d.k)); -+ -+ swap(iter.bi_size, bytes); -+ memcpy_to_bio(&orig->bio, iter, d.v->data); -+ swap(iter.bi_size, bytes); -+ bio_advance_iter(&orig->bio, &iter, bytes); -+ zero_fill_bio_iter(&orig->bio, iter); -+ goto out_read_done; -+ } -+ -+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ -+ /* hole or reservation - just zero fill: */ -+ if (!pick_ret) -+ goto hole; -+ -+ if (pick_ret < 0) { -+ __bcache_io_error(c, "no device to read from"); -+ goto err; -+ } -+ -+ if (pick_ret > 0) -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ if (flags & BCH_READ_NODECODE) { -+ /* -+ * can happen if we retry, and the extent we were going to read -+ * has been merged in the meantime: -+ */ -+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) -+ goto hole; -+ -+ iter.bi_size = pick.crc.compressed_size << 9; -+ goto get_bio; -+ } -+ -+ if (!(flags & BCH_READ_LAST_FRAGMENT) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_MUST_CLONE; -+ -+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -+ flags |= BCH_READ_MUST_BOUNCE; -+ -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_NONE && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_USER_MAPPED)) || -+ (flags & BCH_READ_MUST_BOUNCE)))) { -+ read_full = true; -+ bounce = true; -+ } -+ -+ if (orig->opts.promote_target) -+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, -+ &rbio, &bounce, &read_full); -+ -+ if (!read_full) { -+ EBUG_ON(crc_is_compressed(pick.crc)); -+ EBUG_ON(pick.crc.csum_type && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ bvec_iter_sectors(iter) != pick.crc.live_size || -+ pick.crc.offset || -+ offset_into_extent)); -+ -+ pos.offset += offset_into_extent; -+ pick.ptr.offset += pick.crc.offset + -+ offset_into_extent; -+ offset_into_extent = 0; -+ pick.crc.compressed_size = bvec_iter_sectors(iter); -+ pick.crc.uncompressed_size = bvec_iter_sectors(iter); -+ pick.crc.offset = 0; -+ pick.crc.live_size = bvec_iter_sectors(iter); -+ offset_into_extent = 0; -+ } -+get_bio: -+ if (rbio) { -+ /* -+ * promote already allocated bounce rbio: -+ * promote needs to allocate a bio big enough for uncompressing -+ * data in the write path, but we're not going to use it all -+ * here: -+ */ -+ EBUG_ON(rbio->bio.bi_iter.bi_size < -+ pick.crc.compressed_size << 9); -+ rbio->bio.bi_iter.bi_size = -+ pick.crc.compressed_size << 9; -+ } else if (bounce) { -+ unsigned sectors = pick.crc.compressed_size; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, -+ DIV_ROUND_UP(sectors, PAGE_SECTORS), -+ &c->bio_read_split), -+ orig->opts); -+ -+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); -+ rbio->bounce = true; -+ rbio->split = true; -+ } else if (flags & BCH_READ_MUST_CLONE) { -+ /* -+ * Have to clone if there were any splits, due to error -+ * reporting issues (if a split errored, and retrying didn't -+ * work, when it reports the error to its parent (us) we don't -+ * know if the error was from our bio, and we should retry, or -+ * from the whole bio, in which case we don't want to retry and -+ * lose the error) -+ */ -+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, -+ &c->bio_read_split), -+ orig->opts); -+ rbio->bio.bi_iter = iter; -+ rbio->split = true; -+ } else { -+ rbio = orig; -+ rbio->bio.bi_iter = iter; -+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); -+ } -+ -+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); -+ -+ rbio->c = c; -+ rbio->submit_time = local_clock(); -+ if (rbio->split) -+ rbio->parent = orig; -+ else -+ rbio->end_io = orig->bio.bi_end_io; -+ rbio->bvec_iter = iter; -+ rbio->offset_into_extent= offset_into_extent; -+ rbio->flags = flags; -+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); -+ rbio->narrow_crcs = narrow_crcs; -+ rbio->hole = 0; -+ rbio->retry = 0; -+ rbio->context = 0; -+ /* XXX: only initialize this if needed */ -+ rbio->devs_have = bch2_bkey_devs(k); -+ rbio->pick = pick; -+ rbio->pos = pos; -+ rbio->version = k.k->version; -+ rbio->promote = promote; -+ INIT_WORK(&rbio->work, NULL); -+ -+ rbio->bio.bi_opf = orig->bio.bi_opf; -+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; -+ rbio->bio.bi_end_io = bch2_read_endio; -+ -+ if (rbio->bounce) -+ trace_read_bounce(&rbio->bio); -+ -+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); -+ -+ rcu_read_lock(); -+ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); -+ rcu_read_unlock(); -+ -+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ bio_inc_remaining(&orig->bio); -+ trace_read_split(&orig->bio); -+ } -+ -+ if (!rbio->pick.idx) { -+ if (!rbio->have_ioref) { -+ __bcache_io_error(c, "no device to read from"); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], -+ bio_sectors(&rbio->bio)); -+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ submit_bio(&rbio->bio); -+ else -+ submit_bio_wait(&rbio->bio); -+ } else { -+ /* Attempting reconstruct read: */ -+ if (bch2_ec_read_extent(c, rbio)) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ bio_endio(&rbio->bio); -+ } -+out: -+ if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ return 0; -+ } else { -+ int ret; -+ -+ rbio->context = RBIO_CONTEXT_UNBOUND; -+ bch2_read_endio(&rbio->bio); -+ -+ ret = rbio->retry; -+ rbio = bch2_rbio_free(rbio); -+ -+ if (ret == READ_RETRY_AVOID) { -+ bch2_mark_io_failure(failed, &pick); -+ ret = READ_RETRY; -+ } -+ -+ return ret; -+ } -+ -+err: -+ if (flags & BCH_READ_IN_RETRY) -+ return READ_ERR; -+ -+ orig->bio.bi_status = BLK_STS_IOERR; -+ goto out_read_done; -+ -+hole: -+ /* -+ * won't normally happen in the BCH_READ_NODECODE -+ * (bch2_move_extent()) path, but if we retry and the extent we wanted -+ * to read no longer exists we have to signal that: -+ */ -+ if (flags & BCH_READ_NODECODE) -+ orig->hole = true; -+ -+ zero_fill_bio_iter(&orig->bio, iter); -+out_read_done: -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ bch2_rbio_done(orig); -+ return 0; -+} -+ -+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ unsigned flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE| -+ BCH_READ_USER_MAPPED; -+ int ret; -+ -+ BUG_ON(rbio->_state); -+ BUG_ON(flags & BCH_READ_NODECODE); -+ BUG_ON(flags & BCH_READ_IN_RETRY); -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode, rbio->bio.bi_iter.bi_sector), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inode, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ goto err; -+ -+ /* -+ * With indirect extents, the amount of data to read is the min -+ * of the original extent and the indirect extent: -+ */ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ bch2_read_extent(c, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); -+ bch2_rbio_done(rbio); -+ goto out; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *c) -+{ -+ if (c->promote_table.tbl) -+ rhashtable_destroy(&c->promote_table); -+ mempool_exit(&c->bio_bounce_pages); -+ bioset_exit(&c->bio_write); -+ bioset_exit(&c->bio_read_split); -+ bioset_exit(&c->bio_read); -+} -+ -+int bch2_fs_io_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), -+ BIOSET_NEED_BVECS) || -+ mempool_init_page_pool(&c->bio_bounce_pages, -+ max_t(unsigned, -+ c->opts.btree_node_size, -+ c->sb.encoded_extent_max) / -+ PAGE_SECTORS, 0) || -+ rhashtable_init(&c->promote_table, &bch_promote_params)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h -new file mode 100644 -index 000000000000..0ad293bd6295 ---- /dev/null -+++ b/fs/bcachefs/io.h -@@ -0,0 +1,167 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_H -+#define _BCACHEFS_IO_H -+ -+#include "checksum.h" -+#include "bkey_on_stack.h" -+#include "io_types.h" -+ -+#define to_wbio(_bio) \ -+ container_of((_bio), struct bch_write_bio, bio) -+ -+#define to_rbio(_bio) \ -+ container_of((_bio), struct bch_read_bio, bio) -+ -+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -+ -+void bch2_latency_acct(struct bch_dev *, u64, int); -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, -+ enum bch_data_type, const struct bkey_i *); -+ -+#define BLK_STS_REMOVED ((__force blk_status_t)128) -+ -+enum bch_write_flags { -+ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), -+ BCH_WRITE_CACHED = (1 << 1), -+ BCH_WRITE_FLUSH = (1 << 2), -+ BCH_WRITE_DATA_ENCODED = (1 << 3), -+ BCH_WRITE_PAGES_STABLE = (1 << 4), -+ BCH_WRITE_PAGES_OWNED = (1 << 5), -+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), -+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), -+ BCH_WRITE_FROM_INTERNAL = (1 << 8), -+ -+ /* Internal: */ -+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), -+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), -+ BCH_WRITE_DONE = (1 << 11), -+}; -+ -+static inline u64 *op_journal_seq(struct bch_write_op *op) -+{ -+ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) -+ ? op->journal_seq_p : &op->journal_seq; -+} -+ -+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -+{ -+ op->journal_seq_p = journal_seq; -+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -+} -+ -+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -+{ -+ return op->alloc_reserve == RESERVE_MOVINGGC -+ ? op->c->copygc_wq -+ : op->c->wq; -+} -+ -+int bch2_extent_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct disk_reservation *, -+ u64 *, u64, s64 *); -+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *, s64 *); -+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); -+ -+int bch2_write_index_default(struct bch_write_op *); -+ -+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, -+ struct bch_io_opts opts) -+{ -+ op->c = c; -+ op->end_io = NULL; -+ op->flags = 0; -+ op->written = 0; -+ op->error = 0; -+ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); -+ op->compression_type = bch2_compression_opt_to_type[opts.compression]; -+ op->nr_replicas = 0; -+ op->nr_replicas_required = c->opts.data_replicas_required; -+ op->alloc_reserve = RESERVE_NONE; -+ op->incompressible = 0; -+ op->open_buckets.nr = 0; -+ op->devs_have.nr = 0; -+ op->target = 0; -+ op->opts = opts; -+ op->pos = POS_MAX; -+ op->version = ZERO_VERSION; -+ op->write_point = (struct write_point_specifier) { 0 }; -+ op->res = (struct disk_reservation) { 0 }; -+ op->journal_seq = 0; -+ op->new_i_size = U64_MAX; -+ op->i_sectors_delta = 0; -+ op->index_update_fn = bch2_write_index_default; -+} -+ -+void bch2_write(struct closure *); -+ -+static inline struct bch_write_bio *wbio_init(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ -+ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); -+ return wbio; -+} -+ -+struct bch_devs_mask; -+struct cache_promote_op; -+struct extent_ptr_decoded; -+ -+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, -+ struct bkey_on_stack *); -+ -+static inline int bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *k) -+{ -+ return k->k->k.type == KEY_TYPE_reflink_p -+ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) -+ : 0; -+} -+ -+enum bch_read_flags { -+ BCH_READ_RETRY_IF_STALE = 1 << 0, -+ BCH_READ_MAY_PROMOTE = 1 << 1, -+ BCH_READ_USER_MAPPED = 1 << 2, -+ BCH_READ_NODECODE = 1 << 3, -+ BCH_READ_LAST_FRAGMENT = 1 << 4, -+ -+ /* internal: */ -+ BCH_READ_MUST_BOUNCE = 1 << 5, -+ BCH_READ_MUST_CLONE = 1 << 6, -+ BCH_READ_IN_RETRY = 1 << 7, -+}; -+ -+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, -+ struct bvec_iter, struct bkey_s_c, unsigned, -+ struct bch_io_failures *, unsigned); -+ -+static inline void bch2_read_extent(struct bch_fs *c, -+ struct bch_read_bio *rbio, -+ struct bkey_s_c k, -+ unsigned offset_into_extent, -+ unsigned flags) -+{ -+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, -+ offset_into_extent, NULL, flags); -+} -+ -+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); -+ -+static inline struct bch_read_bio *rbio_init(struct bio *bio, -+ struct bch_io_opts opts) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->_state = 0; -+ rbio->promote = NULL; -+ rbio->opts = opts; -+ return rbio; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *); -+int bch2_fs_io_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_IO_H */ -diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h -new file mode 100644 -index 000000000000..684e4c9a5d98 ---- /dev/null -+++ b/fs/bcachefs/io_types.h -@@ -0,0 +1,149 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_TYPES_H -+#define _BCACHEFS_IO_TYPES_H -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "extents_types.h" -+#include "keylist_types.h" -+#include "opts.h" -+#include "super_types.h" -+ -+#include -+#include -+ -+struct bch_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ u64 submit_time; -+ -+ /* -+ * Reads will often have to be split, and if the extent being read from -+ * was checksummed or compressed we'll also have to allocate bounce -+ * buffers and copy the data back into the original bio. -+ * -+ * If we didn't have to split, we have to save and restore the original -+ * bi_end_io - @split below indicates which: -+ */ -+ union { -+ struct bch_read_bio *parent; -+ bio_end_io_t *end_io; -+ }; -+ -+ /* -+ * Saved copy of bio->bi_iter, from submission time - allows us to -+ * resubmit on IO error, and also to copy data back to the original bio -+ * when we're bouncing: -+ */ -+ struct bvec_iter bvec_iter; -+ -+ unsigned offset_into_extent; -+ -+ u16 flags; -+ union { -+ struct { -+ u16 bounce:1, -+ split:1, -+ kmalloc:1, -+ have_ioref:1, -+ narrow_crcs:1, -+ hole:1, -+ retry:2, -+ context:2; -+ }; -+ u16 _state; -+ }; -+ -+ struct bch_devs_list devs_have; -+ -+ struct extent_ptr_decoded pick; -+ /* start pos of data we read (may not be pos of data we want) */ -+ struct bpos pos; -+ struct bversion version; -+ -+ struct promote_op *promote; -+ -+ struct bch_io_opts opts; -+ -+ struct work_struct work; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_bio { -+ struct bch_fs *c; -+ struct bch_write_bio *parent; -+ -+ u64 submit_time; -+ -+ struct bch_devs_list failed; -+ u8 order; -+ u8 dev; -+ -+ unsigned split:1, -+ bounce:1, -+ put_bio:1, -+ have_ioref:1, -+ used_mempool:1; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_op { -+ struct closure cl; -+ struct bch_fs *c; -+ void (*end_io)(struct bch_write_op *); -+ u64 start_time; -+ -+ unsigned written; /* sectors */ -+ u16 flags; -+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ -+ -+ unsigned csum_type:4; -+ unsigned compression_type:4; -+ unsigned nr_replicas:4; -+ unsigned nr_replicas_required:4; -+ unsigned alloc_reserve:3; -+ unsigned incompressible:1; -+ -+ struct bch_devs_list devs_have; -+ u16 target; -+ u16 nonce; -+ struct bch_io_opts opts; -+ -+ struct bpos pos; -+ struct bversion version; -+ -+ /* For BCH_WRITE_DATA_ENCODED: */ -+ struct bch_extent_crc_unpacked crc; -+ -+ struct write_point_specifier write_point; -+ -+ struct disk_reservation res; -+ -+ struct open_buckets open_buckets; -+ -+ /* -+ * If caller wants to flush but hasn't passed us a journal_seq ptr, we -+ * still need to stash the journal_seq somewhere: -+ */ -+ union { -+ u64 *journal_seq_p; -+ u64 journal_seq; -+ }; -+ u64 new_i_size; -+ s64 i_sectors_delta; -+ -+ int (*index_update_fn)(struct bch_write_op *); -+ -+ struct bch_devs_mask failed; -+ -+ struct keylist insert_keys; -+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; -+ -+ /* Must be last: */ -+ struct bch_write_bio wbio; -+}; -+ -+#endif /* _BCACHEFS_IO_TYPES_H */ -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -new file mode 100644 -index 000000000000..b4f7b61ba9ac ---- /dev/null -+++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1254 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs journalling code, for btree insertions -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+ -+static bool __journal_entry_is_open(union journal_res_state state) -+{ -+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -+} -+ -+static bool journal_entry_is_open(struct journal *j) -+{ -+ return __journal_entry_is_open(j->reservations); -+} -+ -+static void journal_pin_new_entry(struct journal *j, int count) -+{ -+ struct journal_entry_pin_list *p; -+ -+ /* -+ * The fifo_push() needs to happen at the same time as j->seq is -+ * incremented for journal_last_seq() to be calculated correctly -+ */ -+ atomic64_inc(&j->seq); -+ p = fifo_push_ref(&j->pin); -+ -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, count); -+ p->devs.nr = 0; -+} -+ -+static void bch2_journal_buf_init(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ -+ memset(buf->has_inode, 0, sizeof(buf->has_inode)); -+ -+ memset(buf->data, 0, sizeof(*buf->data)); -+ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); -+ buf->data->u64s = 0; -+} -+ -+void bch2_journal_halt(struct journal *j) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ journal_wake(j); -+ closure_wake_up(&journal_cur_buf(j)->wait); -+} -+ -+/* journal entry close/open: */ -+ -+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) -+{ -+ if (!need_write_just_set && -+ test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ bch2_time_stats_update(j->delay_time, -+ j->need_write_time); -+ -+ clear_bit(JOURNAL_NEED_WRITE, &j->flags); -+ -+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); -+} -+ -+/* -+ * Returns true if journal entry is now closed: -+ */ -+static bool __journal_entry_close(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ bool set_need_write = false; -+ unsigned sectors; -+ -+ lockdep_assert_held(&j->lock); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) -+ return true; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { -+ /* this entry will never be written: */ -+ closure_wake_up(&buf->wait); -+ return true; -+ } -+ -+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); -+ j->need_write_time = local_clock(); -+ set_need_write = true; -+ } -+ -+ if (new.prev_buf_unwritten) -+ return false; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; -+ new.idx++; -+ new.prev_buf_unwritten = 1; -+ -+ BUG_ON(journal_state_count(new, new.idx)); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ -+ sectors = vstruct_blocks_plus(buf->data, c->block_bits, -+ buf->u64s_reserved) << c->block_bits; -+ BUG_ON(sectors > buf->sectors); -+ buf->sectors = sectors; -+ -+ bkey_extent_init(&buf->key); -+ -+ /* -+ * We have to set last_seq here, _before_ opening a new journal entry: -+ * -+ * A threads may replace an old pin with a new pin on their current -+ * journal reservation - the expectation being that the journal will -+ * contain either what the old pin protected or what the new pin -+ * protects. -+ * -+ * After the old pin is dropped journal_last_seq() won't include the old -+ * pin, so we can only write the updated last_seq on the entry that -+ * contains whatever the new pin protects. -+ * -+ * Restated, we can _not_ update last_seq for a given entry if there -+ * could be a newer entry open with reservations/pins that have been -+ * taken against it. -+ * -+ * Hence, we want update/set last_seq on the current journal entry right -+ * before we open a new one: -+ */ -+ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); -+ -+ if (journal_entry_empty(buf->data)) -+ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ else -+ set_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ -+ bch2_journal_buf_init(j); -+ -+ cancel_delayed_work(&j->write_work); -+ -+ bch2_journal_space_available(j); -+ -+ bch2_journal_buf_put(j, old.idx, set_need_write); -+ return true; -+} -+ -+static bool journal_entry_close(struct journal *j) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * should _only_ called from journal_res_get() - when we actually want a -+ * journal reservation - journal entry is open means journal is dirty: -+ * -+ * returns: -+ * 0: success -+ * -ENOSPC: journal currently full, must invoke reclaim -+ * -EAGAIN: journal blocked, must wait -+ * -EROFS: insufficient rw devices or journal error -+ */ -+static int journal_entry_open(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ int u64s; -+ u64 v; -+ -+ lockdep_assert_held(&j->lock); -+ BUG_ON(journal_entry_is_open(j)); -+ -+ if (j->blocked) -+ return -EAGAIN; -+ -+ if (j->cur_entry_error) -+ return j->cur_entry_error; -+ -+ BUG_ON(!j->cur_entry_sectors); -+ -+ buf->u64s_reserved = j->entry_u64s_reserved; -+ buf->disk_sectors = j->cur_entry_sectors; -+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); -+ -+ u64s = (int) (buf->sectors << 9) / sizeof(u64) - -+ journal_entry_overhead(j); -+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); -+ -+ if (u64s <= le32_to_cpu(buf->data->u64s)) -+ return -ENOSPC; -+ -+ /* -+ * Must be set before marking the journal entry as open: -+ */ -+ j->cur_entry_u64s = u64s; -+ -+ v = atomic64_read(&j->reservations.counter); -+ do { -+ old.v = new.v = v; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return -EROFS; -+ -+ /* Handle any already added entries */ -+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); -+ -+ EBUG_ON(journal_state_count(new, new.idx)); -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ if (j->res_get_blocked_start) -+ bch2_time_stats_update(j->blocked_time, -+ j->res_get_blocked_start); -+ j->res_get_blocked_start = 0; -+ -+ mod_delayed_work(system_freezable_wq, -+ &j->write_work, -+ msecs_to_jiffies(j->write_delay_ms)); -+ journal_wake(j); -+ return 0; -+} -+ -+static bool journal_quiesced(struct journal *j) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); -+ -+ if (!ret) -+ journal_entry_close(j); -+ return ret; -+} -+ -+static void journal_quiesce(struct journal *j) -+{ -+ wait_event(j->wait, journal_quiesced(j)); -+} -+ -+static void journal_write_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(work, struct journal, write_work.work); -+ -+ journal_entry_close(j); -+} -+ -+/* -+ * Given an inode number, if that inode number has data in the journal that -+ * hasn't yet been flushed, return the journal sequence number that needs to be -+ * flushed: -+ */ -+u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -+{ -+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); -+ u64 seq = 0; -+ -+ if (!test_bit(h, j->buf[0].has_inode) && -+ !test_bit(h, j->buf[1].has_inode)) -+ return 0; -+ -+ spin_lock(&j->lock); -+ if (test_bit(h, journal_cur_buf(j)->has_inode)) -+ seq = journal_cur_seq(j); -+ else if (test_bit(h, journal_prev_buf(j)->has_inode)) -+ seq = journal_cur_seq(j) - 1; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+static int __journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf; -+ bool can_discard; -+ int ret; -+retry: -+ if (journal_res_get_fast(j, res, flags)) -+ return 0; -+ -+ if (bch2_journal_error(j)) -+ return -EROFS; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Recheck after taking the lock, so we don't race with another thread -+ * that just did journal_entry_open() and call journal_entry_close() -+ * unnecessarily -+ */ -+ if (journal_res_get_fast(j, res, flags)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ /* -+ * Don't want to close current journal entry, just need to -+ * invoke reclaim: -+ */ -+ ret = -ENOSPC; -+ goto unlock; -+ } -+ -+ /* -+ * If we couldn't get a reservation because the current buf filled up, -+ * and we had room for a bigger entry on disk, signal that we want to -+ * realloc the journal bufs: -+ */ -+ buf = journal_cur_buf(j); -+ if (journal_entry_is_open(j) && -+ buf->buf_size >> 9 < buf->disk_sectors && -+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) -+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); -+ -+ if (journal_entry_is_open(j) && -+ !__journal_entry_close(j)) { -+ /* -+ * We failed to get a reservation on the current open journal -+ * entry because it's full, and we can't close it because -+ * there's still a previous one in flight: -+ */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ ret = journal_entry_open(j); -+ } -+unlock: -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ can_discard = j->can_discard; -+ spin_unlock(&j->lock); -+ -+ if (!ret) -+ goto retry; -+ -+ if (ret == -ENOSPC) { -+ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), -+ "JOURNAL_RES_GET_RESERVED set but journal full"); -+ -+ /* -+ * Journal is full - can't rely on reclaim from work item due to -+ * freezing: -+ */ -+ trace_journal_full(c); -+ -+ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { -+ if (can_discard) { -+ bch2_journal_do_discards(j); -+ goto retry; -+ } -+ -+ if (mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } -+ } -+ -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+/* -+ * Essentially the entry function to the journaling code. When bcachefs is doing -+ * a btree insert, it calls this function to get the current journal write. -+ * Journal write is the structure used set up journal writes. The calling -+ * function will then add its keys to the structure, queuing them for the next -+ * write. -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. -+ */ -+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || -+ (flags & JOURNAL_RES_GET_NONBLOCK)); -+ return ret; -+} -+ -+/* journal_preres: */ -+ -+static bool journal_preres_available(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); -+ -+ if (!ret) -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ -+ return ret; -+} -+ -+int __bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->preres_wait, -+ (ret = bch2_journal_error(j)) || -+ journal_preres_available(j, res, new_u64s, flags)); -+ return ret; -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *j, -+ struct journal_entry_res *res, -+ unsigned new_u64s) -+{ -+ union journal_res_state state; -+ int d = new_u64s - res->u64s; -+ -+ spin_lock(&j->lock); -+ -+ j->entry_u64s_reserved += d; -+ if (d <= 0) -+ goto out; -+ -+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); -+ smp_mb(); -+ state = READ_ONCE(j->reservations); -+ -+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && -+ state.cur_entry_offset > j->cur_entry_u64s) { -+ j->cur_entry_u64s += d; -+ /* -+ * Not enough room in current journal entry, have to flush it: -+ */ -+ __journal_entry_close(j); -+ } else { -+ journal_cur_buf(j)->u64s_reserved += d; -+ } -+out: -+ spin_unlock(&j->lock); -+ res->u64s += d; -+} -+ -+/* journal flushing: */ -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *j) -+{ -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ seq = journal_cur_seq(j); -+ if (j->reservations.prev_buf_unwritten) -+ seq--; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+/** -+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't -+ * open yet, or wait if we cannot -+ * -+ * used by the btree interior update machinery, when it needs to write a new -+ * btree root - every journal entry contains the roots of all the btrees, so it -+ * doesn't need to bother with getting a journal reservation -+ */ -+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ int ret; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Can't try to open more than one sequence number ahead: -+ */ -+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); -+ -+ if (journal_cur_seq(j) > seq || -+ journal_entry_is_open(j)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (journal_cur_seq(j) < seq && -+ !__journal_entry_close(j)) { -+ /* haven't finished writing out the previous one: */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ BUG_ON(journal_cur_seq(j) != seq); -+ -+ ret = journal_entry_open(j); -+ } -+ -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ if (ret == -EAGAIN || ret == -ENOSPC) -+ closure_wait(&j->async_wait, cl); -+ -+ spin_unlock(&j->lock); -+ -+ if (ret == -ENOSPC) { -+ trace_journal_full(c); -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+static int journal_seq_error(struct journal *j, u64 seq) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ -+ if (seq == journal_cur_seq(j)) -+ return bch2_journal_error(j); -+ -+ if (seq + 1 == journal_cur_seq(j) && -+ !state.prev_buf_unwritten && -+ seq > j->seq_ondisk) -+ return -EIO; -+ -+ return 0; -+} -+ -+static inline struct journal_buf * -+journal_seq_to_buf(struct journal *j, u64 seq) -+{ -+ /* seq should be for a journal entry that has been opened: */ -+ BUG_ON(seq > journal_cur_seq(j)); -+ BUG_ON(seq == journal_cur_seq(j) && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); -+ -+ if (seq == journal_cur_seq(j)) -+ return journal_cur_buf(j); -+ if (seq + 1 == journal_cur_seq(j) && -+ j->reservations.prev_buf_unwritten) -+ return journal_prev_buf(j); -+ return NULL; -+} -+ -+/** -+ * bch2_journal_wait_on_seq - wait for a journal entry to be written -+ * -+ * does _not_ cause @seq to be written immediately - if there is no other -+ * activity to cause the relevant journal entry to be filled up or flushed it -+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is -+ * configurable). -+ */ -+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if ((buf = journal_seq_to_buf(j, seq))) { -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) { -+ smp_mb(); -+ if (bch2_journal_error(j)) -+ closure_wake_up(&buf->wait); -+ } -+ } -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_flush_seq_async - wait for a journal entry to be written -+ * -+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if -+ * necessary -+ */ -+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if (parent && -+ (buf = journal_seq_to_buf(j, seq))) -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+} -+ -+static int journal_seq_flushed(struct journal *j, u64 seq) -+{ -+ int ret; -+ -+ spin_lock(&j->lock); -+ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+int bch2_journal_flush_seq(struct journal *j, u64 seq) -+{ -+ u64 start_time = local_clock(); -+ int ret, ret2; -+ -+ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); -+ -+ bch2_time_stats_update(j->flush_seq_time, start_time); -+ -+ return ret ?: ret2 < 0 ? ret2 : 0; -+} -+ -+/** -+ * bch2_journal_meta_async - force a journal entry to be written -+ */ -+void bch2_journal_meta_async(struct journal *j, struct closure *parent) -+{ -+ struct journal_res res; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ bch2_journal_res_put(j, &res); -+ -+ bch2_journal_flush_seq_async(j, res.seq, parent); -+} -+ -+int bch2_journal_meta(struct journal *j) -+{ -+ struct journal_res res; -+ int ret; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ if (ret) -+ return ret; -+ -+ bch2_journal_res_put(j, &res); -+ -+ return bch2_journal_flush_seq(j, res.seq); -+} -+ -+/* -+ * bch2_journal_flush_async - if there is an open journal entry, or a journal -+ * still being written, write it and wait for the write to complete -+ */ -+void bch2_journal_flush_async(struct journal *j, struct closure *parent) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return; -+ } -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_seq_async(j, seq, parent); -+} -+ -+int bch2_journal_flush(struct journal *j) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ spin_unlock(&j->lock); -+ -+ return bch2_journal_flush_seq(j, seq); -+} -+ -+/* block/unlock the journal: */ -+ -+void bch2_journal_unblock(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked--; -+ spin_unlock(&j->lock); -+ -+ journal_wake(j); -+} -+ -+void bch2_journal_block(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked++; -+ spin_unlock(&j->lock); -+ -+ journal_quiesce(j); -+} -+ -+/* allocate journal on a device: */ -+ -+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, -+ bool new_fs, struct closure *cl) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets; -+ u64 *new_bucket_seq = NULL, *new_buckets = NULL; -+ int ret = 0; -+ -+ /* don't handle reducing nr of buckets yet: */ -+ if (nr <= ja->nr) -+ return 0; -+ -+ ret = -ENOMEM; -+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ if (!new_buckets || !new_bucket_seq) -+ goto err; -+ -+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, -+ nr + sizeof(*journal_buckets) / sizeof(u64)); -+ if (!journal_buckets) -+ goto err; -+ -+ /* -+ * We may be called from the device add path, before the new device has -+ * actually been added to the running filesystem: -+ */ -+ if (c) -+ spin_lock(&c->journal.lock); -+ -+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); -+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); -+ swap(new_buckets, ja->buckets); -+ swap(new_bucket_seq, ja->bucket_seq); -+ -+ if (c) -+ spin_unlock(&c->journal.lock); -+ -+ while (ja->nr < nr) { -+ struct open_bucket *ob = NULL; -+ unsigned pos; -+ long bucket; -+ -+ if (new_fs) { -+ bucket = bch2_bucket_alloc_new_fs(ca); -+ if (bucket < 0) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ } else { -+ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, -+ false, cl); -+ if (IS_ERR(ob)) { -+ ret = cl ? -EAGAIN : -ENOSPC; -+ goto err; -+ } -+ -+ bucket = sector_to_bucket(ca, ob->ptr.offset); -+ } -+ -+ if (c) { -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->journal.lock); -+ } -+ -+ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; -+ __array_insert_item(ja->buckets, ja->nr, pos); -+ __array_insert_item(ja->bucket_seq, ja->nr, pos); -+ __array_insert_item(journal_buckets->buckets, ja->nr, pos); -+ ja->nr++; -+ -+ ja->buckets[pos] = bucket; -+ ja->bucket_seq[pos] = 0; -+ journal_buckets->buckets[pos] = cpu_to_le64(bucket); -+ -+ if (pos <= ja->discard_idx) -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ if (pos <= ja->dirty_idx_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ if (pos <= ja->dirty_idx) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ if (pos <= ja->cur_idx) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), -+ 0); -+ -+ if (c) { -+ spin_unlock(&c->journal.lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ if (!new_fs) -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ ret = 0; -+err: -+ kfree(new_bucket_seq); -+ kfree(new_buckets); -+ -+ return ret; -+} -+ -+/* -+ * Allocate more journal space at runtime - not currently making use if it, but -+ * the code works: -+ */ -+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct closure cl; -+ unsigned current_nr; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ struct disk_reservation disk_res = { 0, 0 }; -+ -+ closure_sync(&cl); -+ -+ mutex_lock(&c->sb_lock); -+ current_nr = ja->nr; -+ -+ /* -+ * note: journal buckets aren't really counted as _sectors_ used yet, so -+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c -+ * when space used goes up without a reservation - but we do need the -+ * reservation to ensure we'll actually be able to allocate: -+ */ -+ -+ if (bch2_disk_reservation_get(c, &disk_res, -+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { -+ mutex_unlock(&c->sb_lock); -+ return -ENOSPC; -+ } -+ -+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (ja->nr != current_nr) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } while (ret == -EAGAIN); -+ -+ return ret; -+} -+ -+int bch2_dev_journal_alloc(struct bch_dev *ca) -+{ -+ unsigned nr; -+ -+ if (dynamic_fault("bcachefs:add:journal_alloc")) -+ return -ENOMEM; -+ -+ /* -+ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever -+ * is smaller: -+ */ -+ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, -+ BCH_JOURNAL_BUCKETS_MIN, -+ min(1 << 10, -+ (1 << 20) / ca->mi.bucket_size)); -+ -+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); -+} -+ -+/* startup/shutdown: */ -+ -+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -+{ -+ union journal_res_state state; -+ struct journal_buf *w; -+ bool ret; -+ -+ spin_lock(&j->lock); -+ state = READ_ONCE(j->reservations); -+ w = j->buf + !state.idx; -+ -+ ret = state.prev_buf_unwritten && -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -+{ -+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -+} -+ -+void bch2_fs_journal_stop(struct journal *j) -+{ -+ bch2_journal_flush_all_pins(j); -+ -+ wait_event(j->wait, journal_entry_close(j)); -+ -+ /* do we need to write another journal entry? */ -+ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) -+ bch2_journal_meta(j); -+ -+ journal_quiesce(j); -+ -+ BUG_ON(!bch2_journal_error(j) && -+ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); -+ -+ cancel_delayed_work_sync(&j->write_work); -+ cancel_delayed_work_sync(&j->reclaim_work); -+} -+ -+int bch2_fs_journal_start(struct journal *j, u64 cur_seq, -+ struct list_head *journal_entries) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ struct journal_replay *i; -+ u64 last_seq = cur_seq, nr, seq; -+ -+ if (!list_empty(journal_entries)) -+ last_seq = le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ nr = cur_seq - last_seq; -+ -+ if (nr + 1 > j->pin.size) { -+ free_fifo(&j->pin); -+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -+ if (!j->pin.data) { -+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -+ return -ENOMEM; -+ } -+ } -+ -+ j->replay_journal_seq = last_seq; -+ j->replay_journal_seq_end = cur_seq; -+ j->last_seq_ondisk = last_seq; -+ j->pin.front = last_seq; -+ j->pin.back = cur_seq; -+ atomic64_set(&j->seq, cur_seq - 1); -+ -+ fifo_for_each_entry_ptr(p, &j->pin, seq) { -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, 1); -+ p->devs.nr = 0; -+ } -+ -+ list_for_each_entry(i, journal_entries, list) { -+ seq = le64_to_cpu(i->j.seq); -+ BUG_ON(seq >= cur_seq); -+ -+ if (seq < last_seq) -+ continue; -+ -+ journal_seq_pin(j, seq)->devs = i->devs; -+ } -+ -+ spin_lock(&j->lock); -+ -+ set_bit(JOURNAL_STARTED, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ bch2_journal_buf_init(j); -+ -+ c->last_bucket_seq_cleanup = journal_cur_seq(j); -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ return 0; -+} -+ -+/* init/exit: */ -+ -+void bch2_dev_journal_exit(struct bch_dev *ca) -+{ -+ kfree(ca->journal.bio); -+ kfree(ca->journal.buckets); -+ kfree(ca->journal.bucket_seq); -+ -+ ca->journal.bio = NULL; -+ ca->journal.buckets = NULL; -+ ca->journal.bucket_seq = NULL; -+} -+ -+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(sb); -+ unsigned i; -+ -+ ja->nr = bch2_nr_journal_buckets(journal_buckets); -+ -+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->bucket_seq) -+ return -ENOMEM; -+ -+ ca->journal.bio = bio_kmalloc(GFP_KERNEL, -+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); -+ if (!ca->journal.bio) -+ return -ENOMEM; -+ -+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->buckets) -+ return -ENOMEM; -+ -+ for (i = 0; i < ja->nr; i++) -+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); -+ -+ return 0; -+} -+ -+void bch2_fs_journal_exit(struct journal *j) -+{ -+ kvpfree(j->buf[1].data, j->buf[1].buf_size); -+ kvpfree(j->buf[0].data, j->buf[0].buf_size); -+ free_fifo(&j->pin); -+} -+ -+int bch2_fs_journal_init(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ static struct lock_class_key res_key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ spin_lock_init(&j->lock); -+ spin_lock_init(&j->err_lock); -+ init_waitqueue_head(&j->wait); -+ INIT_DELAYED_WORK(&j->write_work, journal_write_work); -+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); -+ init_waitqueue_head(&j->pin_flush_wait); -+ mutex_init(&j->reclaim_lock); -+ mutex_init(&j->discard_lock); -+ -+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); -+ -+ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->write_delay_ms = 1000; -+ j->reclaim_delay_ms = 100; -+ -+ /* Btree roots: */ -+ j->entry_u64s_reserved += -+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); -+ -+ atomic64_set(&j->reservations.counter, -+ ((union journal_res_state) -+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -+ -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || -+ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || -+ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ j->pin.front = j->pin.back = 1; -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+/* debug: */ -+ -+ssize_t bch2_journal_print_debug(struct journal *j, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ union journal_res_state s; -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ rcu_read_lock(); -+ spin_lock(&j->lock); -+ s = READ_ONCE(j->reservations); -+ -+ pr_buf(&out, -+ "active journal entries:\t%llu\n" -+ "seq:\t\t\t%llu\n" -+ "last_seq:\t\t%llu\n" -+ "last_seq_ondisk:\t%llu\n" -+ "prereserved:\t\t%u/%u\n" -+ "current entry sectors:\t%u\n" -+ "current entry:\t\t", -+ fifo_used(&j->pin), -+ journal_cur_seq(j), -+ journal_last_seq(j), -+ j->last_seq_ondisk, -+ j->prereserved.reserved, -+ j->prereserved.remaining, -+ j->cur_entry_sectors); -+ -+ switch (s.cur_entry_offset) { -+ case JOURNAL_ENTRY_ERROR_VAL: -+ pr_buf(&out, "error\n"); -+ break; -+ case JOURNAL_ENTRY_CLOSED_VAL: -+ pr_buf(&out, "closed\n"); -+ break; -+ default: -+ pr_buf(&out, "%u/%u\n", -+ s.cur_entry_offset, -+ j->cur_entry_u64s); -+ break; -+ } -+ -+ pr_buf(&out, -+ "current entry refs:\t%u\n" -+ "prev entry unwritten:\t", -+ journal_state_count(s, s.idx)); -+ -+ if (s.prev_buf_unwritten) -+ pr_buf(&out, "yes, ref %u sectors %u\n", -+ journal_state_count(s, !s.idx), -+ journal_prev_buf(j)->sectors); -+ else -+ pr_buf(&out, "no\n"); -+ -+ pr_buf(&out, -+ "need write:\t\t%i\n" -+ "replay done:\t\t%i\n", -+ test_bit(JOURNAL_NEED_WRITE, &j->flags), -+ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); -+ -+ for_each_member_device_rcu(ca, c, iter, -+ &c->rw_devs[BCH_DATA_JOURNAL]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ pr_buf(&out, -+ "dev %u:\n" -+ "\tnr\t\t%u\n" -+ "\tavailable\t%u:%u\n" -+ "\tdiscard_idx\t\t%u\n" -+ "\tdirty_idx_ondisk\t%u (seq %llu)\n" -+ "\tdirty_idx\t\t%u (seq %llu)\n" -+ "\tcur_idx\t\t%u (seq %llu)\n", -+ iter, ja->nr, -+ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), -+ ja->sectors_free, -+ ja->discard_idx, -+ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], -+ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], -+ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); -+ } -+ -+ spin_unlock(&j->lock); -+ rcu_read_unlock(); -+ -+ return out.pos - buf; -+} -+ -+ssize_t bch2_journal_print_pins(struct journal *j, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *pin; -+ u64 i; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { -+ pr_buf(&out, "%llu: count %u\n", -+ i, atomic_read(&pin_list->count)); -+ -+ list_for_each_entry(pin, &pin_list->list, list) -+ pr_buf(&out, "\t%px %ps\n", -+ pin, pin->flush); -+ -+ if (!list_empty(&pin_list->flushed)) -+ pr_buf(&out, "flushed:\n"); -+ -+ list_for_each_entry(pin, &pin_list->flushed, list) -+ pr_buf(&out, "\t%px %ps\n", -+ pin, pin->flush); -+ } -+ spin_unlock(&j->lock); -+ -+ return out.pos - buf; -+} -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -new file mode 100644 -index 000000000000..30de6d96188e ---- /dev/null -+++ b/fs/bcachefs/journal.h -@@ -0,0 +1,519 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_H -+#define _BCACHEFS_JOURNAL_H -+ -+/* -+ * THE JOURNAL: -+ * -+ * The primary purpose of the journal is to log updates (insertions) to the -+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. -+ * -+ * Without the journal, the b-tree is always internally consistent on -+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal -+ * but did handle unclean shutdowns by doing all index updates synchronously -+ * (with coalescing). -+ * -+ * Updates to interior nodes still happen synchronously and without the journal -+ * (for simplicity) - this may change eventually but updates to interior nodes -+ * are rare enough it's not a huge priority. -+ * -+ * This means the journal is relatively separate from the b-tree; it consists of -+ * just a list of keys and journal replay consists of just redoing those -+ * insertions in same order that they appear in the journal. -+ * -+ * PERSISTENCE: -+ * -+ * For synchronous updates (where we're waiting on the index update to hit -+ * disk), the journal entry will be written out immediately (or as soon as -+ * possible, if the write for the previous journal entry was still in flight). -+ * -+ * Synchronous updates are specified by passing a closure (@flush_cl) to -+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter -+ * down to the journalling code. That closure will will wait on the journal -+ * write to complete (via closure_wait()). -+ * -+ * If the index update wasn't synchronous, the journal entry will be -+ * written out after 10 ms have elapsed, by default (the delay_ms field -+ * in struct journal). -+ * -+ * JOURNAL ENTRIES: -+ * -+ * A journal entry is variable size (struct jset), it's got a fixed length -+ * header and then a variable number of struct jset_entry entries. -+ * -+ * Journal entries are identified by monotonically increasing 64 bit sequence -+ * numbers - jset->seq; other places in the code refer to this sequence number. -+ * -+ * A jset_entry entry contains one or more bkeys (which is what gets inserted -+ * into the b-tree). We need a container to indicate which b-tree the key is -+ * for; also, the roots of the various b-trees are stored in jset_entry entries -+ * (one for each b-tree) - this lets us add new b-tree types without changing -+ * the on disk format. -+ * -+ * We also keep some things in the journal header that are logically part of the -+ * superblock - all the things that are frequently updated. This is for future -+ * bcache on raw flash support; the superblock (which will become another -+ * journal) can't be moved or wear leveled, so it contains just enough -+ * information to find the main journal, and the superblock only has to be -+ * rewritten when we want to move/wear level the main journal. -+ * -+ * JOURNAL LAYOUT ON DISK: -+ * -+ * The journal is written to a ringbuffer of buckets (which is kept in the -+ * superblock); the individual buckets are not necessarily contiguous on disk -+ * which means that journal entries are not allowed to span buckets, but also -+ * that we can resize the journal at runtime if desired (unimplemented). -+ * -+ * The journal buckets exist in the same pool as all the other buckets that are -+ * managed by the allocator and garbage collection - garbage collection marks -+ * the journal buckets as metadata buckets. -+ * -+ * OPEN/DIRTY JOURNAL ENTRIES: -+ * -+ * Open/dirty journal entries are journal entries that contain b-tree updates -+ * that have not yet been written out to the b-tree on disk. We have to track -+ * which journal entries are dirty, and we also have to avoid wrapping around -+ * the journal and overwriting old but still dirty journal entries with new -+ * journal entries. -+ * -+ * On disk, this is represented with the "last_seq" field of struct jset; -+ * last_seq is the first sequence number that journal replay has to replay. -+ * -+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in -+ * journal_device->seq) of for each journal bucket, the highest sequence number -+ * any journal entry it contains. Then, by comparing that against last_seq we -+ * can determine whether that journal bucket contains dirty journal entries or -+ * not. -+ * -+ * To track which journal entries are dirty, we maintain a fifo of refcounts -+ * (where each entry corresponds to a specific sequence number) - when a ref -+ * goes to 0, that journal entry is no longer dirty. -+ * -+ * Journalling of index updates is done at the same time as the b-tree itself is -+ * being modified (see btree_insert_key()); when we add the key to the journal -+ * the pending b-tree write takes a ref on the journal entry the key was added -+ * to. If a pending b-tree write would need to take refs on multiple dirty -+ * journal entries, it only keeps the ref on the oldest one (since a newer -+ * journal entry will still be replayed if an older entry was dirty). -+ * -+ * JOURNAL FILLING UP: -+ * -+ * There are two ways the journal could fill up; either we could run out of -+ * space to write to, or we could have too many open journal entries and run out -+ * of room in the fifo of refcounts. Since those refcounts are decremented -+ * without any locking we can't safely resize that fifo, so we handle it the -+ * same way. -+ * -+ * If the journal fills up, we start flushing dirty btree nodes until we can -+ * allocate space for a journal write again - preferentially flushing btree -+ * nodes that are pinning the oldest journal entries first. -+ */ -+ -+#include -+ -+#include "journal_types.h" -+ -+struct bch_fs; -+ -+static inline void journal_wake(struct journal *j) -+{ -+ wake_up(&j->wait); -+ closure_wake_up(&j->async_wait); -+ closure_wake_up(&j->preres_wait); -+} -+ -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ return j->buf + j->reservations.idx; -+} -+ -+static inline struct journal_buf *journal_prev_buf(struct journal *j) -+{ -+ return j->buf + !j->reservations.idx; -+} -+ -+/* Sequence number of oldest dirty journal entry */ -+ -+static inline u64 journal_last_seq(struct journal *j) -+{ -+ return j->pin.front; -+} -+ -+static inline u64 journal_cur_seq(struct journal *j) -+{ -+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); -+ -+ return j->pin.back - 1; -+} -+ -+u64 bch2_inode_journal_seq(struct journal *, u64); -+ -+static inline int journal_state_count(union journal_res_state s, int idx) -+{ -+ return idx == 0 ? s.buf0_count : s.buf1_count; -+} -+ -+static inline void journal_state_inc(union journal_res_state *s) -+{ -+ s->buf0_count += s->idx == 0; -+ s->buf1_count += s->idx == 1; -+} -+ -+static inline void bch2_journal_set_has_inode(struct journal *j, -+ struct journal_res *res, -+ u64 inum) -+{ -+ struct journal_buf *buf = &j->buf[res->idx]; -+ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); -+ -+ /* avoid atomic op if possible */ -+ if (unlikely(!test_bit(bit, buf->has_inode))) -+ set_bit(bit, buf->has_inode); -+} -+ -+/* -+ * Amount of space that will be taken up by some keys in the journal (i.e. -+ * including the jset header) -+ */ -+static inline unsigned jset_u64s(unsigned u64s) -+{ -+ return u64s + sizeof(struct jset_entry) / sizeof(u64); -+} -+ -+static inline int journal_entry_overhead(struct journal *j) -+{ -+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -+} -+ -+static inline struct jset_entry * -+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -+{ -+ struct jset *jset = buf->data; -+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); -+ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ -+ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -+ -+ return entry; -+} -+ -+static inline struct jset_entry * -+journal_res_entry(struct journal *j, struct journal_res *res) -+{ -+ return vstruct_idx(j->buf[res->idx].data, res->offset); -+} -+ -+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, -+ enum btree_id id, unsigned level, -+ const void *data, unsigned u64s) -+{ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ entry->type = type; -+ entry->btree_id = id; -+ entry->level = level; -+ memcpy_u64s_small(entry->_data, data, u64s); -+ -+ return jset_u64s(u64s); -+} -+ -+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, -+ unsigned type, enum btree_id id, -+ unsigned level, -+ const void *data, unsigned u64s) -+{ -+ unsigned actual = journal_entry_set(journal_res_entry(j, res), -+ type, id, level, data, u64s); -+ -+ EBUG_ON(!res->ref); -+ EBUG_ON(actual > res->u64s); -+ -+ res->offset += actual; -+ res->u64s -= actual; -+} -+ -+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, -+ enum btree_id id, const struct bkey_i *k) -+{ -+ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, -+ id, 0, k, k->k.u64s); -+} -+ -+static inline bool journal_entry_empty(struct jset *j) -+{ -+ struct jset_entry *i; -+ -+ if (j->seq != j->last_seq) -+ return false; -+ -+ vstruct_for_each(j, i) -+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) -+ return false; -+ return true; -+} -+ -+void __bch2_journal_buf_put(struct journal *, bool); -+ -+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, -+ bool need_write_just_set) -+{ -+ union journal_res_state s; -+ -+ s.v = atomic64_sub_return(((union journal_res_state) { -+ .buf0_count = idx == 0, -+ .buf1_count = idx == 1, -+ }).v, &j->reservations.counter); -+ if (!journal_state_count(s, idx)) { -+ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); -+ __bch2_journal_buf_put(j, need_write_just_set); -+ } -+} -+ -+/* -+ * This function releases the journal write structure so other threads can -+ * then proceed to add their keys as well. -+ */ -+static inline void bch2_journal_res_put(struct journal *j, -+ struct journal_res *res) -+{ -+ if (!res->ref) -+ return; -+ -+ lock_release(&j->res_map, _THIS_IP_); -+ -+ while (res->u64s) -+ bch2_journal_add_entry(j, res, -+ BCH_JSET_ENTRY_btree_keys, -+ 0, 0, NULL, 0); -+ -+ bch2_journal_buf_put(j, res->idx, false); -+ -+ res->ref = 0; -+} -+ -+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, -+ unsigned); -+ -+#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -+#define JOURNAL_RES_GET_CHECK (1 << 1) -+#define JOURNAL_RES_GET_RESERVED (1 << 2) -+#define JOURNAL_RES_GET_RECLAIM (1 << 3) -+ -+static inline int journal_res_get_fast(struct journal *j, -+ struct journal_res *res, -+ unsigned flags) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ /* -+ * Check if there is still room in the current journal -+ * entry: -+ */ -+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) -+ return 0; -+ -+ EBUG_ON(!journal_state_count(new, new.idx)); -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_CHECK) -+ return 1; -+ -+ new.cur_entry_offset += res->u64s; -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ res->ref = true; -+ res->idx = old.idx; -+ res->offset = old.cur_entry_offset; -+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ return 1; -+} -+ -+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned u64s, unsigned flags) -+{ -+ int ret; -+ -+ EBUG_ON(res->ref); -+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ -+ res->u64s = u64s; -+ -+ if (journal_res_get_fast(j, res, flags)) -+ goto out; -+ -+ ret = bch2_journal_res_get_slowpath(j, res, flags); -+ if (ret) -+ return ret; -+out: -+ if (!(flags & JOURNAL_RES_GET_CHECK)) { -+ lock_acquire_shared(&j->res_map, 0, -+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, -+ NULL, _THIS_IP_); -+ EBUG_ON(!res->ref); -+ } -+ return 0; -+} -+ -+/* journal_preres: */ -+ -+static inline bool journal_check_may_get_unreserved(struct journal *j) -+{ -+ union journal_preres_state s = READ_ONCE(j->prereserved); -+ bool ret = s.reserved <= s.remaining && -+ fifo_free(&j->pin) > 8; -+ -+ lockdep_assert_held(&j->lock); -+ -+ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ if (ret) { -+ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ journal_wake(j); -+ } else { -+ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ } -+ } -+ return ret; -+} -+ -+static inline void bch2_journal_preres_put(struct journal *j, -+ struct journal_preres *res) -+{ -+ union journal_preres_state s = { .reserved = res->u64s }; -+ -+ if (!res->u64s) -+ return; -+ -+ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); -+ res->u64s = 0; -+ closure_wake_up(&j->preres_wait); -+ -+ if (s.reserved <= s.remaining && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ spin_lock(&j->lock); -+ journal_check_may_get_unreserved(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+int __bch2_journal_preres_get(struct journal *, -+ struct journal_preres *, unsigned, unsigned); -+ -+static inline int bch2_journal_preres_get_fast(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int d = new_u64s - res->u64s; -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ new.reserved += d; -+ -+ /* -+ * If we're being called from the journal reclaim path, we have -+ * to unconditionally give out the pre-reservation, there's -+ * nothing else sensible we can do - otherwise we'd recurse back -+ * into the reclaim path and deadlock: -+ */ -+ -+ if (!(flags & JOURNAL_RES_GET_RECLAIM) && -+ new.reserved > new.remaining) -+ return 0; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+ -+ res->u64s += d; -+ return 1; -+} -+ -+static inline int bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ if (new_u64s <= res->u64s) -+ return 0; -+ -+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_NONBLOCK) -+ return -EAGAIN; -+ -+ return __bch2_journal_preres_get(j, res, new_u64s, flags); -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *, -+ struct journal_entry_res *, -+ unsigned); -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *); -+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); -+ -+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -+void bch2_journal_flush_async(struct journal *, struct closure *); -+void bch2_journal_meta_async(struct journal *, struct closure *); -+ -+int bch2_journal_flush_seq(struct journal *, u64); -+int bch2_journal_flush(struct journal *); -+int bch2_journal_meta(struct journal *); -+ -+void bch2_journal_halt(struct journal *); -+ -+static inline int bch2_journal_error(struct journal *j) -+{ -+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL -+ ? -EIO : 0; -+} -+ -+struct bch_dev; -+ -+static inline bool journal_flushes_device(struct bch_dev *ca) -+{ -+ return true; -+} -+ -+static inline void bch2_journal_set_replay_done(struct journal *j) -+{ -+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ set_bit(JOURNAL_REPLAY_DONE, &j->flags); -+} -+ -+void bch2_journal_unblock(struct journal *); -+void bch2_journal_block(struct journal *); -+ -+ssize_t bch2_journal_print_debug(struct journal *, char *); -+ssize_t bch2_journal_print_pins(struct journal *, char *); -+ -+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, -+ unsigned nr); -+int bch2_dev_journal_alloc(struct bch_dev *); -+ -+void bch2_dev_journal_stop(struct journal *, struct bch_dev *); -+ -+void bch2_fs_journal_stop(struct journal *); -+int bch2_fs_journal_start(struct journal *, u64, struct list_head *); -+ -+void bch2_dev_journal_exit(struct bch_dev *); -+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -+void bch2_fs_journal_exit(struct journal *); -+int bch2_fs_journal_init(struct journal *); -+ -+#endif /* _BCACHEFS_JOURNAL_H */ -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -new file mode 100644 -index 000000000000..c298c2b7721d ---- /dev/null -+++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1150 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_io.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+ -+#include -+ -+struct journal_list { -+ struct closure cl; -+ struct mutex lock; -+ struct list_head *head; -+ int ret; -+}; -+ -+#define JOURNAL_ENTRY_ADD_OK 0 -+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 -+ -+/* -+ * Given a journal entry we just read, add it to the list of journal entries to -+ * be replayed: -+ */ -+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, -+ struct journal_list *jlist, struct jset *j) -+{ -+ struct journal_replay *i, *pos; -+ struct list_head *where; -+ size_t bytes = vstruct_bytes(j); -+ __le64 last_seq; -+ int ret; -+ -+ last_seq = !list_empty(jlist->head) -+ ? list_last_entry(jlist->head, struct journal_replay, -+ list)->j.last_seq -+ : 0; -+ -+ if (!c->opts.read_entire_journal) { -+ /* Is this entry older than the range we need? */ -+ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { -+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; -+ goto out; -+ } -+ -+ /* Drop entries we don't need anymore */ -+ list_for_each_entry_safe(i, pos, jlist->head, list) { -+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) -+ break; -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+ } -+ -+ list_for_each_entry_reverse(i, jlist->head, list) { -+ /* Duplicate? */ -+ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { -+ fsck_err_on(bytes != vstruct_bytes(&i->j) || -+ memcmp(j, &i->j, bytes), c, -+ "found duplicate but non identical journal entries (seq %llu)", -+ le64_to_cpu(j->seq)); -+ goto found; -+ } -+ -+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { -+ where = &i->list; -+ goto add; -+ } -+ } -+ -+ where = jlist->head; -+add: -+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_add(&i->list, where); -+ i->devs.nr = 0; -+ memcpy(&i->j, j, bytes); -+found: -+ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) -+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); -+ else -+ fsck_err_on(1, c, "duplicate journal entries on same device"); -+ ret = JOURNAL_ENTRY_ADD_OK; -+out: -+fsck_err: -+ return ret; -+} -+ -+static struct nonce journal_nonce(const struct jset *jset) -+{ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = ((__le32 *) &jset->seq)[0], -+ [2] = ((__le32 *) &jset->seq)[1], -+ [3] = BCH_NONCE_JOURNAL, -+ }}; -+} -+ -+/* this fills in a range with empty jset_entries: */ -+static void journal_entry_null_range(void *start, void *end) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = start; entry != end; entry = vstruct_next(entry)) -+ memset(entry, 0, sizeof(*entry)); -+} -+ -+#define JOURNAL_ENTRY_REREAD 5 -+#define JOURNAL_ENTRY_NONE 6 -+#define JOURNAL_ENTRY_BAD 7 -+ -+#define journal_entry_err(c, msg, ...) \ -+({ \ -+ switch (write) { \ -+ case READ: \ -+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write:\n" \ -+ msg, ##__VA_ARGS__); \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+ true; \ -+}) -+ -+#define journal_entry_err_on(cond, c, msg, ...) \ -+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -+ -+static int journal_validate_key(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, -+ unsigned level, enum btree_id btree_id, -+ struct bkey_i *k, -+ const char *type, int write) -+{ -+ void *next = vstruct_next(entry); -+ const char *invalid; -+ unsigned version = le32_to_cpu(jset->version); -+ int ret = 0; -+ -+ if (journal_entry_err_on(!k->k.u64s, c, -+ "invalid %s in journal: k->u64s 0", type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on((void *) bkey_next(k) > -+ (void *) vstruct_next(entry), c, -+ "invalid %s in journal: extends past end of journal entry", -+ type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, -+ "invalid %s in journal: bad format %u", -+ type, k->k.format)) { -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (!write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+ -+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), -+ __btree_node_type(level, btree_id)); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); -+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", -+ type, invalid, buf); -+ -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_btree_keys(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k; -+ -+ vstruct_for_each(entry, k) { -+ int ret = journal_validate_key(c, jset, entry, -+ entry->level, -+ entry->btree_id, -+ k, "key", write); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int journal_entry_validate_btree_root(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k = entry->start; -+ int ret = 0; -+ -+ if (journal_entry_err_on(!entry->u64s || -+ le16_to_cpu(entry->u64s) != k->k.u64s, c, -+ "invalid btree root journal entry: wrong number of keys")) { -+ void *next = vstruct_next(entry); -+ /* -+ * we don't want to null out this jset_entry, -+ * just the contents, so that later we can tell -+ * we were _supposed_ to have a btree root -+ */ -+ entry->u64s = 0; -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -+ "btree root", write); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_prio_ptrs(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ /* obsolete, don't care: */ -+ return 0; -+} -+ -+static int journal_entry_validate_blacklist(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_blacklist_v2(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_blacklist_v2 *bl_entry; -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ goto out; -+ } -+ -+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > -+ le64_to_cpu(bl_entry->end), c, -+ "invalid journal seq blacklist entry: start > end")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+out: -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u), -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_data_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u) || -+ bytes < sizeof(*u) + u->r.nr_devs, -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+struct jset_entry_ops { -+ int (*validate)(struct bch_fs *, struct jset *, -+ struct jset_entry *, int); -+}; -+ -+static const struct jset_entry_ops bch2_jset_entry_ops[] = { -+#define x(f, nr) \ -+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ -+ .validate = journal_entry_validate_##f, \ -+ }, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+}; -+ -+static int journal_entry_validate(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, int write) -+{ -+ return entry->type < BCH_JSET_ENTRY_NR -+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, -+ entry, write) -+ : 0; -+} -+ -+static int jset_validate_entries(struct bch_fs *c, struct jset *jset, -+ int write) -+{ -+ struct jset_entry *entry; -+ int ret = 0; -+ -+ vstruct_for_each(jset, entry) { -+ if (journal_entry_err_on(vstruct_next(entry) > -+ vstruct_last(jset), c, -+ "journal entry extends past end of jset")) { -+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); -+ break; -+ } -+ -+ ret = journal_entry_validate(c, jset, entry, write); -+ if (ret) -+ break; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int jset_validate(struct bch_fs *c, -+ struct jset *jset, u64 sector, -+ unsigned bucket_sectors_left, -+ unsigned sectors_read, -+ int write) -+{ -+ size_t bytes = vstruct_bytes(jset); -+ struct bch_csum csum; -+ unsigned version; -+ int ret = 0; -+ -+ if (le64_to_cpu(jset->magic) != jset_magic(c)) -+ return JOURNAL_ENTRY_NONE; -+ -+ version = le32_to_cpu(jset->version); -+ if ((version != BCH_JSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max) { -+ bch_err(c, "unknown journal entry version %u", jset->version); -+ return BCH_FSCK_UNKNOWN_VERSION; -+ } -+ -+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, -+ "journal entry too big (%zu bytes), sector %lluu", -+ bytes, sector)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ if (bytes > sectors_read << 9) -+ return JOURNAL_ENTRY_REREAD; -+ -+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, -+ "journal entry with unknown csum type %llu sector %lluu", -+ JSET_CSUM_TYPE(jset), sector)) -+ return JOURNAL_ENTRY_BAD; -+ -+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); -+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, -+ "journal checksum bad, sector %llu", sector)) { -+ /* XXX: retry IO, when we start retrying checksum errors */ -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, -+ "invalid journal entry: last_seq > seq")) -+ jset->last_seq = jset->seq; -+ -+ return 0; -+fsck_err: -+ return ret; -+} -+ -+struct journal_read_buf { -+ void *data; -+ size_t size; -+}; -+ -+static int journal_read_buf_realloc(struct journal_read_buf *b, -+ size_t new_size) -+{ -+ void *n; -+ -+ /* the bios are sized for this many pages, max: */ -+ if (new_size > JOURNAL_ENTRY_SIZE_MAX) -+ return -ENOMEM; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = kvpmalloc(new_size, GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ kvpfree(b->data, b->size); -+ b->data = n; -+ b->size = new_size; -+ return 0; -+} -+ -+static int journal_read_bucket(struct bch_dev *ca, -+ struct journal_read_buf *buf, -+ struct journal_list *jlist, -+ unsigned bucket) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct jset *j = NULL; -+ unsigned sectors, sectors_read = 0; -+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), -+ end = offset + ca->mi.bucket_size; -+ bool saw_bad = false; -+ int ret = 0; -+ -+ pr_debug("reading %u", bucket); -+ -+ while (offset < end) { -+ if (!sectors_read) { -+ struct bio *bio; -+reread: -+ sectors_read = min_t(unsigned, -+ end - offset, buf->size >> 9); -+ -+ bio = bio_kmalloc(GFP_KERNEL, -+ buf_pages(buf->data, -+ sectors_read << 9)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(bio, REQ_OP_READ, 0); -+ bch2_bio_map(bio, buf->data, sectors_read << 9); -+ -+ ret = submit_bio_wait(bio); -+ bio_put(bio); -+ -+ if (bch2_dev_io_err_on(ret, ca, -+ "journal read from sector %llu", -+ offset) || -+ bch2_meta_read_fault("journal")) -+ return -EIO; -+ -+ j = buf->data; -+ } -+ -+ ret = jset_validate(c, j, offset, -+ end - offset, sectors_read, -+ READ); -+ switch (ret) { -+ case BCH_FSCK_OK: -+ break; -+ case JOURNAL_ENTRY_REREAD: -+ if (vstruct_bytes(j) > buf->size) { -+ ret = journal_read_buf_realloc(buf, -+ vstruct_bytes(j)); -+ if (ret) -+ return ret; -+ } -+ goto reread; -+ case JOURNAL_ENTRY_NONE: -+ if (!saw_bad) -+ return 0; -+ sectors = c->opts.block_size; -+ goto next_block; -+ case JOURNAL_ENTRY_BAD: -+ saw_bad = true; -+ sectors = c->opts.block_size; -+ goto next_block; -+ default: -+ return ret; -+ } -+ -+ /* -+ * This happens sometimes if we don't have discards on - -+ * when we've partially overwritten a bucket with new -+ * journal entries. We don't need the rest of the -+ * bucket: -+ */ -+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) -+ return 0; -+ -+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); -+ -+ mutex_lock(&jlist->lock); -+ ret = journal_entry_add(c, ca, jlist, j); -+ mutex_unlock(&jlist->lock); -+ -+ switch (ret) { -+ case JOURNAL_ENTRY_ADD_OK: -+ break; -+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: -+ break; -+ default: -+ return ret; -+ } -+ -+ sectors = vstruct_sectors(j, c->block_bits); -+next_block: -+ pr_debug("next"); -+ offset += sectors; -+ sectors_read -= sectors; -+ j = ((void *) j) + (sectors << 9); -+ } -+ -+ return 0; -+} -+ -+static void bch2_journal_read_device(struct closure *cl) -+{ -+ struct journal_device *ja = -+ container_of(cl, struct journal_device, read); -+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); -+ struct journal_list *jlist = -+ container_of(cl->parent, struct journal_list, cl); -+ struct journal_read_buf buf = { NULL, 0 }; -+ u64 min_seq = U64_MAX; -+ unsigned i; -+ int ret; -+ -+ if (!ja->nr) -+ goto out; -+ -+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); -+ if (ret) -+ goto err; -+ -+ pr_debug("%u journal buckets", ja->nr); -+ -+ for (i = 0; i < ja->nr; i++) { -+ ret = journal_read_bucket(ca, &buf, jlist, i); -+ if (ret) -+ goto err; -+ } -+ -+ /* Find the journal bucket with the highest sequence number: */ -+ for (i = 0; i < ja->nr; i++) { -+ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) -+ ja->cur_idx = i; -+ -+ min_seq = min(ja->bucket_seq[i], min_seq); -+ } -+ -+ /* -+ * If there's duplicate journal entries in multiple buckets (which -+ * definitely isn't supposed to happen, but...) - make sure to start -+ * cur_idx at the last of those buckets, so we don't deadlock trying to -+ * allocate -+ */ -+ while (ja->bucket_seq[ja->cur_idx] > min_seq && -+ ja->bucket_seq[ja->cur_idx] > -+ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ ja->sectors_free = 0; -+ -+ /* -+ * Set dirty_idx to indicate the entire journal is full and needs to be -+ * reclaimed - journal reclaim will immediately reclaim whatever isn't -+ * pinned when it first runs: -+ */ -+ ja->discard_idx = ja->dirty_idx_ondisk = -+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -+out: -+ kvpfree(buf.data, buf.size); -+ percpu_ref_put(&ca->io_ref); -+ closure_return(cl); -+ return; -+err: -+ mutex_lock(&jlist->lock); -+ jlist->ret = ret; -+ mutex_unlock(&jlist->lock); -+ goto out; -+} -+ -+int bch2_journal_read(struct bch_fs *c, struct list_head *list) -+{ -+ struct journal_list jlist; -+ struct journal_replay *i; -+ struct bch_dev *ca; -+ unsigned iter; -+ size_t keys = 0, entries = 0; -+ bool degraded = false; -+ int ret = 0; -+ -+ closure_init_stack(&jlist.cl); -+ mutex_init(&jlist.lock); -+ jlist.head = list; -+ jlist.ret = 0; -+ -+ for_each_member_device(ca, c, iter) { -+ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && -+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) -+ continue; -+ -+ if ((ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO) && -+ percpu_ref_tryget(&ca->io_ref)) -+ closure_call(&ca->journal.read, -+ bch2_journal_read_device, -+ system_unbound_wq, -+ &jlist.cl); -+ else -+ degraded = true; -+ } -+ -+ closure_sync(&jlist.cl); -+ -+ if (jlist.ret) -+ return jlist.ret; -+ -+ list_for_each_entry(i, list, list) { -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct bch_replicas_padded replicas; -+ char buf[80]; -+ -+ ret = jset_validate_entries(c, &i->j, READ); -+ if (ret) -+ goto fsck_err; -+ -+ /* -+ * If we're mounting in degraded mode - if we didn't read all -+ * the devices - this is wrong: -+ */ -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); -+ -+ if (!degraded && -+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, -+ "superblock not marked as containing replicas %s", -+ (bch2_replicas_entry_to_text(&PBUF(buf), -+ &replicas.e), buf)))) { -+ ret = bch2_mark_replicas(c, &replicas.e); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_jset_key(k, _n, entry, &i->j) -+ keys++; -+ entries++; -+ } -+ -+ if (!list_empty(list)) { -+ i = list_last_entry(list, struct journal_replay, list); -+ -+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", -+ keys, entries, le64_to_cpu(i->j.seq)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal write: */ -+ -+static void __journal_write_alloc(struct journal *j, -+ struct journal_buf *w, -+ struct dev_alloc_list *devs_sorted, -+ unsigned sectors, -+ unsigned *replicas, -+ unsigned replicas_want) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (*replicas >= replicas_want) -+ return; -+ -+ for (i = 0; i < devs_sorted->nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ /* -+ * Check that we can use this device, and aren't already using -+ * it: -+ */ -+ if (!ca->mi.durability || -+ ca->mi.state != BCH_MEMBER_STATE_RW || -+ !ja->nr || -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), -+ ca->dev_idx) || -+ sectors > ja->sectors_free) -+ continue; -+ -+ bch2_dev_stripe_increment(c, ca, &j->wp.stripe); -+ -+ bch2_bkey_append_ptr(&w->key, -+ (struct bch_extent_ptr) { -+ .offset = bucket_to_sector(ca, -+ ja->buckets[ja->cur_idx]) + -+ ca->mi.bucket_size - -+ ja->sectors_free, -+ .dev = ca->dev_idx, -+ }); -+ -+ ja->sectors_free -= sectors; -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ -+ *replicas += ca->mi.durability; -+ -+ if (*replicas >= replicas_want) -+ break; -+ } -+} -+ -+/** -+ * journal_next_bucket - move on to the next journal bucket if possible -+ */ -+static int journal_write_alloc(struct journal *j, struct journal_buf *w, -+ unsigned sectors) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ struct dev_alloc_list devs_sorted; -+ unsigned i, replicas = 0, replicas_want = -+ READ_ONCE(c->opts.metadata_replicas); -+ -+ rcu_read_lock(); -+ -+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, -+ &c->rw_devs[BCH_DATA_JOURNAL]); -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+ -+ if (replicas >= replicas_want) -+ goto done; -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ if (sectors > ja->sectors_free && -+ sectors <= ca->mi.bucket_size && -+ bch2_journal_dev_buckets_available(j, ja, -+ journal_space_discarded)) { -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ /* -+ * ja->bucket_seq[ja->cur_idx] must always have -+ * something sensible: -+ */ -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ } -+ } -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+done: -+ rcu_read_unlock(); -+ -+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; -+} -+ -+static void journal_write_compact(struct jset *jset) -+{ -+ struct jset_entry *i, *next, *prev = NULL; -+ -+ /* -+ * Simple compaction, dropping empty jset_entries (from journal -+ * reservations that weren't fully used) and merging jset_entries that -+ * can be. -+ * -+ * If we wanted to be really fancy here, we could sort all the keys in -+ * the jset and drop keys that were overwritten - probably not worth it: -+ */ -+ vstruct_for_each_safe(jset, i, next) { -+ unsigned u64s = le16_to_cpu(i->u64s); -+ -+ /* Empty entry: */ -+ if (!u64s) -+ continue; -+ -+ /* Can we merge with previous entry? */ -+ if (prev && -+ i->btree_id == prev->btree_id && -+ i->level == prev->level && -+ i->type == prev->type && -+ i->type == BCH_JSET_ENTRY_btree_keys && -+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { -+ memmove_u64s_down(vstruct_next(prev), -+ i->_data, -+ u64s); -+ le16_add_cpu(&prev->u64s, u64s); -+ continue; -+ } -+ -+ /* Couldn't merge, move i into new position (after prev): */ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ if (i != prev) -+ memmove_u64s_down(prev, i, jset_u64s(u64s)); -+ } -+ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ -+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -+{ -+ /* we aren't holding j->lock: */ -+ unsigned new_size = READ_ONCE(j->buf_size_want); -+ void *new_buf; -+ -+ if (buf->buf_size >= new_size) -+ return; -+ -+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); -+ if (!new_buf) -+ return; -+ -+ memcpy(new_buf, buf->data, buf->buf_size); -+ kvpfree(buf->data, buf->buf_size); -+ buf->data = new_buf; -+ buf->buf_size = new_size; -+} -+ -+static void journal_write_done(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *w = journal_prev_buf(j); -+ struct bch_devs_list devs = -+ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); -+ struct bch_replicas_padded replicas; -+ u64 seq = le64_to_cpu(w->data->seq); -+ u64 last_seq = le64_to_cpu(w->data->last_seq); -+ -+ bch2_time_stats_update(j->write_time, j->write_start_time); -+ -+ if (!devs.nr) { -+ bch_err(c, "unable to write journal to sufficient devices"); -+ goto err; -+ } -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); -+ -+ if (bch2_mark_replicas(c, &replicas.e)) -+ goto err; -+ -+ spin_lock(&j->lock); -+ if (seq >= j->pin.front) -+ journal_seq_pin(j, seq)->devs = devs; -+ -+ j->seq_ondisk = seq; -+ j->last_seq_ondisk = last_seq; -+ bch2_journal_space_available(j); -+ -+ /* -+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard -+ * more buckets: -+ * -+ * Must come before signaling write completion, for -+ * bch2_fs_journal_stop(): -+ */ -+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -+out: -+ /* also must come before signalling write completion: */ -+ closure_debug_destroy(cl); -+ -+ BUG_ON(!j->reservations.prev_buf_unwritten); -+ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, -+ &j->reservations.counter); -+ -+ closure_wake_up(&w->wait); -+ journal_wake(j); -+ -+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ mod_delayed_work(system_freezable_wq, &j->write_work, 0); -+ spin_unlock(&j->lock); -+ return; -+err: -+ bch2_fatal_error(c); -+ spin_lock(&j->lock); -+ goto out; -+} -+ -+static void journal_write_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ struct journal *j = &ca->fs->journal; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", -+ blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("journal")) { -+ struct journal_buf *w = journal_prev_buf(j); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&j->err_lock, flags); -+ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); -+ spin_unlock_irqrestore(&j->err_lock, flags); -+ } -+ -+ closure_put(&j->io); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+void bch2_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_prev_buf(j); -+ struct jset_entry *start, *end; -+ struct jset *jset; -+ struct bio *bio; -+ struct bch_extent_ptr *ptr; -+ bool validate_before_checksum = false; -+ unsigned i, sectors, bytes, u64s; -+ int ret; -+ -+ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); -+ -+ journal_buf_realloc(j, w); -+ jset = w->data; -+ -+ j->write_start_time = local_clock(); -+ -+ /* -+ * New btree roots are set by journalling them; when the journal entry -+ * gets written we have to propagate them to c->btree_roots -+ * -+ * But, every journal entry we write has to contain all the btree roots -+ * (at least for now); so after we copy btree roots to c->btree_roots we -+ * have to get any missing btree roots and add them to this journal -+ * entry: -+ */ -+ -+ bch2_journal_entries_to_btree_roots(c, jset); -+ -+ start = end = vstruct_last(jset); -+ -+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); -+ -+ end = bch2_journal_super_entries_add_common(c, end, -+ le64_to_cpu(jset->seq)); -+ u64s = (u64 *) end - (u64 *) start; -+ BUG_ON(u64s > j->entry_u64s_reserved); -+ -+ le32_add_cpu(&jset->u64s, u64s); -+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); -+ -+ journal_write_compact(jset); -+ -+ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ jset->magic = cpu_to_le64(jset_magic(c)); -+ -+ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le32(BCH_JSET_VERSION_OLD) -+ : cpu_to_le32(c->sb.version); -+ -+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); -+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) -+ validate_before_checksum = true; -+ -+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ if (validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), -+ journal_nonce(jset), jset); -+ -+ if (!validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ sectors = vstruct_sectors(jset, c->block_bits); -+ BUG_ON(sectors > w->sectors); -+ -+ bytes = vstruct_bytes(jset); -+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); -+ -+retry_alloc: -+ spin_lock(&j->lock); -+ ret = journal_write_alloc(j, w, sectors); -+ -+ if (ret && j->can_discard) { -+ spin_unlock(&j->lock); -+ bch2_journal_do_discards(j); -+ goto retry_alloc; -+ } -+ -+ /* -+ * write is allocated, no longer need to account for it in -+ * bch2_journal_space_available(): -+ */ -+ w->sectors = 0; -+ -+ /* -+ * journal entry has been compacted and allocated, recalculate space -+ * available: -+ */ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ if (ret) { -+ bch_err(c, "Unable to allocate journal write"); -+ bch2_fatal_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+ } -+ -+ /* -+ * XXX: we really should just disable the entire journal in nochanges -+ * mode -+ */ -+ if (c->opts.nochanges) -+ goto no_io; -+ -+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ /* XXX: fix this */ -+ bch_err(c, "missing device for journal write\n"); -+ continue; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], -+ sectors); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = ptr->offset; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, -+ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); -+ bch2_bio_map(bio, jset, sectors << 9); -+ -+ trace_journal_write(bio); -+ closure_bio_submit(bio, cl); -+ -+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); -+ } -+ -+ for_each_rw_member(ca, c, i) -+ if (journal_flushes_device(ca) && -+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { -+ percpu_ref_get(&ca->io_ref); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_FLUSH; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ closure_bio_submit(bio, cl); -+ } -+ -+no_io: -+ bch2_bucket_seq_cleanup(c); -+ -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+err: -+ bch2_inconsistent_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+} -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -new file mode 100644 -index 000000000000..72e575f360af ---- /dev/null -+++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,42 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_IO_H -+#define _BCACHEFS_JOURNAL_IO_H -+ -+/* -+ * Only used for holding the journal entries we read in btree_journal_read() -+ * during cache_registration -+ */ -+struct journal_replay { -+ struct list_head list; -+ struct bch_devs_list devs; -+ /* must be last: */ -+ struct jset j; -+}; -+ -+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, -+ struct jset_entry *entry, unsigned type) -+{ -+ while (entry < vstruct_last(jset)) { -+ if (entry->type == type) -+ return entry; -+ -+ entry = vstruct_next(entry); -+ } -+ -+ return NULL; -+} -+ -+#define for_each_jset_entry_type(entry, jset, type) \ -+ for (entry = (jset)->start; \ -+ (entry = __jset_entry_type_next(jset, entry, type)); \ -+ entry = vstruct_next(entry)) -+ -+#define for_each_jset_key(k, _n, entry, jset) \ -+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ -+ vstruct_for_each_safe(entry, k, _n) -+ -+int bch2_journal_read(struct bch_fs *, struct list_head *); -+ -+void bch2_journal_write(struct closure *); -+ -+#endif /* _BCACHEFS_JOURNAL_IO_H */ -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -new file mode 100644 -index 000000000000..4811ab9f879e ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,644 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+#include "super.h" -+ -+/* Free space calculations: */ -+ -+static unsigned journal_space_from(struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ switch (from) { -+ case journal_space_discarded: -+ return ja->discard_idx; -+ case journal_space_clean_ondisk: -+ return ja->dirty_idx_ondisk; -+ case journal_space_clean: -+ return ja->dirty_idx; -+ default: -+ BUG(); -+ } -+} -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *j, -+ struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ unsigned available = (journal_space_from(ja, from) - -+ ja->cur_idx - 1 + ja->nr) % ja->nr; -+ -+ /* -+ * Don't use the last bucket unless writing the new last_seq -+ * will make another bucket available: -+ */ -+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) -+ --available; -+ -+ return available; -+} -+ -+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) -+{ -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ new.remaining = u64s_remaining; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+} -+ -+static struct journal_space { -+ unsigned next_entry; -+ unsigned remaining; -+} __journal_space_available(struct journal *j, unsigned nr_devs_want, -+ enum journal_space_from from) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned sectors_next_entry = UINT_MAX; -+ unsigned sectors_total = UINT_MAX; -+ unsigned i, nr_devs = 0; -+ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten -+ ? journal_prev_buf(j)->sectors -+ : 0; -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_JOURNAL]) { -+ struct journal_device *ja = &ca->journal; -+ unsigned buckets_this_device, sectors_this_device; -+ -+ if (!ja->nr) -+ continue; -+ -+ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); -+ sectors_this_device = ja->sectors_free; -+ -+ /* -+ * We that we don't allocate the space for a journal entry -+ * until we write it out - thus, account for it here: -+ */ -+ if (unwritten_sectors >= sectors_this_device) { -+ if (!buckets_this_device) -+ continue; -+ -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ sectors_this_device -= unwritten_sectors; -+ -+ if (sectors_this_device < ca->mi.bucket_size && -+ buckets_this_device) { -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ if (!sectors_this_device) -+ continue; -+ -+ sectors_next_entry = min(sectors_next_entry, -+ sectors_this_device); -+ -+ sectors_total = min(sectors_total, -+ buckets_this_device * ca->mi.bucket_size + -+ sectors_this_device); -+ -+ nr_devs++; -+ } -+ rcu_read_unlock(); -+ -+ if (nr_devs < nr_devs_want) -+ return (struct journal_space) { 0, 0 }; -+ -+ return (struct journal_space) { -+ .next_entry = sectors_next_entry, -+ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), -+ }; -+} -+ -+void bch2_journal_space_available(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_space discarded, clean_ondisk, clean; -+ unsigned overhead, u64s_remaining = 0; -+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, -+ j->buf[1].buf_size >> 9); -+ unsigned i, nr_online = 0, nr_devs_want; -+ bool can_discard = false; -+ int ret = 0; -+ -+ lockdep_assert_held(&j->lock); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_JOURNAL]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ while (ja->dirty_idx != ja->cur_idx && -+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ -+ while (ja->dirty_idx_ondisk != ja->dirty_idx && -+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ -+ if (ja->discard_idx != ja->dirty_idx_ondisk) -+ can_discard = true; -+ -+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); -+ nr_online++; -+ } -+ rcu_read_unlock(); -+ -+ j->can_discard = can_discard; -+ -+ if (nr_online < c->opts.metadata_replicas_required) { -+ ret = -EROFS; -+ goto out; -+ } -+ -+ if (!fifo_free(&j->pin)) { -+ ret = -ENOSPC; -+ goto out; -+ } -+ -+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); -+ -+ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); -+ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); -+ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); -+ -+ if (!discarded.next_entry) -+ ret = -ENOSPC; -+ -+ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * -+ journal_entry_overhead(j); -+ u64s_remaining = clean.remaining << 6; -+ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); -+ u64s_remaining /= 4; -+out: -+ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; -+ j->cur_entry_error = ret; -+ journal_set_remaining(j, u64s_remaining); -+ journal_check_may_get_unreserved(j); -+ -+ if (!ret) -+ journal_wake(j); -+} -+ -+/* Discards - last part of journal reclaim: */ -+ -+static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = ja->discard_idx != ja->dirty_idx_ondisk; -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * Advance ja->discard_idx as long as it points to buckets that are no longer -+ * dirty, issuing discards if necessary: -+ */ -+void bch2_journal_do_discards(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ mutex_lock(&j->discard_lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ -+ while (should_discard_bucket(j, ja)) { -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, -+ ja->buckets[ja->discard_idx]), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ spin_lock(&j->lock); -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ } -+ } -+ -+ mutex_unlock(&j->discard_lock); -+} -+ -+/* -+ * Journal entry pinning - machinery for holding a reference on a given journal -+ * entry, holding it open to ensure it gets replayed during recovery: -+ */ -+ -+static void bch2_journal_reclaim_fast(struct journal *j) -+{ -+ struct journal_entry_pin_list temp; -+ bool popped = false; -+ -+ lockdep_assert_held(&j->lock); -+ -+ /* -+ * Unpin journal entries whose reference counts reached zero, meaning -+ * all btree nodes got written out -+ */ -+ while (!fifo_empty(&j->pin) && -+ !atomic_read(&fifo_peek_front(&j->pin).count)) { -+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); -+ BUG_ON(!fifo_pop(&j->pin, temp)); -+ popped = true; -+ } -+ -+ if (popped) -+ bch2_journal_space_available(j); -+} -+ -+void bch2_journal_pin_put(struct journal *j, u64 seq) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ if (atomic_dec_and_test(&pin_list->count)) { -+ spin_lock(&j->lock); -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+static inline void __journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ struct journal_entry_pin_list *pin_list; -+ -+ if (!journal_pin_active(pin)) -+ return; -+ -+ pin_list = journal_seq_pin(j, pin->seq); -+ pin->seq = 0; -+ list_del_init(&pin->list); -+ -+ /* -+ * Unpinning a journal entry make make journal_next_bucket() succeed, if -+ * writing a new last_seq will now make another bucket available: -+ */ -+ if (atomic_dec_and_test(&pin_list->count) && -+ pin_list == &fifo_peek_front(&j->pin)) -+ bch2_journal_reclaim_fast(j); -+ else if (fifo_used(&j->pin) == 1 && -+ atomic_read(&pin_list->count) == 1) -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ spin_lock(&j->lock); -+ __journal_pin_drop(j, pin); -+ spin_unlock(&j->lock); -+} -+ -+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ __journal_pin_drop(j, pin); -+ -+ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); -+ -+ atomic_inc(&pin_list->count); -+ pin->seq = seq; -+ pin->flush = flush_fn; -+ -+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); -+} -+ -+void __bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_update(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (journal_pin_active(pin) && pin->seq < seq) -+ return; -+ -+ spin_lock(&j->lock); -+ -+ if (pin->seq != seq) { -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ } else { -+ struct journal_entry_pin_list *pin_list = -+ journal_seq_pin(j, seq); -+ -+ /* -+ * If the pin is already pinning the right sequence number, it -+ * still might've already been flushed: -+ */ -+ list_move(&pin->list, &pin_list->list); -+ } -+ -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_copy(struct journal *j, -+ struct journal_entry_pin *dst, -+ struct journal_entry_pin *src, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ -+ if (journal_pin_active(src) && -+ (!journal_pin_active(dst) || src->seq < dst->seq)) -+ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running -+ */ -+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -+{ -+ BUG_ON(journal_pin_active(pin)); -+ -+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -+} -+ -+/* -+ * Journal reclaim: flush references to open journal entries to reclaim space in -+ * the journal -+ * -+ * May be done by the journal code in the background as needed to free up space -+ * for more journal entries, or as part of doing a clean shutdown, or to migrate -+ * data off of a specific device: -+ */ -+ -+static struct journal_entry_pin * -+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *ret = NULL; -+ -+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) -+ return NULL; -+ -+ spin_lock(&j->lock); -+ -+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) -+ if (*seq > max_seq || -+ (ret = list_first_entry_or_null(&pin_list->list, -+ struct journal_entry_pin, list))) -+ break; -+ -+ if (ret) { -+ list_move(&ret->list, &pin_list->flushed); -+ BUG_ON(j->flush_in_progress); -+ j->flush_in_progress = ret; -+ j->last_flushed = jiffies; -+ } -+ -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* returns true if we did work */ -+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, -+ unsigned min_nr) -+{ -+ struct journal_entry_pin *pin; -+ bool ret = false; -+ u64 seq; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ while ((pin = journal_get_next_pin(j, min_nr -+ ? U64_MAX : seq_to_flush, &seq))) { -+ if (min_nr) -+ min_nr--; -+ -+ pin->flush(j, pin, seq); -+ -+ BUG_ON(j->flush_in_progress != pin); -+ j->flush_in_progress = NULL; -+ wake_up(&j->pin_flush_wait); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_journal_reclaim - free up journal buckets -+ * -+ * Background journal reclaim writes out btree nodes. It should be run -+ * early enough so that we never completely run out of journal buckets. -+ * -+ * High watermarks for triggering background reclaim: -+ * - FIFO has fewer than 512 entries left -+ * - fewer than 25% journal buckets free -+ * -+ * Background reclaim runs until low watermarks are reached: -+ * - FIFO has more than 1024 entries left -+ * - more than 50% journal buckets free -+ * -+ * As long as a reclaim can complete in the time it takes to fill up -+ * 512 journal entries or 25% of all journal buckets, then -+ * journal_next_bucket() should not stall. -+ */ -+void bch2_journal_reclaim(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter, min_nr = 0; -+ u64 seq_to_flush = 0; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ bch2_journal_do_discards(j); -+ -+ spin_lock(&j->lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ unsigned nr_buckets, bucket_to_flush; -+ -+ if (!ja->nr) -+ continue; -+ -+ /* Try to keep the journal at most half full: */ -+ nr_buckets = ja->nr / 2; -+ -+ /* And include pre-reservations: */ -+ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, -+ (ca->mi.bucket_size << 6) - -+ journal_entry_overhead(j)); -+ -+ nr_buckets = min(nr_buckets, ja->nr); -+ -+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; -+ seq_to_flush = max(seq_to_flush, -+ ja->bucket_seq[bucket_to_flush]); -+ } -+ -+ /* Also flush if the pin fifo is more than half full */ -+ seq_to_flush = max_t(s64, seq_to_flush, -+ (s64) journal_cur_seq(j) - -+ (j->pin.size >> 1)); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If it's been longer than j->reclaim_delay_ms since we last flushed, -+ * make sure to flush at least one journal pin: -+ */ -+ if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(j->reclaim_delay_ms))) -+ min_nr = 1; -+ -+ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { -+ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); -+ min_nr = 1; -+ } -+ -+ journal_flush_pins(j, seq_to_flush, min_nr); -+ -+ if (!bch2_journal_error(j)) -+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, -+ msecs_to_jiffies(j->reclaim_delay_ms)); -+} -+ -+void bch2_journal_reclaim_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(to_delayed_work(work), -+ struct journal, reclaim_work); -+ -+ mutex_lock(&j->reclaim_lock); -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+} -+ -+static int journal_flush_done(struct journal *j, u64 seq_to_flush, -+ bool *did_work) -+{ -+ int ret; -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&j->reclaim_lock); -+ -+ *did_work = journal_flush_pins(j, seq_to_flush, 0); -+ -+ spin_lock(&j->lock); -+ /* -+ * If journal replay hasn't completed, the unreplayed journal entries -+ * hold refs on their corresponding sequence numbers -+ */ -+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || -+ journal_last_seq(j) > seq_to_flush || -+ (fifo_used(&j->pin) == 1 && -+ atomic_read(&fifo_peek_front(&j->pin).count) == 1); -+ -+ spin_unlock(&j->lock); -+ mutex_unlock(&j->reclaim_lock); -+ -+ return ret; -+} -+ -+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -+{ -+ bool did_work = false; -+ -+ if (!test_bit(JOURNAL_STARTED, &j->flags)) -+ return false; -+ -+ closure_wait_event(&j->async_wait, -+ journal_flush_done(j, seq_to_flush, &did_work)); -+ -+ return did_work; -+} -+ -+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ u64 iter, seq = 0; -+ int ret = 0; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(p, &j->pin, iter) -+ if (dev_idx >= 0 -+ ? bch2_dev_list_has_dev(p->devs, dev_idx) -+ : p->devs.nr < c->opts.metadata_replicas) -+ seq = iter; -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_pins(j, seq); -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->replicas_gc_lock); -+ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); -+ -+ seq = 0; -+ -+ spin_lock(&j->lock); -+ while (!ret && seq < j->pin.back) { -+ struct bch_replicas_padded replicas; -+ -+ seq = max(seq, journal_last_seq(j)); -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, -+ journal_seq_pin(j, seq)->devs); -+ seq++; -+ -+ spin_unlock(&j->lock); -+ ret = bch2_mark_replicas(c, &replicas.e); -+ spin_lock(&j->lock); -+ } -+ spin_unlock(&j->lock); -+ -+ ret = bch2_replicas_gc_end(c, ret); -+ mutex_unlock(&c->replicas_gc_lock); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h -new file mode 100644 -index 000000000000..8128907a7623 ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.h -@@ -0,0 +1,69 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -+#define _BCACHEFS_JOURNAL_RECLAIM_H -+ -+#define JOURNAL_PIN (32 * 1024) -+ -+enum journal_space_from { -+ journal_space_discarded, -+ journal_space_clean_ondisk, -+ journal_space_clean, -+}; -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *, -+ struct journal_device *, -+ enum journal_space_from); -+void bch2_journal_space_available(struct journal *); -+ -+static inline bool journal_pin_active(struct journal_entry_pin *pin) -+{ -+ return pin->seq != 0; -+} -+ -+static inline struct journal_entry_pin_list * -+journal_seq_pin(struct journal *j, u64 seq) -+{ -+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); -+ -+ return &j->pin.data[seq & j->pin.mask]; -+} -+ -+void bch2_journal_pin_put(struct journal *, u64); -+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -+ -+void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+static inline void bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) -+ __bch2_journal_pin_add(j, seq, pin, flush_fn); -+} -+ -+void bch2_journal_pin_update(struct journal *, u64, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_copy(struct journal *, -+ struct journal_entry_pin *, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -+ -+void bch2_journal_do_discards(struct journal *); -+void bch2_journal_reclaim(struct journal *); -+void bch2_journal_reclaim_work(struct work_struct *); -+ -+bool bch2_journal_flush_pins(struct journal *, u64); -+ -+static inline bool bch2_journal_flush_all_pins(struct journal *j) -+{ -+ return bch2_journal_flush_pins(j, U64_MAX); -+} -+ -+int bch2_journal_flush_device_pins(struct journal *, int); -+ -+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -new file mode 100644 -index 000000000000..a21de0088753 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -0,0 +1,318 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_iter.h" -+#include "eytzinger.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+/* -+ * journal_seq_blacklist machinery: -+ * -+ * To guarantee order of btree updates after a crash, we need to detect when a -+ * btree node entry (bset) is newer than the newest journal entry that was -+ * successfully written, and ignore it - effectively ignoring any btree updates -+ * that didn't make it into the journal. -+ * -+ * If we didn't do this, we might have two btree nodes, a and b, both with -+ * updates that weren't written to the journal yet: if b was updated after a, -+ * but b was flushed and not a - oops; on recovery we'll find that the updates -+ * to b happened, but not the updates to a that happened before it. -+ * -+ * Ignoring bsets that are newer than the newest journal entry is always safe, -+ * because everything they contain will also have been journalled - and must -+ * still be present in the journal on disk until a journal entry has been -+ * written _after_ that bset was written. -+ * -+ * To accomplish this, bsets record the newest journal sequence number they -+ * contain updates for; then, on startup, the btree code queries the journal -+ * code to ask "Is this sequence number newer than the newest journal entry? If -+ * so, ignore it." -+ * -+ * When this happens, we must blacklist that journal sequence number: the -+ * journal must not write any entries with that sequence number, and it must -+ * record that it was blacklisted so that a) on recovery we don't think we have -+ * missing journal entries and b) so that the btree code continues to ignore -+ * that bset, until that btree node is rewritten. -+ */ -+ -+static unsigned -+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -+{ -+ return bl -+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / -+ sizeof(struct journal_seq_blacklist_entry)) -+ : 0; -+} -+ -+static unsigned sb_blacklist_u64s(unsigned nr) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ -+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -+} -+ -+static struct bch_sb_field_journal_seq_blacklist * -+blacklist_entry_try_merge(struct bch_fs *c, -+ struct bch_sb_field_journal_seq_blacklist *bl, -+ unsigned i) -+{ -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ if (le64_to_cpu(bl->start[i].end) >= -+ le64_to_cpu(bl->start[i + 1].start)) { -+ bl->start[i].end = bl->start[i + 1].end; -+ --nr; -+ memmove(&bl->start[i], -+ &bl->start[i + 1], -+ sizeof(bl->start[0]) * (nr - i)); -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr)); -+ BUG_ON(!bl); -+ } -+ -+ return bl; -+} -+ -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ unsigned i, nr; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ nr = blacklist_nr_entries(bl); -+ -+ if (bl) { -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = -+ bl->start + i; -+ -+ if (start == le64_to_cpu(e->start) && -+ end == le64_to_cpu(e->end)) -+ goto out; -+ -+ if (start <= le64_to_cpu(e->start) && -+ end >= le64_to_cpu(e->end)) { -+ e->start = cpu_to_le64(start); -+ e->end = cpu_to_le64(end); -+ -+ if (i + 1 < nr) -+ bl = blacklist_entry_try_merge(c, -+ bl, i); -+ if (i) -+ bl = blacklist_entry_try_merge(c, -+ bl, i - 1); -+ goto out_write_sb; -+ } -+ } -+ } -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr + 1)); -+ if (!bl) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ bl->start[nr].start = cpu_to_le64(start); -+ bl->start[nr].end = cpu_to_le64(end); -+out_write_sb: -+ c->disk_sb.sb->features[0] |= -+ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; -+ -+ ret = bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static int journal_seq_blacklist_table_cmp(const void *_l, -+ const void *_r, size_t size) -+{ -+ const struct journal_seq_blacklist_table_entry *l = _l; -+ const struct journal_seq_blacklist_table_entry *r = _r; -+ -+ return cmp_int(l->start, r->start); -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, -+ bool dirty) -+{ -+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; -+ struct journal_seq_blacklist_table_entry search = { .start = seq }; -+ int idx; -+ -+ if (!t) -+ return false; -+ -+ idx = eytzinger0_find_le(t->entries, t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ &search); -+ if (idx < 0) -+ return false; -+ -+ BUG_ON(t->entries[idx].start > seq); -+ -+ if (seq >= t->entries[idx].end) -+ return false; -+ -+ if (dirty) -+ t->entries[idx].dirty = true; -+ return true; -+} -+ -+int bch2_blacklist_table_initialize(struct bch_fs *c) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ struct journal_seq_blacklist_table *t; -+ unsigned i, nr = blacklist_nr_entries(bl); -+ -+ BUG_ON(c->journal_seq_blacklist_table); -+ -+ if (!bl) -+ return 0; -+ -+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, -+ GFP_KERNEL); -+ if (!t) -+ return -ENOMEM; -+ -+ t->nr = nr; -+ -+ for (i = 0; i < nr; i++) { -+ t->entries[i].start = le64_to_cpu(bl->start[i].start); -+ t->entries[i].end = le64_to_cpu(bl->start[i].end); -+ } -+ -+ eytzinger0_sort(t->entries, -+ t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ NULL); -+ -+ c->journal_seq_blacklist_table = t; -+ return 0; -+} -+ -+static const char * -+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (le64_to_cpu(i->start) >= -+ le64_to_cpu(i->end)) -+ return "entry start >= end"; -+ -+ if (i + 1 < bl->start + nr && -+ le64_to_cpu(i[0].end) > -+ le64_to_cpu(i[1].start)) -+ return "entries out of order"; -+ } -+ -+ return NULL; -+} -+ -+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (i != bl->start) -+ pr_buf(out, " "); -+ -+ pr_buf(out, "%llu-%llu", -+ le64_to_cpu(i->start), -+ le64_to_cpu(i->end)); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { -+ .validate = bch2_sb_journal_seq_blacklist_validate, -+ .to_text = bch2_sb_journal_seq_blacklist_to_text -+}; -+ -+void bch2_blacklist_entries_gc(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ journal_seq_blacklist_gc_work); -+ struct journal_seq_blacklist_table *t; -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ struct journal_seq_blacklist_entry *src, *dst; -+ struct btree_trans trans; -+ unsigned i, nr, new_nr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_iter *iter; -+ struct btree *b; -+ -+ for_each_btree_node(&trans, iter, i, POS_MIN, -+ BTREE_ITER_PREFETCH, b) -+ if (test_bit(BCH_FS_STOPPING, &c->flags)) { -+ bch2_trans_exit(&trans); -+ return; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ if (!bl) -+ goto out; -+ -+ nr = blacklist_nr_entries(bl); -+ dst = bl->start; -+ -+ t = c->journal_seq_blacklist_table; -+ BUG_ON(nr != t->nr); -+ -+ for (src = bl->start, i = eytzinger0_first(t->nr); -+ src < bl->start + nr; -+ src++, i = eytzinger0_next(i, nr)) { -+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); -+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); -+ -+ if (t->entries[i].dirty) -+ *dst++ = *src; -+ } -+ -+ new_nr = dst - bl->start; -+ -+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); -+ -+ if (new_nr != nr) { -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ new_nr ? sb_blacklist_u64s(new_nr) : 0); -+ BUG_ON(new_nr && !bl); -+ -+ if (!new_nr) -+ c->disk_sb.sb->features[0] &= -+ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); -+ -+ bch2_write_super(c); -+ } -+out: -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h -new file mode 100644 -index 000000000000..03f4b97247fd ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.h -@@ -0,0 +1,13 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -+int bch2_blacklist_table_initialize(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -+ -+void bch2_blacklist_entries_gc(struct work_struct *); -+ -+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -new file mode 100644 -index 000000000000..154b51b891d3 ---- /dev/null -+++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,277 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_TYPES_H -+#define _BCACHEFS_JOURNAL_TYPES_H -+ -+#include -+#include -+ -+#include "alloc_types.h" -+#include "super_types.h" -+#include "fifo.h" -+ -+struct journal_res; -+ -+/* -+ * We put two of these in struct journal; we used them for writes to the -+ * journal that are being staged or in flight. -+ */ -+struct journal_buf { -+ struct jset *data; -+ -+ BKEY_PADDED(key); -+ -+ struct closure_waitlist wait; -+ -+ unsigned buf_size; /* size in bytes of @data */ -+ unsigned sectors; /* maximum size for current entry */ -+ unsigned disk_sectors; /* maximum size entry could have been, if -+ buf_size was bigger */ -+ unsigned u64s_reserved; -+ /* bloom filter: */ -+ unsigned long has_inode[1024 / sizeof(unsigned long)]; -+}; -+ -+/* -+ * Something that makes a journal entry dirty - i.e. a btree node that has to be -+ * flushed: -+ */ -+ -+struct journal_entry_pin_list { -+ struct list_head list; -+ struct list_head flushed; -+ atomic_t count; -+ struct bch_devs_list devs; -+}; -+ -+struct journal; -+struct journal_entry_pin; -+typedef void (*journal_pin_flush_fn)(struct journal *j, -+ struct journal_entry_pin *, u64); -+ -+struct journal_entry_pin { -+ struct list_head list; -+ journal_pin_flush_fn flush; -+ u64 seq; -+}; -+ -+struct journal_res { -+ bool ref; -+ u8 idx; -+ u16 u64s; -+ u32 offset; -+ u64 seq; -+}; -+ -+/* -+ * For reserving space in the journal prior to getting a reservation on a -+ * particular journal entry: -+ */ -+struct journal_preres { -+ unsigned u64s; -+}; -+ -+union journal_res_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 cur_entry_offset:20, -+ idx:1, -+ prev_buf_unwritten:1, -+ buf0_count:21, -+ buf1_count:21; -+ }; -+}; -+ -+union journal_preres_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u32 reserved; -+ u32 remaining; -+ }; -+}; -+ -+/* bytes: */ -+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+ -+/* -+ * We stash some journal state as sentinal values in cur_entry_offset: -+ * note - cur_entry_offset is in units of u64s -+ */ -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+ -+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) -+ -+/* -+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, -+ * either because something's waiting on the write to complete or because it's -+ * been dirty too long and the timer's expired. -+ */ -+ -+enum { -+ JOURNAL_REPLAY_DONE, -+ JOURNAL_STARTED, -+ JOURNAL_RECLAIM_STARTED, -+ JOURNAL_NEED_WRITE, -+ JOURNAL_NOT_EMPTY, -+ JOURNAL_MAY_GET_UNRESERVED, -+}; -+ -+/* Embedded in struct bch_fs */ -+struct journal { -+ /* Fastpath stuff up front: */ -+ -+ unsigned long flags; -+ -+ union journal_res_state reservations; -+ -+ /* Max size of current journal entry */ -+ unsigned cur_entry_u64s; -+ unsigned cur_entry_sectors; -+ -+ /* -+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if -+ * insufficient devices: -+ */ -+ int cur_entry_error; -+ -+ union journal_preres_state prereserved; -+ -+ /* Reserved space in journal entry to be used just prior to write */ -+ unsigned entry_u64s_reserved; -+ -+ unsigned buf_size_want; -+ -+ /* -+ * Two journal entries -- one is currently open for new entries, the -+ * other is possibly being written out. -+ */ -+ struct journal_buf buf[2]; -+ -+ spinlock_t lock; -+ -+ /* if nonzero, we may not open a new journal entry: */ -+ unsigned blocked; -+ -+ /* Used when waiting because the journal was full */ -+ wait_queue_head_t wait; -+ struct closure_waitlist async_wait; -+ struct closure_waitlist preres_wait; -+ -+ struct closure io; -+ struct delayed_work write_work; -+ -+ /* Sequence number of most recent journal entry (last entry in @pin) */ -+ atomic64_t seq; -+ -+ /* seq, last_seq from the most recent journal entry successfully written */ -+ u64 seq_ondisk; -+ u64 last_seq_ondisk; -+ -+ /* -+ * FIFO of journal entries whose btree updates have not yet been -+ * written out. -+ * -+ * Each entry is a reference count. The position in the FIFO is the -+ * entry's sequence number relative to @seq. -+ * -+ * The journal entry itself holds a reference count, put when the -+ * journal entry is written out. Each btree node modified by the journal -+ * entry also holds a reference count, put when the btree node is -+ * written. -+ * -+ * When a reference count reaches zero, the journal entry is no longer -+ * needed. When all journal entries in the oldest journal bucket are no -+ * longer needed, the bucket can be discarded and reused. -+ */ -+ struct { -+ u64 front, back, size, mask; -+ struct journal_entry_pin_list *data; -+ } pin; -+ -+ u64 replay_journal_seq; -+ u64 replay_journal_seq_end; -+ -+ struct write_point wp; -+ spinlock_t err_lock; -+ -+ struct delayed_work reclaim_work; -+ struct mutex reclaim_lock; -+ unsigned long last_flushed; -+ struct journal_entry_pin *flush_in_progress; -+ wait_queue_head_t pin_flush_wait; -+ -+ /* protects advancing ja->discard_idx: */ -+ struct mutex discard_lock; -+ bool can_discard; -+ -+ unsigned write_delay_ms; -+ unsigned reclaim_delay_ms; -+ -+ u64 res_get_blocked_start; -+ u64 need_write_time; -+ u64 write_start_time; -+ -+ struct time_stats *write_time; -+ struct time_stats *delay_time; -+ struct time_stats *blocked_time; -+ struct time_stats *flush_seq_time; -+ -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map res_map; -+#endif -+}; -+ -+/* -+ * Embedded in struct bch_dev. First three fields refer to the array of journal -+ * buckets, in bch_sb. -+ */ -+struct journal_device { -+ /* -+ * For each journal bucket, contains the max sequence number of the -+ * journal writes it contains - so we know when a bucket can be reused. -+ */ -+ u64 *bucket_seq; -+ -+ unsigned sectors_free; -+ -+ /* -+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: -+ */ -+ unsigned discard_idx; /* Next bucket to discard */ -+ unsigned dirty_idx_ondisk; -+ unsigned dirty_idx; -+ unsigned cur_idx; /* Journal bucket we're currently writing to */ -+ unsigned nr; -+ -+ u64 *buckets; -+ -+ /* Bio for journal reads/writes to this device */ -+ struct bio *bio; -+ -+ /* for bch_journal_read_device */ -+ struct closure read; -+}; -+ -+/* -+ * journal_entry_res - reserve space in every journal entry: -+ */ -+struct journal_entry_res { -+ unsigned u64s; -+}; -+ -+#endif /* _BCACHEFS_JOURNAL_TYPES_H */ -diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c -new file mode 100644 -index 000000000000..864dfaa67b7a ---- /dev/null -+++ b/fs/bcachefs/keylist.c -@@ -0,0 +1,67 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "keylist.h" -+ -+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, -+ size_t nr_inline_u64s, size_t new_u64s) -+{ -+ size_t oldsize = bch2_keylist_u64s(l); -+ size_t newsize = oldsize + new_u64s; -+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; -+ u64 *new_keys; -+ -+ newsize = roundup_pow_of_two(newsize); -+ -+ if (newsize <= nr_inline_u64s || -+ (old_buf && roundup_pow_of_two(oldsize) == newsize)) -+ return 0; -+ -+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); -+ if (!new_keys) -+ return -ENOMEM; -+ -+ if (!old_buf) -+ memcpy_u64s(new_keys, inline_u64s, oldsize); -+ -+ l->keys_p = new_keys; -+ l->top_p = new_keys + oldsize; -+ -+ return 0; -+} -+ -+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -+{ -+ struct bkey_i *where; -+ -+ for_each_keylist_key(l, where) -+ if (bkey_cmp(insert->k.p, where->k.p) < 0) -+ break; -+ -+ memmove_u64s_up((u64 *) where + insert->k.u64s, -+ where, -+ ((u64 *) l->top) - ((u64 *) where)); -+ -+ l->top_p += insert->k.u64s; -+ bkey_copy(where, insert); -+} -+ -+void bch2_keylist_pop_front(struct keylist *l) -+{ -+ l->top_p -= bch2_keylist_front(l)->k.u64s; -+ -+ memmove_u64s_down(l->keys, -+ bkey_next(l->keys), -+ bch2_keylist_u64s(l)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *l) -+{ -+ struct bkey_i *k; -+ -+ for_each_keylist_key(l, k) -+ BUG_ON(bkey_next(k) != l->top && -+ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); -+} -+#endif -diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h -new file mode 100644 -index 000000000000..195799bb20bc ---- /dev/null -+++ b/fs/bcachefs/keylist.h -@@ -0,0 +1,76 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_H -+#define _BCACHEFS_KEYLIST_H -+ -+#include "keylist_types.h" -+ -+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); -+void bch2_keylist_pop_front(struct keylist *); -+ -+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -+{ -+ l->top_p = l->keys_p = inline_keys; -+} -+ -+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -+{ -+ if (l->keys_p != inline_keys) -+ kfree(l->keys_p); -+ bch2_keylist_init(l, inline_keys); -+} -+ -+static inline void bch2_keylist_push(struct keylist *l) -+{ -+ l->top = bkey_next(l->top); -+} -+ -+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -+{ -+ bkey_copy(l->top, k); -+ bch2_keylist_push(l); -+} -+ -+static inline bool bch2_keylist_empty(struct keylist *l) -+{ -+ return l->top == l->keys; -+} -+ -+static inline size_t bch2_keylist_u64s(struct keylist *l) -+{ -+ return l->top_p - l->keys_p; -+} -+ -+static inline size_t bch2_keylist_bytes(struct keylist *l) -+{ -+ return bch2_keylist_u64s(l) * sizeof(u64); -+} -+ -+static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -+{ -+ return l->keys; -+} -+ -+#define for_each_keylist_key(_keylist, _k) \ -+ for (_k = (_keylist)->keys; \ -+ _k != (_keylist)->top; \ -+ _k = bkey_next(_k)) -+ -+static inline u64 keylist_sectors(struct keylist *keys) -+{ -+ struct bkey_i *k; -+ u64 ret = 0; -+ -+ for_each_keylist_key(keys, k) -+ ret += k->k.size; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *); -+#else -+static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -+#endif -+ -+#endif /* _BCACHEFS_KEYLIST_H */ -diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h -new file mode 100644 -index 000000000000..4b3ff7d8a875 ---- /dev/null -+++ b/fs/bcachefs/keylist_types.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_TYPES_H -+#define _BCACHEFS_KEYLIST_TYPES_H -+ -+struct keylist { -+ union { -+ struct bkey_i *keys; -+ u64 *keys_p; -+ }; -+ union { -+ struct bkey_i *top; -+ u64 *top_p; -+ }; -+}; -+ -+#endif /* _BCACHEFS_KEYLIST_TYPES_H */ -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -new file mode 100644 -index 000000000000..96c8690adc5b ---- /dev/null -+++ b/fs/bcachefs/migrate.c -@@ -0,0 +1,170 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for moving data off a device. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "extents.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "migrate.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, -+ unsigned dev_idx, int flags, bool metadata) -+{ -+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; -+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; -+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; -+ unsigned nr_good; -+ -+ bch2_bkey_drop_device(k, dev_idx); -+ -+ nr_good = bch2_bkey_durability(c, k.s_c); -+ if ((!nr_good && !(flags & lost)) || -+ (nr_good < replicas && !(flags & degraded))) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, -+ enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack sk; -+ int ret = 0; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k))) { -+ if (!bch2_bkey_has_device(k, dev_idx)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), -+ dev_idx, flags, false); -+ if (ret) -+ break; -+ -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * don't want to leave ret == -EINTR, since if we raced and -+ * something else overwrote the key we could spuriously return -+ * -EINTR below: -+ */ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: -+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); -+} -+ -+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct closure cl; -+ struct btree *b; -+ unsigned id; -+ int ret; -+ -+ /* don't handle this yet: */ -+ if (flags & BCH_FORCE_IF_METADATA_LOST) -+ return -EINVAL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ closure_init_stack(&cl); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+retry: -+ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), -+ dev_idx)) -+ continue; -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), -+ dev_idx, flags, true); -+ if (ret) { -+ bch_err(c, "Cannot drop device without losing data"); -+ goto err; -+ } -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) { -+ b = bch2_btree_iter_peek_node(iter); -+ goto retry; -+ } -+ if (ret) { -+ bch_err(c, "Error updating btree node key: %i", ret); -+ goto err; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ /* flush relevant btree updates */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = 0; -+err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, dev_idx, flags); -+} -diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h -new file mode 100644 -index 000000000000..027efaa0d575 ---- /dev/null -+++ b/fs/bcachefs/migrate.h -@@ -0,0 +1,7 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MIGRATE_H -+#define _BCACHEFS_MIGRATE_H -+ -+int bch2_dev_data_drop(struct bch_fs *, unsigned, int); -+ -+#endif /* _BCACHEFS_MIGRATE_H */ -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -new file mode 100644 -index 000000000000..b42350f9e9fb ---- /dev/null -+++ b/fs/bcachefs/move.c -@@ -0,0 +1,815 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "inode.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "keylist.h" -+ -+#include -+#include -+ -+#include -+ -+#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 -+ -+struct moving_io { -+ struct list_head list; -+ struct closure cl; -+ bool read_completed; -+ -+ unsigned read_sectors; -+ unsigned write_sectors; -+ -+ struct bch_read_bio rbio; -+ -+ struct migrate_write write; -+ /* Must be last since it is variable size */ -+ struct bio_vec bi_inline_vecs[0]; -+}; -+ -+struct moving_context { -+ /* Closure for waiting on all reads and writes to complete */ -+ struct closure cl; -+ -+ struct bch_move_stats *stats; -+ -+ struct list_head reads; -+ -+ /* in flight sectors: */ -+ atomic_t read_sectors; -+ atomic_t write_sectors; -+ -+ wait_queue_head_t wait; -+}; -+ -+static int bch2_migrate_index_update(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct migrate_write *m = -+ container_of(op, struct migrate_write, op); -+ struct keylist *keys = &op->insert_keys; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, m->btree_id, -+ bkey_start_pos(&bch2_keylist_front(keys)->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (1) { -+ struct bkey_s_c k; -+ struct bkey_i *insert; -+ struct bkey_i_extent *new; -+ BKEY_PADDED(k) _new, _insert; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bool did_work = false; -+ int nr; -+ -+ bch2_trans_reset(&trans, 0); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ if (ret == -EINTR) -+ continue; -+ break; -+ } -+ -+ new = bkey_i_to_extent(bch2_keylist_front(keys)); -+ -+ if (bversion_cmp(k.k->version, new->k.version) || -+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) -+ goto nomatch; -+ -+ if (m->data_cmd == DATA_REWRITE && -+ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) -+ goto nomatch; -+ -+ bkey_reassemble(&_insert.k, k); -+ insert = &_insert.k; -+ -+ bkey_copy(&_new.k, bch2_keylist_front(keys)); -+ new = bkey_i_to_extent(&_new.k); -+ bch2_cut_front(iter->pos, &new->k_i); -+ -+ bch2_cut_front(iter->pos, insert); -+ bch2_cut_back(new->k.p, insert); -+ bch2_cut_back(insert->k.p, &new->k_i); -+ -+ if (m->data_cmd == DATA_REWRITE) -+ bch2_bkey_drop_device(bkey_i_to_s(insert), -+ m->data_opts.rewrite_dev); -+ -+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { -+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { -+ /* -+ * raced with another move op? extent already -+ * has a pointer to the device we just wrote -+ * data to -+ */ -+ continue; -+ } -+ -+ bch2_extent_ptr_decoded_append(insert, &p); -+ did_work = true; -+ } -+ -+ if (!did_work) -+ goto nomatch; -+ -+ bch2_bkey_narrow_crcs(insert, -+ (struct bch_extent_crc_unpacked) { 0 }); -+ bch2_extent_normalize(c, bkey_i_to_s(insert)); -+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), -+ op->opts.background_target, -+ op->opts.data_replicas); -+ -+ /* -+ * If we're not fully overwriting @k, and it's compressed, we -+ * need a reservation for all the pointers in @insert -+ */ -+ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - -+ m->nr_ptrs_reserved; -+ -+ if (insert->k.size < k.k->size && -+ bch2_bkey_sectors_compressed(k) && -+ nr > 0) { -+ ret = bch2_disk_reservation_add(c, &op->res, -+ keylist_sectors(keys) * nr, 0); -+ if (ret) -+ goto out; -+ -+ m->nr_ptrs_reserved += nr; -+ goto next; -+ } -+ -+ bch2_trans_update(&trans, iter, insert, 0); -+ -+ ret = bch2_trans_commit(&trans, &op->res, -+ op_journal_seq(op), -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ m->data_opts.btree_insert_flags); -+ if (!ret) -+ atomic_long_inc(&c->extent_migrate_done); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+next: -+ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { -+ bch2_keylist_pop_front(keys); -+ if (bch2_keylist_empty(keys)) -+ goto out; -+ } -+ continue; -+nomatch: -+ if (m->ctxt) { -+ BUG_ON(k.k->p.offset <= iter->pos.offset); -+ atomic64_inc(&m->ctxt->stats->keys_raced); -+ atomic64_add(k.k->p.offset - iter->pos.offset, -+ &m->ctxt->stats->sectors_raced); -+ } -+ atomic_long_inc(&c->extent_migrate_raced); -+ trace_move_race(&new->k); -+ bch2_btree_iter_next_slot(iter); -+ goto next; -+ } -+out: -+ bch2_trans_exit(&trans); -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -+ -+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) -+{ -+ /* write bio must own pages: */ -+ BUG_ON(!m->op.wbio.bio.bi_vcnt); -+ -+ m->ptr = rbio->pick.ptr; -+ m->offset = rbio->pos.offset - rbio->pick.crc.offset; -+ m->op.devs_have = rbio->devs_have; -+ m->op.pos = rbio->pos; -+ m->op.version = rbio->version; -+ m->op.crc = rbio->pick.crc; -+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; -+ -+ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { -+ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; -+ m->op.csum_type = m->op.crc.csum_type; -+ } -+ -+ if (m->data_cmd == DATA_REWRITE) -+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); -+} -+ -+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ m->btree_id = btree_id; -+ m->data_cmd = data_cmd; -+ m->data_opts = data_opts; -+ m->nr_ptrs_reserved = 0; -+ -+ bch2_write_op_init(&m->op, c, io_opts); -+ -+ if (!bch2_bkey_is_incompressible(k)) -+ m->op.compression_type = -+ bch2_compression_opt_to_type[io_opts.background_compression ?: -+ io_opts.compression]; -+ else -+ m->op.incompressible = true; -+ -+ m->op.target = data_opts.target, -+ m->op.write_point = wp; -+ -+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) -+ m->op.alloc_reserve = RESERVE_MOVINGGC; -+ -+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| -+ BCH_WRITE_PAGES_STABLE| -+ BCH_WRITE_PAGES_OWNED| -+ BCH_WRITE_DATA_ENCODED| -+ BCH_WRITE_FROM_INTERNAL; -+ -+ m->op.nr_replicas = 1; -+ m->op.nr_replicas_required = 1; -+ m->op.index_update_fn = bch2_migrate_index_update; -+ -+ switch (data_cmd) { -+ case DATA_ADD_REPLICAS: { -+ /* -+ * DATA_ADD_REPLICAS is used for moving data to a different -+ * device in the background, and due to compression the new copy -+ * might take up more space than the old copy: -+ */ -+#if 0 -+ int nr = (int) io_opts.data_replicas - -+ bch2_bkey_nr_ptrs_allocated(k); -+#endif -+ int nr = (int) io_opts.data_replicas; -+ -+ if (nr > 0) { -+ m->op.nr_replicas = m->nr_ptrs_reserved = nr; -+ -+ ret = bch2_disk_reservation_get(c, &m->op.res, -+ k.k->size, m->op.nr_replicas, 0); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_REWRITE: { -+ unsigned compressed_sectors = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ crc_is_compressed(p.crc) && -+ bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) -+ compressed_sectors += p.crc.compressed_size; -+ -+ if (compressed_sectors) { -+ ret = bch2_disk_reservation_add(c, &m->op.res, -+ compressed_sectors, -+ BCH_DISK_RESERVATION_NOFAIL); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_PROMOTE: -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; -+ m->op.flags |= BCH_WRITE_CACHED; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+static void move_free(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ struct moving_context *ctxt = io->write.ctxt; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); -+ -+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) -+ if (bv->bv_page) -+ __free_page(bv->bv_page); -+ -+ wake_up(&ctxt->wait); -+ -+ kfree(io); -+} -+ -+static void move_write_done(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_return_with_destructor(cl, move_free); -+} -+ -+static void move_write(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ closure_return_with_destructor(cl, move_free); -+ return; -+ } -+ -+ bch2_migrate_read_done(&io->write, &io->rbio); -+ -+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_call(&io->write.op.cl, bch2_write, NULL, cl); -+ continue_at(cl, move_write_done, NULL); -+} -+ -+static inline struct moving_io *next_pending_write(struct moving_context *ctxt) -+{ -+ struct moving_io *io = -+ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); -+ -+ return io && io->read_completed ? io : NULL; -+} -+ -+static void move_read_endio(struct bio *bio) -+{ -+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ atomic_sub(io->read_sectors, &ctxt->read_sectors); -+ io->read_completed = true; -+ -+ if (next_pending_write(ctxt)) -+ wake_up(&ctxt->wait); -+ -+ closure_put(&ctxt->cl); -+} -+ -+static void do_pending_writes(struct moving_context *ctxt) -+{ -+ struct moving_io *io; -+ -+ while ((io = next_pending_write(ctxt))) { -+ list_del(&io->list); -+ closure_call(&io->cl, move_write, NULL, &ctxt->cl); -+ } -+} -+ -+#define move_ctxt_wait_event(_ctxt, _cond) \ -+do { \ -+ do_pending_writes(_ctxt); \ -+ \ -+ if (_cond) \ -+ break; \ -+ __wait_event((_ctxt)->wait, \ -+ next_pending_write(_ctxt) || (_cond)); \ -+} while (1) -+ -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -+{ -+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); -+ -+ move_ctxt_wait_event(ctxt, -+ !atomic_read(&ctxt->write_sectors) || -+ atomic_read(&ctxt->write_sectors) != sectors_pending); -+} -+ -+static int bch2_move_extent(struct bch_fs *c, -+ struct moving_context *ctxt, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct moving_io *io; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned sectors = k.k->size, pages; -+ int ret = -ENOMEM; -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->write_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->read_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ /* write path might have to decompress data: */ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -+ -+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ io = kzalloc(sizeof(struct moving_io) + -+ sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ if (!io) -+ goto err; -+ -+ io->write.ctxt = ctxt; -+ io->read_sectors = k.k->size; -+ io->write_sectors = k.k->size; -+ -+ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); -+ bio_set_prio(&io->write.op.wbio.bio, -+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ -+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -+ GFP_KERNEL)) -+ goto err_free; -+ -+ io->rbio.c = c; -+ io->rbio.opts = io_opts; -+ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); -+ io->rbio.bio.bi_vcnt = pages; -+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ -+ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); -+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -+ io->rbio.bio.bi_end_io = move_read_endio; -+ -+ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, -+ data_cmd, data_opts, btree_id, k); -+ if (ret) -+ goto err_free_pages; -+ -+ atomic64_inc(&ctxt->stats->keys_moved); -+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); -+ -+ trace_move_extent(k.k); -+ -+ atomic_add(io->read_sectors, &ctxt->read_sectors); -+ list_add_tail(&io->list, &ctxt->reads); -+ -+ /* -+ * dropped by move_read_endio() - guards against use after free of -+ * ctxt when doing wakeup -+ */ -+ closure_get(&ctxt->cl); -+ bch2_read_extent(c, &io->rbio, k, 0, -+ BCH_READ_NODECODE| -+ BCH_READ_LAST_FRAGMENT); -+ return 0; -+err_free_pages: -+ bio_free_pages(&io->write.op.wbio.bio); -+err_free: -+ kfree(io); -+err: -+ trace_move_alloc_fail(k.k); -+ return ret; -+} -+ -+static int __bch2_move_data(struct bch_fs *c, -+ struct moving_context *ctxt, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats, -+ enum btree_id btree_id) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct bkey_on_stack sk; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct data_opts data_opts; -+ enum data_cmd data_cmd; -+ u64 delay, cur_inum = U64_MAX; -+ int ret = 0, ret2; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_USER; -+ stats->btree_id = btree_id; -+ stats->pos = POS_MIN; -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, start, -+ BTREE_ITER_PREFETCH); -+ -+ if (rate) -+ bch2_ratelimit_reset(rate); -+ -+ while (1) { -+ do { -+ delay = rate ? bch2_ratelimit_delay(rate) : 0; -+ -+ if (delay) { -+ bch2_trans_unlock(&trans); -+ set_current_state(TASK_INTERRUPTIBLE); -+ } -+ -+ if (kthread && (ret = kthread_should_stop())) { -+ __set_current_state(TASK_RUNNING); -+ goto out; -+ } -+ -+ if (delay) -+ schedule_timeout(delay); -+ -+ if (unlikely(freezing(current))) { -+ bch2_trans_unlock(&trans); -+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); -+ try_to_freeze(); -+ } -+ } while (delay); -+peek: -+ k = bch2_btree_iter_peek(iter); -+ -+ stats->pos = iter->pos; -+ -+ if (!k.k) -+ break; -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (!bkey_extent_is_direct_data(k.k)) -+ goto next_nondata; -+ -+ if (btree_id == BTREE_ID_EXTENTS && -+ cur_inum != k.k->p.inode) { -+ struct bch_inode_unpacked inode; -+ -+ /* don't hold btree locks while looking up inode: */ -+ bch2_trans_unlock(&trans); -+ -+ io_opts = bch2_opts_to_inode_opts(c->opts); -+ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) -+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); -+ cur_inum = k.k->p.inode; -+ goto peek; -+ } -+ -+ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ case DATA_PROMOTE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* unlock before doing IO: */ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, -+ data_cmd, data_opts); -+ if (ret2) { -+ if (ret2 == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); -+ continue; -+ } -+ -+ /* XXX signal failure */ -+ goto next; -+ } -+ -+ if (rate) -+ bch2_ratelimit_increment(rate, k.k->size); -+next: -+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), -+ &stats->sectors_seen); -+next_nondata: -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+out: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+int bch2_move_data(struct bch_fs *c, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct moving_context ctxt = { .stats = stats }; -+ int ret; -+ -+ closure_init_stack(&ctxt.cl); -+ INIT_LIST_HEAD(&ctxt.reads); -+ init_waitqueue_head(&ctxt.wait); -+ -+ stats->data_type = BCH_DATA_USER; -+ -+ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_EXTENTS) ?: -+ __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_REFLINK); -+ -+ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); -+ closure_sync(&ctxt.cl); -+ -+ EBUG_ON(atomic_read(&ctxt.write_sectors)); -+ -+ trace_move_data(c, -+ atomic64_read(&stats->sectors_moved), -+ atomic64_read(&stats->keys_moved)); -+ -+ return ret; -+} -+ -+static int bch2_move_btree(struct bch_fs *c, -+ move_pred_fn pred, -+ void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned id; -+ struct data_opts data_opts; -+ enum data_cmd cmd; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_BTREE; -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ stats->btree_id = id; -+ -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ stats->pos = iter->pos; -+ -+ switch ((cmd = pred(c, arg, -+ bkey_i_to_s_c(&b->key), -+ &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ ret = bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, 0) ?: ret; -+next: -+ bch2_trans_cond_resched(&trans); -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+#if 0 -+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ return DATA_SCRUB; -+} -+#endif -+ -+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ unsigned nr_good = bch2_bkey_durability(c, k); -+ unsigned replicas = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ replicas = c->opts.metadata_replicas; -+ break; -+ case KEY_TYPE_extent: -+ replicas = io_opts->data_replicas; -+ break; -+ } -+ -+ if (!nr_good || nr_good >= replicas) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+} -+ -+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ struct bch_ioctl_data *op = arg; -+ -+ if (!bch2_bkey_has_device(k, op->migrate.dev)) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ data_opts->rewrite_dev = op->migrate.dev; -+ return DATA_REWRITE; -+} -+ -+int bch2_data_job(struct bch_fs *c, -+ struct bch_move_stats *stats, -+ struct bch_ioctl_data op) -+{ -+ int ret = 0; -+ -+ switch (op.op) { -+ case BCH_DATA_OP_REREPLICATE: -+ stats->data_type = BCH_DATA_JOURNAL; -+ ret = bch2_journal_flush_device_pins(&c->journal, -1); -+ -+ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; -+ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ rereplicate_pred, c, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ case BCH_DATA_OP_MIGRATE: -+ if (op.migrate.dev >= c->sb.nr_devices) -+ return -EINVAL; -+ -+ stats->data_type = BCH_DATA_JOURNAL; -+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -+ -+ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -new file mode 100644 -index 000000000000..0acd1720d4f8 ---- /dev/null -+++ b/fs/bcachefs/move.h -@@ -0,0 +1,64 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_H -+#define _BCACHEFS_MOVE_H -+ -+#include "btree_iter.h" -+#include "buckets.h" -+#include "io_types.h" -+#include "move_types.h" -+ -+struct bch_read_bio; -+struct moving_context; -+ -+enum data_cmd { -+ DATA_SKIP, -+ DATA_SCRUB, -+ DATA_ADD_REPLICAS, -+ DATA_REWRITE, -+ DATA_PROMOTE, -+}; -+ -+struct data_opts { -+ u16 target; -+ unsigned rewrite_dev; -+ int btree_insert_flags; -+}; -+ -+struct migrate_write { -+ enum btree_id btree_id; -+ enum data_cmd data_cmd; -+ struct data_opts data_opts; -+ -+ unsigned nr_ptrs_reserved; -+ -+ struct moving_context *ctxt; -+ -+ /* what we read: */ -+ struct bch_extent_ptr ptr; -+ u64 offset; -+ -+ struct bch_write_op op; -+}; -+ -+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); -+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, -+ struct write_point_specifier, -+ struct bch_io_opts, -+ enum data_cmd, struct data_opts, -+ enum btree_id, struct bkey_s_c); -+ -+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, -+ struct bkey_s_c, -+ struct bch_io_opts *, struct data_opts *); -+ -+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, -+ struct write_point_specifier, -+ struct bpos, struct bpos, -+ move_pred_fn, void *, -+ struct bch_move_stats *); -+ -+int bch2_data_job(struct bch_fs *, -+ struct bch_move_stats *, -+ struct bch_ioctl_data); -+ -+#endif /* _BCACHEFS_MOVE_H */ -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -new file mode 100644 -index 000000000000..fc0de165af9f ---- /dev/null -+++ b/fs/bcachefs/move_types.h -@@ -0,0 +1,17 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_TYPES_H -+#define _BCACHEFS_MOVE_TYPES_H -+ -+struct bch_move_stats { -+ enum bch_data_type data_type; -+ enum btree_id btree_id; -+ struct bpos pos; -+ -+ atomic64_t keys_moved; -+ atomic64_t keys_raced; -+ atomic64_t sectors_moved; -+ atomic64_t sectors_seen; -+ atomic64_t sectors_raced; -+}; -+ -+#endif /* _BCACHEFS_MOVE_TYPES_H */ -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -new file mode 100644 -index 000000000000..0a87cd7405dd ---- /dev/null -+++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,322 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Moving/copying garbage collector -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "extents.h" -+#include "eytzinger.h" -+#include "io.h" -+#include "keylist.h" -+#include "move.h" -+#include "movinggc.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * We can't use the entire copygc reserve in one iteration of copygc: we may -+ * need the buckets we're freeing up to go back into the copygc reserve to make -+ * forward progress, but if the copygc reserve is full they'll be available for -+ * any allocation - and it's possible that in a given iteration, we free up most -+ * of the buckets we're going to free before we allocate most of the buckets -+ * we're going to allocate. -+ * -+ * If we only use half of the reserve per iteration, then in steady state we'll -+ * always have room in the reserve for the buckets we're going to need in the -+ * next iteration: -+ */ -+#define COPYGC_BUCKETS_PER_ITER(ca) \ -+ ((ca)->free[RESERVE_MOVINGGC].size / 2) -+ -+/* -+ * Max sectors to move per iteration: Have to take into account internal -+ * fragmentation from the multiple write points for each generation: -+ */ -+#define COPYGC_SECTORS_PER_ITER(ca) \ -+ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) -+ -+static inline int sectors_used_cmp(copygc_heap *heap, -+ struct copygc_heap_entry l, -+ struct copygc_heap_entry r) -+{ -+ return cmp_int(l.sectors, r.sectors); -+} -+ -+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -+{ -+ const struct copygc_heap_entry *l = _l; -+ const struct copygc_heap_entry *r = _r; -+ -+ return cmp_int(l->offset, r->offset); -+} -+ -+static bool __copygc_pred(struct bch_dev *ca, -+ struct bkey_s_c k) -+{ -+ copygc_heap *h = &ca->copygc_heap; -+ const struct bch_extent_ptr *ptr = -+ bch2_bkey_has_device(k, ca->dev_idx); -+ -+ if (ptr) { -+ struct copygc_heap_entry search = { .offset = ptr->offset }; -+ -+ ssize_t i = eytzinger0_find_le(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, &search); -+#if 0 -+ /* eytzinger search verify code: */ -+ ssize_t j = -1, k; -+ -+ for (k = 0; k < h->used; k++) -+ if (h->data[k].offset <= ptr->offset && -+ (j < 0 || h->data[k].offset > h->data[j].offset)) -+ j = k; -+ -+ BUG_ON(i != j); -+#endif -+ return (i >= 0 && -+ ptr->offset < h->data[i].offset + ca->mi.bucket_size && -+ ptr->gen == h->data[i].gen); -+ } -+ -+ return false; -+} -+ -+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ struct bch_dev *ca = arg; -+ -+ if (!__copygc_pred(ca, k)) -+ return DATA_SKIP; -+ -+ data_opts->target = dev_to_target(ca->dev_idx); -+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; -+ data_opts->rewrite_dev = ca->dev_idx; -+ return DATA_REWRITE; -+} -+ -+static bool have_copygc_reserve(struct bch_dev *ca) -+{ -+ bool ret; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || -+ ca->allocator_state != ALLOCATOR_RUNNING; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ return ret; -+} -+ -+static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ copygc_heap *h = &ca->copygc_heap; -+ struct copygc_heap_entry e, *i; -+ struct bucket_array *buckets; -+ struct bch_move_stats move_stats; -+ u64 sectors_to_move = 0, sectors_not_moved = 0; -+ u64 buckets_to_move, buckets_not_moved = 0; -+ size_t b; -+ int ret; -+ -+ memset(&move_stats, 0, sizeof(move_stats)); -+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); -+ -+ /* -+ * Find buckets with lowest sector counts, skipping completely -+ * empty buckets, by building a maxheap sorted by sector count, -+ * and repeatedly replacing the maximum element until all -+ * buckets have been visited. -+ */ -+ h->used = 0; -+ -+ /* -+ * We need bucket marks to be up to date - gc can't be recalculating -+ * them: -+ */ -+ down_read(&c->gc_lock); -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ struct copygc_heap_entry e; -+ -+ if (m.owned_by_allocator || -+ m.data_type != BCH_DATA_USER || -+ !bucket_sectors_used(m) || -+ bucket_sectors_used(m) >= ca->mi.bucket_size) -+ continue; -+ -+ e = (struct copygc_heap_entry) { -+ .gen = m.gen, -+ .sectors = bucket_sectors_used(m), -+ .offset = bucket_to_sector(ca, b), -+ }; -+ heap_add_or_replace(h, e, -sectors_used_cmp, NULL); -+ } -+ up_read(&ca->bucket_lock); -+ up_read(&c->gc_lock); -+ -+ for (i = h->data; i < h->data + h->used; i++) -+ sectors_to_move += i->sectors; -+ -+ while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { -+ BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); -+ sectors_to_move -= e.sectors; -+ } -+ -+ buckets_to_move = h->used; -+ -+ if (!buckets_to_move) -+ return; -+ -+ eytzinger0_sort(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, NULL); -+ -+ ret = bch2_move_data(c, &ca->copygc_pd.rate, -+ writepoint_ptr(&ca->copygc_write_point), -+ POS_MIN, POS_MAX, -+ copygc_pred, ca, -+ &move_stats); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ for (i = h->data; i < h->data + h->used; i++) { -+ size_t b = sector_to_bucket(ca, i->offset); -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ -+ if (i->gen == m.gen && bucket_sectors_used(m)) { -+ sectors_not_moved += bucket_sectors_used(m); -+ buckets_not_moved++; -+ } -+ } -+ up_read(&ca->bucket_lock); -+ -+ if (sectors_not_moved && !ret) -+ bch_warn_ratelimited(c, -+ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", -+ sectors_not_moved, sectors_to_move, -+ buckets_not_moved, buckets_to_move, -+ atomic64_read(&move_stats.sectors_moved), -+ atomic64_read(&move_stats.keys_raced), -+ atomic64_read(&move_stats.sectors_raced)); -+ -+ trace_copygc(ca, -+ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, -+ buckets_to_move, buckets_not_moved); -+} -+ -+/* -+ * Copygc runs when the amount of fragmented data is above some arbitrary -+ * threshold: -+ * -+ * The threshold at the limit - when the device is full - is the amount of space -+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of -+ * disk space stranded due to fragmentation and store everything we have -+ * promised to store. -+ * -+ * But we don't want to be running copygc unnecessarily when the device still -+ * has plenty of free space - rather, we want copygc to smoothly run every so -+ * often and continually reduce the amount of fragmented space as the device -+ * fills up. So, we increase the threshold by half the current free space. -+ */ -+unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); -+ u64 fragmented_allowed = ca->copygc_threshold + -+ ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); -+ -+ return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); -+} -+ -+static int bch2_copygc_thread(void *arg) -+{ -+ struct bch_dev *ca = arg; -+ struct bch_fs *c = ca->fs; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last, wait; -+ -+ set_freezable(); -+ -+ while (!kthread_should_stop()) { -+ if (kthread_wait_freezable(c->copy_gc_enabled)) -+ break; -+ -+ last = atomic_long_read(&clock->now); -+ wait = bch2_copygc_wait_amount(ca); -+ -+ if (wait > clock->max_slop) { -+ bch2_kthread_io_clock_wait(clock, last + wait, -+ MAX_SCHEDULE_TIMEOUT); -+ continue; -+ } -+ -+ bch2_copygc(c, ca); -+ } -+ -+ return 0; -+} -+ -+void bch2_copygc_stop(struct bch_dev *ca) -+{ -+ ca->copygc_pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&ca->copygc_pd.rate); -+ -+ if (ca->copygc_thread) { -+ kthread_stop(ca->copygc_thread); -+ put_task_struct(ca->copygc_thread); -+ } -+ ca->copygc_thread = NULL; -+} -+ -+int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct task_struct *t; -+ -+ if (ca->copygc_thread) -+ return 0; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ if (bch2_fs_init_fault("copygc_start")) -+ return -ENOMEM; -+ -+ t = kthread_create(bch2_copygc_thread, ca, -+ "bch_copygc[%s]", ca->name); -+ if (IS_ERR(t)) -+ return PTR_ERR(t); -+ -+ get_task_struct(t); -+ -+ ca->copygc_thread = t; -+ wake_up_process(ca->copygc_thread); -+ -+ return 0; -+} -+ -+void bch2_dev_copygc_init(struct bch_dev *ca) -+{ -+ bch2_pd_controller_init(&ca->copygc_pd); -+ ca->copygc_pd.d_term = 0; -+} -diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h -new file mode 100644 -index 000000000000..dcd479632cf1 ---- /dev/null -+++ b/fs/bcachefs/movinggc.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVINGGC_H -+#define _BCACHEFS_MOVINGGC_H -+ -+void bch2_copygc_stop(struct bch_dev *); -+int bch2_copygc_start(struct bch_fs *, struct bch_dev *); -+void bch2_dev_copygc_init(struct bch_dev *); -+ -+#endif /* _BCACHEFS_MOVINGGC_H */ -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -new file mode 100644 -index 000000000000..94d6c044a27d ---- /dev/null -+++ b/fs/bcachefs/opts.c -@@ -0,0 +1,440 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+ -+#include "bcachefs.h" -+#include "compress.h" -+#include "disk_groups.h" -+#include "opts.h" -+#include "super-io.h" -+#include "util.h" -+ -+const char * const bch2_error_actions[] = { -+ "continue", -+ "remount-ro", -+ "panic", -+ NULL -+}; -+ -+const char * const bch2_sb_features[] = { -+#define x(f, n) #f, -+ BCH_SB_FEATURES() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_csum_opts[] = { -+ "none", -+ "crc32c", -+ "crc64", -+ NULL -+}; -+ -+const char * const bch2_compression_opts[] = { -+#define x(t, n) #t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_str_hash_types[] = { -+ "crc32c", -+ "crc64", -+ "siphash", -+ NULL -+}; -+ -+const char * const bch2_data_types[] = { -+ "none", -+ "sb", -+ "journal", -+ "btree", -+ "data", -+ "cached", -+ NULL -+}; -+ -+const char * const bch2_cache_replacement_policies[] = { -+ "lru", -+ "fifo", -+ "random", -+ NULL -+}; -+ -+/* Default is -1; we skip past it for struct cached_dev's cache mode */ -+const char * const bch2_cache_modes[] = { -+ "default", -+ "writethrough", -+ "writeback", -+ "writearound", -+ "none", -+ NULL -+}; -+ -+const char * const bch2_dev_state[] = { -+ "readwrite", -+ "readonly", -+ "failed", -+ "spare", -+ NULL -+}; -+ -+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -+{ -+#define x(_name, ...) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ -+ BCH_OPTS() -+#undef x -+} -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opt_defined(*opts, _name); -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opts->_name; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ opt_set(*opts, _name, v); \ -+ break; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Initial options from superblock - here we don't want any options undefined, -+ * any options the superblock doesn't specify are set to 0: -+ */ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ -+ if (_sb_opt != NO_SB_OPT) \ -+ opt_set(opts, _name, _sb_opt(sb)); -+ BCH_OPTS() -+#undef x -+ -+ return opts; -+} -+ -+const struct bch_option bch2_opt_table[] = { -+#define OPT_BOOL() .type = BCH_OPT_BOOL -+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -+#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -+#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices -+#define OPT_FN(_fn) .type = BCH_OPT_FN, \ -+ .parse = _fn##_parse, \ -+ .to_text = _fn##_to_text -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ -+ [Opt_##_name] = { \ -+ .attr = { \ -+ .name = #_name, \ -+ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ -+ }, \ -+ .mode = _mode, \ -+ .hint = _hint, \ -+ .help = _help, \ -+ .set_sb = SET_##_sb_opt, \ -+ _type \ -+ }, -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+int bch2_opt_lookup(const char *name) -+{ -+ const struct bch_option *i; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); -+ i++) -+ if (!strcmp(name, i->attr.name)) -+ return i - bch2_opt_table; -+ -+ return -1; -+} -+ -+struct synonym { -+ const char *s1, *s2; -+}; -+ -+static const struct synonym bch_opt_synonyms[] = { -+ { "quota", "usrquota" }, -+}; -+ -+static int bch2_mount_opt_lookup(const char *name) -+{ -+ const struct synonym *i; -+ -+ for (i = bch_opt_synonyms; -+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); -+ i++) -+ if (!strcmp(name, i->s1)) -+ name = i->s2; -+ -+ return bch2_opt_lookup(name); -+} -+ -+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, -+ const char *val, u64 *res) -+{ -+ ssize_t ret; -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res > 1) -+ return -ERANGE; -+ break; -+ case BCH_OPT_UINT: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_SECTORS: -+ ret = bch2_strtou64_h(val, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res & 511) -+ return -EINVAL; -+ -+ *res >>= 9; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_STR: -+ ret = match_string(opt->choices, -1, val); -+ if (ret < 0) -+ return ret; -+ -+ *res = ret; -+ break; -+ case BCH_OPT_FN: -+ if (!c) -+ return -EINVAL; -+ -+ return opt->parse(c, val, res); -+ } -+ -+ return 0; -+} -+ -+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct bch_option *opt, u64 v, -+ unsigned flags) -+{ -+ if (flags & OPT_SHOW_MOUNT_STYLE) { -+ if (opt->type == BCH_OPT_BOOL) { -+ pr_buf(out, "%s%s", -+ v ? "" : "no", -+ opt->attr.name); -+ return; -+ } -+ -+ pr_buf(out, "%s=", opt->attr.name); -+ } -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ case BCH_OPT_UINT: -+ pr_buf(out, "%lli", v); -+ break; -+ case BCH_OPT_SECTORS: -+ bch2_hprint(out, v); -+ break; -+ case BCH_OPT_STR: -+ if (flags & OPT_SHOW_FULL_LIST) -+ bch2_string_opt_to_text(out, opt->choices, v); -+ else -+ pr_buf(out, opt->choices[v]); -+ break; -+ case BCH_OPT_FN: -+ opt->to_text(out, c, v); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) -+{ -+ int ret = 0; -+ -+ switch (id) { -+ case Opt_compression: -+ case Opt_background_compression: -+ ret = bch2_check_set_has_compressed_data(c, v); -+ break; -+ case Opt_erasure_code: -+ if (v) -+ bch2_check_set_feature(c, BCH_FEATURE_ec); -+ break; -+ } -+ -+ return ret; -+} -+ -+int bch2_opts_check_may_set(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ ret = bch2_opt_check_may_set(c, i, -+ bch2_opt_get_by_id(&c->opts, i)); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_parse_mount_opts(struct bch_opts *opts, char *options) -+{ -+ char *opt, *name, *val; -+ int ret, id; -+ u64 v; -+ -+ while ((opt = strsep(&options, ",")) != NULL) { -+ name = strsep(&opt, "="); -+ val = opt; -+ -+ if (val) { -+ id = bch2_mount_opt_lookup(name); -+ if (id < 0) -+ goto bad_opt; -+ -+ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); -+ if (ret < 0) -+ goto bad_val; -+ } else { -+ id = bch2_mount_opt_lookup(name); -+ v = 1; -+ -+ if (id < 0 && -+ !strncmp("no", name, 2)) { -+ id = bch2_mount_opt_lookup(name + 2); -+ v = 0; -+ } -+ -+ if (id < 0) -+ goto bad_opt; -+ -+ if (bch2_opt_table[id].type != BCH_OPT_BOOL) -+ goto no_val; -+ } -+ -+ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) -+ goto bad_opt; -+ -+ if (id == Opt_acl && -+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) -+ goto bad_opt; -+ -+ if ((id == Opt_usrquota || -+ id == Opt_grpquota) && -+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) -+ goto bad_opt; -+ -+ bch2_opt_set_by_id(opts, id, v); -+ } -+ -+ return 0; -+bad_opt: -+ pr_err("Bad mount option %s", name); -+ return -1; -+bad_val: -+ pr_err("Invalid value %s for mount option %s", val, name); -+ return -1; -+no_val: -+ pr_err("Mount option %s requires a value", name); -+ return -1; -+} -+ -+/* io opts: */ -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -+{ -+ struct bch_io_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) -+{ -+ struct bch_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) -+{ -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+} -+ -+bool bch2_opt_is_inode_opt(enum bch_opt_id id) -+{ -+ static const enum bch_opt_id inode_opt_list[] = { -+#define x(_name, _bits) Opt_##_name, -+ BCH_INODE_OPTS() -+#undef x -+ }; -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) -+ if (inode_opt_list[i] == id) -+ return true; -+ -+ return false; -+} -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -new file mode 100644 -index 000000000000..3b051e7a8f1d ---- /dev/null -+++ b/fs/bcachefs/opts.h -@@ -0,0 +1,435 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_OPTS_H -+#define _BCACHEFS_OPTS_H -+ -+#include -+#include -+#include -+#include -+#include "bcachefs_format.h" -+ -+extern const char * const bch2_error_actions[]; -+extern const char * const bch2_sb_features[]; -+extern const char * const bch2_csum_opts[]; -+extern const char * const bch2_compression_opts[]; -+extern const char * const bch2_str_hash_types[]; -+extern const char * const bch2_data_types[]; -+extern const char * const bch2_cache_replacement_policies[]; -+extern const char * const bch2_cache_modes[]; -+extern const char * const bch2_dev_state[]; -+ -+/* -+ * Mount options; we also store defaults in the superblock. -+ * -+ * Also exposed via sysfs: if an option is writeable, and it's also stored in -+ * the superblock, changing it via sysfs (currently? might change this) also -+ * updates the superblock. -+ * -+ * We store options as signed integers, where -1 means undefined. This means we -+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only -+ * apply the options from that struct that are defined. -+ */ -+ -+/* dummy option, for options that aren't stored in the superblock */ -+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); -+ -+/* When can be set: */ -+enum opt_mode { -+ OPT_FORMAT = (1 << 0), -+ OPT_MOUNT = (1 << 1), -+ OPT_RUNTIME = (1 << 2), -+ OPT_INODE = (1 << 3), -+ OPT_DEVICE = (1 << 4), -+}; -+ -+enum opt_type { -+ BCH_OPT_BOOL, -+ BCH_OPT_UINT, -+ BCH_OPT_SECTORS, -+ BCH_OPT_STR, -+ BCH_OPT_FN, -+}; -+ -+/** -+ * x(name, shortopt, type, in mem type, mode, sb_opt) -+ * -+ * @name - name of mount option, sysfs attribute, and struct bch_opts -+ * member -+ * -+ * @mode - when opt may be set -+ * -+ * @sb_option - name of corresponding superblock option -+ * -+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR -+ */ -+ -+/* -+ * XXX: add fields for -+ * - default value -+ * - helptext -+ */ -+ -+#ifdef __KERNEL__ -+#define RATELIMIT_ERRORS true -+#else -+#define RATELIMIT_ERRORS false -+#endif -+ -+#define BCH_OPTS() \ -+ x(block_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 128), \ -+ BCH_SB_BLOCK_SIZE, 8, \ -+ "size", NULL) \ -+ x(btree_node_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 128), \ -+ BCH_SB_BTREE_NODE_SIZE, 512, \ -+ "size", "Btree node size, default 256k") \ -+ x(errors, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_error_actions), \ -+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ -+ NULL, "Action to take on filesystem error") \ -+ x(metadata_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_WANT, 1, \ -+ "#", "Number of metadata replicas") \ -+ x(data_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_WANT, 1, \ -+ "#", "Number of data replicas") \ -+ x(metadata_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(data_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(metadata_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(data_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(background_compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(str_hash, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_str_hash_types), \ -+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ -+ NULL, "Hash function for directory entries and xattrs")\ -+ x(foreground_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_FOREGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group for foreground writes") \ -+ x(background_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_BACKGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group to move data to in the background")\ -+ x(promote_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_PROMOTE_TARGET, 0, \ -+ "(target)", "Device or disk group to promote data to on read")\ -+ x(erasure_code, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_BOOL(), \ -+ BCH_SB_ERASURE_CODE, false, \ -+ NULL, "Enable erasure coding (DO NOT USE YET)") \ -+ x(inodes_32bit, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_INODE_32BIT, false, \ -+ NULL, "Constrain inode numbers to 32 bits") \ -+ x(gc_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(5, 21), \ -+ BCH_SB_GC_RESERVE, 8, \ -+ "%", "Percentage of disk space to reserve for copygc")\ -+ x(gc_reserve_bytes, u64, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_SECTORS(0, U64_MAX), \ -+ BCH_SB_GC_RESERVE_BYTES, 0, \ -+ "%", "Amount of disk space to reserve for copygc\n" \ -+ "Takes precedence over gc_reserve_percent if set")\ -+ x(root_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(0, 100), \ -+ BCH_SB_ROOT_RESERVE, 0, \ -+ "%", "Percentage of disk space to reserve for superuser")\ -+ x(wide_macs, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_128_BIT_MACS, false, \ -+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ -+ x(inline_data, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable inline data extents") \ -+ x(acl, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_POSIX_ACL, true, \ -+ NULL, "Enable POSIX acls") \ -+ x(usrquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_USRQUOTA, false, \ -+ NULL, "Enable user quotas") \ -+ x(grpquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_GRPQUOTA, false, \ -+ NULL, "Enable group quotas") \ -+ x(prjquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_PRJQUOTA, false, \ -+ NULL, "Enable project quotas") \ -+ x(reflink, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_REFLINK, true, \ -+ NULL, "Enable reflink support") \ -+ x(degraded, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Allow mounting in degraded mode") \ -+ x(discard, u8, \ -+ OPT_MOUNT|OPT_DEVICE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable discard/TRIM support") \ -+ x(verbose, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Extra debugging information during mount/recovery")\ -+ x(journal_flush_disabled, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Disable journal flush on sync/fsync\n" \ -+ "If enabled, writes can be lost, but only since the\n"\ -+ "last journal write (default 1 second)") \ -+ x(fsck, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Run fsck on mount") \ -+ x(fix_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Fix errors during fsck without asking") \ -+ x(ratelimit_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, RATELIMIT_ERRORS, \ -+ NULL, "Ratelimit error messages during fsck") \ -+ x(nochanges, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Super read only mode - no writes at all will be issued,\n"\ -+ "even if we have to replay the journal") \ -+ x(norecovery, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't replay the journal") \ -+ x(keep_journal, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't free journal entries/keys after startup")\ -+ x(read_entire_journal, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Read all journal entries, not just dirty ones")\ -+ x(noexcl, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't open device in exclusive mode") \ -+ x(sb, u64, \ -+ OPT_MOUNT, \ -+ OPT_UINT(0, S64_MAX), \ -+ NO_SB_OPT, BCH_SB_SECTOR, \ -+ "offset", "Sector offset of superblock") \ -+ x(read_only, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(nostart, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don\'t start filesystem, only open devices") \ -+ x(reconstruct_alloc, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Reconstruct alloc btree") \ -+ x(version_upgrade, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Set superblock to latest version,\n" \ -+ "allowing any new features to be used") \ -+ x(project, u8, \ -+ OPT_INODE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(fs_size, u64, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(bucket, u32, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(durability, u8, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, BCH_REPLICAS_MAX), \ -+ NO_SB_OPT, 1, \ -+ "n", "Data written to this device will be considered\n"\ -+ "to have already been replicated n times") -+ -+struct bch_opts { -+#define x(_name, _bits, ...) unsigned _name##_defined:1; -+ BCH_OPTS() -+#undef x -+ -+#define x(_name, _bits, ...) _bits _name; -+ BCH_OPTS() -+#undef x -+}; -+ -+static const struct bch_opts bch2_opts_default = { -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ -+ ._name##_defined = true, \ -+ ._name = _default, \ -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+#define opt_defined(_opts, _name) ((_opts)._name##_defined) -+ -+#define opt_get(_opts, _name) \ -+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) -+ -+#define opt_set(_opts, _name, _v) \ -+do { \ -+ (_opts)._name##_defined = true; \ -+ (_opts)._name = _v; \ -+} while (0) -+ -+static inline struct bch_opts bch2_opts_empty(void) -+{ -+ return (struct bch_opts) { 0 }; -+} -+ -+void bch2_opts_apply(struct bch_opts *, struct bch_opts); -+ -+enum bch_opt_id { -+#define x(_name, ...) Opt_##_name, -+ BCH_OPTS() -+#undef x -+ bch2_opts_nr -+}; -+ -+struct bch_fs; -+struct printbuf; -+ -+struct bch_option { -+ struct attribute attr; -+ void (*set_sb)(struct bch_sb *, u64); -+ enum opt_mode mode; -+ enum opt_type type; -+ -+ union { -+ struct { -+ u64 min, max; -+ }; -+ struct { -+ const char * const *choices; -+ }; -+ struct { -+ int (*parse)(struct bch_fs *, const char *, u64 *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, u64); -+ }; -+ }; -+ -+ const char *hint; -+ const char *help; -+ -+}; -+ -+extern const struct bch_option bch2_opt_table[]; -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -+ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *); -+ -+int bch2_opt_lookup(const char *); -+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); -+ -+#define OPT_SHOW_FULL_LIST (1 << 0) -+#define OPT_SHOW_MOUNT_STYLE (1 << 1) -+ -+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, -+ const struct bch_option *, u64, unsigned); -+ -+int bch2_opt_check_may_set(struct bch_fs *, int, u64); -+int bch2_opts_check_may_set(struct bch_fs *); -+int bch2_parse_mount_opts(struct bch_opts *, char *); -+ -+/* inode opts: */ -+ -+struct bch_io_opts { -+#define x(_name, _bits) unsigned _name##_defined:1; -+ BCH_INODE_OPTS() -+#undef x -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_OPTS() -+#undef x -+}; -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); -+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); -+bool bch2_opt_is_inode_opt(enum bch_opt_id); -+ -+#endif /* _BCACHEFS_OPTS_H */ -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -new file mode 100644 -index 000000000000..d3032a46e7f3 ---- /dev/null -+++ b/fs/bcachefs/quota.c -@@ -0,0 +1,783 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "inode.h" -+#include "quota.h" -+#include "super-io.h" -+ -+static const char *bch2_sb_validate_quota(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_quota *q = field_to_type(f, quota); -+ -+ if (vstruct_bytes(&q->field) != sizeof(*q)) -+ return "invalid field quota: wrong size"; -+ -+ return NULL; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_quota = { -+ .validate = bch2_sb_validate_quota, -+}; -+ -+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->p.inode >= QTYP_NR) -+ return "invalid quota type"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+static const char * const bch2_quota_counters[] = { -+ "space", -+ "inodes", -+}; -+ -+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); -+ unsigned i; -+ -+ for (i = 0; i < Q_COUNTERS; i++) -+ pr_buf(out, "%s hardlimit %llu softlimit %llu", -+ bch2_quota_counters[i], -+ le64_to_cpu(dq.v->c[i].hardlimit), -+ le64_to_cpu(dq.v->c[i].softlimit)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+#include -+#include -+#include -+ -+static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -+{ -+ qtypes >>= i; -+ return qtypes ? i + __ffs(qtypes) : QTYP_NR; -+} -+ -+#define for_each_set_qtype(_c, _i, _q, _qtypes) \ -+ for (_i = 0; \ -+ (_i = __next_qtype(_i, _qtypes), \ -+ _q = &(_c)->quotas[_i], \ -+ _i < QTYP_NR); \ -+ _i++) -+ -+static bool ignore_hardlimit(struct bch_memquota_type *q) -+{ -+ if (capable(CAP_SYS_RESOURCE)) -+ return true; -+#if 0 -+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; -+ -+ return capable(CAP_SYS_RESOURCE) && -+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || -+ !(info->dqi_flags & DQF_ROOT_SQUASH)); -+#endif -+ return false; -+} -+ -+enum quota_msg { -+ SOFTWARN, /* Softlimit reached */ -+ SOFTLONGWARN, /* Grace time expired */ -+ HARDWARN, /* Hardlimit reached */ -+ -+ HARDBELOW, /* Usage got below inode hardlimit */ -+ SOFTBELOW, /* Usage got below inode softlimit */ -+}; -+ -+static int quota_nl[][Q_COUNTERS] = { -+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, -+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, -+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, -+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, -+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, -+ -+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, -+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, -+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, -+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, -+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -+}; -+ -+struct quota_msgs { -+ u8 nr; -+ struct { -+ u8 qtype; -+ u8 msg; -+ } m[QTYP_NR * Q_COUNTERS]; -+}; -+ -+static void prepare_msg(unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); -+ -+ msgs->m[msgs->nr].qtype = qtype; -+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; -+ msgs->nr++; -+} -+ -+static void prepare_warning(struct memquota_counter *qc, -+ unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ if (qc->warning_issued & (1 << msg_type)) -+ return; -+ -+ prepare_msg(qtype, counter, msgs, msg_type); -+} -+ -+static void flush_warnings(struct bch_qid qid, -+ struct super_block *sb, -+ struct quota_msgs *msgs) -+{ -+ unsigned i; -+ -+ for (i = 0; i < msgs->nr; i++) -+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), -+ sb->s_dev, msgs->m[i].msg); -+} -+ -+static int bch2_quota_check_limit(struct bch_fs *c, -+ unsigned qtype, -+ struct bch_memquota *mq, -+ struct quota_msgs *msgs, -+ enum quota_counters counter, -+ s64 v, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q = &c->quotas[qtype]; -+ struct memquota_counter *qc = &mq->c[counter]; -+ u64 n = qc->v + v; -+ -+ BUG_ON((s64) n < 0); -+ -+ if (mode == KEY_TYPE_QUOTA_NOCHECK) -+ return 0; -+ -+ if (v <= 0) { -+ if (n < qc->hardlimit && -+ (qc->warning_issued & (1 << HARDWARN))) { -+ qc->warning_issued &= ~(1 << HARDWARN); -+ prepare_msg(qtype, counter, msgs, HARDBELOW); -+ } -+ -+ if (n < qc->softlimit && -+ (qc->warning_issued & (1 << SOFTWARN))) { -+ qc->warning_issued &= ~(1 << SOFTWARN); -+ prepare_msg(qtype, counter, msgs, SOFTBELOW); -+ } -+ -+ qc->warning_issued = 0; -+ return 0; -+ } -+ -+ if (qc->hardlimit && -+ qc->hardlimit < n && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, HARDWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer && -+ ktime_get_real_seconds() >= qc->timer && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer == 0) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); -+ -+ /* XXX is this the right one? */ -+ qc->timer = ktime_get_real_seconds() + -+ q->limits[counter].warnlimit; -+ } -+ -+ return 0; -+} -+ -+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ unsigned qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq[QTYP_NR]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); -+ if (!mq[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mq[i]->c[counter].v += v; -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(qid, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static void __bch2_quota_transfer(struct bch_memquota *src_q, -+ struct bch_memquota *dst_q, -+ enum quota_counters counter, s64 v) -+{ -+ BUG_ON(v > src_q->c[counter].v); -+ BUG_ON(v + dst_q->c[counter].v < v); -+ -+ src_q->c[counter].v -= v; -+ dst_q->c[counter].v += v; -+} -+ -+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q; -+ struct bch_memquota *src_q[3], *dst_q[3]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); -+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); -+ -+ if (!src_q[i] || !dst_q[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, -+ dst_q[i]->c[Q_SPC].v + space, -+ mode); -+ if (ret) -+ goto err; -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, -+ dst_q[i]->c[Q_INO].v + 1, -+ mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); -+ } -+ -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(dst, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq; -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq; -+ unsigned i; -+ -+ BUG_ON(k.k->p.inode >= QTYP_NR); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_quota: -+ dq = bkey_s_c_to_quota(k); -+ q = &c->quotas[k.k->p.inode]; -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); -+ if (!mq) { -+ mutex_unlock(&q->lock); -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < Q_COUNTERS; i++) { -+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); -+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); -+ } -+ -+ mutex_unlock(&q->lock); -+ } -+ -+ return 0; -+} -+ -+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->p.inode != type) -+ break; -+ -+ ret = __bch2_quota_set(c, k); -+ if (ret) -+ break; -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+void bch2_fs_quota_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ genradix_free(&c->quotas[i].table); -+} -+ -+void bch2_fs_quota_init(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ mutex_init(&c->quotas[i].lock); -+} -+ -+static void bch2_sb_quota_read(struct bch_fs *c) -+{ -+ struct bch_sb_field_quota *sb_quota; -+ unsigned i, j; -+ -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) -+ return; -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ struct bch_memquota_type *q = &c->quotas[i]; -+ -+ for (j = 0; j < Q_COUNTERS; j++) { -+ q->limits[j].timelimit = -+ le32_to_cpu(sb_quota->q[i].c[j].timelimit); -+ q->limits[j].warnlimit = -+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); -+ } -+ } -+} -+ -+int bch2_fs_quota_read(struct bch_fs *c) -+{ -+ unsigned i, qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked u; -+ struct bkey_s_c k; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ bch2_sb_quota_read(c); -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_init_type(c, i); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); -+ if (ret) -+ return ret; -+ -+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, -+ KEY_TYPE_QUOTA_NOCHECK); -+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, -+ KEY_TYPE_QUOTA_NOCHECK); -+ } -+ } -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Enable/disable/delete quotas for an entire filesystem: */ -+ -+static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ /* Accounting must be enabled at mount time: */ -+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) -+ return -EINVAL; -+ -+ /* Can't enable enforcement without accounting: */ -+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) -+ return -EINVAL; -+ -+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) -+ return -EINVAL; -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (uflags & FS_USER_QUOTA) { -+ if (c->opts.usrquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_USR, 0), -+ POS(QTYP_USR + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_GROUP_QUOTA) { -+ if (c->opts.grpquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_GRP, 0), -+ POS(QTYP_GRP + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_PROJ_QUOTA) { -+ if (c->opts.prjquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_PRJ, 0), -+ POS(QTYP_PRJ + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Return quota status information, such as enforcements, quota file inode -+ * numbers etc. -+ */ -+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ unsigned qtypes = enabled_qtypes(c); -+ unsigned i; -+ -+ memset(state, 0, sizeof(*state)); -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ state->s_state[i].flags |= QCI_SYSFILE; -+ -+ if (!(qtypes & (1 << i))) -+ continue; -+ -+ state->s_state[i].flags |= QCI_ACCT_ENABLED; -+ -+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; -+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; -+ -+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; -+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Adjust quota timers & warnings -+ */ -+static int bch2_quota_set_info(struct super_block *sb, int type, -+ struct qc_info *info) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_sb_field_quota *sb_quota; -+ struct bch_memquota_type *q; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (type >= QTYP_NR) -+ return -EINVAL; -+ -+ if (!((1 << type) & enabled_qtypes(c))) -+ return -ESRCH; -+ -+ if (info->i_fieldmask & -+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) -+ return -EINVAL; -+ -+ q = &c->quotas[type]; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) { -+ sb_quota = bch2_sb_resize_quota(&c->disk_sb, -+ sizeof(*sb_quota) / sizeof(u64)); -+ if (!sb_quota) -+ return -ENOSPC; -+ } -+ -+ if (info->i_fieldmask & QC_SPC_TIMER) -+ sb_quota->q[type].c[Q_SPC].timelimit = -+ cpu_to_le32(info->i_spc_timelimit); -+ -+ if (info->i_fieldmask & QC_SPC_WARNS) -+ sb_quota->q[type].c[Q_SPC].warnlimit = -+ cpu_to_le32(info->i_spc_warnlimit); -+ -+ if (info->i_fieldmask & QC_INO_TIMER) -+ sb_quota->q[type].c[Q_INO].timelimit = -+ cpu_to_le32(info->i_ino_timelimit); -+ -+ if (info->i_fieldmask & QC_INO_WARNS) -+ sb_quota->q[type].c[Q_INO].warnlimit = -+ cpu_to_le32(info->i_ino_warnlimit); -+ -+ bch2_sb_quota_read(c); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+/* Get/set individual quotas: */ -+ -+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -+{ -+ dst->d_space = src->c[Q_SPC].v << 9; -+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; -+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; -+ dst->d_spc_timer = src->c[Q_SPC].timer; -+ dst->d_spc_warns = src->c[Q_SPC].warns; -+ -+ dst->d_ino_count = src->c[Q_INO].v; -+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; -+ dst->d_ino_softlimit = src->c[Q_INO].softlimit; -+ dst->d_ino_timer = src->c[Q_INO].timer; -+ dst->d_ino_warns = src->c[Q_INO].warns; -+} -+ -+static int bch2_get_quota(struct super_block *sb, struct kqid kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid.type]; -+ qid_t qid = from_kqid(&init_user_ns, kqid); -+ struct bch_memquota *mq; -+ -+ memset(qdq, 0, sizeof(*qdq)); -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr(&q->table, qid); -+ if (mq) -+ __bch2_quota_get(qdq, mq); -+ mutex_unlock(&q->lock); -+ -+ return 0; -+} -+ -+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid->type]; -+ qid_t qid = from_kqid(&init_user_ns, *kqid); -+ struct genradix_iter iter; -+ struct bch_memquota *mq; -+ int ret = 0; -+ -+ mutex_lock(&q->lock); -+ -+ genradix_for_each_from(&q->table, iter, mq, qid) -+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { -+ __bch2_quota_get(qdq, mq); -+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); -+ goto found; -+ } -+ -+ ret = -ENOENT; -+found: -+ mutex_unlock(&q->lock); -+ return ret; -+} -+ -+static int bch2_set_quota_trans(struct btree_trans *trans, -+ struct bkey_i_quota *new_quota, -+ struct qc_dqblk *qdq) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_slot(iter); -+ -+ ret = bkey_err(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (k.k->type == KEY_TYPE_quota) -+ new_quota->v = *bkey_s_c_to_quota(k).v; -+ -+ if (qdq->d_fieldmask & QC_SPC_SOFT) -+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); -+ if (qdq->d_fieldmask & QC_SPC_HARD) -+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); -+ -+ if (qdq->d_fieldmask & QC_INO_SOFT) -+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); -+ if (qdq->d_fieldmask & QC_INO_HARD) -+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); -+ -+ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); -+} -+ -+static int bch2_set_quota(struct super_block *sb, struct kqid qid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct btree_trans trans; -+ struct bkey_i_quota new_quota; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ bkey_quota_init(&new_quota.k_i); -+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, -+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: -+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+const struct quotactl_ops bch2_quotactl_operations = { -+ .quota_enable = bch2_quota_enable, -+ .quota_disable = bch2_quota_disable, -+ .rm_xquota = bch2_quota_remove, -+ -+ .get_state = bch2_quota_get_state, -+ .set_info = bch2_quota_set_info, -+ -+ .get_dqblk = bch2_get_quota, -+ .get_nextdqblk = bch2_get_next_quota, -+ .set_dqblk = bch2_set_quota, -+}; -+ -+#endif /* CONFIG_BCACHEFS_QUOTA */ -diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h -new file mode 100644 -index 000000000000..51e4f9713ef0 ---- /dev/null -+++ b/fs/bcachefs/quota.h -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_H -+#define _BCACHEFS_QUOTA_H -+ -+#include "inode.h" -+#include "quota_types.h" -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -+ -+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_quota (struct bkey_ops) { \ -+ .key_invalid = bch2_quota_invalid, \ -+ .val_to_text = bch2_quota_to_text, \ -+} -+ -+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -+{ -+ return (struct bch_qid) { -+ .q[QTYP_USR] = u->bi_uid, -+ .q[QTYP_GRP] = u->bi_gid, -+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, -+ }; -+} -+ -+static inline unsigned enabled_qtypes(struct bch_fs *c) -+{ -+ return ((c->opts.usrquota << QTYP_USR)| -+ (c->opts.grpquota << QTYP_GRP)| -+ (c->opts.prjquota << QTYP_PRJ)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, -+ s64, enum quota_acct_mode); -+ -+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, -+ struct bch_qid, u64, enum quota_acct_mode); -+ -+void bch2_fs_quota_exit(struct bch_fs *); -+void bch2_fs_quota_init(struct bch_fs *); -+int bch2_fs_quota_read(struct bch_fs *); -+ -+extern const struct quotactl_ops bch2_quotactl_operations; -+ -+#else -+ -+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -+static inline void bch2_fs_quota_init(struct bch_fs *c) {} -+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } -+ -+#endif -+ -+#endif /* _BCACHEFS_QUOTA_H */ -diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h -new file mode 100644 -index 000000000000..6a136083d389 ---- /dev/null -+++ b/fs/bcachefs/quota_types.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_TYPES_H -+#define _BCACHEFS_QUOTA_TYPES_H -+ -+#include -+ -+struct bch_qid { -+ u32 q[QTYP_NR]; -+}; -+ -+enum quota_acct_mode { -+ KEY_TYPE_QUOTA_PREALLOC, -+ KEY_TYPE_QUOTA_WARN, -+ KEY_TYPE_QUOTA_NOCHECK, -+}; -+ -+struct memquota_counter { -+ u64 v; -+ u64 hardlimit; -+ u64 softlimit; -+ s64 timer; -+ int warns; -+ int warning_issued; -+}; -+ -+struct bch_memquota { -+ struct memquota_counter c[Q_COUNTERS]; -+}; -+ -+typedef GENRADIX(struct bch_memquota) bch_memquota_table; -+ -+struct quota_limit { -+ u32 timelimit; -+ u32 warnlimit; -+}; -+ -+struct bch_memquota_type { -+ struct quota_limit limits[Q_COUNTERS]; -+ bch_memquota_table table; -+ struct mutex lock; -+}; -+ -+#endif /* _BCACHEFS_QUOTA_TYPES_H */ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -new file mode 100644 -index 000000000000..e15a2b1dc5d0 ---- /dev/null -+++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,334 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "extents.h" -+#include "io.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Check if an extent should be moved: -+ * returns -1 if it should not be moved, or -+ * device of pointer that should be moved, if known, or INT_MAX if unknown -+ */ -+static int __bch2_rebalance_pred(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ if (io_opts->background_compression && -+ !bch2_bkey_is_incompressible(k)) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ p.crc.compression_type != -+ bch2_compression_opt_to_type[io_opts->background_compression]) -+ return p.ptr.dev; -+ -+ if (io_opts->background_target) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) -+ return p.ptr.dev; -+ -+ return -1; -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ atomic64_t *counter; -+ int dev; -+ -+ dev = __bch2_rebalance_pred(c, k, io_opts); -+ if (dev < 0) -+ return; -+ -+ counter = dev < INT_MAX -+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work -+ : &c->rebalance.work_unknown_dev; -+ -+ if (atomic64_add_return(k.k->size, counter) == k.k->size) -+ rebalance_wakeup(c); -+} -+ -+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { -+ data_opts->target = io_opts->background_target; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+ } else { -+ return DATA_SKIP; -+ } -+} -+ -+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -+{ -+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == -+ sectors) -+ rebalance_wakeup(c); -+} -+ -+struct rebalance_work { -+ int dev_most_full_idx; -+ unsigned dev_most_full_percent; -+ u64 dev_most_full_work; -+ u64 dev_most_full_capacity; -+ u64 total_work; -+}; -+ -+static void rebalance_work_accumulate(struct rebalance_work *w, -+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) -+{ -+ unsigned percent_full; -+ u64 work = dev_work + unknown_dev; -+ -+ if (work < dev_work || work < unknown_dev) -+ work = U64_MAX; -+ work = min(work, capacity); -+ -+ percent_full = div64_u64(work * 100, capacity); -+ -+ if (percent_full >= w->dev_most_full_percent) { -+ w->dev_most_full_idx = idx; -+ w->dev_most_full_percent = percent_full; -+ w->dev_most_full_work = work; -+ w->dev_most_full_capacity = capacity; -+ } -+ -+ if (w->total_work + dev_work >= w->total_work && -+ w->total_work + dev_work >= dev_work) -+ w->total_work += dev_work; -+} -+ -+static struct rebalance_work rebalance_work(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct rebalance_work ret = { .dev_most_full_idx = -1 }; -+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ rebalance_work_accumulate(&ret, -+ atomic64_read(&ca->rebalance_work), -+ unknown_dev, -+ bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket), -+ i); -+ -+ rebalance_work_accumulate(&ret, -+ unknown_dev, 0, c->capacity, -1); -+ -+ return ret; -+} -+ -+static void rebalance_work_reset(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ atomic64_set(&ca->rebalance_work, 0); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, 0); -+} -+ -+static unsigned long curr_cputime(void) -+{ -+ u64 utime, stime; -+ -+ task_cputime_adjusted(current, &utime, &stime); -+ return nsecs_to_jiffies(utime + stime); -+} -+ -+static int bch2_rebalance_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct rebalance_work w, p; -+ unsigned long start, prev_start; -+ unsigned long prev_run_time, prev_run_cputime; -+ unsigned long cputime, prev_cputime; -+ unsigned long io_start; -+ long throttle; -+ -+ set_freezable(); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = rebalance_work(c); -+ prev_start = jiffies; -+ prev_cputime = curr_cputime(); -+ -+ while (!kthread_wait_freezable(r->enabled)) { -+ cond_resched(); -+ -+ start = jiffies; -+ cputime = curr_cputime(); -+ -+ prev_run_time = start - prev_start; -+ prev_run_cputime = cputime - prev_cputime; -+ -+ w = rebalance_work(c); -+ BUG_ON(!w.dev_most_full_capacity); -+ -+ if (!w.total_work) { -+ r->state = REBALANCE_WAITING; -+ kthread_wait_freezable(rebalance_work(c).total_work); -+ continue; -+ } -+ -+ /* -+ * If there isn't much work to do, throttle cpu usage: -+ */ -+ throttle = prev_run_cputime * 100 / -+ max(1U, w.dev_most_full_percent) - -+ prev_run_time; -+ -+ if (w.dev_most_full_percent < 20 && throttle > 0) { -+ r->throttled_until_iotime = io_start + -+ div_u64(w.dev_most_full_capacity * -+ (20 - w.dev_most_full_percent), -+ 50); -+ -+ if (atomic_long_read(&clock->now) + clock->max_slop < -+ r->throttled_until_iotime) { -+ r->throttled_until_cputime = start + throttle; -+ r->state = REBALANCE_THROTTLED; -+ -+ bch2_kthread_io_clock_wait(clock, -+ r->throttled_until_iotime, -+ throttle); -+ continue; -+ } -+ } -+ -+ /* minimum 1 mb/sec: */ -+ r->pd.rate.rate = -+ max_t(u64, 1 << 11, -+ r->pd.rate.rate * -+ max(p.dev_most_full_percent, 1U) / -+ max(w.dev_most_full_percent, 1U)); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = w; -+ prev_start = start; -+ prev_cputime = cputime; -+ -+ r->state = REBALANCE_RUNNING; -+ memset(&r->move_stats, 0, sizeof(r->move_stats)); -+ rebalance_work_reset(c); -+ -+ bch2_move_data(c, -+ /* ratelimiting disabled for now */ -+ NULL, /* &r->pd.rate, */ -+ writepoint_ptr(&c->rebalance_write_point), -+ POS_MIN, POS_MAX, -+ rebalance_pred, NULL, -+ &r->move_stats); -+ } -+ -+ return 0; -+} -+ -+ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct rebalance_work w = rebalance_work(c); -+ char h1[21], h2[21]; -+ -+ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); -+ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); -+ pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", -+ w.dev_most_full_idx, h1, h2); -+ -+ bch2_hprint(&PBUF(h1), w.total_work << 9); -+ bch2_hprint(&PBUF(h2), c->capacity << 9); -+ pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); -+ -+ pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); -+ -+ switch (r->state) { -+ case REBALANCE_WAITING: -+ pr_buf(&out, "waiting\n"); -+ break; -+ case REBALANCE_THROTTLED: -+ bch2_hprint(&PBUF(h1), -+ (r->throttled_until_iotime - -+ atomic_long_read(&c->io_clock[WRITE].now)) << 9); -+ pr_buf(&out, "throttled for %lu sec or %s io\n", -+ (r->throttled_until_cputime - jiffies) / HZ, -+ h1); -+ break; -+ case REBALANCE_RUNNING: -+ pr_buf(&out, "running\n"); -+ pr_buf(&out, "pos %llu:%llu\n", -+ r->move_stats.pos.inode, -+ r->move_stats.pos.offset); -+ break; -+ } -+ -+ return out.pos - buf; -+} -+ -+void bch2_rebalance_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ c->rebalance.pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->rebalance.pd.rate); -+ -+ p = rcu_dereference_protected(c->rebalance.thread, 1); -+ c->rebalance.thread = NULL; -+ -+ if (p) { -+ /* for sychronizing with rebalance_wakeup() */ -+ synchronize_rcu(); -+ -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_rebalance_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(c->rebalance.thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_rebalance_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->rebalance.pd); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); -+} -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -new file mode 100644 -index 000000000000..99e2a1fb6084 ---- /dev/null -+++ b/fs/bcachefs/rebalance.h -@@ -0,0 +1,28 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_H -+#define _BCACHEFS_REBALANCE_H -+ -+#include "rebalance_types.h" -+ -+static inline void rebalance_wakeup(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(c->rebalance.thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_opts *); -+void bch2_rebalance_add_work(struct bch_fs *, u64); -+ -+ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); -+ -+void bch2_rebalance_stop(struct bch_fs *); -+int bch2_rebalance_start(struct bch_fs *); -+void bch2_fs_rebalance_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REBALANCE_H */ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -new file mode 100644 -index 000000000000..192c6be20ced ---- /dev/null -+++ b/fs/bcachefs/rebalance_types.h -@@ -0,0 +1,27 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_TYPES_H -+#define _BCACHEFS_REBALANCE_TYPES_H -+ -+#include "move_types.h" -+ -+enum rebalance_state { -+ REBALANCE_WAITING, -+ REBALANCE_THROTTLED, -+ REBALANCE_RUNNING, -+}; -+ -+struct bch_fs_rebalance { -+ struct task_struct __rcu *thread; -+ struct bch_pd_controller pd; -+ -+ atomic64_t work_unknown_dev; -+ -+ enum rebalance_state state; -+ unsigned long throttled_until_iotime; -+ unsigned long throttled_until_cputime; -+ struct bch_move_stats move_stats; -+ -+ unsigned enabled:1; -+}; -+ -+#endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -new file mode 100644 -index 000000000000..41b864dcdc39 ---- /dev/null -+++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1317 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "buckets.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "quota.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+/* iterate over keys read from the journal: */ -+ -+static struct journal_key *journal_key_search(struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ size_t l = 0, r = journal_keys->nr, m; -+ -+ while (l < r) { -+ m = l + ((r - l) >> 1); -+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: -+ cmp_int(level, journal_keys->d[m].level) ?: -+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ BUG_ON(l < journal_keys->nr && -+ (cmp_int(id, journal_keys->d[l].btree_id) ?: -+ cmp_int(level, journal_keys->d[l].level) ?: -+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); -+ -+ BUG_ON(l && -+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: -+ cmp_int(level, journal_keys->d[l - 1].level) ?: -+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); -+ -+ return l < journal_keys->nr ? journal_keys->d + l : NULL; -+} -+ -+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) -+{ -+ if (iter->k && -+ iter->k < iter->keys->d + iter->keys->nr && -+ iter->k->btree_id == iter->btree_id && -+ iter->k->level == iter->level) -+ return iter->k->k; -+ -+ iter->k = NULL; -+ return NULL; -+} -+ -+static void bch2_journal_iter_advance(struct journal_iter *iter) -+{ -+ if (iter->k) -+ iter->k++; -+} -+ -+static void bch2_journal_iter_init(struct journal_iter *iter, -+ struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ iter->btree_id = id; -+ iter->level = level; -+ iter->keys = journal_keys; -+ iter->k = journal_key_search(journal_keys, id, level, pos); -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -+{ -+ return iter->btree -+ ? bch2_btree_iter_peek(iter->btree) -+ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, -+ iter->b, &iter->unpacked); -+} -+ -+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -+{ -+ if (iter->btree) -+ bch2_btree_iter_next(iter->btree); -+ else -+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -+} -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -+{ -+ switch (iter->last) { -+ case none: -+ break; -+ case btree: -+ bch2_journal_iter_advance_btree(iter); -+ break; -+ case journal: -+ bch2_journal_iter_advance(&iter->journal); -+ break; -+ } -+ -+ iter->last = none; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -+{ -+ struct bkey_s_c ret; -+ -+ while (1) { -+ struct bkey_s_c btree_k = -+ bch2_journal_iter_peek_btree(iter); -+ struct bkey_s_c journal_k = -+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); -+ -+ if (btree_k.k && journal_k.k) { -+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); -+ -+ if (!cmp) -+ bch2_journal_iter_advance_btree(iter); -+ -+ iter->last = cmp < 0 ? btree : journal; -+ } else if (btree_k.k) { -+ iter->last = btree; -+ } else if (journal_k.k) { -+ iter->last = journal; -+ } else { -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ ret = iter->last == journal ? journal_k : btree_k; -+ -+ if (iter->b && -+ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { -+ iter->journal.k = NULL; -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ if (!bkey_deleted(ret.k)) -+ break; -+ -+ bch2_btree_and_journal_iter_advance(iter); -+ } -+ -+ return ret; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) -+{ -+ bch2_btree_and_journal_iter_advance(iter); -+ -+ return bch2_btree_and_journal_iter_peek(iter); -+} -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, -+ struct btree_trans *trans, -+ struct journal_keys *journal_keys, -+ enum btree_id id, struct bpos pos) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); -+ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); -+} -+ -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct journal_keys *journal_keys, -+ struct btree *b) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->b = b; -+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); -+ bch2_journal_iter_init(&iter->journal, journal_keys, -+ b->c.btree_id, b->c.level, b->data->min_key); -+} -+ -+/* Walk btree, overlaying keys from the journal: */ -+ -+static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ ret = key_fn(c, btree_id, b->c.level, k); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ if (b->c.level > 0) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, child, -+ journal_keys, btree_id, node_fn, key_fn); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree *b = c->btree_roots[btree_id].b; -+ int ret = 0; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, -+ node_fn, key_fn) ?: -+ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+/* sort and dedup all keys in the journal: */ -+ -+void bch2_journal_entries_free(struct list_head *list) -+{ -+ -+ while (!list_empty(list)) { -+ struct journal_replay *i = -+ list_first_entry(list, struct journal_replay, list); -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+} -+ -+/* -+ * When keys compare equal, oldest compares first: -+ */ -+static int journal_sort_key_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->level, r->level) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+void bch2_journal_keys_free(struct journal_keys *keys) -+{ -+ kvfree(keys->d); -+ keys->d = NULL; -+ keys->nr = 0; -+} -+ -+static struct journal_keys journal_keys_sort(struct list_head *journal_entries) -+{ -+ struct journal_replay *p; -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct journal_keys keys = { NULL }; -+ struct journal_key *src, *dst; -+ size_t nr_keys = 0; -+ -+ if (list_empty(journal_entries)) -+ return keys; -+ -+ keys.journal_seq_base = -+ le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ nr_keys++; -+ } -+ -+ -+ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); -+ if (!keys.d) -+ goto err; -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ keys.d[keys.nr++] = (struct journal_key) { -+ .btree_id = entry->btree_id, -+ .level = entry->level, -+ .k = k, -+ .journal_seq = le64_to_cpu(p->j.seq) - -+ keys.journal_seq_base, -+ .journal_offset = k->_data - p->j._data, -+ }; -+ } -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); -+ -+ src = dst = keys.d; -+ while (src < keys.d + keys.nr) { -+ while (src + 1 < keys.d + keys.nr && -+ src[0].btree_id == src[1].btree_id && -+ src[0].level == src[1].level && -+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) -+ src++; -+ -+ *dst++ = *src++; -+ } -+ -+ keys.nr = dst - keys.d; -+err: -+ return keys; -+} -+ -+/* journal replay: */ -+ -+static void replay_now_at(struct journal *j, u64 seq) -+{ -+ BUG_ON(seq < j->replay_journal_seq); -+ BUG_ON(seq > j->replay_journal_seq_end); -+ -+ while (j->replay_journal_seq < seq) -+ bch2_journal_pin_put(j, j->replay_journal_seq++); -+} -+ -+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, -+ struct bkey_i *k) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter, *split_iter; -+ /* -+ * We might cause compressed extents to be split, so we need to pass in -+ * a disk_reservation: -+ */ -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i *split; -+ struct bpos atomic_end; -+ /* -+ * Some extents aren't equivalent - w.r.t. what the triggers do -+ * - if they're split: -+ */ -+ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || -+ k->k.type == KEY_TYPE_reflink_p; -+ bool remark = false; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ -+ do { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); -+ -+ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); -+ ret = PTR_ERR_OR_ZERO(split); -+ if (ret) -+ goto err; -+ -+ if (!remark && -+ remark_if_split && -+ bkey_cmp(atomic_end, k->k.p) < 0) { -+ ret = bch2_disk_reservation_add(c, &disk_res, -+ k->k.size * -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ -+ remark = true; -+ } -+ -+ bkey_copy(split, k); -+ bch2_cut_front(iter->pos, split); -+ bch2_cut_back(atomic_end, split); -+ -+ split_iter = bch2_trans_copy_iter(&trans, iter); -+ ret = PTR_ERR_OR_ZERO(split_iter); -+ if (ret) -+ goto err; -+ -+ /* -+ * It's important that we don't go through the -+ * extent_handle_overwrites() and extent_update_to_keys() path -+ * here: journal replay is supposed to treat extents like -+ * regular keys -+ */ -+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); -+ bch2_trans_update(&trans, split_iter, split, !remark -+ ? BTREE_TRIGGER_NORUN -+ : BTREE_TRIGGER_NOOVERWRITES); -+ -+ bch2_btree_iter_set_pos(iter, split->k.p); -+ } while (bkey_cmp(iter->pos, k->k.p) < 0); -+ -+ if (remark) { -+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), -+ 0, -((s64) k->k.size), -+ BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_trans_commit(&trans, &disk_res, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_journal_replay_key(struct btree_trans *trans, -+ enum btree_id id, unsigned level, -+ struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_node_iter(trans, id, k->k.p, -+ BTREE_MAX_DEPTH, level, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ /* -+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run -+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't -+ * want that here, journal replay is supposed to treat extents like -+ * regular keys: -+ */ -+ __bch2_btree_iter_set_pos(iter, k->k.p, false); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_journal_replay_key(&trans, id, level, k)); -+} -+ -+static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_alloc_replay_key(&trans, k)); -+} -+ -+static int journal_sort_seq_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(r->level, l->level) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->btree_id, r->btree_id) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p); -+} -+ -+static int bch2_journal_replay(struct bch_fs *c, -+ struct journal_keys keys) -+{ -+ struct journal *j = &c->journal; -+ struct journal_key *i; -+ u64 seq; -+ int ret; -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); -+ -+ if (keys.nr) -+ replay_now_at(j, keys.journal_seq_base); -+ -+ seq = j->replay_journal_seq; -+ -+ /* -+ * First replay updates to the alloc btree - these will only update the -+ * btree key cache: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_alloc_replay_key(c, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Next replay updates to interior btree nodes: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Now that the btree is in a consistent state, we can start journal -+ * reclaim (which will be flushing entries from the btree key cache back -+ * to the btree: -+ */ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); -+ -+ j->replay_journal_seq = seq; -+ -+ /* -+ * Now replay leaf node updates: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level || i->btree_id == BTREE_ID_ALLOC) -+ continue; -+ -+ replay_now_at(j, keys.journal_seq_base + i->journal_seq); -+ -+ ret = i->k->k.size -+ ? bch2_extent_replay_key(c, i->btree_id, i->k) -+ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ -+ replay_now_at(j, j->replay_journal_seq_end); -+ j->replay_journal_seq = 0; -+ -+ bch2_journal_set_replay_done(j); -+ bch2_journal_flush_all_pins(j); -+ return bch2_journal_error(j); -+err: -+ bch_err(c, "journal replay: error %d while replaying key", ret); -+ return ret; -+} -+ -+static bool journal_empty(struct list_head *journal) -+{ -+ return list_empty(journal) || -+ journal_entry_empty(&list_last_entry(journal, -+ struct journal_replay, list)->j); -+} -+ -+static int -+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, -+ struct list_head *journal) -+{ -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ u64 start_seq = le64_to_cpu(i->j.last_seq); -+ u64 end_seq = le64_to_cpu(i->j.seq); -+ u64 seq = start_seq; -+ int ret = 0; -+ -+ list_for_each_entry(i, journal, list) { -+ if (le64_to_cpu(i->j.seq) < start_seq) -+ continue; -+ -+ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, -+ "journal entries %llu-%llu missing! (replaying %llu-%llu)", -+ seq, le64_to_cpu(i->j.seq) - 1, -+ start_seq, end_seq); -+ -+ seq = le64_to_cpu(i->j.seq); -+ -+ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, -+ "found blacklisted journal entry %llu", seq); -+ -+ do { -+ seq++; -+ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal replay early: */ -+ -+static int journal_replay_entry_early(struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ int ret = 0; -+ -+ switch (entry->type) { -+ case BCH_JSET_ENTRY_btree_root: { -+ struct btree_root *r; -+ -+ if (entry->btree_id >= BTREE_ID_NR) { -+ bch_err(c, "filesystem has unknown btree type %u", -+ entry->btree_id); -+ return -EINVAL; -+ } -+ -+ r = &c->btree_roots[entry->btree_id]; -+ -+ if (entry->u64s) { -+ r->level = entry->level; -+ bkey_copy(&r->key, &entry->start[0]); -+ r->error = 0; -+ } else { -+ r->error = -EIO; -+ } -+ r->alive = true; -+ break; -+ } -+ case BCH_JSET_ENTRY_usage: { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ switch (entry->btree_id) { -+ case FS_USAGE_RESERVED: -+ if (entry->level < BCH_REPLICAS_MAX) -+ c->usage_base->persistent_reserved[entry->level] = -+ le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_INODES: -+ c->usage_base->nr_inodes = le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_KEY_VERSION: -+ atomic64_set(&c->key_version, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ -+ break; -+ } -+ case BCH_JSET_ENTRY_data_usage: { -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ ret = bch2_replicas_set_usage(c, &u->r, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist: { -+ struct jset_entry_blacklist *bl_entry = -+ container_of(entry, struct jset_entry_blacklist, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->seq), -+ le64_to_cpu(bl_entry->seq) + 1); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist_v2: { -+ struct jset_entry_blacklist_v2 *bl_entry = -+ container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->start), -+ le64_to_cpu(bl_entry->end) + 1); -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int journal_replay_early(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct list_head *journal) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ if (clean) { -+ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); -+ -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } else { -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ -+ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); -+ -+ list_for_each_entry(i, journal, list) -+ vstruct_for_each(&i->j, entry) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ bch2_fs_usage_initialize(c); -+ -+ return 0; -+} -+ -+/* sb clean section: */ -+ -+static struct bkey_i *btree_root_find(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct jset *j, -+ enum btree_id id, unsigned *level) -+{ -+ struct bkey_i *k; -+ struct jset_entry *entry, *start, *end; -+ -+ if (clean) { -+ start = clean->start; -+ end = vstruct_end(&clean->field); -+ } else { -+ start = j->start; -+ end = vstruct_last(j); -+ } -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root && -+ entry->btree_id == id) -+ goto found; -+ -+ return NULL; -+found: -+ if (!entry->u64s) -+ return ERR_PTR(-EINVAL); -+ -+ k = entry->start; -+ *level = entry->level; -+ return k; -+} -+ -+static int verify_superblock_clean(struct bch_fs *c, -+ struct bch_sb_field_clean **cleanp, -+ struct jset *j) -+{ -+ unsigned i; -+ struct bch_sb_field_clean *clean = *cleanp; -+ int ret = 0; -+ -+ if (!c->sb.clean || !j) -+ return 0; -+ -+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, -+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", -+ le64_to_cpu(clean->journal_seq), -+ le64_to_cpu(j->seq))) { -+ kfree(clean); -+ *cleanp = NULL; -+ return 0; -+ } -+ -+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, -+ "superblock read clock doesn't match journal after clean shutdown"); -+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, -+ "superblock read clock doesn't match journal after clean shutdown"); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ char buf1[200], buf2[200]; -+ struct bkey_i *k1, *k2; -+ unsigned l1 = 0, l2 = 0; -+ -+ k1 = btree_root_find(c, clean, NULL, i, &l1); -+ k2 = btree_root_find(c, NULL, j, i, &l2); -+ -+ if (!k1 && !k2) -+ continue; -+ -+ mustfix_fsck_err_on(!k1 || !k2 || -+ IS_ERR(k1) || -+ IS_ERR(k2) || -+ k1->k.u64s != k2->k.u64s || -+ memcmp(k1, k2, bkey_bytes(k1)) || -+ l1 != l2, c, -+ "superblock btree root %u doesn't match journal after clean shutdown\n" -+ "sb: l=%u %s\n" -+ "journal: l=%u %s\n", i, -+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), -+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean, *sb_clean; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); -+ -+ if (fsck_err_on(!sb_clean, c, -+ "superblock marked clean but clean section not present")) { -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ mutex_unlock(&c->sb_lock); -+ return NULL; -+ } -+ -+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), -+ GFP_KERNEL); -+ if (!clean) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(-ENOMEM); -+ } -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(clean, READ); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return clean; -+fsck_err: -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+} -+ -+static int read_btree_roots(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = &c->btree_roots[i]; -+ -+ if (!r->alive) -+ continue; -+ -+ if (i == BTREE_ID_ALLOC && -+ c->opts.reconstruct_alloc) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ continue; -+ } -+ -+ -+ if (r->error) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "invalid btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ -+ ret = bch2_btree_root_read(c, i, &r->key, r->level); -+ if (ret) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "error reading btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (!c->btree_roots[i].b) -+ bch2_btree_root_alloc(c, i); -+fsck_err: -+ return ret; -+} -+ -+int bch2_fs_recovery(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_clean *clean = NULL; -+ u64 journal_seq; -+ bool wrote = false, write_sb = false; -+ int ret; -+ -+ if (c->sb.clean) -+ clean = read_superblock_clean(c); -+ ret = PTR_ERR_OR_ZERO(clean); -+ if (ret) -+ goto err; -+ -+ if (c->sb.clean) -+ bch_info(c, "recovering from clean shutdown, journal seq %llu", -+ le64_to_cpu(clean->journal_seq)); -+ -+ if (!c->replicas.entries) { -+ bch_info(c, "building replicas info"); -+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ } -+ -+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { -+ struct jset *j; -+ -+ ret = bch2_journal_read(c, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, -+ "filesystem marked clean but journal not empty")) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ } -+ -+ if (!c->sb.clean && list_empty(&c->journal_entries)) { -+ bch_err(c, "no journal entries found"); -+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; -+ goto err; -+ } -+ -+ c->journal_keys = journal_keys_sort(&c->journal_entries); -+ if (!c->journal_keys.d) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ j = &list_last_entry(&c->journal_entries, -+ struct journal_replay, list)->j; -+ -+ ret = verify_superblock_clean(c, &clean, j); -+ if (ret) -+ goto err; -+ -+ journal_seq = le64_to_cpu(j->seq) + 1; -+ } else { -+ journal_seq = le64_to_cpu(clean->journal_seq) + 1; -+ } -+ -+ if (!c->sb.clean && -+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { -+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = journal_replay_early(c, clean, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (!c->sb.clean) { -+ ret = bch2_journal_seq_blacklist_add(c, -+ journal_seq, -+ journal_seq + 4); -+ if (ret) { -+ bch_err(c, "error creating new journal seq blacklist entry"); -+ goto err; -+ } -+ -+ journal_seq += 4; -+ } -+ -+ ret = bch2_blacklist_table_initialize(c); -+ -+ if (!list_empty(&c->journal_entries)) { -+ ret = verify_journal_entries_not_blacklisted_or_missing(c, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_fs_journal_start(&c->journal, journal_seq, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ ret = read_btree_roots(c); -+ if (ret) -+ goto err; -+ -+ bch_verbose(c, "starting alloc read"); -+ err = "error reading allocation information"; -+ ret = bch2_alloc_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "alloc read done"); -+ -+ bch_verbose(c, "starting stripes_read"); -+ err = "error reading stripes"; -+ ret = bch2_stripes_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "stripes_read done"); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ -+ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { -+ /* -+ * interior btree node updates aren't consistent with the -+ * journal; after an unclean shutdown we have to walk all -+ * pointers to metadata: -+ */ -+ bch_info(c, "starting metadata mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, true); -+ if (ret) -+ goto err; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ if (c->opts.fsck || -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || -+ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { -+ bch_info(c, "starting mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, false); -+ if (ret) -+ goto err; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ /* -+ * Skip past versions that might have possibly been used (as nonces), -+ * but hadn't had their pointers written: -+ */ -+ if (c->sb.encryption_type && !c->sb.clean) -+ atomic64_add(1 << 16, &c->key_version); -+ -+ if (c->opts.norecovery) -+ goto out; -+ -+ bch_verbose(c, "starting journal replay"); -+ err = "journal replay failed"; -+ ret = bch2_journal_replay(c, c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "journal replay done"); -+ -+ if (!c->opts.nochanges) { -+ /* -+ * note that even when filesystem was clean there might be work -+ * to do here, if we ran gc (because of fsck) which recalculated -+ * oldest_gen: -+ */ -+ bch_verbose(c, "writing allocation info"); -+ err = "error writing out alloc info"; -+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: -+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); -+ if (ret) { -+ bch_err(c, "error writing alloc info"); -+ goto err; -+ } -+ bch_verbose(c, "alloc write done"); -+ -+ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); -+ } -+ -+ if (!c->sb.clean) { -+ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ bch_info(c, "checking inode link counts"); -+ err = "error in recovery"; -+ ret = bch2_fsck_inode_nlink(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ -+ } else { -+ bch_verbose(c, "checking for deleted inodes"); -+ err = "error in recovery"; -+ ret = bch2_fsck_walk_inodes_only(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ } -+ } -+ -+ if (c->opts.fsck) { -+ bch_info(c, "starting fsck"); -+ err = "error in fsck"; -+ ret = bch2_fsck_full(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "fsck done"); -+ } -+ -+ if (enabled_qtypes(c)) { -+ bch_verbose(c, "reading quotas"); -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "quotas done"); -+ } -+ -+ mutex_lock(&c->sb_lock); -+ if (c->opts.version_upgrade) { -+ if (c->sb.version < bcachefs_metadata_version_new_versioning) -+ c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_min); -+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ write_sb = true; -+ } -+ -+ if (!test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ write_sb = true; -+ } -+ -+ if (c->opts.fsck && -+ !test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); -+ write_sb = true; -+ } -+ -+ if (write_sb) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (c->journal_seq_blacklist_table && -+ c->journal_seq_blacklist_table->nr > 128) -+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); -+out: -+ ret = 0; -+err: -+fsck_err: -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ bch2_flush_fsck_errs(c); -+ -+ if (!c->opts.keep_journal) { -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ } -+ kfree(clean); -+ if (ret) -+ bch_err(c, "Error in recovery: %s (%i)", err, ret); -+ else -+ bch_verbose(c, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_initialize(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ struct bkey_inode_buf packed_inode; -+ struct qstr lostfound = QSTR("lost+found"); -+ const char *err = "cannot allocate memory"; -+ struct bch_dev *ca; -+ LIST_HEAD(journal); -+ unsigned i; -+ int ret; -+ -+ bch_notice(c, "initializing new filesystem"); -+ -+ mutex_lock(&c->sb_lock); -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->version = c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ bch2_btree_root_alloc(c, i); -+ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); -+ -+ err = "unable to allocate journal buckets"; -+ for_each_online_member(ca, c, i) { -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ /* -+ * journal_res_get() will crash if called before this has -+ * set up the journal.pin FIFO and journal.cur pointer: -+ */ -+ bch2_fs_journal_start(&c->journal, 1, &journal); -+ bch2_journal_set_replay_done(&c->journal); -+ -+ bch2_inode_init(c, &root_inode, 0, 0, -+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); -+ root_inode.bi_inum = BCACHEFS_ROOT_INO; -+ bch2_inode_pack(&packed_inode, &root_inode); -+ -+ err = "error creating root directory"; -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &packed_inode.inode.k_i, -+ NULL, NULL, BTREE_INSERT_LAZY_RW); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_early(c, &lostfound_inode); -+ -+ err = "error creating lost+found"; -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, -+ &root_inode, &lostfound_inode, -+ &lostfound, -+ 0, 0, S_IFDIR|0700, 0, -+ NULL, NULL)); -+ if (ret) -+ goto err; -+ -+ if (enabled_qtypes(c)) { -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ } -+ -+ err = "error writing first journal entry"; -+ ret = bch2_journal_meta(&c->journal); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+err: -+ pr_err("Error initializing new filesystem: %s (%i)", err, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -new file mode 100644 -index 000000000000..a66827c9addf ---- /dev/null -+++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,60 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_H -+#define _BCACHEFS_RECOVERY_H -+ -+#define for_each_journal_key(keys, i) \ -+ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) -+ -+struct journal_iter { -+ enum btree_id btree_id; -+ unsigned level; -+ struct journal_keys *keys; -+ struct journal_key *k; -+}; -+ -+/* -+ * Iterate over keys in the btree, with keys from the journal overlaid on top: -+ */ -+ -+struct btree_and_journal_iter { -+ struct btree_iter *btree; -+ -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey unpacked; -+ -+ struct journal_iter journal; -+ -+ enum last_key_returned { -+ none, -+ btree, -+ journal, -+ } last; -+}; -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, -+ struct btree_trans *, -+ struct journal_keys *, -+ enum btree_id, struct bpos); -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct journal_keys *, -+ struct btree *); -+ -+typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); -+typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k); -+ -+int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, -+ btree_walk_node_fn, btree_walk_key_fn); -+ -+void bch2_journal_keys_free(struct journal_keys *); -+void bch2_journal_entries_free(struct list_head *); -+ -+int bch2_fs_recovery(struct bch_fs *); -+int bch2_fs_initialize(struct bch_fs *); -+ -+#endif /* _BCACHEFS_RECOVERY_H */ -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -new file mode 100644 -index 000000000000..3c473f1380a6 ---- /dev/null -+++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,303 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "inode.h" -+#include "io.h" -+#include "reflink.h" -+ -+#include -+ -+/* reflink pointers */ -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ if (bkey_val_bytes(p.k) != sizeof(*p.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); -+} -+ -+enum merge_result bch2_reflink_p_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); -+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); -+ -+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, _r); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* indirect extents */ -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ if (bkey_val_bytes(r.k) < sizeof(*r.v)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -+ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+static int bch2_make_extent_indirect(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i_extent *e) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *reflink_iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ struct bkey_i_reflink_p *r_p; -+ int ret; -+ -+ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, -+ POS(0, c->reflink_hint), -+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { -+ if (reflink_iter->pos.inode) { -+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); -+ continue; -+ } -+ -+ if (bkey_deleted(k.k) && e->k.size <= k.k->size) -+ break; -+ } -+ -+ if (ret) -+ goto err; -+ -+ /* rewind iter to start of hole, if necessary: */ -+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); -+ -+ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reflink_v_init(&r_v->k_i); -+ r_v->k.p = reflink_iter->pos; -+ bch2_key_resize(&r_v->k, e->k.size); -+ r_v->k.version = e->k.version; -+ -+ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + -+ bkey_val_u64s(&e->k)); -+ r_v->v.refcount = 0; -+ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); -+ -+ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); -+ -+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); -+ if (IS_ERR(r_p)) -+ return PTR_ERR(r_p); -+ -+ e->k.type = KEY_TYPE_reflink_p; -+ r_p = bkey_i_to_reflink_p(&e->k_i); -+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); -+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); -+ -+ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); -+err: -+ if (!IS_ERR(reflink_iter)) -+ c->reflink_hint = reflink_iter->pos.offset; -+ bch2_trans_iter_put(trans, reflink_iter); -+ -+ return ret; -+} -+ -+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) -+{ -+ struct bkey_s_c k = bch2_btree_iter_peek(iter); -+ int ret; -+ -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(iter->pos, end) >= 0) -+ return bkey_s_c_null; -+ -+ if (k.k->type == KEY_TYPE_extent || -+ k.k->type == KEY_TYPE_reflink_p) -+ break; -+ } -+ -+ return k; -+} -+ -+s64 bch2_remap_range(struct bch_fs *c, -+ struct bpos dst_start, struct bpos src_start, -+ u64 remap_sectors, u64 *journal_seq, -+ u64 new_i_size, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *dst_iter, *src_iter; -+ struct bkey_s_c src_k; -+ BKEY_PADDED(k) new_dst; -+ struct bkey_on_stack new_src; -+ struct bpos dst_end = dst_start, src_end = src_start; -+ struct bpos dst_want, src_want; -+ u64 src_done, dst_done; -+ int ret = 0, ret2 = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ bch2_check_set_feature(c, BCH_FEATURE_reflink); -+ -+ dst_end.offset += remap_sectors; -+ src_end.offset += remap_sectors; -+ -+ bkey_on_stack_init(&new_src); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); -+ -+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, -+ BTREE_ITER_INTENT); -+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, -+ BTREE_ITER_INTENT); -+ -+ while (1) { -+ bch2_trans_begin(&trans); -+ -+ trans.mem_top = 0; -+ -+ if (fatal_signal_pending(current)) { -+ ret = -EINTR; -+ goto err; -+ } -+ -+ src_k = get_next_src(src_iter, src_end); -+ ret = bkey_err(src_k); -+ if (ret) -+ goto btree_err; -+ -+ src_done = bpos_min(src_iter->pos, src_end).offset - -+ src_start.offset; -+ dst_want = POS(dst_start.inode, dst_start.offset + src_done); -+ -+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { -+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, -+ journal_seq, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ continue; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); -+ -+ if (!bkey_cmp(dst_iter->pos, dst_end)) -+ break; -+ -+ if (src_k.k->type == KEY_TYPE_extent) { -+ bkey_on_stack_reassemble(&new_src, c, src_k); -+ src_k = bkey_i_to_s_c(new_src.k); -+ -+ bch2_cut_front(src_iter->pos, new_src.k); -+ bch2_cut_back(src_end, new_src.k); -+ -+ ret = bch2_make_extent_indirect(&trans, src_iter, -+ bkey_i_to_extent(new_src.k)); -+ if (ret) -+ goto btree_err; -+ -+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); -+ } -+ -+ if (src_k.k->type == KEY_TYPE_reflink_p) { -+ struct bkey_s_c_reflink_p src_p = -+ bkey_s_c_to_reflink_p(src_k); -+ struct bkey_i_reflink_p *dst_p = -+ bkey_reflink_p_init(&new_dst.k); -+ -+ u64 offset = le64_to_cpu(src_p.v->idx) + -+ (src_iter->pos.offset - -+ bkey_start_offset(src_k.k)); -+ -+ dst_p->v.idx = cpu_to_le64(offset); -+ } else { -+ BUG(); -+ } -+ -+ new_dst.k.k.p = dst_iter->pos; -+ bch2_key_resize(&new_dst.k.k, -+ min(src_k.k->p.offset - src_iter->pos.offset, -+ dst_end.offset - dst_iter->pos.offset)); -+ -+ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, -+ NULL, journal_seq, -+ new_i_size, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ src_want = POS(src_start.inode, src_start.offset + dst_done); -+ bch2_btree_iter_set_pos(src_iter, src_want); -+btree_err: -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); -+err: -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); -+ -+ bch2_trans_begin(&trans); -+ -+ do { -+ struct bch_inode_unpacked inode_u; -+ struct btree_iter *inode_iter; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ dst_start.inode, BTREE_ITER_INTENT); -+ ret2 = PTR_ERR_OR_ZERO(inode_iter); -+ -+ if (!ret2 && -+ inode_u.bi_size < new_i_size) { -+ inode_u.bi_size = new_i_size; -+ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, journal_seq, 0); -+ } -+ } while (ret2 == -EINTR); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&new_src, c); -+ -+ percpu_ref_put(&c->writes); -+ -+ return dst_done ?: ret ?: ret2; -+} -diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h -new file mode 100644 -index 000000000000..5445c1cf0797 ---- /dev/null -+++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REFLINK_H -+#define _BCACHEFS_REFLINK_H -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+enum merge_result bch2_reflink_p_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_p_invalid, \ -+ .val_to_text = bch2_reflink_p_to_text, \ -+ .key_merge = bch2_reflink_p_merge, \ -+} -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+ -+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_v_invalid, \ -+ .val_to_text = bch2_reflink_v_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, -+ u64, u64 *, u64, s64 *); -+ -+#endif /* _BCACHEFS_REFLINK_H */ -diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c -new file mode 100644 -index 000000000000..67a7128fd9af ---- /dev/null -+++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1084 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "buckets.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, -+ struct bch_replicas_cpu *); -+ -+/* Replicas tracking - in memory: */ -+ -+static inline int u8_cmp(u8 l, u8 r) -+{ -+ return cmp_int(l, r); -+} -+ -+static void verify_replicas_entry(struct bch_replicas_entry *e) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ unsigned i; -+ -+ BUG_ON(e->data_type >= BCH_DATA_NR); -+ BUG_ON(!e->nr_devs); -+ BUG_ON(e->nr_required > 1 && -+ e->nr_required >= e->nr_devs); -+ -+ for (i = 0; i + 1 < e->nr_devs; i++) -+ BUG_ON(e->devs[i] >= e->devs[i + 1]); -+#endif -+} -+ -+static void replicas_entry_sort(struct bch_replicas_entry *e) -+{ -+ bubble_sort(e->devs, e->nr_devs, u8_cmp); -+} -+ -+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -+{ -+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -+} -+ -+void bch2_replicas_entry_to_text(struct printbuf *out, -+ struct bch_replicas_entry *e) -+{ -+ unsigned i; -+ -+ pr_buf(out, "%s: %u/%u [", -+ bch2_data_types[e->data_type], -+ e->nr_required, -+ e->nr_devs); -+ -+ for (i = 0; i < e->nr_devs; i++) -+ pr_buf(out, i ? " %u" : "%u", e->devs[i]); -+ pr_buf(out, "]"); -+} -+ -+void bch2_cpu_replicas_to_text(struct printbuf *out, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_cpu_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+static void extent_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ r->nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ continue; -+ -+ if (!p.has_ec) -+ r->devs[r->nr_devs++] = p.ptr.dev; -+ else -+ r->nr_required = 0; -+ } -+} -+ -+static void stripe_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ const struct bch_extent_ptr *ptr; -+ -+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; -+ -+ for (ptr = s.v->ptrs; -+ ptr < s.v->ptrs + s.v->nr_blocks; -+ ptr++) -+ r->devs[r->nr_devs++] = ptr->dev; -+} -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *e, -+ struct bkey_s_c k) -+{ -+ e->nr_devs = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ e->data_type = BCH_DATA_BTREE; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ e->data_type = BCH_DATA_USER; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_stripe: -+ e->data_type = BCH_DATA_USER; -+ stripe_to_replicas(k, e); -+ break; -+ } -+ -+ replicas_entry_sort(e); -+} -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *e, -+ enum bch_data_type data_type, -+ struct bch_devs_list devs) -+{ -+ unsigned i; -+ -+ BUG_ON(!data_type || -+ data_type == BCH_DATA_SB || -+ data_type >= BCH_DATA_NR); -+ -+ e->data_type = data_type; -+ e->nr_devs = 0; -+ e->nr_required = 1; -+ -+ for (i = 0; i < devs.nr; i++) -+ e->devs[e->nr_devs++] = devs.devs[i]; -+ -+ replicas_entry_sort(e); -+} -+ -+static struct bch_replicas_cpu -+cpu_replicas_add_entry(struct bch_replicas_cpu *old, -+ struct bch_replicas_entry *new_entry) -+{ -+ unsigned i; -+ struct bch_replicas_cpu new = { -+ .nr = old->nr + 1, -+ .entry_size = max_t(unsigned, old->entry_size, -+ replicas_entry_bytes(new_entry)), -+ }; -+ -+ BUG_ON(!new_entry->data_type); -+ verify_replicas_entry(new_entry); -+ -+ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); -+ if (!new.entries) -+ return new; -+ -+ for (i = 0; i < old->nr; i++) -+ memcpy(cpu_replicas_entry(&new, i), -+ cpu_replicas_entry(old, i), -+ old->entry_size); -+ -+ memcpy(cpu_replicas_entry(&new, old->nr), -+ new_entry, -+ replicas_entry_bytes(new_entry)); -+ -+ bch2_cpu_replicas_sort(&new); -+ return new; -+} -+ -+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ int idx, entry_size = replicas_entry_bytes(search); -+ -+ if (unlikely(entry_size > r->entry_size)) -+ return -1; -+ -+ verify_replicas_entry(search); -+ -+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) -+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, -+ entry_cmp, search); -+#undef entry_cmp -+ -+ return idx < r->nr ? idx : -1; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ replicas_entry_sort(search); -+ -+ return __replicas_entry_idx(&c->replicas, search); -+} -+ -+static bool __replicas_has_entry(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ return __replicas_entry_idx(r, search) >= 0; -+} -+ -+static bool bch2_replicas_marked_locked(struct bch_fs *c, -+ struct bch_replicas_entry *search, -+ bool check_gc_replicas) -+{ -+ if (!search->nr_devs) -+ return true; -+ -+ verify_replicas_entry(search); -+ -+ return __replicas_has_entry(&c->replicas, search) && -+ (!check_gc_replicas || -+ likely((!c->replicas_gc.entries)) || -+ __replicas_has_entry(&c->replicas_gc, search)); -+} -+ -+bool bch2_replicas_marked(struct bch_fs *c, -+ struct bch_replicas_entry *search, -+ bool check_gc_replicas) -+{ -+ bool marked; -+ -+ percpu_down_read(&c->mark_lock); -+ marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); -+ percpu_up_read(&c->mark_lock); -+ -+ return marked; -+} -+ -+static void __replicas_table_update(struct bch_fs_usage *dst, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage *src, -+ struct bch_replicas_cpu *src_r) -+{ -+ int src_idx, dst_idx; -+ -+ *dst = *src; -+ -+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { -+ if (!src->replicas[src_idx]) -+ continue; -+ -+ dst_idx = __replicas_entry_idx(dst_r, -+ cpu_replicas_entry(src_r, src_idx)); -+ BUG_ON(dst_idx < 0); -+ -+ dst->replicas[dst_idx] = src->replicas[src_idx]; -+ } -+} -+ -+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage __percpu *src_p, -+ struct bch_replicas_cpu *src_r) -+{ -+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; -+ struct bch_fs_usage *dst, *src = (void *) -+ bch2_acc_percpu_u64s((void *) src_p, src_nr); -+ -+ preempt_disable(); -+ dst = this_cpu_ptr(dst_p); -+ preempt_enable(); -+ -+ __replicas_table_update(dst, dst_r, src, src_r); -+} -+ -+/* -+ * Resize filesystem accounting: -+ */ -+static int replicas_table_update(struct bch_fs *c, -+ struct bch_replicas_cpu *new_r) -+{ -+ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; -+ struct bch_fs_usage *new_scratch = NULL; -+ struct bch_fs_usage __percpu *new_gc = NULL; -+ struct bch_fs_usage *new_base = NULL; -+ unsigned bytes = sizeof(struct bch_fs_usage) + -+ sizeof(u64) * new_r->nr; -+ int ret = -ENOMEM; -+ -+ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || -+ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || -+ (c->usage_gc && -+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { -+ bch_err(c, "error updating replicas table: memory allocation failure"); -+ goto err; -+ } -+ -+ if (c->usage_base) -+ __replicas_table_update(new_base, new_r, -+ c->usage_base, &c->replicas); -+ if (c->usage[0]) -+ __replicas_table_update_pcpu(new_usage[0], new_r, -+ c->usage[0], &c->replicas); -+ if (c->usage[1]) -+ __replicas_table_update_pcpu(new_usage[1], new_r, -+ c->usage[1], &c->replicas); -+ if (c->usage_gc) -+ __replicas_table_update_pcpu(new_gc, new_r, -+ c->usage_gc, &c->replicas); -+ -+ swap(c->usage_base, new_base); -+ swap(c->usage[0], new_usage[0]); -+ swap(c->usage[1], new_usage[1]); -+ swap(c->usage_scratch, new_scratch); -+ swap(c->usage_gc, new_gc); -+ swap(c->replicas, *new_r); -+ ret = 0; -+err: -+ free_percpu(new_gc); -+ kfree(new_scratch); -+ free_percpu(new_usage[1]); -+ free_percpu(new_usage[0]); -+ kfree(new_base); -+ return ret; -+} -+ -+static unsigned reserve_journal_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ unsigned journal_res_u64s = 0; -+ -+ /* nr_inodes: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* key_version: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* persistent_reserved: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * -+ BCH_REPLICAS_MAX; -+ -+ for_each_cpu_replicas_entry(r, e) -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + -+ e->nr_devs, sizeof(u64)); -+ return journal_res_u64s; -+} -+ -+noinline -+static int bch2_mark_replicas_slowpath(struct bch_fs *c, -+ struct bch_replicas_entry *new_entry) -+{ -+ struct bch_replicas_cpu new_r, new_gc; -+ int ret = 0; -+ -+ verify_replicas_entry(new_entry); -+ -+ memset(&new_r, 0, sizeof(new_r)); -+ memset(&new_gc, 0, sizeof(new_gc)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (c->replicas_gc.entries && -+ !__replicas_has_entry(&c->replicas_gc, new_entry)) { -+ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); -+ if (!new_gc.entries) -+ goto err; -+ } -+ -+ if (!__replicas_has_entry(&c->replicas, new_entry)) { -+ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); -+ if (!new_r.entries) -+ goto err; -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); -+ if (ret) -+ goto err; -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->replicas_journal_res, -+ reserve_journal_replicas(c, &new_r)); -+ } -+ -+ if (!new_r.entries && -+ !new_gc.entries) -+ goto out; -+ -+ /* allocations done, now commit: */ -+ -+ if (new_r.entries) -+ bch2_write_super(c); -+ -+ /* don't update in memory replicas until changes are persistent */ -+ percpu_down_write(&c->mark_lock); -+ if (new_r.entries) -+ ret = replicas_table_update(c, &new_r); -+ if (new_gc.entries) -+ swap(new_gc, c->replicas_gc); -+ percpu_up_write(&c->mark_lock); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ kfree(new_r.entries); -+ kfree(new_gc.entries); -+ -+ return ret; -+err: -+ bch_err(c, "error adding replicas entry: memory allocation failure"); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+int bch2_mark_replicas(struct bch_fs *c, -+ struct bch_replicas_entry *r) -+{ -+ return likely(bch2_replicas_marked(c, r, true)) -+ ? 0 -+ : bch2_mark_replicas_slowpath(c, r); -+} -+ -+bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, -+ struct bkey_s_c k, -+ bool check_gc_replicas) -+{ -+ struct bch_replicas_padded search; -+ struct bch_devs_list cached = bch2_bkey_cached_devs(k); -+ unsigned i; -+ -+ for (i = 0; i < cached.nr; i++) { -+ bch2_replicas_entry_cached(&search.e, cached.devs[i]); -+ -+ if (!bch2_replicas_marked_locked(c, &search.e, -+ check_gc_replicas)) -+ return false; -+ } -+ -+ bch2_bkey_to_replicas(&search.e, k); -+ -+ return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); -+} -+ -+bool bch2_bkey_replicas_marked(struct bch_fs *c, -+ struct bkey_s_c k, -+ bool check_gc_replicas) -+{ -+ bool marked; -+ -+ percpu_down_read(&c->mark_lock); -+ marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); -+ percpu_up_read(&c->mark_lock); -+ -+ return marked; -+} -+ -+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bch_replicas_padded search; -+ struct bch_devs_list cached = bch2_bkey_cached_devs(k); -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < cached.nr; i++) { -+ bch2_replicas_entry_cached(&search.e, cached.devs[i]); -+ -+ ret = bch2_mark_replicas(c, &search.e); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_bkey_to_replicas(&search.e, k); -+ -+ return bch2_mark_replicas(c, &search.e); -+} -+ -+int bch2_replicas_gc_end(struct bch_fs *c, int ret) -+{ -+ unsigned i; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * this is kind of crappy; the replicas gc mechanism needs to be ripped -+ * out -+ */ -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct bch_replicas_cpu n; -+ -+ if (!__replicas_has_entry(&c->replicas_gc, e) && -+ (c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i]))) { -+ n = cpu_replicas_add_entry(&c->replicas_gc, e); -+ if (!n.entries) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ swap(n, c->replicas_gc); -+ kfree(n.entries); -+ } -+ } -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &c->replicas_gc); -+err: -+ kfree(c->replicas_gc.entries); -+ c->replicas_gc.entries = NULL; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i = 0; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ BUG_ON(c->replicas_gc.entries); -+ -+ c->replicas_gc.nr = 0; -+ c->replicas_gc.entry_size = 0; -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) { -+ c->replicas_gc.nr++; -+ c->replicas_gc.entry_size = -+ max_t(unsigned, c->replicas_gc.entry_size, -+ replicas_entry_bytes(e)); -+ } -+ -+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, -+ c->replicas_gc.entry_size, -+ GFP_NOIO); -+ if (!c->replicas_gc.entries) { -+ mutex_unlock(&c->sb_lock); -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) -+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), -+ e, c->replicas_gc.entry_size); -+ -+ bch2_cpu_replicas_sort(&c->replicas_gc); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_replicas_gc2(struct bch_fs *c) -+{ -+ struct bch_replicas_cpu new = { 0 }; -+ unsigned i, nr; -+ int ret = 0; -+ -+ bch2_journal_meta(&c->journal); -+retry: -+ nr = READ_ONCE(c->replicas.nr); -+ new.entry_size = READ_ONCE(c->replicas.entry_size); -+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); -+ if (!new.entries) { -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ if (nr != c->replicas.nr || -+ new.entry_size != c->replicas.entry_size) { -+ percpu_up_write(&c->mark_lock); -+ mutex_unlock(&c->sb_lock); -+ kfree(new.entries); -+ goto retry; -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (e->data_type == BCH_DATA_JOURNAL || -+ c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i])) -+ memcpy(cpu_replicas_entry(&new, new.nr++), -+ e, new.entry_size); -+ } -+ -+ bch2_cpu_replicas_sort(&new); -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &new); -+err: -+ kfree(new.entries); -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_set_usage(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ u64 sectors) -+{ -+ int ret, idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) { -+ struct bch_replicas_cpu n; -+ -+ n = cpu_replicas_add_entry(&c->replicas, r); -+ if (!n.entries) -+ return -ENOMEM; -+ -+ ret = replicas_table_update(c, &n); -+ if (ret) -+ return ret; -+ -+ kfree(n.entries); -+ -+ idx = bch2_replicas_entry_idx(c, r); -+ BUG_ON(ret < 0); -+ } -+ -+ c->usage_base->replicas[idx] = sectors; -+ -+ return 0; -+} -+ -+/* Replicas tracking - superblock: */ -+ -+static int -+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry *e, *dst; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ dst = cpu_replicas_entry(cpu_r, idx++); -+ memcpy(dst, e, replicas_entry_bytes(e)); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+static int -+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry_v0 *e; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ entry_size += sizeof(struct bch_replicas_entry) - -+ sizeof(struct bch_replicas_entry_v0); -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ struct bch_replicas_entry *dst = -+ cpu_replicas_entry(cpu_r, idx++); -+ -+ dst->data_type = e->data_type; -+ dst->nr_devs = e->nr_devs; -+ dst->nr_required = 1; -+ memcpy(dst->devs, e->devs, e->nr_devs); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -+{ -+ struct bch_sb_field_replicas *sb_v1; -+ struct bch_sb_field_replicas_v0 *sb_v0; -+ struct bch_replicas_cpu new_r = { 0, 0, NULL }; -+ int ret = 0; -+ -+ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); -+ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); -+ -+ if (ret) -+ return -ENOMEM; -+ -+ bch2_cpu_replicas_sort(&new_r); -+ -+ percpu_down_write(&c->mark_lock); -+ -+ ret = replicas_table_update(c, &new_r); -+ percpu_up_write(&c->mark_lock); -+ -+ kfree(new_r.entries); -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r; -+ struct bch_replicas_entry_v0 *dst; -+ struct bch_replicas_entry *src; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) -+ bytes += replicas_entry_bytes(src) - 1; -+ -+ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); -+ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ dst->data_type = src->data_type; -+ dst->nr_devs = src->nr_devs; -+ memcpy(dst->devs, src->devs, src->nr_devs); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas *sb_r; -+ struct bch_replicas_entry *dst, *src; -+ bool need_v1 = false; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) { -+ bytes += replicas_entry_bytes(src); -+ if (src->nr_required != 1) -+ need_v1 = true; -+ } -+ -+ if (!need_v1) -+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); -+ -+ sb_r = bch2_sb_resize_replicas(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); -+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ memcpy(dst, src, replicas_entry_bytes(src)); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) -+{ -+ unsigned i; -+ -+ sort_cmp_size(cpu_r->entries, -+ cpu_r->nr, -+ cpu_r->entry_size, -+ memcmp, NULL); -+ -+ for (i = 0; i + 1 < cpu_r->nr; i++) { -+ struct bch_replicas_entry *l = -+ cpu_replicas_entry(cpu_r, i); -+ struct bch_replicas_entry *r = -+ cpu_replicas_entry(cpu_r, i + 1); -+ -+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); -+ -+ if (!memcmp(l, r, cpu_r->entry_size)) -+ return "duplicate replicas entry"; -+ } -+ -+ return NULL; -+} -+ -+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: bad nr_required"; -+ if (e->nr_required > 1 && -+ e->nr_required >= e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+static void bch2_sb_replicas_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *r = field_to_type(f, replicas); -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas = { -+ .validate = bch2_sb_validate_replicas, -+ .to_text = bch2_sb_replicas_to_text, -+}; -+ -+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry_v0 *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry_v0(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { -+ .validate = bch2_sb_validate_replicas_v0, -+}; -+ -+/* Query replicas: */ -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *c, -+ struct bch_devs_mask online_devs) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_replicas_entry *e; -+ unsigned i, nr_online, nr_offline; -+ struct replicas_status ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ ret.replicas[i].redundancy = INT_MAX; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) { -+ if (e->data_type >= ARRAY_SIZE(ret.replicas)) -+ panic("e %p data_type %u\n", e, e->data_type); -+ -+ nr_online = nr_offline = 0; -+ -+ for (i = 0; i < e->nr_devs; i++) { -+ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, -+ e->devs[i])); -+ -+ if (test_bit(e->devs[i], online_devs.d)) -+ nr_online++; -+ else -+ nr_offline++; -+ } -+ -+ ret.replicas[e->data_type].redundancy = -+ min(ret.replicas[e->data_type].redundancy, -+ (int) nr_online - (int) e->nr_required); -+ -+ ret.replicas[e->data_type].nr_offline = -+ max(ret.replicas[e->data_type].nr_offline, -+ nr_offline); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ if (ret.replicas[i].redundancy == INT_MAX) -+ ret.replicas[i].redundancy = 0; -+ -+ return ret; -+} -+ -+struct replicas_status bch2_replicas_status(struct bch_fs *c) -+{ -+ return __bch2_replicas_status(c, bch2_online_devs(c)); -+} -+ -+static bool have_enough_devs(struct replicas_status s, -+ enum bch_data_type type, -+ bool force_if_degraded, -+ bool force_if_lost) -+{ -+ return (!s.replicas[type].nr_offline || force_if_degraded) && -+ (s.replicas[type].redundancy >= 0 || force_if_lost); -+} -+ -+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -+{ -+ return (have_enough_devs(s, BCH_DATA_JOURNAL, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_BTREE, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_USER, -+ flags & BCH_FORCE_IF_DATA_DEGRADED, -+ flags & BCH_FORCE_IF_DATA_LOST)); -+} -+ -+int bch2_replicas_online(struct bch_fs *c, bool meta) -+{ -+ struct replicas_status s = bch2_replicas_status(c); -+ -+ return (meta -+ ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, -+ s.replicas[BCH_DATA_BTREE].redundancy) -+ : s.replicas[BCH_DATA_USER].redundancy) + 1; -+} -+ -+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i, ret = 0; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ for (i = 0; i < e->nr_devs; i++) -+ if (e->devs[i] == ca->dev_idx) -+ ret |= 1 << e->data_type; -+ -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_fs_replicas_init(struct bch_fs *c) -+{ -+ c->journal.entry_u64s_reserved += -+ reserve_journal_replicas(c, &c->replicas); -+ -+ return replicas_table_update(c, &c->replicas); -+} -diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h -new file mode 100644 -index 000000000000..8527d82841bb ---- /dev/null -+++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,95 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REPLICAS_H -+#define _BCACHEFS_REPLICAS_H -+ -+#include "eytzinger.h" -+#include "replicas_types.h" -+ -+void bch2_replicas_entry_to_text(struct printbuf *, -+ struct bch_replicas_entry *); -+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -+ -+static inline struct bch_replicas_entry * -+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -+{ -+ return (void *) r->entries + r->entry_size * i; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *, -+ enum bch_data_type, -+ struct bch_devs_list); -+bool bch2_replicas_marked(struct bch_fs *, -+ struct bch_replicas_entry *, bool); -+int bch2_mark_replicas(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+bool bch2_bkey_replicas_marked_locked(struct bch_fs *, -+ struct bkey_s_c, bool); -+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -+bool bch2_bkey_replicas_marked(struct bch_fs *, -+ struct bkey_s_c, bool); -+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); -+ -+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, -+ unsigned dev) -+{ -+ e->data_type = BCH_DATA_CACHED; -+ e->nr_devs = 1; -+ e->nr_required = 1; -+ e->devs[0] = dev; -+} -+ -+struct replicas_status { -+ struct { -+ int redundancy; -+ unsigned nr_offline; -+ } replicas[BCH_DATA_NR]; -+}; -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *, -+ struct bch_devs_mask); -+struct replicas_status bch2_replicas_status(struct bch_fs *); -+bool bch2_have_enough_devs(struct replicas_status, unsigned); -+ -+int bch2_replicas_online(struct bch_fs *, bool); -+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -+ -+int bch2_replicas_gc_end(struct bch_fs *, int); -+int bch2_replicas_gc_start(struct bch_fs *, unsigned); -+int bch2_replicas_gc2(struct bch_fs *); -+ -+int bch2_replicas_set_usage(struct bch_fs *, -+ struct bch_replicas_entry *, -+ u64); -+ -+#define for_each_cpu_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ -+ _i = (void *) (_i) + (_r)->entry_size) -+ -+/* iterate over superblock replicas - used by userspace tools: */ -+ -+#define replicas_entry_next(_i) \ -+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) -+ -+#define for_each_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+#define for_each_replicas_entry_v0(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; -+ -+int bch2_fs_replicas_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REPLICAS_H */ -diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h -new file mode 100644 -index 000000000000..0535b1d3760e ---- /dev/null -+++ b/fs/bcachefs/replicas_types.h -@@ -0,0 +1,10 @@ -+#ifndef _BCACHEFS_REPLICAS_TYPES_H -+#define _BCACHEFS_REPLICAS_TYPES_H -+ -+struct bch_replicas_cpu { -+ unsigned nr; -+ unsigned entry_size; -+ struct bch_replicas_entry *entries; -+}; -+ -+#endif /* _BCACHEFS_REPLICAS_TYPES_H */ -diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c -new file mode 100644 -index 000000000000..c062edb3fbc2 ---- /dev/null -+++ b/fs/bcachefs/siphash.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: BSD-3-Clause -+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ -+ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ */ -+ -+/* -+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d -+ * are the number of compression rounds and the number of finalization rounds. -+ * A compression round is identical to a finalization round and this round -+ * function is called SipRound. Given a 128-bit key k and a (possibly empty) -+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). -+ * -+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, -+ * by Jean-Philippe Aumasson and Daniel J. Bernstein, -+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa -+ * https://131002.net/siphash/siphash.pdf -+ * https://131002.net/siphash/ -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include "siphash.h" -+ -+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -+{ -+ while (rounds--) { -+ ctx->v[0] += ctx->v[1]; -+ ctx->v[2] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 13); -+ ctx->v[3] = rol64(ctx->v[3], 16); -+ -+ ctx->v[1] ^= ctx->v[0]; -+ ctx->v[3] ^= ctx->v[2]; -+ ctx->v[0] = rol64(ctx->v[0], 32); -+ -+ ctx->v[2] += ctx->v[1]; -+ ctx->v[0] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 17); -+ ctx->v[3] = rol64(ctx->v[3], 21); -+ -+ ctx->v[1] ^= ctx->v[2]; -+ ctx->v[3] ^= ctx->v[0]; -+ ctx->v[2] = rol64(ctx->v[2], 32); -+ } -+} -+ -+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -+{ -+ u64 m = get_unaligned_le64(ptr); -+ -+ ctx->v[3] ^= m; -+ SipHash_Rounds(ctx, rounds); -+ ctx->v[0] ^= m; -+} -+ -+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -+{ -+ u64 k0, k1; -+ -+ k0 = le64_to_cpu(key->k0); -+ k1 = le64_to_cpu(key->k1); -+ -+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; -+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; -+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; -+ ctx->v[3] = 0x7465646279746573ULL ^ k1; -+ -+ memset(ctx->buf, 0, sizeof(ctx->buf)); -+ ctx->bytes = 0; -+} -+ -+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, -+ const void *src, size_t len) -+{ -+ const u8 *ptr = src; -+ size_t left, used; -+ -+ if (len == 0) -+ return; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ ctx->bytes += len; -+ -+ if (used > 0) { -+ left = sizeof(ctx->buf) - used; -+ -+ if (len >= left) { -+ memcpy(&ctx->buf[used], ptr, left); -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ len -= left; -+ ptr += left; -+ } else { -+ memcpy(&ctx->buf[used], ptr, len); -+ return; -+ } -+ } -+ -+ while (len >= sizeof(ctx->buf)) { -+ SipHash_CRounds(ctx, ptr, rc); -+ len -= sizeof(ctx->buf); -+ ptr += sizeof(ctx->buf); -+ } -+ -+ if (len > 0) -+ memcpy(&ctx->buf[used], ptr, len); -+} -+ -+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ -+ r = SipHash_End(ctx, rc, rf); -+ -+ *((__le64 *) dst) = cpu_to_le64(r); -+} -+ -+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ size_t left, used; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ left = sizeof(ctx->buf) - used; -+ memset(&ctx->buf[used], 0, left - 1); -+ ctx->buf[7] = ctx->bytes; -+ -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ ctx->v[2] ^= 0xff; -+ SipHash_Rounds(ctx, rf); -+ -+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); -+ memset(ctx, 0, sizeof(*ctx)); -+ return (r); -+} -+ -+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -+{ -+ SIPHASH_CTX ctx; -+ -+ SipHash_Init(&ctx, key); -+ SipHash_Update(&ctx, rc, rf, src, len); -+ return SipHash_End(&ctx, rc, rf); -+} -diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h -new file mode 100644 -index 000000000000..3dfaf34a43b2 ---- /dev/null -+++ b/fs/bcachefs/siphash.h -@@ -0,0 +1,87 @@ -+/* SPDX-License-Identifier: BSD-3-Clause */ -+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * $FreeBSD$ -+ */ -+ -+/* -+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) -+ * optimized for speed on short messages returning a 64bit hash/digest value. -+ * -+ * The number of rounds is defined during the initialization: -+ * SipHash24_Init() for the fast and resonable strong version -+ * SipHash48_Init() for the strong version (half as fast) -+ * -+ * struct SIPHASH_CTX ctx; -+ * SipHash24_Init(&ctx); -+ * SipHash_SetKey(&ctx, "16bytes long key"); -+ * SipHash_Update(&ctx, pointer_to_string, length_of_string); -+ * SipHash_Final(output, &ctx); -+ */ -+ -+#ifndef _SIPHASH_H_ -+#define _SIPHASH_H_ -+ -+#include -+ -+#define SIPHASH_BLOCK_LENGTH 8 -+#define SIPHASH_KEY_LENGTH 16 -+#define SIPHASH_DIGEST_LENGTH 8 -+ -+typedef struct _SIPHASH_CTX { -+ u64 v[4]; -+ u8 buf[SIPHASH_BLOCK_LENGTH]; -+ u32 bytes; -+} SIPHASH_CTX; -+ -+typedef struct { -+ __le64 k0; -+ __le64 k1; -+} SIPHASH_KEY; -+ -+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -+u64 SipHash_End(SIPHASH_CTX *, int, int); -+void SipHash_Final(void *, SIPHASH_CTX *, int, int); -+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); -+ -+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -+#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) -+ -+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -+#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) -+ -+#endif /* _SIPHASH_H_ */ -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -new file mode 100644 -index 000000000000..dea9b7252b88 ---- /dev/null -+++ b/fs/bcachefs/str_hash.h -@@ -0,0 +1,336 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_STR_HASH_H -+#define _BCACHEFS_STR_HASH_H -+ -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "checksum.h" -+#include "error.h" -+#include "inode.h" -+#include "siphash.h" -+#include "super.h" -+ -+#include -+#include -+#include -+ -+static inline enum bch_str_hash_type -+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -+{ -+ switch (opt) { -+ case BCH_STR_HASH_OPT_CRC32C: -+ return BCH_STR_HASH_CRC32C; -+ case BCH_STR_HASH_OPT_CRC64: -+ return BCH_STR_HASH_CRC64; -+ case BCH_STR_HASH_OPT_SIPHASH: -+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) -+ ? BCH_STR_HASH_SIPHASH -+ : BCH_STR_HASH_SIPHASH_OLD; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_info { -+ u8 type; -+ union { -+ __le64 crc_key; -+ SIPHASH_KEY siphash_key; -+ }; -+}; -+ -+static inline struct bch_hash_info -+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -+{ -+ /* XXX ick */ -+ struct bch_hash_info info = { -+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & -+ ~(~0U << INODE_STR_HASH_BITS), -+ .crc_key = bi->bi_hash_seed, -+ }; -+ -+ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { -+ SHASH_DESC_ON_STACK(desc, c->sha256); -+ u8 digest[SHA256_DIGEST_SIZE]; -+ -+ desc->tfm = c->sha256; -+ -+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, -+ sizeof(bi->bi_hash_seed), digest); -+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); -+ } -+ -+ return info; -+} -+ -+struct bch_str_hash_ctx { -+ union { -+ u32 crc32c; -+ u64 crc64; -+ SIPHASH_CTX siphash; -+ }; -+}; -+ -+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Init(&ctx->siphash, &info->siphash_key); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info, -+ const void *data, size_t len) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(ctx->crc32c, data, len); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(ctx->crc64, data, len); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Update(&ctx->siphash, data, len); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ return ctx->crc32c; -+ case BCH_STR_HASH_CRC64: -+ return ctx->crc64 >> 1; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ return SipHash24_End(&ctx->siphash) >> 1; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_desc { -+ enum btree_id btree_id; -+ u8 key_type; -+ -+ u64 (*hash_key)(const struct bch_hash_info *, const void *); -+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); -+ bool (*cmp_key)(struct bkey_s_c, const void *); -+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); -+}; -+ -+static __always_inline struct btree_iter * -+bch2_hash_lookup(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key, -+ unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|flags, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_key(k, key)) -+ return iter; -+ } else if (k.k->type == KEY_TYPE_whiteout) { -+ ; -+ } else { -+ /* hole, not found */ -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOENT); -+} -+ -+static __always_inline struct btree_iter * -+bch2_hash_hole(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type != desc.key_type) -+ return iter; -+ } -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOSPC); -+} -+ -+static __always_inline -+int bch2_hash_needs_whiteout(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *start) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_copy_iter(trans, start); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ bch2_btree_iter_next_slot(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->type != desc.key_type && -+ k.k->type != KEY_TYPE_whiteout) -+ break; -+ -+ if (k.k->type == desc.key_type && -+ desc.hash_bkey(info, k) <= start->pos.offset) { -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static __always_inline -+int bch2_hash_set(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, struct bkey_i *insert, int flags) -+{ -+ struct btree_iter *iter, *slot = NULL; -+ struct bkey_s_c k; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) -+ goto found; -+ -+ /* hash collision: */ -+ continue; -+ } -+ -+ if (!slot && -+ !(flags & BCH_HASH_SET_MUST_REPLACE)) { -+ slot = bch2_trans_copy_iter(trans, iter); -+ if (IS_ERR(slot)) -+ return PTR_ERR(slot); -+ } -+ -+ if (k.k->type != KEY_TYPE_whiteout) -+ goto not_found; -+ } -+ -+ if (!ret) -+ ret = -ENOSPC; -+out: -+ bch2_trans_iter_put(trans, slot); -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+found: -+ found = true; -+not_found: -+ -+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { -+ ret = -ENOENT; -+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { -+ ret = -EEXIST; -+ } else { -+ if (!found && slot) -+ swap(iter, slot); -+ -+ insert->k.p = iter->pos; -+ bch2_trans_update(trans, iter, insert, 0); -+ } -+ -+ goto out; -+} -+ -+static __always_inline -+int bch2_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ struct bkey_i *delete; -+ int ret; -+ -+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); -+ if (ret < 0) -+ return ret; -+ -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ if (IS_ERR(delete)) -+ return PTR_ERR(delete); -+ -+ bkey_init(&delete->k); -+ delete->k.p = iter->pos; -+ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; -+ -+ bch2_trans_update(trans, iter, delete, 0); -+ return 0; -+} -+ -+static __always_inline -+int bch2_hash_delete(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_hash_lookup(trans, desc, info, inode, key, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_hash_delete_at(trans, desc, info, iter); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+#endif /* _BCACHEFS_STR_HASH_H */ -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -new file mode 100644 -index 000000000000..9a221d3e1652 ---- /dev/null -+++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1158 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_seq_blacklist.h" -+#include "replicas.h" -+#include "quota.h" -+#include "super-io.h" -+#include "super.h" -+#include "vstructs.h" -+ -+#include -+#include -+ -+const char * const bch2_sb_fields[] = { -+#define x(name, nr) #name, -+ BCH_SB_FIELDS() -+#undef x -+ NULL -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *, -+ struct bch_sb_field *); -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f; -+ -+ /* XXX: need locking around superblock to access optional fields */ -+ -+ vstruct_for_each(sb, f) -+ if (le32_to_cpu(f->type) == type) -+ return f; -+ return NULL; -+} -+ -+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, -+ struct bch_sb_field *f, -+ unsigned u64s) -+{ -+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; -+ -+ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > -+ sb->page_order); -+ -+ if (!f && !u64s) { -+ /* nothing to do: */ -+ } else if (!f) { -+ f = vstruct_last(sb->sb); -+ memset(f, 0, sizeof(u64) * u64s); -+ f->u64s = cpu_to_le32(u64s); -+ f->type = 0; -+ } else { -+ void *src, *dst; -+ -+ src = vstruct_end(f); -+ -+ if (u64s) { -+ f->u64s = cpu_to_le32(u64s); -+ dst = vstruct_end(f); -+ } else { -+ dst = f; -+ } -+ -+ memmove(dst, src, vstruct_end(sb->sb) - src); -+ -+ if (dst > src) -+ memset(src, 0, dst - src); -+ } -+ -+ sb->sb->u64s = cpu_to_le32(sb_u64s); -+ -+ return u64s ? f : NULL; -+} -+ -+void bch2_sb_field_delete(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ -+ if (f) -+ __bch2_sb_field_resize(sb, f, 0); -+} -+ -+/* Superblock realloc/free: */ -+ -+void bch2_free_super(struct bch_sb_handle *sb) -+{ -+ if (sb->bio) -+ bio_put(sb->bio); -+ if (!IS_ERR_OR_NULL(sb->bdev)) -+ blkdev_put(sb->bdev, sb->mode); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ memset(sb, 0, sizeof(*sb)); -+} -+ -+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -+{ -+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); -+ unsigned order = get_order(new_bytes); -+ struct bch_sb *new_sb; -+ struct bio *bio; -+ -+ if (sb->sb && sb->page_order >= order) -+ return 0; -+ -+ if (sb->have_layout) { -+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; -+ -+ if (new_bytes > max_bytes) { -+ char buf[BDEVNAME_SIZE]; -+ -+ pr_err("%s: superblock too big: want %zu but have %llu", -+ bdevname(sb->bdev, buf), new_bytes, max_bytes); -+ return -ENOSPC; -+ } -+ } -+ -+ if (sb->page_order >= order && sb->sb) -+ return 0; -+ -+ if (dynamic_fault("bcachefs:add:super_realloc")) -+ return -ENOMEM; -+ -+ if (sb->have_bio) { -+ bio = bio_kmalloc(GFP_KERNEL, 1 << order); -+ if (!bio) -+ return -ENOMEM; -+ -+ if (sb->bio) -+ bio_put(sb->bio); -+ sb->bio = bio; -+ } -+ -+ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); -+ if (!new_sb) -+ return -ENOMEM; -+ -+ if (sb->sb) -+ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ sb->sb = new_sb; -+ -+ sb->page_order = order; -+ -+ return 0; -+} -+ -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type, -+ unsigned u64s) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ ssize_t d = -old_u64s + u64s; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) -+ return NULL; -+ -+ if (sb->fs_sb) { -+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ /* XXX: we're not checking that offline device have enough space */ -+ -+ for_each_online_member(ca, c, i) { -+ struct bch_sb_handle *sb = &ca->disk_sb; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { -+ percpu_ref_put(&ca->ref); -+ return NULL; -+ } -+ } -+ } -+ -+ f = bch2_sb_field_get(sb->sb, type); -+ f = __bch2_sb_field_resize(sb, f, u64s); -+ if (f) -+ f->type = cpu_to_le32(type); -+ return f; -+} -+ -+/* Superblock validate: */ -+ -+static inline void __bch2_sb_layout_size_assert(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -+} -+ -+static const char *validate_sb_layout(struct bch_sb_layout *layout) -+{ -+ u64 offset, prev_offset, max_sectors; -+ unsigned i; -+ -+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock layout"; -+ -+ if (layout->layout_type != 0) -+ return "Invalid superblock layout type"; -+ -+ if (!layout->nr_superblocks) -+ return "Invalid superblock layout: no superblocks"; -+ -+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) -+ return "Invalid superblock layout: too many superblocks"; -+ -+ max_sectors = 1 << layout->sb_max_size_bits; -+ -+ prev_offset = le64_to_cpu(layout->sb_offset[0]); -+ -+ for (i = 1; i < layout->nr_superblocks; i++) { -+ offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset < prev_offset + max_sectors) -+ return "Invalid superblock layout: superblocks overlap"; -+ prev_offset = offset; -+ } -+ -+ return NULL; -+} -+ -+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) -+{ -+ struct bch_sb *sb = disk_sb->sb; -+ struct bch_sb_field *f; -+ struct bch_sb_field_members *mi; -+ const char *err; -+ u32 version, version_min; -+ u16 block_size; -+ -+ version = le16_to_cpu(sb->version); -+ version_min = version >= bcachefs_metadata_version_new_versioning -+ ? le16_to_cpu(sb->version_min) -+ : version; -+ -+ if (version >= bcachefs_metadata_version_max || -+ version_min < bcachefs_metadata_version_min) -+ return "Unsupported superblock version"; -+ -+ if (version_min > version) -+ return "Bad minimum version"; -+ -+ if (sb->features[1] || -+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) -+ return "Filesystem has incompatible features"; -+ -+ block_size = le16_to_cpu(sb->block_size); -+ -+ if (!is_power_of_2(block_size) || -+ block_size > PAGE_SECTORS) -+ return "Bad block size"; -+ -+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) -+ return "Bad user UUID"; -+ -+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) -+ return "Bad internal UUID"; -+ -+ if (!sb->nr_devices || -+ sb->nr_devices <= sb->dev_idx || -+ sb->nr_devices > BCH_SB_MEMBERS_MAX) -+ return "Bad number of member devices"; -+ -+ if (!BCH_SB_META_REPLICAS_WANT(sb) || -+ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_META_REPLICAS_REQ(sb) || -+ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || -+ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || -+ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) -+ return "Invalid compression type"; -+ -+ if (!BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "Btree node size not set"; -+ -+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) -+ return "Btree node size not a power of two"; -+ -+ if (BCH_SB_GC_RESERVE(sb) < 5) -+ return "gc reserve percentage too small"; -+ -+ if (!sb->time_precision || -+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) -+ return "invalid time precision"; -+ -+ /* validate layout */ -+ err = validate_sb_layout(&sb->layout); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (!f->u64s) -+ return "Invalid superblock: invalid optional field"; -+ -+ if (vstruct_next(f) > vstruct_last(sb)) -+ return "Invalid superblock: invalid optional field"; -+ } -+ -+ /* members must be validated first: */ -+ mi = bch2_sb_get_members(sb); -+ if (!mi) -+ return "Invalid superblock: member info area missing"; -+ -+ err = bch2_sb_field_validate(sb, &mi->field); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) -+ continue; -+ -+ err = bch2_sb_field_validate(sb, f); -+ if (err) -+ return err; -+ } -+ -+ return NULL; -+} -+ -+/* device open: */ -+ -+static void bch2_sb_update(struct bch_fs *c) -+{ -+ struct bch_sb *src = c->disk_sb.sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(src); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ c->sb.uuid = src->uuid; -+ c->sb.user_uuid = src->user_uuid; -+ c->sb.version = le16_to_cpu(src->version); -+ c->sb.nr_devices = src->nr_devices; -+ c->sb.clean = BCH_SB_CLEAN(src); -+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); -+ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); -+ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); -+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); -+ c->sb.time_precision = le32_to_cpu(src->time_precision); -+ c->sb.features = le64_to_cpu(src->features[0]); -+ c->sb.compat = le64_to_cpu(src->compat[0]); -+ -+ for_each_member_device(ca, c, i) -+ ca->mi = bch2_mi_to_cpu(mi->members + i); -+} -+ -+/* doesn't copy member info */ -+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -+{ -+ struct bch_sb_field *src_f, *dst_f; -+ struct bch_sb *dst = dst_handle->sb; -+ unsigned i; -+ -+ dst->version = src->version; -+ dst->version_min = src->version_min; -+ dst->seq = src->seq; -+ dst->uuid = src->uuid; -+ dst->user_uuid = src->user_uuid; -+ memcpy(dst->label, src->label, sizeof(dst->label)); -+ -+ dst->block_size = src->block_size; -+ dst->nr_devices = src->nr_devices; -+ -+ dst->time_base_lo = src->time_base_lo; -+ dst->time_base_hi = src->time_base_hi; -+ dst->time_precision = src->time_precision; -+ -+ memcpy(dst->flags, src->flags, sizeof(dst->flags)); -+ memcpy(dst->features, src->features, sizeof(dst->features)); -+ memcpy(dst->compat, src->compat, sizeof(dst->compat)); -+ -+ for (i = 0; i < BCH_SB_FIELD_NR; i++) { -+ if (i == BCH_SB_FIELD_journal) -+ continue; -+ -+ src_f = bch2_sb_field_get(src, i); -+ dst_f = bch2_sb_field_get(dst, i); -+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, -+ src_f ? le32_to_cpu(src_f->u64s) : 0); -+ -+ if (src_f) -+ memcpy(dst_f, src_f, vstruct_bytes(src_f)); -+ } -+} -+ -+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -+{ -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(src); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ int ret; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ ret = bch2_sb_realloc(&c->disk_sb, -+ le32_to_cpu(src->u64s) - journal_u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&c->disk_sb, src); -+ -+ ret = bch2_sb_replicas_to_cpu_replicas(c); -+ if (ret) -+ return ret; -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ return ret; -+ -+ bch2_sb_update(c); -+ return 0; -+} -+ -+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(dst); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; -+ int ret; -+ -+ ret = bch2_sb_realloc(&ca->disk_sb, u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&ca->disk_sb, src); -+ return 0; -+} -+ -+/* read superblock: */ -+ -+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) -+{ -+ struct bch_csum csum; -+ size_t bytes; -+reread: -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ if (submit_bio_wait(sb->bio)) -+ return "IO error"; -+ -+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock"; -+ -+ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || -+ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) -+ return "Unsupported superblock version"; -+ -+ bytes = vstruct_bytes(sb->sb); -+ -+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) -+ return "Bad superblock: too big"; -+ -+ if (get_order(bytes) > sb->page_order) { -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) -+ return "cannot allocate memory"; -+ goto reread; -+ } -+ -+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) -+ return "unknown csum type"; -+ -+ /* XXX: verify MACs */ -+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), -+ null_nonce(), sb->sb); -+ -+ if (bch2_crc_cmp(csum, sb->sb->csum)) -+ return "bad checksum reading superblock"; -+ -+ sb->seq = le64_to_cpu(sb->sb->seq); -+ -+ return NULL; -+} -+ -+int bch2_read_super(const char *path, struct bch_opts *opts, -+ struct bch_sb_handle *sb) -+{ -+ u64 offset = opt_get(*opts, sb); -+ struct bch_sb_layout layout; -+ const char *err; -+ __le64 *i; -+ int ret; -+ -+ pr_verbose_init(*opts, ""); -+ -+ memset(sb, 0, sizeof(*sb)); -+ sb->mode = FMODE_READ; -+ sb->have_bio = true; -+ -+ if (!opt_get(*opts, noexcl)) -+ sb->mode |= FMODE_EXCL; -+ -+ if (!opt_get(*opts, nochanges)) -+ sb->mode |= FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (IS_ERR(sb->bdev) && -+ PTR_ERR(sb->bdev) == -EACCES && -+ opt_get(*opts, read_only)) { -+ sb->mode &= ~FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (!IS_ERR(sb->bdev)) -+ opt_set(*opts, nochanges, true); -+ } -+ -+ if (IS_ERR(sb->bdev)) { -+ ret = PTR_ERR(sb->bdev); -+ goto out; -+ } -+ -+ err = "cannot allocate memory"; -+ ret = bch2_sb_realloc(sb, 0); -+ if (ret) -+ goto err; -+ -+ ret = -EFAULT; -+ err = "dynamic fault"; -+ if (bch2_fs_init_fault("read_super")) -+ goto err; -+ -+ ret = -EINVAL; -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ -+ if (opt_defined(*opts, sb)) -+ goto err; -+ -+ pr_err("error reading default superblock: %s", err); -+ -+ /* -+ * Error reading primary superblock - read location of backup -+ * superblocks: -+ */ -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ /* -+ * use sb buffer to read layout, since sb buffer is page aligned but -+ * layout won't be: -+ */ -+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); -+ -+ err = "IO error"; -+ if (submit_bio_wait(sb->bio)) -+ goto err; -+ -+ memcpy(&layout, sb->sb, sizeof(layout)); -+ err = validate_sb_layout(&layout); -+ if (err) -+ goto err; -+ -+ for (i = layout.sb_offset; -+ i < layout.sb_offset + layout.nr_superblocks; i++) { -+ offset = le64_to_cpu(*i); -+ -+ if (offset == opt_get(*opts, sb)) -+ continue; -+ -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ } -+ -+ ret = -EINVAL; -+ goto err; -+ -+got_super: -+ err = "Superblock block size smaller than device block size"; -+ ret = -EINVAL; -+ if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev)) -+ goto err; -+ -+ if (sb->mode & FMODE_WRITE) -+ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities -+ |= BDI_CAP_STABLE_WRITES; -+ ret = 0; -+ sb->have_layout = true; -+out: -+ pr_verbose_init(*opts, "ret %i", ret); -+ return ret; -+err: -+ bch2_free_super(sb); -+ pr_err("error reading superblock: %s", err); -+ goto out; -+} -+ -+/* write superblock: */ -+ -+static void write_super_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ -+ /* XXX: return errors directly */ -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", -+ blk_status_to_str(bio->bi_status))) -+ ca->sb_write_error = 1; -+ -+ closure_put(&ca->fs->sb_write); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ sb->offset = sb->layout.sb_offset[idx]; -+ -+ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); -+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), -+ null_nonce(), sb); -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, sb, -+ roundup((size_t) vstruct_bytes(sb), -+ bdev_logical_block_size(ca->disk_sb.bdev))); -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+int bch2_write_super(struct bch_fs *c) -+{ -+ struct closure *cl = &c->sb_write; -+ struct bch_dev *ca; -+ unsigned i, sb = 0, nr_wrote; -+ const char *err; -+ struct bch_devs_mask sb_written; -+ bool wrote, can_mount_without_written, can_mount_with_written; -+ int ret = 0; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ closure_init_stack(cl); -+ memset(&sb_written, 0, sizeof(sb_written)); -+ -+ le64_add_cpu(&c->disk_sb.sb->seq, 1); -+ -+ if (test_bit(BCH_FS_ERROR, &c->flags)) -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ for_each_online_member(ca, c, i) { -+ err = bch2_sb_validate(&ca->disk_sb); -+ if (err) { -+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); -+ ret = -1; -+ goto out; -+ } -+ } -+ -+ if (c->opts.nochanges) -+ goto out; -+ -+ for_each_online_member(ca, c, i) { -+ __set_bit(ca->dev_idx, sb_written.d); -+ ca->sb_write_error = 0; -+ } -+ -+ for_each_online_member(ca, c, i) -+ read_back_super(c, ca); -+ closure_sync(cl); -+ -+ for_each_online_member(ca, c, i) { -+ if (!ca->sb_write_error && -+ ca->disk_sb.seq != -+ le64_to_cpu(ca->sb_read_scratch->seq)) { -+ bch2_fs_fatal_error(c, -+ "Superblock modified by another process"); -+ percpu_ref_put(&ca->io_ref); -+ ret = -EROFS; -+ goto out; -+ } -+ } -+ -+ do { -+ wrote = false; -+ for_each_online_member(ca, c, i) -+ if (!ca->sb_write_error && -+ sb < ca->disk_sb.sb->layout.nr_superblocks) { -+ write_one_super(c, ca, sb); -+ wrote = true; -+ } -+ closure_sync(cl); -+ sb++; -+ } while (wrote); -+ -+ for_each_online_member(ca, c, i) { -+ if (ca->sb_write_error) -+ __clear_bit(ca->dev_idx, sb_written.d); -+ else -+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); -+ } -+ -+ nr_wrote = dev_mask_nr(&sb_written); -+ -+ can_mount_with_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) -+ sb_written.d[i] = ~sb_written.d[i]; -+ -+ can_mount_without_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ /* -+ * If we would be able to mount _without_ the devices we successfully -+ * wrote superblocks to, we weren't able to write to enough devices: -+ * -+ * Exception: if we can mount without the successes because we haven't -+ * written anything (new filesystem), we continue if we'd be able to -+ * mount with the devices we did successfully write to: -+ */ -+ if (bch2_fs_fatal_err_on(!nr_wrote || -+ (can_mount_without_written && -+ !can_mount_with_written), c, -+ "Unable to write superblock to sufficient devices")) -+ ret = -1; -+out: -+ /* Make new options visible after they're persistent: */ -+ bch2_sb_update(c); -+ return ret; -+} -+ -+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ mutex_lock(&c->sb_lock); -+ if (!(c->sb.features & (1ULL << feat))) { -+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); -+ -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static int u64_cmp(const void *_l, const void *_r) -+{ -+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); -+ -+ return l < r ? -1 : l > r ? 1 : 0; -+} -+ -+static const char *bch2_sb_validate_journal(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ const char *err; -+ unsigned nr; -+ unsigned i; -+ u64 *b; -+ -+ journal = bch2_sb_get_journal(sb); -+ if (!journal) -+ return NULL; -+ -+ nr = bch2_nr_journal_buckets(journal); -+ if (!nr) -+ return NULL; -+ -+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); -+ if (!b) -+ return "cannot allocate memory"; -+ -+ for (i = 0; i < nr; i++) -+ b[i] = le64_to_cpu(journal->buckets[i]); -+ -+ sort(b, nr, sizeof(u64), u64_cmp, NULL); -+ -+ err = "journal bucket at sector 0"; -+ if (!b[0]) -+ goto err; -+ -+ err = "journal bucket before first bucket"; -+ if (m && b[0] < le16_to_cpu(m->first_bucket)) -+ goto err; -+ -+ err = "journal bucket past end of device"; -+ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) -+ goto err; -+ -+ err = "duplicate journal buckets"; -+ for (i = 0; i + 1 < nr; i++) -+ if (b[i] == b[i + 1]) -+ goto err; -+ -+ err = NULL; -+err: -+ kfree(b); -+ return err; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_journal = { -+ .validate = bch2_sb_validate_journal, -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+static const char *bch2_sb_validate_members(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_member *m; -+ -+ if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) -+ return "Invalid superblock: bad member info"; -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) -+ return "Too many buckets"; -+ -+ if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) -+ return "Not enough buckets"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) -+ return "bucket size smaller than block size"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "bucket size smaller than btree node size"; -+ } -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_validate_members, -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+static const char *bch2_sb_validate_crypt(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) -+ return "invalid field crypt: wrong size"; -+ -+ if (BCH_CRYPT_KDF_TYPE(crypt)) -+ return "invalid field crypt: bad kdf type"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_validate_crypt, -+}; -+ -+/* BCH_SB_FIELD_clean: */ -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = clean->start; -+ entry < (struct jset_entry *) vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) -+ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); -+} -+ -+int bch2_fs_mark_dirty(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Unconditionally write superblock, to verify it hasn't changed before -+ * we go rw: -+ */ -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; -+ ret = bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static void -+entry_init_u64s(struct jset_entry *entry, unsigned u64s) -+{ -+ memset(entry, 0, u64s * sizeof(u64)); -+ -+ /* -+ * The u64s field counts from the start of data, ignoring the shared -+ * fields. -+ */ -+ entry->u64s = u64s - 1; -+} -+ -+static void -+entry_init_size(struct jset_entry *entry, size_t size) -+{ -+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); -+ entry_init_u64s(entry, u64s); -+} -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *c, -+ struct jset_entry *entry, -+ u64 journal_seq) -+{ -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ -+ if (!journal_seq) { -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ } else { -+ bch2_fs_usage_acc_to_base(c, journal_seq & 1); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_INODES; -+ u->v = cpu_to_le64(c->usage_base->nr_inodes); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_KEY_VERSION; -+ u->v = cpu_to_le64(atomic64_read(&c->key_version)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_RESERVED; -+ u->entry.level = i; -+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u) + e->nr_devs); -+ u->entry.type = BCH_JSET_ENTRY_data_usage; -+ u->v = cpu_to_le64(c->usage_base->replicas[i]); -+ memcpy(&u->r, e, replicas_entry_bytes(e)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return entry; -+} -+ -+void bch2_fs_mark_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *sb_clean; -+ struct jset_entry *entry; -+ unsigned u64s; -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_CLEAN(c->disk_sb.sb)) -+ goto out; -+ -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); -+ -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); -+ -+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; -+ -+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); -+ if (!sb_clean) { -+ bch_err(c, "error resizing superblock while setting filesystem clean"); -+ goto out; -+ } -+ -+ sb_clean->flags = 0; -+ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); -+ -+ /* Trying to catch outstanding bug: */ -+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); -+ -+ entry = sb_clean->start; -+ entry = bch2_journal_super_entries_add_common(c, entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); -+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); -+ -+ memset(entry, 0, -+ vstruct_end(&sb_clean->field) - (void *) entry); -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(sb_clean, WRITE); -+ -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+} -+ -+static const char *bch2_sb_validate_clean(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) -+ return "invalid field crypt: wrong size"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_validate_clean, -+}; -+ -+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -+#define x(f, nr) \ -+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, -+ BCH_SB_FIELDS() -+#undef x -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ -+ return type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type]->validate(sb, f) -+ : NULL; -+} -+ -+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type] : NULL; -+ -+ if (ops) -+ pr_buf(out, "%s", bch2_sb_fields[type]); -+ else -+ pr_buf(out, "(unknown field %u)", type); -+ -+ pr_buf(out, " (size %llu):", vstruct_bytes(f)); -+ -+ if (ops && ops->to_text) -+ bch2_sb_field_ops[type]->to_text(out, sb, f); -+} -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -new file mode 100644 -index 000000000000..7a068158efca ---- /dev/null -+++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,137 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_IO_H -+#define _BCACHEFS_SUPER_IO_H -+ -+#include "extents.h" -+#include "eytzinger.h" -+#include "super_types.h" -+#include "super.h" -+ -+#include -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, -+ enum bch_sb_field_type, unsigned); -+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); -+ -+#define field_to_type(_f, _name) \ -+ container_of_or_null(_f, struct bch_sb_field_##_name, field) -+ -+#define x(_name, _nr) \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_get_##_name(struct bch_sb *sb) \ -+{ \ -+ return field_to_type(bch2_sb_field_get(sb, \ -+ BCH_SB_FIELD_##_name), _name); \ -+} \ -+ \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ -+{ \ -+ return field_to_type(bch2_sb_field_resize(sb, \ -+ BCH_SB_FIELD_##_name, u64s), _name); \ -+} -+ -+BCH_SB_FIELDS() -+#undef x -+ -+extern const char * const bch2_sb_fields[]; -+ -+struct bch_sb_field_ops { -+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); -+ void (*to_text)(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+}; -+ -+static inline __le64 bch2_sb_magic(struct bch_fs *c) -+{ -+ __le64 ret; -+ memcpy(&ret, &c->sb.uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 jset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -+} -+ -+static inline __u64 bset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -+} -+ -+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); -+ -+void bch2_free_super(struct bch_sb_handle *); -+int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -+ -+const char *bch2_sb_validate(struct bch_sb_handle *); -+ -+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -+int bch2_write_super(struct bch_fs *); -+void __bch2_check_set_feature(struct bch_fs *, unsigned); -+ -+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ if (!(c->sb.features & (1ULL << feat))) -+ __bch2_check_set_feature(c, feat); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -+{ -+ return j -+ ? (__le64 *) vstruct_end(&j->field) - j->buckets -+ : 0; -+} -+ -+/* BCH_SB_FIELD_members: */ -+ -+static inline bool bch2_member_exists(struct bch_member *m) -+{ -+ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); -+} -+ -+static inline bool bch2_dev_exists(struct bch_sb *sb, -+ struct bch_sb_field_members *mi, -+ unsigned dev) -+{ -+ return dev < sb->nr_devices && -+ bch2_member_exists(&mi->members[dev]); -+} -+ -+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -+{ -+ return (struct bch_member_cpu) { -+ .nbuckets = le64_to_cpu(mi->nbuckets), -+ .first_bucket = le16_to_cpu(mi->first_bucket), -+ .bucket_size = le16_to_cpu(mi->bucket_size), -+ .group = BCH_MEMBER_GROUP(mi), -+ .state = BCH_MEMBER_STATE(mi), -+ .replacement = BCH_MEMBER_REPLACEMENT(mi), -+ .discard = BCH_MEMBER_DISCARD(mi), -+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), -+ .durability = BCH_MEMBER_DURABILITY(mi) -+ ? BCH_MEMBER_DURABILITY(mi) - 1 -+ : 1, -+ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), -+ }; -+} -+ -+/* BCH_SB_FIELD_clean: */ -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *, -+ struct jset_entry *, u64); -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); -+ -+int bch2_fs_mark_dirty(struct bch_fs *); -+void bch2_fs_mark_clean(struct bch_fs *); -+ -+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_SUPER_IO_H */ -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -new file mode 100644 -index 000000000000..0cdf285e4ffd ---- /dev/null -+++ b/fs/bcachefs/super.c -@@ -0,0 +1,2046 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs setup/teardown code, and some metadata io - read a superblock and -+ * figure out what to do with it. -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_key_cache.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "chardev.h" -+#include "checksum.h" -+#include "clock.h" -+#include "compress.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "move.h" -+#include "migrate.h" -+#include "movinggc.h" -+#include "quota.h" -+#include "rebalance.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "sysfs.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Kent Overstreet "); -+ -+#define KTYPE(type) \ -+struct kobj_type type ## _ktype = { \ -+ .release = type ## _release, \ -+ .sysfs_ops = &type ## _sysfs_ops, \ -+ .default_attrs = type ## _files \ -+} -+ -+static void bch2_fs_release(struct kobject *); -+static void bch2_dev_release(struct kobject *); -+ -+static void bch2_fs_internal_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_opts_dir_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_time_stats_release(struct kobject *k) -+{ -+} -+ -+static KTYPE(bch2_fs); -+static KTYPE(bch2_fs_internal); -+static KTYPE(bch2_fs_opts_dir); -+static KTYPE(bch2_fs_time_stats); -+static KTYPE(bch2_dev); -+ -+static struct kset *bcachefs_kset; -+static LIST_HEAD(bch_fs_list); -+static DEFINE_MUTEX(bch_fs_list_lock); -+ -+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); -+ -+static void bch2_dev_free(struct bch_dev *); -+static int bch2_dev_alloc(struct bch_fs *, unsigned); -+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&bch_fs_list_lock); -+ rcu_read_lock(); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ for_each_member_device_rcu(ca, c, i, NULL) -+ if (ca->disk_sb.bdev == bdev) { -+ closure_get(&c->cl); -+ goto found; -+ } -+ c = NULL; -+found: -+ rcu_read_unlock(); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) -+ return c; -+ -+ return NULL; -+} -+ -+struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(uuid); -+ if (c) -+ closure_get(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+int bch2_congested(void *data, int bdi_bits) -+{ -+ struct bch_fs *c = data; -+ struct backing_dev_info *bdi; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ rcu_read_lock(); -+ if (bdi_bits & (1 << WB_sync_congested)) { -+ /* Reads - check all devices: */ -+ for_each_readable_member(ca, c, i) { -+ bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ if (bdi_congested(bdi, bdi_bits)) { -+ ret = 1; -+ break; -+ } -+ } -+ } else { -+ unsigned target = READ_ONCE(c->opts.foreground_target); -+ const struct bch_devs_mask *devs = target -+ ? bch2_target_to_mask(c, target) -+ : &c->rw_devs[BCH_DATA_USER]; -+ -+ for_each_member_device_rcu(ca, c, i, devs) { -+ bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ if (bdi_congested(bdi, bdi_bits)) { -+ ret = 1; -+ break; -+ } -+ } -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+/* Filesystem RO/RW: */ -+ -+/* -+ * For startup/shutdown of RW stuff, the dependencies are: -+ * -+ * - foreground writes depend on copygc and rebalance (to free up space) -+ * -+ * - copygc and rebalance depend on mark and sweep gc (they actually probably -+ * don't because they either reserve ahead of time or don't block if -+ * allocations fail, but allocations can require mark and sweep gc to run -+ * because of generation number wraparound) -+ * -+ * - all of the above depends on the allocator threads -+ * -+ * - allocator depends on the journal (when it rewrites prios and gens) -+ */ -+ -+static void __bch2_fs_read_only(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ bool wrote = false; -+ unsigned i, clean_passes = 0; -+ int ret; -+ -+ bch2_rebalance_stop(c); -+ -+ for_each_member_device(ca, c, i) -+ bch2_copygc_stop(ca); -+ -+ bch2_gc_thread_stop(c); -+ -+ /* -+ * Flush journal before stopping allocators, because flushing journal -+ * blacklist entries involves allocating new btree nodes: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ /* -+ * If the allocator threads didn't all start up, the btree updates to -+ * write out alloc info aren't going to work: -+ */ -+ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) -+ goto nowrote_alloc; -+ -+ bch_verbose(c, "writing alloc info"); -+ /* -+ * This should normally just be writing the bucket read/write clocks: -+ */ -+ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: -+ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); -+ bch_verbose(c, "writing alloc info complete"); -+ -+ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); -+ -+ if (ret) -+ goto nowrote_alloc; -+ -+ bch_verbose(c, "flushing journal and stopping allocators"); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ do { -+ clean_passes++; -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ -+ /* -+ * In flight interior btree updates will generate more journal -+ * updates and btree updates (alloc btree): -+ */ -+ if (bch2_btree_interior_updates_nr_pending(c)) { -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ clean_passes = 0; -+ } -+ flush_work(&c->btree_interior_update_work); -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ } while (clean_passes < 2); -+ bch_verbose(c, "flushing journal and stopping allocators complete"); -+ -+ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+nowrote_alloc: -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ flush_work(&c->btree_interior_update_work); -+ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_stop(ca); -+ -+ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ bch2_fs_journal_stop(&c->journal); -+ -+ /* -+ * the journal kicks off btree writes via reclaim - wait for in flight -+ * writes after stopping journal: -+ */ -+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ bch2_btree_flush_all_writes(c); -+ else -+ bch2_btree_verify_flushed(c); -+ -+ /* -+ * After stopping journal: -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_remove(c, ca); -+} -+ -+static void bch2_writes_disabled(struct percpu_ref *writes) -+{ -+ struct bch_fs *c = container_of(writes, struct bch_fs, writes); -+ -+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ wake_up(&bch_read_only_wait); -+} -+ -+void bch2_fs_read_only(struct bch_fs *c) -+{ -+ if (!test_bit(BCH_FS_RW, &c->flags)) { -+ cancel_delayed_work_sync(&c->journal.reclaim_work); -+ return; -+ } -+ -+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ /* -+ * Block new foreground-end write operations from starting - any new -+ * writes will return -EROFS: -+ * -+ * (This is really blocking new _allocations_, writes to previously -+ * allocated space can still happen until stopping the allocator in -+ * bch2_dev_allocator_stop()). -+ */ -+ percpu_ref_kill(&c->writes); -+ -+ cancel_work_sync(&c->ec_stripe_delete_work); -+ cancel_delayed_work(&c->pd_controllers_update); -+ -+ /* -+ * If we're not doing an emergency shutdown, we want to wait on -+ * outstanding writes to complete so they don't see spurious errors due -+ * to shutting down the allocator: -+ * -+ * If we are doing an emergency shutdown outstanding writes may -+ * hang until we shutdown the allocator so we don't want to wait -+ * on outstanding writes before shutting everything down - but -+ * we do need to wait on them before returning and signalling -+ * that going RO is complete: -+ */ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || -+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); -+ -+ __bch2_fs_read_only(c); -+ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ -+ if (!bch2_journal_error(&c->journal) && -+ !test_bit(BCH_FS_ERROR, &c->flags) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && -+ test_bit(BCH_FS_STARTED, &c->flags) && -+ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && -+ !c->opts.norecovery) { -+ bch_verbose(c, "marking filesystem clean"); -+ bch2_fs_mark_clean(c); -+ } -+ -+ clear_bit(BCH_FS_RW, &c->flags); -+} -+ -+static void bch2_fs_read_only_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, read_only_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+} -+ -+static void bch2_fs_read_only_async(struct bch_fs *c) -+{ -+ queue_work(system_long_wq, &c->read_only_work); -+} -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); -+ -+ bch2_fs_read_only_async(c); -+ bch2_journal_halt(&c->journal); -+ -+ wake_up(&bch_read_only_wait); -+ return ret; -+} -+ -+static int bch2_fs_read_write_late(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ ret = bch2_gc_thread_start(c); -+ if (ret) { -+ bch_err(c, "error starting gc thread"); -+ return ret; -+ } -+ -+ for_each_rw_member(ca, c, i) { -+ ret = bch2_copygc_start(c, ca); -+ if (ret) { -+ bch_err(c, "error starting copygc threads"); -+ percpu_ref_put(&ca->io_ref); -+ return ret; -+ } -+ } -+ -+ ret = bch2_rebalance_start(c); -+ if (ret) { -+ bch_err(c, "error starting rebalance thread"); -+ return ret; -+ } -+ -+ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); -+ -+ schedule_work(&c->ec_stripe_delete_work); -+ -+ return 0; -+} -+ -+static int __bch2_fs_read_write(struct bch_fs *c, bool early) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ if (test_bit(BCH_FS_RW, &c->flags)) -+ return 0; -+ -+ /* -+ * nochanges is used for fsck -n mode - we have to allow going rw -+ * during recovery for that to work: -+ */ -+ if (c->opts.norecovery || -+ (c->opts.nochanges && -+ (!early || c->opts.read_only))) -+ return -EROFS; -+ -+ ret = bch2_fs_mark_dirty(c); -+ if (ret) -+ goto err; -+ -+ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ for_each_rw_member(ca, c, i) { -+ ret = bch2_dev_allocator_start(ca); -+ if (ret) { -+ bch_err(c, "error starting allocator threads"); -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ -+ if (!early) { -+ ret = bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ percpu_ref_reinit(&c->writes); -+ set_bit(BCH_FS_RW, &c->flags); -+ -+ queue_delayed_work(c->journal_reclaim_wq, -+ &c->journal.reclaim_work, 0); -+ return 0; -+err: -+ __bch2_fs_read_only(c); -+ return ret; -+} -+ -+int bch2_fs_read_write(struct bch_fs *c) -+{ -+ return __bch2_fs_read_write(c, false); -+} -+ -+int bch2_fs_read_write_early(struct bch_fs *c) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ return __bch2_fs_read_write(c, true); -+} -+ -+/* Filesystem startup/shutdown: */ -+ -+static void bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_exit(&c->times[i]); -+ -+ bch2_fs_quota_exit(c); -+ bch2_fs_fsio_exit(c); -+ bch2_fs_ec_exit(c); -+ bch2_fs_encryption_exit(c); -+ bch2_fs_io_exit(c); -+ bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_iter_exit(c); -+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); -+ bch2_fs_btree_cache_exit(c); -+ bch2_fs_journal_exit(&c->journal); -+ bch2_io_clock_exit(&c->io_clock[WRITE]); -+ bch2_io_clock_exit(&c->io_clock[READ]); -+ bch2_fs_compress_exit(c); -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ percpu_free_rwsem(&c->mark_lock); -+ kfree(c->usage_scratch); -+ free_percpu(c->usage[1]); -+ free_percpu(c->usage[0]); -+ kfree(c->usage_base); -+ free_percpu(c->pcpu); -+ mempool_exit(&c->large_bkey_pool); -+ mempool_exit(&c->btree_bounce_pool); -+ bioset_exit(&c->btree_bio); -+ mempool_exit(&c->fill_iter); -+ percpu_ref_exit(&c->writes); -+ kfree(c->replicas.entries); -+ kfree(c->replicas_gc.entries); -+ kfree(rcu_dereference_protected(c->disk_groups, 1)); -+ kfree(c->journal_seq_blacklist_table); -+ -+ if (c->journal_reclaim_wq) -+ destroy_workqueue(c->journal_reclaim_wq); -+ if (c->copygc_wq) -+ destroy_workqueue(c->copygc_wq); -+ if (c->wq) -+ destroy_workqueue(c->wq); -+ -+ free_pages((unsigned long) c->disk_sb.sb, -+ c->disk_sb.page_order); -+ kvpfree(c, sizeof(*c)); -+ module_put(THIS_MODULE); -+} -+ -+static void bch2_fs_release(struct kobject *kobj) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ bch2_fs_free(c); -+} -+ -+void bch2_fs_stop(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ bch_verbose(c, "shutting down"); -+ -+ set_bit(BCH_FS_STOPPING, &c->flags); -+ -+ cancel_work_sync(&c->journal_seq_blacklist_gc_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (c->kobj.state_in_sysfs) -+ kobject_del(&c->kobj); -+ -+ bch2_fs_debug_exit(c); -+ bch2_fs_chardev_exit(c); -+ -+ kobject_put(&c->time_stats); -+ kobject_put(&c->opts_dir); -+ kobject_put(&c->internal); -+ -+ mutex_lock(&bch_fs_list_lock); -+ list_del(&c->list); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ closure_sync(&c->cl); -+ closure_debug_destroy(&c->cl); -+ -+ /* btree prefetch might have kicked off reads in the background: */ -+ bch2_btree_flush_all_reads(c); -+ -+ for_each_member_device(ca, c, i) -+ cancel_work_sync(&ca->io_error_work); -+ -+ cancel_work_sync(&c->btree_write_error_work); -+ cancel_delayed_work_sync(&c->pd_controllers_update); -+ cancel_work_sync(&c->read_only_work); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); -+ -+ bch_verbose(c, "shutdown complete"); -+ -+ kobject_put(&c->kobj); -+} -+ -+static const char *bch2_fs_online(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ const char *err = NULL; -+ unsigned i; -+ int ret; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ if (!list_empty(&c->list)) -+ return NULL; -+ -+ if (__bch2_uuid_to_fs(c->sb.uuid)) -+ return "filesystem UUID already open"; -+ -+ ret = bch2_fs_chardev_init(c); -+ if (ret) -+ return "error creating character device"; -+ -+ bch2_fs_debug_init(c); -+ -+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || -+ kobject_add(&c->internal, &c->kobj, "internal") || -+ kobject_add(&c->opts_dir, &c->kobj, "options") || -+ kobject_add(&c->time_stats, &c->kobj, "time_stats") || -+ bch2_opts_create_sysfs_files(&c->opts_dir)) -+ return "error creating sysfs objects"; -+ -+ down_write(&c->state_lock); -+ -+ err = "error creating sysfs objects"; -+ __for_each_member_device(ca, c, i, NULL) -+ if (bch2_dev_sysfs_online(c, ca)) -+ goto err; -+ -+ list_add(&c->list, &bch_fs_list); -+ err = NULL; -+err: -+ up_write(&c->state_lock); -+ return err; -+} -+ -+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_fs *c; -+ unsigned i, iter_size; -+ const char *err; -+ -+ pr_verbose_init(opts, ""); -+ -+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); -+ if (!c) -+ goto out; -+ -+ __module_get(THIS_MODULE); -+ -+ c->minor = -1; -+ c->disk_sb.fs_sb = true; -+ -+ init_rwsem(&c->state_lock); -+ mutex_init(&c->sb_lock); -+ mutex_init(&c->replicas_gc_lock); -+ mutex_init(&c->btree_root_lock); -+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); -+ -+ init_rwsem(&c->gc_lock); -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_init(&c->times[i]); -+ -+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -+ bch2_fs_allocator_background_init(c); -+ bch2_fs_allocator_foreground_init(c); -+ bch2_fs_rebalance_init(c); -+ bch2_fs_quota_init(c); -+ -+ INIT_LIST_HEAD(&c->list); -+ -+ mutex_init(&c->usage_scratch_lock); -+ -+ mutex_init(&c->bio_bounce_pages_lock); -+ -+ bio_list_init(&c->btree_write_error_list); -+ spin_lock_init(&c->btree_write_error_lock); -+ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); -+ -+ INIT_WORK(&c->journal_seq_blacklist_gc_work, -+ bch2_blacklist_entries_gc); -+ -+ INIT_LIST_HEAD(&c->journal_entries); -+ -+ INIT_LIST_HEAD(&c->fsck_errors); -+ mutex_init(&c->fsck_error_lock); -+ -+ INIT_LIST_HEAD(&c->ec_new_stripe_list); -+ mutex_init(&c->ec_new_stripe_lock); -+ mutex_init(&c->ec_stripe_create_lock); -+ spin_lock_init(&c->ec_stripes_heap_lock); -+ -+ seqcount_init(&c->gc_pos_lock); -+ -+ seqcount_init(&c->usage_lock); -+ -+ sema_init(&c->io_in_flight, 64); -+ -+ c->copy_gc_enabled = 1; -+ c->rebalance.enabled = 1; -+ c->promote_whole_extents = true; -+ -+ c->journal.write_time = &c->times[BCH_TIME_journal_write]; -+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; -+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; -+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; -+ -+ bch2_fs_btree_cache_init_early(&c->btree_cache); -+ -+ if (percpu_init_rwsem(&c->mark_lock)) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (bch2_sb_to_fs(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ -+ mutex_unlock(&c->sb_lock); -+ -+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); -+ -+ c->opts = bch2_opts_default; -+ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); -+ bch2_opts_apply(&c->opts, opts); -+ -+ c->block_bits = ilog2(c->opts.block_size); -+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); -+ -+ if (bch2_fs_init_fault("fs_alloc")) -+ goto err; -+ -+ iter_size = sizeof(struct sort_iter) + -+ (btree_blocks(c) + 1) * 2 * -+ sizeof(struct sort_iter_set); -+ -+ if (!(c->wq = alloc_workqueue("bcachefs", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcache_copygc", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || -+ percpu_ref_init(&c->writes, bch2_writes_disabled, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || -+ bioset_init(&c->btree_bio, 1, -+ max(offsetof(struct btree_read_bio, bio), -+ offsetof(struct btree_write_bio, wbio.bio)), -+ BIOSET_NEED_BVECS) || -+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || -+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, -+ btree_bytes(c)) || -+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || -+ bch2_io_clock_init(&c->io_clock[READ]) || -+ bch2_io_clock_init(&c->io_clock[WRITE]) || -+ bch2_fs_journal_init(&c->journal) || -+ bch2_fs_replicas_init(c) || -+ bch2_fs_btree_cache_init(c) || -+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || -+ bch2_fs_btree_iter_init(c) || -+ bch2_fs_btree_interior_update_init(c) || -+ bch2_fs_io_init(c) || -+ bch2_fs_encryption_init(c) || -+ bch2_fs_compress_init(c) || -+ bch2_fs_ec_init(c) || -+ bch2_fs_fsio_init(c)) -+ goto err; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && -+ bch2_dev_alloc(c, i)) -+ goto err; -+ -+ /* -+ * Now that all allocations have succeeded, init various refcounty -+ * things that let us shutdown: -+ */ -+ closure_init(&c->cl, NULL); -+ -+ c->kobj.kset = bcachefs_kset; -+ kobject_init(&c->kobj, &bch2_fs_ktype); -+ kobject_init(&c->internal, &bch2_fs_internal_ktype); -+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); -+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); -+ -+ mutex_lock(&bch_fs_list_lock); -+ err = bch2_fs_online(c); -+ mutex_unlock(&bch_fs_list_lock); -+ if (err) { -+ bch_err(c, "bch2_fs_online() error: %s", err); -+ goto err; -+ } -+out: -+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); -+ return c; -+err: -+ bch2_fs_free(c); -+ c = NULL; -+ goto out; -+} -+ -+noinline_for_stack -+static void print_mount_opts(struct bch_fs *c) -+{ -+ enum bch_opt_id i; -+ char buf[512]; -+ struct printbuf p = PBUF(buf); -+ bool first = true; -+ -+ strcpy(buf, "(null)"); -+ -+ if (c->opts.read_only) { -+ pr_buf(&p, "ro"); -+ first = false; -+ } -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ if (!first) -+ pr_buf(&p, ","); -+ first = false; -+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); -+ } -+ -+ bch_info(c, "mounted with opts: %s", buf); -+} -+ -+int bch2_fs_start(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ time64_t now = ktime_get_real_seconds(); -+ unsigned i; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for_each_online_member(ca, c, i) -+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) -+ ? bch2_fs_recovery(c) -+ : bch2_fs_initialize(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_opts_check_may_set(c); -+ if (ret) -+ goto err; -+ -+ err = "dynamic fault"; -+ ret = -EINVAL; -+ if (bch2_fs_init_fault("fs_start")) -+ goto err; -+ -+ set_bit(BCH_FS_STARTED, &c->flags); -+ -+ if (c->opts.read_only || c->opts.nochanges) { -+ bch2_fs_read_only(c); -+ } else { -+ err = "error going read write"; -+ ret = !test_bit(BCH_FS_RW, &c->flags) -+ ? bch2_fs_read_write(c) -+ : bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ print_mount_opts(c); -+ ret = 0; -+out: -+ up_write(&c->state_lock); -+ return ret; -+err: -+ switch (ret) { -+ case BCH_FSCK_ERRORS_NOT_FIXED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("mount with -o fix_errors to repair\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_UNIMPLEMENTED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_IMPOSSIBLE: -+ bch_err(c, "filesystem contains errors, but repair impossible"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_UNKNOWN_VERSION: -+ err = "unknown metadata version";; -+ break; -+ case -ENOMEM: -+ err = "cannot allocate memory"; -+ break; -+ case -EIO: -+ err = "IO error"; -+ break; -+ } -+ -+ if (ret >= 0) -+ ret = -EIO; -+ goto out; -+} -+ -+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -+{ -+ struct bch_sb_field_members *sb_mi; -+ -+ sb_mi = bch2_sb_get_members(sb); -+ if (!sb_mi) -+ return "Invalid superblock: member info area missing"; -+ -+ if (le16_to_cpu(sb->block_size) != c->opts.block_size) -+ return "mismatched block size"; -+ -+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) -+ return "new cache bucket size is too small"; -+ -+ return NULL; -+} -+ -+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) -+{ -+ struct bch_sb *newest = -+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); -+ -+ if (uuid_le_cmp(fs->uuid, sb->uuid)) -+ return "device not a member of filesystem"; -+ -+ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) -+ return "device has been removed"; -+ -+ if (fs->block_size != sb->block_size) -+ return "mismatched block size"; -+ -+ return NULL; -+} -+ -+/* Device startup/shutdown: */ -+ -+static void bch2_dev_release(struct kobject *kobj) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ -+ kfree(ca); -+} -+ -+static void bch2_dev_free(struct bch_dev *ca) -+{ -+ cancel_work_sync(&ca->io_error_work); -+ -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (ca->kobj.state_in_sysfs) -+ kobject_del(&ca->kobj); -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+ -+ free_percpu(ca->io_done); -+ bioset_exit(&ca->replica_set); -+ bch2_dev_buckets_free(ca); -+ free_page((unsigned long) ca->sb_read_scratch); -+ -+ bch2_time_stats_exit(&ca->io_latency[WRITE]); -+ bch2_time_stats_exit(&ca->io_latency[READ]); -+ -+ percpu_ref_exit(&ca->io_ref); -+ percpu_ref_exit(&ca->ref); -+ kobject_put(&ca->kobj); -+} -+ -+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -+{ -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (percpu_ref_is_zero(&ca->io_ref)) -+ return; -+ -+ __bch2_dev_read_only(c, ca); -+ -+ reinit_completion(&ca->io_ref_completion); -+ percpu_ref_kill(&ca->io_ref); -+ wait_for_completion(&ca->io_ref_completion); -+ -+ if (ca->kobj.state_in_sysfs) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ sysfs_remove_link(block, "bcachefs"); -+ sysfs_remove_link(&ca->kobj, "block"); -+ } -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+} -+ -+static void bch2_dev_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); -+ -+ complete(&ca->ref_completion); -+} -+ -+static void bch2_dev_io_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); -+ -+ complete(&ca->io_ref_completion); -+} -+ -+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -+{ -+ int ret; -+ -+ if (!c->kobj.state_in_sysfs) -+ return 0; -+ -+ if (!ca->kobj.state_in_sysfs) { -+ ret = kobject_add(&ca->kobj, &c->kobj, -+ "dev-%u", ca->dev_idx); -+ if (ret) -+ return ret; -+ } -+ -+ if (ca->disk_sb.bdev) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); -+ if (ret) -+ return ret; -+ ret = sysfs_create_link(&ca->kobj, block, "block"); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, -+ struct bch_member *member) -+{ -+ struct bch_dev *ca; -+ -+ ca = kzalloc(sizeof(*ca), GFP_KERNEL); -+ if (!ca) -+ return NULL; -+ -+ kobject_init(&ca->kobj, &bch2_dev_ktype); -+ init_completion(&ca->ref_completion); -+ init_completion(&ca->io_ref_completion); -+ -+ init_rwsem(&ca->bucket_lock); -+ -+ writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); -+ -+ bch2_dev_copygc_init(ca); -+ -+ INIT_WORK(&ca->io_error_work, bch2_io_error_work); -+ -+ bch2_time_stats_init(&ca->io_latency[READ]); -+ bch2_time_stats_init(&ca->io_latency[WRITE]); -+ -+ ca->mi = bch2_mi_to_cpu(member); -+ ca->uuid = member->uuid; -+ -+ if (opt_defined(c->opts, discard)) -+ ca->mi.discard = opt_get(c->opts, discard); -+ -+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, -+ 0, GFP_KERNEL) || -+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || -+ bch2_dev_buckets_alloc(c, ca) || -+ bioset_init(&ca->replica_set, 4, -+ offsetof(struct bch_write_bio, bio), 0) || -+ !(ca->io_done = alloc_percpu(*ca->io_done))) -+ goto err; -+ -+ return ca; -+err: -+ bch2_dev_free(ca); -+ return NULL; -+} -+ -+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, -+ unsigned dev_idx) -+{ -+ ca->dev_idx = dev_idx; -+ __set_bit(ca->dev_idx, ca->self.d); -+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); -+ -+ ca->fs = c; -+ rcu_assign_pointer(c->devs[ca->dev_idx], ca); -+ -+ if (bch2_dev_sysfs_online(c, ca)) -+ pr_warn("error creating sysfs objects"); -+} -+ -+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -+{ -+ struct bch_member *member = -+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; -+ struct bch_dev *ca = NULL; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bch2_fs_init_fault("dev_alloc")) -+ goto err; -+ -+ ca = __bch2_dev_alloc(c, member); -+ if (!ca) -+ goto err; -+ -+ bch2_dev_attach(c, ca, dev_idx); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -+{ -+ unsigned ret; -+ -+ if (bch2_dev_is_online(ca)) { -+ bch_err(ca, "already have device online in slot %u", -+ sb->sb->dev_idx); -+ return -EINVAL; -+ } -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "cannot online: device too small"); -+ return -EINVAL; -+ } -+ -+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "device too small"); -+ return -EINVAL; -+ } -+ -+ ret = bch2_dev_journal_init(ca, sb->sb); -+ if (ret) -+ return ret; -+ -+ /* Commit: */ -+ ca->disk_sb = *sb; -+ if (sb->mode & FMODE_EXCL) -+ ca->disk_sb.bdev->bd_holder = ca; -+ memset(sb, 0, sizeof(*sb)); -+ -+ percpu_ref_reinit(&ca->io_ref); -+ -+ return 0; -+} -+ -+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (le64_to_cpu(sb->sb->seq) > -+ le64_to_cpu(c->disk_sb.sb->seq)) -+ bch2_sb_to_fs(c, sb->sb); -+ -+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || -+ !c->devs[sb->sb->dev_idx]); -+ -+ ca = bch_dev_locked(c, sb->sb->dev_idx); -+ -+ ret = __bch2_dev_attach_bdev(ca, sb); -+ if (ret) -+ return ret; -+ -+ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && -+ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { -+ mutex_lock(&c->sb_lock); -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_dev_sysfs_online(c, ca); -+ -+ if (c->sb.nr_devices == 1) -+ bdevname(ca->disk_sb.bdev, c->name); -+ bdevname(ca->disk_sb.bdev, ca->name); -+ -+ rebalance_wakeup(c); -+ return 0; -+} -+ -+/* Device management: */ -+ -+/* -+ * Note: this function is also used by the error paths - when a particular -+ * device sees an error, we call it to determine whether we can just set the -+ * device RO, or - if this function returns false - we'll set the whole -+ * filesystem RO: -+ * -+ * XXX: maybe we should be more explicit about whether we're changing state -+ * because we got an error or what have you? -+ */ -+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_devs_mask new_online_devs; -+ struct replicas_status s; -+ struct bch_dev *ca2; -+ int i, nr_rw = 0, required; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ switch (new_state) { -+ case BCH_MEMBER_STATE_RW: -+ return true; -+ case BCH_MEMBER_STATE_RO: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ return true; -+ -+ /* do we have enough devices to write to? */ -+ for_each_member_device(ca2, c, i) -+ if (ca2 != ca) -+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; -+ -+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) -+ ? c->opts.metadata_replicas -+ : c->opts.metadata_replicas_required, -+ !(flags & BCH_FORCE_IF_DATA_DEGRADED) -+ ? c->opts.data_replicas -+ : c->opts.data_replicas_required); -+ -+ return nr_rw >= required; -+ case BCH_MEMBER_STATE_FAILED: -+ case BCH_MEMBER_STATE_SPARE: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW && -+ ca->mi.state != BCH_MEMBER_STATE_RO) -+ return true; -+ -+ /* do we have enough devices to read from? */ -+ new_online_devs = bch2_online_devs(c); -+ __clear_bit(ca->dev_idx, new_online_devs.d); -+ -+ s = __bch2_replicas_status(c, new_online_devs); -+ -+ return bch2_have_enough_devs(s, flags); -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_fs_may_start(struct bch_fs *c) -+{ -+ struct replicas_status s; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned i, flags = c->opts.degraded -+ ? BCH_FORCE_IF_DEGRADED -+ : 0; -+ -+ if (!c->opts.degraded) { -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) -+ continue; -+ -+ ca = bch_dev_locked(c, i); -+ -+ if (!bch2_dev_is_online(ca) && -+ (ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO)) { -+ mutex_unlock(&c->sb_lock); -+ return false; -+ } -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ s = bch2_replicas_status(c); -+ -+ return bch2_have_enough_devs(s, flags); -+} -+ -+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -+{ -+ bch2_copygc_stop(ca); -+ -+ /* -+ * The allocator thread itself allocates btree nodes, so stop it first: -+ */ -+ bch2_dev_allocator_stop(ca); -+ bch2_dev_allocator_remove(c, ca); -+ bch2_dev_journal_stop(&c->journal, ca); -+} -+ -+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); -+ -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ if (bch2_dev_allocator_start(ca)) -+ return "error starting allocator thread"; -+ -+ if (bch2_copygc_start(c, ca)) -+ return "error starting copygc thread"; -+ -+ return NULL; -+} -+ -+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ int ret = 0; -+ -+ if (ca->mi.state == new_state) -+ return 0; -+ -+ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) -+ return -EINVAL; -+ -+ if (new_state != BCH_MEMBER_STATE_RW) -+ __bch2_dev_read_only(c, ca); -+ -+ bch_notice(ca, "%s", bch2_dev_state[new_state]); -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (new_state == BCH_MEMBER_STATE_RW && -+ __bch2_dev_read_write(c, ca)) -+ ret = -ENOMEM; -+ -+ rebalance_wakeup(c); -+ -+ return ret; -+} -+ -+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ int ret; -+ -+ down_write(&c->state_lock); -+ ret = __bch2_dev_set_state(c, ca, new_state, flags); -+ up_write(&c->state_lock); -+ -+ return ret; -+} -+ -+/* Device add/removal: */ -+ -+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ size_t i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < ca->mi.nbuckets; i++) { -+ ret = bch2_btree_key_cache_flush(&trans, -+ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); -+ if (ret) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ return ret; -+ -+ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ POS(ca->dev_idx + 1, 0), -+ NULL); -+} -+ -+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ unsigned dev_idx = ca->dev_idx, data; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ /* -+ * We consume a reference to ca->ref, regardless of whether we succeed -+ * or fail: -+ */ -+ percpu_ref_put(&ca->ref); -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot remove without losing data"); -+ goto err; -+ } -+ -+ __bch2_dev_read_only(c, ca); -+ -+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i dropping data", ret); -+ goto err; -+ } -+ -+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i flushing journal", ret); -+ goto err; -+ } -+ -+ ret = bch2_dev_remove_alloc(c, ca); -+ if (ret) { -+ bch_err(ca, "Remove failed, error deleting alloc info"); -+ goto err; -+ } -+ -+ /* -+ * must flush all existing journal entries, they might have -+ * (overwritten) keys that point to the device we're removing: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ /* -+ * hack to ensure bch2_replicas_gc2() clears out entries to this device -+ */ -+ bch2_journal_meta(&c->journal); -+ ret = bch2_journal_error(&c->journal); -+ if (ret) { -+ bch_err(ca, "Remove failed, journal error"); -+ goto err; -+ } -+ -+ ret = bch2_replicas_gc2(c); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i from replicas gc", ret); -+ goto err; -+ } -+ -+ data = bch2_dev_has_data(c, ca); -+ if (data) { -+ char data_has_str[100]; -+ -+ bch2_flags_to_text(&PBUF(data_has_str), -+ bch2_data_types, data); -+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); -+ ret = -EBUSY; -+ goto err; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ mutex_lock(&c->sb_lock); -+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); -+ mutex_unlock(&c->sb_lock); -+ -+ percpu_ref_kill(&ca->ref); -+ wait_for_completion(&ca->ref_completion); -+ -+ bch2_dev_free(ca); -+ -+ /* -+ * Free this device's slot in the bch_member array - all pointers to -+ * this device must be gone: -+ */ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); -+ -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+ return 0; -+err: -+ if (ca->mi.state == BCH_MEMBER_STATE_RW && -+ !percpu_ref_is_zero(&ca->io_ref)) -+ __bch2_dev_read_write(c, ca); -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+static void dev_usage_clear(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ -+ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); -+ up_read(&ca->bucket_lock); -+} -+ -+/* Add new device to running filesystem: */ -+int bch2_dev_add(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb; -+ const char *err; -+ struct bch_dev *ca = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member dev_mi; -+ unsigned dev_idx, nr_devices, u64s; -+ int ret; -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) -+ return ret; -+ -+ err = bch2_sb_validate(&sb); -+ if (err) -+ return -EINVAL; -+ -+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; -+ -+ err = bch2_dev_may_add(sb.sb, c); -+ if (err) -+ return -EINVAL; -+ -+ ca = __bch2_dev_alloc(c, &dev_mi); -+ if (!ca) { -+ bch2_free_super(&sb); -+ return -ENOMEM; -+ } -+ -+ ret = __bch2_dev_attach_bdev(ca, &sb); -+ if (ret) { -+ bch2_dev_free(ca); -+ return ret; -+ } -+ -+ /* -+ * We want to allocate journal on the new device before adding the new -+ * device to the filesystem because allocating after we attach requires -+ * spinning up the allocator thread, and the allocator thread requires -+ * doing btree writes, which if the existing devices are RO isn't going -+ * to work -+ * -+ * So we have to mark where the superblocks are, but marking allocated -+ * data normally updates the filesystem usage too, so we have to mark, -+ * allocate the journal, reset all the marks, then remark after we -+ * attach... -+ */ -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ -+ err = "journal alloc failed"; -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) -+ goto err; -+ -+ dev_usage_clear(ca); -+ -+ down_write(&c->state_lock); -+ mutex_lock(&c->sb_lock); -+ -+ err = "insufficient space in new superblock"; -+ ret = bch2_sb_from_fs(c, ca); -+ if (ret) -+ goto err_unlock; -+ -+ mi = bch2_sb_get_members(ca->disk_sb.sb); -+ -+ if (!bch2_sb_resize_members(&ca->disk_sb, -+ le32_to_cpu(mi->field.u64s) + -+ sizeof(dev_mi) / sizeof(u64))) { -+ ret = -ENOSPC; -+ goto err_unlock; -+ } -+ -+ if (dynamic_fault("bcachefs:add:no_slot")) -+ goto no_slot; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) -+ goto have_slot; -+no_slot: -+ err = "no slots available in superblock"; -+ ret = -ENOSPC; -+ goto err_unlock; -+ -+have_slot: -+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); -+ u64s = (sizeof(struct bch_sb_field_members) + -+ sizeof(struct bch_member) * nr_devices) / sizeof(u64); -+ -+ err = "no space in superblock for member info"; -+ ret = -ENOSPC; -+ -+ mi = bch2_sb_resize_members(&c->disk_sb, u64s); -+ if (!mi) -+ goto err_unlock; -+ -+ /* success: */ -+ -+ mi->members[dev_idx] = dev_mi; -+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); -+ c->disk_sb.sb->nr_devices = nr_devices; -+ -+ ca->disk_sb.sb->dev_idx = dev_idx; -+ bch2_dev_attach(c, ca, dev_idx); -+ -+ bch2_mark_dev_superblock(c, ca, 0); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err_late; -+ } -+ -+ up_write(&c->state_lock); -+ return 0; -+ -+err_unlock: -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ bch2_free_super(&sb); -+ bch_err(c, "Unable to add device: %s", err); -+ return ret; -+err_late: -+ bch_err(c, "Error going rw after adding device: %s", err); -+ return -EINVAL; -+} -+ -+/* Hot add existing device to running filesystem: */ -+int bch2_dev_online(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb = { NULL }; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ const char *err; -+ int ret; -+ -+ down_write(&c->state_lock); -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) { -+ up_write(&c->state_lock); -+ return ret; -+ } -+ -+ dev_idx = sb.sb->dev_idx; -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); -+ if (err) -+ goto err; -+ -+ if (bch2_dev_attach_bdev(c, &sb)) { -+ err = "bch2_dev_attach_bdev() error"; -+ goto err; -+ } -+ -+ ca = bch_dev_locked(c, dev_idx); -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ mi->members[ca->dev_idx].last_mount = -+ cpu_to_le64(ktime_get_real_seconds()); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ up_write(&c->state_lock); -+ return 0; -+err: -+ up_write(&c->state_lock); -+ bch2_free_super(&sb); -+ bch_err(c, "error bringing %s online: %s", path, err); -+ return -EINVAL; -+} -+ -+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ down_write(&c->state_lock); -+ -+ if (!bch2_dev_is_online(ca)) { -+ bch_err(ca, "Already offline"); -+ up_write(&c->state_lock); -+ return 0; -+ } -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot offline required disk"); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ up_write(&c->state_lock); -+ return 0; -+} -+ -+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bch_member *mi; -+ int ret = 0; -+ -+ down_write(&c->state_lock); -+ -+ if (nbuckets < ca->mi.nbuckets) { -+ bch_err(ca, "Cannot shrink yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (bch2_dev_is_online(ca) && -+ get_capacity(ca->disk_sb.bdev->bd_disk) < -+ ca->mi.bucket_size * nbuckets) { -+ bch_err(ca, "New size larger than device"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = bch2_dev_buckets_resize(c, ca, nbuckets); -+ if (ret) { -+ bch_err(ca, "Resize error: %i", ret); -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ mi->nbuckets = cpu_to_le64(nbuckets); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_recalc_capacity(c); -+err: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+/* return with ref on ca->ref: */ -+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) -+{ -+ -+ struct block_device *bdev = lookup_bdev(path); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->disk_sb.bdev == bdev) -+ goto found; -+ -+ ca = ERR_PTR(-ENOENT); -+found: -+ bdput(bdev); -+ return ca; -+} -+ -+/* Filesystem open: */ -+ -+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, -+ struct bch_opts opts) -+{ -+ struct bch_sb_handle *sb = NULL; -+ struct bch_fs *c = NULL; -+ unsigned i, best_sb = 0; -+ const char *err; -+ int ret = -ENOMEM; -+ -+ pr_verbose_init(opts, ""); -+ -+ if (!nr_devices) { -+ c = ERR_PTR(-EINVAL); -+ goto out2; -+ } -+ -+ if (!try_module_get(THIS_MODULE)) { -+ c = ERR_PTR(-ENODEV); -+ goto out2; -+ } -+ -+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); -+ if (!sb) -+ goto err; -+ -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_read_super(devices[i], &opts, &sb[i]); -+ if (ret) -+ goto err; -+ -+ err = bch2_sb_validate(&sb[i]); -+ if (err) -+ goto err_print; -+ } -+ -+ for (i = 1; i < nr_devices; i++) -+ if (le64_to_cpu(sb[i].sb->seq) > -+ le64_to_cpu(sb[best_sb].sb->seq)) -+ best_sb = i; -+ -+ for (i = 0; i < nr_devices; i++) { -+ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); -+ if (err) -+ goto err_print; -+ } -+ -+ ret = -ENOMEM; -+ c = bch2_fs_alloc(sb[best_sb].sb, opts); -+ if (!c) -+ goto err; -+ -+ err = "bch2_dev_online() error"; -+ down_write(&c->state_lock); -+ for (i = 0; i < nr_devices; i++) -+ if (bch2_dev_attach_bdev(c, &sb[i])) { -+ up_write(&c->state_lock); -+ goto err_print; -+ } -+ up_write(&c->state_lock); -+ -+ err = "insufficient devices"; -+ if (!bch2_fs_may_start(c)) -+ goto err_print; -+ -+ if (!c->opts.nostart) { -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+out: -+ kfree(sb); -+ module_put(THIS_MODULE); -+out2: -+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); -+ return c; -+err_print: -+ pr_err("bch_fs_open err opening %s: %s", -+ devices[0], err); -+ ret = -EINVAL; -+err: -+ if (c) -+ bch2_fs_stop(c); -+ for (i = 0; i < nr_devices; i++) -+ bch2_free_super(&sb[i]); -+ c = ERR_PTR(ret); -+ goto out; -+} -+ -+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, -+ struct bch_opts opts) -+{ -+ const char *err; -+ struct bch_fs *c; -+ bool allocated_fs = false; -+ int ret; -+ -+ err = bch2_sb_validate(sb); -+ if (err) -+ return err; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(sb->sb->uuid); -+ if (c) { -+ closure_get(&c->cl); -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); -+ if (err) -+ goto err; -+ } else { -+ c = bch2_fs_alloc(sb->sb, opts); -+ err = "cannot allocate memory"; -+ if (!c) -+ goto err; -+ -+ allocated_fs = true; -+ } -+ -+ err = "bch2_dev_online() error"; -+ -+ mutex_lock(&c->sb_lock); -+ if (bch2_dev_attach_bdev(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ mutex_unlock(&c->sb_lock); -+ -+ if (!c->opts.nostart && bch2_fs_may_start(c)) { -+ err = "error starting filesystem"; -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+ -+ closure_put(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return NULL; -+err: -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (allocated_fs) -+ bch2_fs_stop(c); -+ else if (c) -+ closure_put(&c->cl); -+ -+ return err; -+} -+ -+const char *bch2_fs_open_incremental(const char *path) -+{ -+ struct bch_sb_handle sb; -+ struct bch_opts opts = bch2_opts_empty(); -+ const char *err; -+ -+ if (bch2_read_super(path, &opts, &sb)) -+ return "error reading superblock"; -+ -+ err = __bch2_fs_open_incremental(&sb, opts); -+ bch2_free_super(&sb); -+ -+ return err; -+} -+ -+/* Global interfaces/init */ -+ -+static void bcachefs_exit(void) -+{ -+ bch2_debug_exit(); -+ bch2_vfs_exit(); -+ bch2_chardev_exit(); -+ if (bcachefs_kset) -+ kset_unregister(bcachefs_kset); -+} -+ -+static int __init bcachefs_init(void) -+{ -+ bch2_bkey_pack_test(); -+ bch2_inode_pack_test(); -+ -+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || -+ bch2_chardev_init() || -+ bch2_vfs_init() || -+ bch2_debug_init()) -+ goto err; -+ -+ return 0; -+err: -+ bcachefs_exit(); -+ return -ENOMEM; -+} -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ bool bch2_##name; \ -+ module_param_named(name, bch2_##name, bool, 0644); \ -+ MODULE_PARM_DESC(name, description); -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+module_exit(bcachefs_exit); -+module_init(bcachefs_init); -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -new file mode 100644 -index 000000000000..4aa5dd7917cf ---- /dev/null -+++ b/fs/bcachefs/super.h -@@ -0,0 +1,231 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_H -+#define _BCACHEFS_SUPER_H -+ -+#include "extents.h" -+ -+#include "bcachefs_ioctl.h" -+ -+#include -+ -+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -+{ -+ return div_u64(s, ca->mi.bucket_size); -+} -+ -+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -+{ -+ return ((sector_t) b) * ca->mi.bucket_size; -+} -+ -+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -+{ -+ u32 remainder; -+ -+ div_u64_rem(s, ca->mi.bucket_size, &remainder); -+ return remainder; -+} -+ -+static inline bool bch2_dev_is_online(struct bch_dev *ca) -+{ -+ return !percpu_ref_is_zero(&ca->io_ref); -+} -+ -+static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+{ -+ return bch2_dev_is_online(ca) && -+ ca->mi.state != BCH_MEMBER_STATE_FAILED; -+} -+ -+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -+{ -+ if (!percpu_ref_tryget(&ca->io_ref)) -+ return false; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW || -+ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) -+ return true; -+ -+ percpu_ref_put(&ca->io_ref); -+ return false; -+} -+ -+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -+{ -+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -+} -+ -+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs.nr; i++) -+ if (devs.devs[i] == dev) -+ return true; -+ -+ return false; -+} -+ -+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs->nr; i++) -+ if (devs->devs[i] == dev) { -+ array_remove_item(devs->devs, devs->nr, i); -+ return; -+ } -+} -+ -+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); -+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); -+ devs->devs[devs->nr++] = dev; -+} -+ -+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -+{ -+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -+} -+ -+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, -+ const struct bch_devs_mask *mask) -+{ -+ struct bch_dev *ca = NULL; -+ -+ while ((*iter = mask -+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) -+ : *iter) < c->sb.nr_devices && -+ !(ca = rcu_dereference_check(c->devs[*iter], -+ lockdep_is_held(&c->state_lock)))) -+ (*iter)++; -+ -+ return ca; -+} -+ -+#define __for_each_member_device(ca, c, iter, mask) \ -+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) -+ -+#define for_each_member_device_rcu(ca, c, iter, mask) \ -+ __for_each_member_device(ca, c, iter, mask) -+ -+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ if ((ca = __bch2_next_dev(c, iter, NULL))) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* -+ * If you break early, you must drop your ref on the current device -+ */ -+#define for_each_member_device(ca, c, iter) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_dev(c, &(iter))); \ -+ percpu_ref_put(&ca->ref), (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, -+ unsigned *iter, -+ int state_mask) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ while ((ca = __bch2_next_dev(c, iter, NULL)) && -+ (!((1 << ca->mi.state) & state_mask) || -+ !percpu_ref_tryget(&ca->io_ref))) -+ (*iter)++; -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+#define __for_each_online_member(ca, c, iter, state_mask) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ -+ percpu_ref_put(&ca->io_ref), (iter)++) -+ -+#define for_each_online_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, ~0) -+ -+#define for_each_rw_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) -+ -+#define for_each_readable_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, \ -+ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) -+ -+/* -+ * If a key exists that references a device, the device won't be going away and -+ * we can omit rcu_read_lock(): -+ */ -+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_check(c->devs[idx], 1); -+} -+ -+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_protected(c->devs[idx], -+ lockdep_is_held(&c->sb_lock) || -+ lockdep_is_held(&c->state_lock)); -+} -+ -+/* XXX kill, move to struct bch_fs */ -+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -+{ -+ struct bch_devs_mask devs; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ memset(&devs, 0, sizeof(devs)); -+ for_each_online_member(ca, c, i) -+ __set_bit(ca->dev_idx, devs.d); -+ return devs; -+} -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *); -+struct bch_fs *bch2_uuid_to_fs(uuid_le); -+int bch2_congested(void *, int); -+ -+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+ -+int bch2_dev_fail(struct bch_dev *, int); -+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_add(struct bch_fs *, const char *); -+int bch2_dev_online(struct bch_fs *, const char *); -+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *); -+void bch2_fs_read_only(struct bch_fs *); -+ -+int bch2_fs_read_write(struct bch_fs *); -+int bch2_fs_read_write_early(struct bch_fs *); -+ -+void bch2_fs_stop(struct bch_fs *); -+ -+int bch2_fs_start(struct bch_fs *); -+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+const char *bch2_fs_open_incremental(const char *path); -+ -+#endif /* _BCACHEFS_SUPER_H */ -diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h -new file mode 100644 -index 000000000000..20406ebd6f5b ---- /dev/null -+++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,51 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_TYPES_H -+#define _BCACHEFS_SUPER_TYPES_H -+ -+struct bch_sb_handle { -+ struct bch_sb *sb; -+ struct block_device *bdev; -+ struct bio *bio; -+ unsigned page_order; -+ fmode_t mode; -+ unsigned have_layout:1; -+ unsigned have_bio:1; -+ unsigned fs_sb:1; -+ u64 seq; -+}; -+ -+struct bch_devs_mask { -+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -+}; -+ -+struct bch_devs_list { -+ u8 nr; -+ u8 devs[BCH_REPLICAS_MAX + 1]; -+}; -+ -+struct bch_member_cpu { -+ u64 nbuckets; /* device size */ -+ u16 first_bucket; /* index of first bucket used */ -+ u16 bucket_size; /* sectors */ -+ u16 group; -+ u8 state; -+ u8 replacement; -+ u8 discard; -+ u8 data_allowed; -+ u8 durability; -+ u8 valid; -+}; -+ -+struct bch_disk_group_cpu { -+ bool deleted; -+ u16 parent; -+ struct bch_devs_mask devs; -+}; -+ -+struct bch_disk_groups_cpu { -+ struct rcu_head rcu; -+ unsigned nr; -+ struct bch_disk_group_cpu entries[]; -+}; -+ -+#endif /* _BCACHEFS_SUPER_TYPES_H */ -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -new file mode 100644 -index 000000000000..c169d282a1f9 ---- /dev/null -+++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1091 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcache sysfs interfaces -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "sysfs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "inode.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "opts.h" -+#include "rebalance.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "tests.h" -+ -+#include -+#include -+#include -+ -+#include "util.h" -+ -+#define SYSFS_OPS(type) \ -+struct sysfs_ops type ## _sysfs_ops = { \ -+ .show = type ## _show, \ -+ .store = type ## _store \ -+} -+ -+#define SHOW(fn) \ -+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ -+ char *buf) \ -+ -+#define STORE(fn) \ -+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ -+ const char *buf, size_t size) \ -+ -+#define __sysfs_attribute(_name, _mode) \ -+ static struct attribute sysfs_##_name = \ -+ { .name = #_name, .mode = _mode } -+ -+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) -+ -+#define sysfs_printf(file, fmt, ...) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ -+} while (0) -+ -+#define sysfs_print(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return snprint(buf, PAGE_SIZE, var); \ -+} while (0) -+ -+#define sysfs_hprint(file, val) \ -+do { \ -+ if (attr == &sysfs_ ## file) { \ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); \ -+ bch2_hprint(&out, val); \ -+ pr_buf(&out, "\n"); \ -+ return out.pos - buf; \ -+ } \ -+} while (0) -+ -+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -+#define var_print(_var) sysfs_print(_var, var(_var)) -+#define var_hprint(_var) sysfs_hprint(_var, var(_var)) -+ -+#define sysfs_strtoul(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe(buf, var) ?: (ssize_t) size; \ -+} while (0) -+ -+#define sysfs_strtoul_clamp(file, var, min, max) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe_clamp(buf, var, min, max) \ -+ ?: (ssize_t) size; \ -+} while (0) -+ -+#define strtoul_or_return(cp) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define strtoul_restrict_or_return(cp, min, max) \ -+({ \ -+ unsigned long __v = 0; \ -+ int _r = strtoul_safe_restrict(cp, __v, min, max); \ -+ if (_r) \ -+ return _r; \ -+ __v; \ -+}) -+ -+#define strtoi_h_or_return(cp) \ -+({ \ -+ u64 _v; \ -+ int _r = strtoi_h(cp, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define sysfs_hatoi(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoi_h(buf, &var) ?: (ssize_t) size; \ -+} while (0) -+ -+write_attribute(trigger_journal_flush); -+write_attribute(trigger_btree_coalesce); -+write_attribute(trigger_gc); -+write_attribute(prune_cache); -+rw_attribute(btree_gc_periodic); -+ -+read_attribute(uuid); -+read_attribute(minor); -+read_attribute(bucket_size); -+read_attribute(block_size); -+read_attribute(btree_node_size); -+read_attribute(first_bucket); -+read_attribute(nbuckets); -+read_attribute(durability); -+read_attribute(iodone); -+ -+read_attribute(io_latency_read); -+read_attribute(io_latency_write); -+read_attribute(io_latency_stats_read); -+read_attribute(io_latency_stats_write); -+read_attribute(congested); -+ -+read_attribute(bucket_quantiles_last_read); -+read_attribute(bucket_quantiles_last_write); -+read_attribute(bucket_quantiles_fragmentation); -+read_attribute(bucket_quantiles_oldest_gen); -+ -+read_attribute(reserve_stats); -+read_attribute(btree_cache_size); -+read_attribute(compression_stats); -+read_attribute(journal_debug); -+read_attribute(journal_pins); -+read_attribute(btree_updates); -+read_attribute(dirty_btree_nodes); -+read_attribute(btree_key_cache); -+read_attribute(btree_transactions); -+ -+read_attribute(internal_uuid); -+ -+read_attribute(has_data); -+read_attribute(alloc_debug); -+write_attribute(wake_allocator); -+ -+read_attribute(read_realloc_races); -+read_attribute(extent_migrate_done); -+read_attribute(extent_migrate_raced); -+ -+rw_attribute(journal_write_delay_ms); -+rw_attribute(journal_reclaim_delay_ms); -+ -+rw_attribute(discard); -+rw_attribute(cache_replacement_policy); -+rw_attribute(label); -+ -+rw_attribute(copy_gc_enabled); -+sysfs_pd_controller_attribute(copy_gc); -+ -+rw_attribute(rebalance_enabled); -+sysfs_pd_controller_attribute(rebalance); -+read_attribute(rebalance_work); -+rw_attribute(promote_whole_extents); -+ -+read_attribute(new_stripes); -+ -+rw_attribute(pd_controllers_update_seconds); -+ -+read_attribute(meta_replicas_have); -+read_attribute(data_replicas_have); -+ -+read_attribute(io_timers_read); -+read_attribute(io_timers_write); -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+write_attribute(perf_test); -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ rw_attribute(name); -+ -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define x(_name) \ -+ static struct attribute sysfs_time_stat_##_name = \ -+ { .name = #_name, .mode = S_IRUGO }; -+ BCH_TIME_STATS() -+#undef x -+ -+static struct attribute sysfs_state_rw = { -+ .name = "state", -+ .mode = S_IRUGO -+}; -+ -+static size_t bch2_btree_cache_size(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct btree *b; -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_for_each_entry(b, &c->btree_cache.live, list) -+ ret += btree_bytes(c); -+ -+ mutex_unlock(&c->btree_cache.lock); -+ return ret; -+} -+ -+static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); -+ -+ if (!fs_usage) -+ return -ENOMEM; -+ -+ bch2_fs_usage_to_text(&out, c, fs_usage); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ kfree(fs_usage); -+ -+ return out.pos - buf; -+} -+ -+static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, -+ nr_compressed_extents = 0, -+ compressed_sectors_compressed = 0, -+ compressed_sectors_uncompressed = 0; -+ int ret; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) -+ if (k.k->type == KEY_TYPE_extent) { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ if (!crc_is_compressed(p.crc)) { -+ nr_uncompressed_extents++; -+ uncompressed_sectors += e.k->size; -+ } else { -+ nr_compressed_extents++; -+ compressed_sectors_compressed += -+ p.crc.compressed_size; -+ compressed_sectors_uncompressed += -+ p.crc.uncompressed_size; -+ } -+ -+ /* only looking at the first ptr */ -+ break; -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ return scnprintf(buf, PAGE_SIZE, -+ "uncompressed data:\n" -+ " nr extents: %llu\n" -+ " size (bytes): %llu\n" -+ "compressed data:\n" -+ " nr extents: %llu\n" -+ " compressed size (bytes): %llu\n" -+ " uncompressed size (bytes): %llu\n", -+ nr_uncompressed_extents, -+ uncompressed_sectors << 9, -+ nr_compressed_extents, -+ compressed_sectors_compressed << 9, -+ compressed_sectors_uncompressed << 9); -+} -+ -+static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) -+{ -+ char *out = buf, *end = buf + PAGE_SIZE; -+ struct ec_stripe_head *h; -+ struct ec_stripe_new *s; -+ -+ mutex_lock(&c->ec_new_stripe_lock); -+ list_for_each_entry(h, &c->ec_new_stripe_list, list) { -+ out += scnprintf(out, end - out, -+ "target %u algo %u redundancy %u:\n", -+ h->target, h->algo, h->redundancy); -+ -+ if (h->s) -+ out += scnprintf(out, end - out, -+ "\tpending: blocks %u allocated %u\n", -+ h->s->blocks.nr, -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr)); -+ -+ mutex_lock(&h->lock); -+ list_for_each_entry(s, &h->stripes, list) -+ out += scnprintf(out, end - out, -+ "\tin flight: blocks %u allocated %u pin %u\n", -+ s->blocks.nr, -+ bitmap_weight(s->blocks_allocated, -+ s->blocks.nr), -+ atomic_read(&s->pin)); -+ mutex_unlock(&h->lock); -+ -+ } -+ mutex_unlock(&c->ec_new_stripe_lock); -+ -+ return out - buf; -+} -+ -+SHOW(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ sysfs_print(minor, c->minor); -+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); -+ -+ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(btree_node_size, btree_bytes(c)); -+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); -+ -+ sysfs_print(read_realloc_races, -+ atomic_long_read(&c->read_realloc_races)); -+ sysfs_print(extent_migrate_done, -+ atomic_long_read(&c->extent_migrate_done)); -+ sysfs_print(extent_migrate_raced, -+ atomic_long_read(&c->extent_migrate_raced)); -+ -+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); -+ -+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ -+ sysfs_print(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ -+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); -+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ -+ -+ if (attr == &sysfs_rebalance_work) -+ return bch2_rebalance_work_show(c, buf); -+ -+ sysfs_print(promote_whole_extents, c->promote_whole_extents); -+ -+ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); -+ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_alloc_debug) -+ return show_fs_alloc_debug(c, buf); -+ -+ if (attr == &sysfs_journal_debug) -+ return bch2_journal_print_debug(&c->journal, buf); -+ -+ if (attr == &sysfs_journal_pins) -+ return bch2_journal_print_pins(&c->journal, buf); -+ -+ if (attr == &sysfs_btree_updates) -+ return bch2_btree_updates_print(c, buf); -+ -+ if (attr == &sysfs_dirty_btree_nodes) -+ return bch2_dirty_btree_nodes_print(c, buf); -+ -+ if (attr == &sysfs_btree_key_cache) { -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_transactions) { -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ bch2_btree_trans_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_compression_stats) -+ return bch2_compression_stats(c, buf); -+ -+ if (attr == &sysfs_new_stripes) -+ return bch2_new_stripes(c, buf); -+ -+ if (attr == &sysfs_io_timers_read) -+ return bch2_io_timers_show(&c->io_clock[READ], buf); -+ if (attr == &sysfs_io_timers_write) -+ return bch2_io_timers_show(&c->io_clock[WRITE], buf); -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ return 0; -+} -+ -+STORE(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ if (attr == &sysfs_btree_gc_periodic) { -+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) -+ ?: (ssize_t) size; -+ -+ wake_up_process(c->gc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_copy_gc_enabled) { -+ struct bch_dev *ca; -+ unsigned i; -+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) -+ ?: (ssize_t) size; -+ -+ for_each_member_device(ca, c, i) -+ if (ca->copygc_thread) -+ wake_up_process(ca->copygc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_rebalance_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) -+ ?: (ssize_t) size; -+ -+ rebalance_wakeup(c); -+ return ret; -+ } -+ -+ sysfs_strtoul(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); -+ -+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); -+ -+ /* Debugging: */ -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_trigger_journal_flush) -+ bch2_journal_meta_async(&c->journal, NULL); -+ -+ if (attr == &sysfs_trigger_btree_coalesce) -+ bch2_coalesce(c); -+ -+ if (attr == &sysfs_trigger_gc) { -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ down_read(&c->state_lock); -+ bch2_gc(c, NULL, false, false); -+ up_read(&c->state_lock); -+#else -+ bch2_gc_gens(c); -+#endif -+ } -+ -+ if (attr == &sysfs_prune_cache) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = strtoul_or_return(buf); -+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); -+ } -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ if (attr == &sysfs_perf_test) { -+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; -+ char *test = strsep(&p, " \t\n"); -+ char *nr_str = strsep(&p, " \t\n"); -+ char *threads_str = strsep(&p, " \t\n"); -+ unsigned threads; -+ u64 nr; -+ int ret = -EINVAL; -+ -+ if (threads_str && -+ !(ret = kstrtouint(threads_str, 10, &threads)) && -+ !(ret = bch2_strtoull_h(nr_str, &nr))) -+ bch2_btree_perf_test(c, test, nr, threads); -+ else -+ size = ret; -+ kfree(tmp); -+ } -+#endif -+ return size; -+} -+SYSFS_OPS(bch2_fs); -+ -+struct attribute *bch2_fs_files[] = { -+ &sysfs_minor, -+ &sysfs_block_size, -+ &sysfs_btree_node_size, -+ &sysfs_btree_cache_size, -+ -+ &sysfs_meta_replicas_have, -+ &sysfs_data_replicas_have, -+ -+ &sysfs_journal_write_delay_ms, -+ &sysfs_journal_reclaim_delay_ms, -+ -+ &sysfs_promote_whole_extents, -+ -+ &sysfs_compression_stats, -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ &sysfs_perf_test, -+#endif -+ NULL -+}; -+ -+/* internal dir - just a wrapper */ -+ -+SHOW(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_show(&c->kobj, attr, buf); -+} -+ -+STORE(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_store(&c->kobj, attr, buf, size); -+} -+SYSFS_OPS(bch2_fs_internal); -+ -+struct attribute *bch2_fs_internal_files[] = { -+ &sysfs_alloc_debug, -+ &sysfs_journal_debug, -+ &sysfs_journal_pins, -+ &sysfs_btree_updates, -+ &sysfs_dirty_btree_nodes, -+ &sysfs_btree_key_cache, -+ &sysfs_btree_transactions, -+ -+ &sysfs_read_realloc_races, -+ &sysfs_extent_migrate_done, -+ &sysfs_extent_migrate_raced, -+ -+ &sysfs_trigger_journal_flush, -+ &sysfs_trigger_btree_coalesce, -+ &sysfs_trigger_gc, -+ &sysfs_prune_cache, -+ -+ &sysfs_copy_gc_enabled, -+ -+ &sysfs_rebalance_enabled, -+ &sysfs_rebalance_work, -+ sysfs_pd_controller_files(rebalance), -+ -+ &sysfs_new_stripes, -+ -+ &sysfs_io_timers_read, -+ &sysfs_io_timers_write, -+ -+ &sysfs_internal_uuid, -+ -+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ NULL -+}; -+ -+/* options */ -+ -+SHOW(bch2_fs_opts_dir) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int id = opt - bch2_opt_table; -+ u64 v = bch2_opt_get_by_id(&c->opts, id); -+ -+ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); -+ pr_buf(&out, "\n"); -+ -+ return out.pos - buf; -+} -+ -+STORE(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int ret, id = opt - bch2_opt_table; -+ char *tmp; -+ u64 v; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v); -+ kfree(tmp); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, id, v); -+ if (ret < 0) -+ return ret; -+ -+ if (opt->set_sb != SET_NO_SB_OPT) { -+ mutex_lock(&c->sb_lock); -+ opt->set_sb(c->disk_sb.sb, v); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_opt_set_by_id(&c->opts, id, v); -+ -+ if ((id == Opt_background_target || -+ id == Opt_background_compression) && v) { -+ bch2_rebalance_add_work(c, S64_MAX); -+ rebalance_wakeup(c); -+ } -+ -+ return size; -+} -+SYSFS_OPS(bch2_fs_opts_dir); -+ -+struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -+ -+int bch2_opts_create_sysfs_files(struct kobject *kobj) -+{ -+ const struct bch_option *i; -+ int ret; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + bch2_opts_nr; -+ i++) { -+ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) -+ continue; -+ -+ ret = sysfs_create_file(kobj, &i->attr); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* time stats */ -+ -+SHOW(bch2_fs_time_stats) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -+ -+#define x(name) \ -+ if (attr == &sysfs_time_stat_##name) \ -+ return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ -+ buf, PAGE_SIZE); -+ BCH_TIME_STATS() -+#undef x -+ -+ return 0; -+} -+ -+STORE(bch2_fs_time_stats) -+{ -+ return size; -+} -+SYSFS_OPS(bch2_fs_time_stats); -+ -+struct attribute *bch2_fs_time_stats_files[] = { -+#define x(name) \ -+ &sysfs_time_stat_##name, -+ BCH_TIME_STATS() -+#undef x -+ NULL -+}; -+ -+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, -+ size_t, void *); -+ -+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ int rw = (private ? 1 : 0); -+ -+ return bucket_last_io(c, bucket(ca, b), rw); -+} -+ -+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ struct bucket *g = bucket(ca, b); -+ return bucket_sectors_used(g->mark); -+} -+ -+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ return bucket_gc_gen(ca, b); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ const unsigned *l = _l; -+ const unsigned *r = _r; -+ -+ return cmp_int(*l, *r); -+} -+ -+static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, -+ char *buf, bucket_map_fn *fn, void *private) -+{ -+ size_t i, n; -+ /* Compute 31 quantiles */ -+ unsigned q[31], *p; -+ ssize_t ret = 0; -+ -+ down_read(&ca->bucket_lock); -+ n = ca->mi.nbuckets; -+ -+ p = vzalloc(n * sizeof(unsigned)); -+ if (!p) { -+ up_read(&ca->bucket_lock); -+ return -ENOMEM; -+ } -+ -+ for (i = ca->mi.first_bucket; i < n; i++) -+ p[i] = fn(c, ca, i, private); -+ -+ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); -+ up_read(&ca->bucket_lock); -+ -+ while (n && -+ !p[n - 1]) -+ --n; -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; -+ -+ vfree(p); -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, -+ "%u ", q[i]); -+ buf[ret - 1] = '\n'; -+ -+ return ret; -+} -+ -+static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ enum alloc_reserve i; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ -+ pr_buf(&out, "free_inc:\t%zu\t%zu\n", -+ fifo_used(&ca->free_inc), -+ ca->free_inc.size); -+ -+ for (i = 0; i < RESERVE_NR; i++) -+ pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, -+ fifo_used(&ca->free[i]), -+ ca->free[i].size); -+ -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ return out.pos - buf; -+} -+ -+static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); -+ unsigned i, nr[BCH_DATA_NR]; -+ -+ memset(nr, 0, sizeof(nr)); -+ -+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) -+ nr[c->open_buckets[i].type]++; -+ -+ return scnprintf(buf, PAGE_SIZE, -+ "free_inc: %zu/%zu\n" -+ "free[RESERVE_BTREE]: %zu/%zu\n" -+ "free[RESERVE_MOVINGGC]: %zu/%zu\n" -+ "free[RESERVE_NONE]: %zu/%zu\n" -+ "buckets:\n" -+ " capacity: %llu\n" -+ " alloc: %llu\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " available: %lli\n" -+ "sectors:\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " fragmented: %llu\n" -+ " copygc threshold: %llu\n" -+ "freelist_wait: %s\n" -+ "open buckets: %u/%u (reserved %u)\n" -+ "open_buckets_wait: %s\n" -+ "open_buckets_btree: %u\n" -+ "open_buckets_user: %u\n" -+ "btree reserve cache: %u\n", -+ fifo_used(&ca->free_inc), ca->free_inc.size, -+ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, -+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, -+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, -+ ca->mi.nbuckets - ca->mi.first_bucket, -+ stats.buckets_alloc, -+ stats.buckets[BCH_DATA_SB], -+ stats.buckets[BCH_DATA_JOURNAL], -+ stats.buckets[BCH_DATA_BTREE], -+ stats.buckets[BCH_DATA_USER], -+ stats.buckets[BCH_DATA_CACHED], -+ stats.buckets_ec, -+ ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, -+ stats.sectors[BCH_DATA_SB], -+ stats.sectors[BCH_DATA_JOURNAL], -+ stats.sectors[BCH_DATA_BTREE], -+ stats.sectors[BCH_DATA_USER], -+ stats.sectors[BCH_DATA_CACHED], -+ stats.sectors_ec, -+ stats.sectors_fragmented, -+ ca->copygc_threshold, -+ c->freelist_wait.list.first ? "waiting" : "empty", -+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, -+ BTREE_NODE_OPEN_BUCKET_RESERVE, -+ c->open_buckets_wait.list.first ? "waiting" : "empty", -+ nr[BCH_DATA_BTREE], -+ nr[BCH_DATA_USER], -+ c->btree_reserve_cache_nr); -+} -+ -+static const char * const bch2_rw[] = { -+ "read", -+ "write", -+ NULL -+}; -+ -+static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ int rw, i; -+ -+ for (rw = 0; rw < 2; rw++) { -+ pr_buf(&out, "%s:\n", bch2_rw[rw]); -+ -+ for (i = 1; i < BCH_DATA_NR; i++) -+ pr_buf(&out, "%-12s:%12llu\n", -+ bch2_data_types[i], -+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); -+ } -+ -+ return out.pos - buf; -+} -+ -+SHOW(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ sysfs_printf(uuid, "%pU\n", ca->uuid.b); -+ -+ sysfs_print(bucket_size, bucket_bytes(ca)); -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(first_bucket, ca->mi.first_bucket); -+ sysfs_print(nbuckets, ca->mi.nbuckets); -+ sysfs_print(durability, ca->mi.durability); -+ sysfs_print(discard, ca->mi.discard); -+ -+ if (attr == &sysfs_label) { -+ if (ca->mi.group) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(&out, &c->disk_sb, -+ ca->mi.group - 1); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_has_data) { -+ bch2_flags_to_text(&out, bch2_data_types, -+ bch2_dev_has_data(c, ca)); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ bch2_string_opt_to_text(&out, -+ bch2_cache_replacement_policies, -+ ca->mi.replacement); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_state_rw) { -+ bch2_string_opt_to_text(&out, bch2_dev_state, -+ ca->mi.state); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_iodone) -+ return show_dev_iodone(ca, buf); -+ -+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); -+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); -+ -+ if (attr == &sysfs_io_latency_stats_read) -+ return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); -+ if (attr == &sysfs_io_latency_stats_write) -+ return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); -+ -+ sysfs_printf(congested, "%u%%", -+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) -+ * 100 / CONGESTED_MAX); -+ -+ if (attr == &sysfs_bucket_quantiles_last_read) -+ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); -+ if (attr == &sysfs_bucket_quantiles_last_write) -+ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); -+ if (attr == &sysfs_bucket_quantiles_fragmentation) -+ return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); -+ if (attr == &sysfs_bucket_quantiles_oldest_gen) -+ return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); -+ -+ if (attr == &sysfs_reserve_stats) -+ return show_reserve_stats(ca, buf); -+ if (attr == &sysfs_alloc_debug) -+ return show_dev_alloc_debug(ca, buf); -+ -+ return 0; -+} -+ -+STORE(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct bch_member *mi; -+ -+ sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); -+ -+ if (attr == &sysfs_discard) { -+ bool v = strtoul_or_return(buf); -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if (v != BCH_MEMBER_DISCARD(mi)) { -+ SET_BCH_MEMBER_DISCARD(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); -+ -+ if (v < 0) -+ return v; -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { -+ SET_BCH_MEMBER_REPLACEMENT(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_label) { -+ char *tmp; -+ int ret; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_dev_group_set(c, ca, strim(tmp)); -+ kfree(tmp); -+ if (ret) -+ return ret; -+ } -+ -+ if (attr == &sysfs_wake_allocator) -+ bch2_wake_allocator(ca); -+ -+ return size; -+} -+SYSFS_OPS(bch2_dev); -+ -+struct attribute *bch2_dev_files[] = { -+ &sysfs_uuid, -+ &sysfs_bucket_size, -+ &sysfs_block_size, -+ &sysfs_first_bucket, -+ &sysfs_nbuckets, -+ &sysfs_durability, -+ -+ /* settings: */ -+ &sysfs_discard, -+ &sysfs_cache_replacement_policy, -+ &sysfs_state_rw, -+ &sysfs_label, -+ -+ &sysfs_has_data, -+ &sysfs_iodone, -+ -+ &sysfs_io_latency_read, -+ &sysfs_io_latency_write, -+ &sysfs_io_latency_stats_read, -+ &sysfs_io_latency_stats_write, -+ &sysfs_congested, -+ -+ /* alloc info - other stats: */ -+ &sysfs_bucket_quantiles_last_read, -+ &sysfs_bucket_quantiles_last_write, -+ &sysfs_bucket_quantiles_fragmentation, -+ &sysfs_bucket_quantiles_oldest_gen, -+ -+ &sysfs_reserve_stats, -+ -+ /* debug: */ -+ &sysfs_alloc_debug, -+ &sysfs_wake_allocator, -+ -+ sysfs_pd_controller_files(copy_gc), -+ NULL -+}; -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h -new file mode 100644 -index 000000000000..525fd05d91f7 ---- /dev/null -+++ b/fs/bcachefs/sysfs.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SYSFS_H_ -+#define _BCACHEFS_SYSFS_H_ -+ -+#include -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+struct attribute; -+struct sysfs_ops; -+ -+extern struct attribute *bch2_fs_files[]; -+extern struct attribute *bch2_fs_internal_files[]; -+extern struct attribute *bch2_fs_opts_dir_files[]; -+extern struct attribute *bch2_fs_time_stats_files[]; -+extern struct attribute *bch2_dev_files[]; -+ -+extern struct sysfs_ops bch2_fs_sysfs_ops; -+extern struct sysfs_ops bch2_fs_internal_sysfs_ops; -+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+extern struct sysfs_ops bch2_dev_sysfs_ops; -+ -+int bch2_opts_create_sysfs_files(struct kobject *); -+ -+#else -+ -+static struct attribute *bch2_fs_files[] = {}; -+static struct attribute *bch2_fs_internal_files[] = {}; -+static struct attribute *bch2_fs_opts_dir_files[] = {}; -+static struct attribute *bch2_fs_time_stats_files[] = {}; -+static struct attribute *bch2_dev_files[] = {}; -+ -+static const struct sysfs_ops bch2_fs_sysfs_ops; -+static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+static const struct sysfs_ops bch2_dev_sysfs_ops; -+ -+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } -+ -+#endif /* NO_BCACHEFS_SYSFS */ -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -new file mode 100644 -index 000000000000..4dcace650416 ---- /dev/null -+++ b/fs/bcachefs/tests.c -@@ -0,0 +1,725 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "journal_reclaim.h" -+#include "tests.h" -+ -+#include "linux/kthread.h" -+#include "linux/random.h" -+ -+static void delete_test_keys(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+/* unit tests */ -+ -+static void test_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ pr_info("deleting once"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ pr_info("deleting twice"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_delete_written(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS_MIN, 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i++); -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) -+ BUG_ON(k.k->p.offset != --i); -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test extents"); -+ -+ for (i = 0; i < nr; i += 8) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 8; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS_MIN, 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i); -+ i = k.k->p.offset; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { -+ BUG_ON(k.k->p.offset != i); -+ i = bkey_start_offset(k.k); -+ } -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i * 2; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i); -+ i += 2; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr * 2); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(k.k->p.offset != i); -+ BUG_ON(bkey_deleted(k.k) != (i & 1)); -+ -+ i++; -+ if (i == nr * 2) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i += 16) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 16; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i + 8); -+ BUG_ON(k.k->size != 8); -+ i += 16; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(bkey_deleted(k.k) != !(i % 16)); -+ -+ BUG_ON(bkey_start_offset(k.k) != i); -+ BUG_ON(k.k->size != 8); -+ i = k.k->p.offset; -+ -+ if (i == nr) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * XXX: we really want to make sure we've got a btree with depth > 0 for these -+ * tests -+ */ -+static void test_peek_end(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_peek_end_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* extent unit tests */ -+ -+u64 test_version; -+ -+static void insert_test_extent(struct bch_fs *c, -+ u64 start, u64 end) -+{ -+ struct bkey_i_cookie k; -+ int ret; -+ -+ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); -+ -+ bkey_cookie_init(&k.k_i); -+ k.k_i.k.p.offset = end; -+ k.k_i.k.size = end - start; -+ k.k_i.k.version.lo = test_version++; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+} -+ -+static void __test_extent_overwrite(struct bch_fs *c, -+ u64 e1_start, u64 e1_end, -+ u64 e2_start, u64 e2_end) -+{ -+ insert_test_extent(c, e1_start, e1_end); -+ insert_test_extent(c, e2_start, e2_end); -+ -+ delete_test_keys(c); -+} -+ -+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 0, 32); -+ __test_extent_overwrite(c, 8, 64, 0, 32); -+} -+ -+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 64); -+ __test_extent_overwrite(c, 0, 64, 32, 72); -+} -+ -+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 40); -+} -+ -+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 32, 64, 0, 64); -+ __test_extent_overwrite(c, 32, 64, 0, 128); -+ __test_extent_overwrite(c, 32, 64, 32, 64); -+ __test_extent_overwrite(c, 32, 64, 32, 128); -+} -+ -+/* perf tests */ -+ -+static u64 test_rand(void) -+{ -+ u64 v; -+#if 0 -+ v = prandom_u32(); -+#else -+ prandom_bytes(&v, sizeof(v)); -+#endif -+ return v; -+} -+ -+static void rand_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct bkey_i_cookie k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = test_rand(); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_mixed(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ -+ if (!(i & 3) && k.k) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static int __do_delete(struct btree_trans *trans, struct bpos pos) -+{ -+ struct btree_iter *iter; -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = k.k->p; -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static void rand_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ struct bpos pos = POS(0, test_rand()); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __do_delete(&trans, pos)); -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_cookie insert; -+ int ret; -+ u64 i = 0; -+ -+ bkey_cookie_init(&insert.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ insert.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &insert.k_i, 0)); -+ -+ BUG_ON(ret); -+ -+ if (++i == nr) -+ break; -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) -+ ; -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_overwrite(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_INTENT, k, ret) { -+ struct bkey_i_cookie u; -+ -+ bkey_reassemble(&u.k_i, k); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &u.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_delete(struct bch_fs *c, u64 nr) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+typedef void (*perf_test_fn)(struct bch_fs *, u64); -+ -+struct test_job { -+ struct bch_fs *c; -+ u64 nr; -+ unsigned nr_threads; -+ perf_test_fn fn; -+ -+ atomic_t ready; -+ wait_queue_head_t ready_wait; -+ -+ atomic_t done; -+ struct completion done_completion; -+ -+ u64 start; -+ u64 finish; -+}; -+ -+static int btree_perf_test_thread(void *data) -+{ -+ struct test_job *j = data; -+ -+ if (atomic_dec_and_test(&j->ready)) { -+ wake_up(&j->ready_wait); -+ j->start = sched_clock(); -+ } else { -+ wait_event(j->ready_wait, !atomic_read(&j->ready)); -+ } -+ -+ j->fn(j->c, j->nr / j->nr_threads); -+ -+ if (atomic_dec_and_test(&j->done)) { -+ j->finish = sched_clock(); -+ complete(&j->done_completion); -+ } -+ -+ return 0; -+} -+ -+void bch2_btree_perf_test(struct bch_fs *c, const char *testname, -+ u64 nr, unsigned nr_threads) -+{ -+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; -+ char name_buf[20], nr_buf[20], per_sec_buf[20]; -+ unsigned i; -+ u64 time; -+ -+ atomic_set(&j.ready, nr_threads); -+ init_waitqueue_head(&j.ready_wait); -+ -+ atomic_set(&j.done, nr_threads); -+ init_completion(&j.done_completion); -+ -+#define perf_test(_test) \ -+ if (!strcmp(testname, #_test)) j.fn = _test -+ -+ perf_test(rand_insert); -+ perf_test(rand_lookup); -+ perf_test(rand_mixed); -+ perf_test(rand_delete); -+ -+ perf_test(seq_insert); -+ perf_test(seq_lookup); -+ perf_test(seq_overwrite); -+ perf_test(seq_delete); -+ -+ /* a unit test, not a perf test: */ -+ perf_test(test_delete); -+ perf_test(test_delete_written); -+ perf_test(test_iterate); -+ perf_test(test_iterate_extents); -+ perf_test(test_iterate_slots); -+ perf_test(test_iterate_slots_extents); -+ perf_test(test_peek_end); -+ perf_test(test_peek_end_extents); -+ -+ perf_test(test_extent_overwrite_front); -+ perf_test(test_extent_overwrite_back); -+ perf_test(test_extent_overwrite_middle); -+ perf_test(test_extent_overwrite_all); -+ -+ if (!j.fn) { -+ pr_err("unknown test %s", testname); -+ return; -+ } -+ -+ //pr_info("running test %s:", testname); -+ -+ if (nr_threads == 1) -+ btree_perf_test_thread(&j); -+ else -+ for (i = 0; i < nr_threads; i++) -+ kthread_run(btree_perf_test_thread, &j, -+ "bcachefs perf test[%u]", i); -+ -+ while (wait_for_completion_interruptible(&j.done_completion)) -+ ; -+ -+ time = j.finish - j.start; -+ -+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); -+ bch2_hprint(&PBUF(nr_buf), nr); -+ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); -+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", -+ name_buf, nr_buf, nr_threads, -+ time / NSEC_PER_SEC, -+ time * nr_threads / nr, -+ per_sec_buf); -+} -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h -new file mode 100644 -index 000000000000..551d0764225e ---- /dev/null -+++ b/fs/bcachefs/tests.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_TEST_H -+#define _BCACHEFS_TEST_H -+ -+struct bch_fs; -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); -+ -+#else -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#endif /* _BCACHEFS_TEST_H */ -diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c -new file mode 100644 -index 000000000000..59e8dfa3d245 ---- /dev/null -+++ b/fs/bcachefs/trace.c -@@ -0,0 +1,12 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "buckets.h" -+#include "btree_types.h" -+#include "keylist.h" -+ -+#include -+#include "keylist.h" -+ -+#define CREATE_TRACE_POINTS -+#include -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -new file mode 100644 -index 000000000000..e69d03d1109f ---- /dev/null -+++ b/fs/bcachefs/util.c -@@ -0,0 +1,910 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * random utiility code, for bcache but in theory not specific to bcache -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "eytzinger.h" -+#include "util.h" -+ -+static const char si_units[] = "?kMGTPEZY"; -+ -+static int __bch2_strtoh(const char *cp, u64 *res, -+ u64 t_max, bool t_signed) -+{ -+ bool positive = *cp != '-'; -+ unsigned u; -+ u64 v = 0; -+ -+ if (*cp == '+' || *cp == '-') -+ cp++; -+ -+ if (!isdigit(*cp)) -+ return -EINVAL; -+ -+ do { -+ if (v > U64_MAX / 10) -+ return -ERANGE; -+ v *= 10; -+ if (v > U64_MAX - (*cp - '0')) -+ return -ERANGE; -+ v += *cp - '0'; -+ cp++; -+ } while (isdigit(*cp)); -+ -+ for (u = 1; u < strlen(si_units); u++) -+ if (*cp == si_units[u]) { -+ cp++; -+ goto got_unit; -+ } -+ u = 0; -+got_unit: -+ if (*cp == '\n') -+ cp++; -+ if (*cp) -+ return -EINVAL; -+ -+ if (fls64(v) + u * 10 > 64) -+ return -ERANGE; -+ -+ v <<= u * 10; -+ -+ if (positive) { -+ if (v > t_max) -+ return -ERANGE; -+ } else { -+ if (v && !t_signed) -+ return -ERANGE; -+ -+ if (v > t_max + 1) -+ return -ERANGE; -+ v = -v; -+ } -+ -+ *res = v; -+ return 0; -+} -+ -+#define STRTO_H(name, type) \ -+int bch2_ ## name ## _h(const char *cp, type *res) \ -+{ \ -+ u64 v; \ -+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ -+ ANYSINT_MAX(type) != ((type) ~0ULL)); \ -+ *res = v; \ -+ return ret; \ -+} -+ -+STRTO_H(strtoint, int) -+STRTO_H(strtouint, unsigned int) -+STRTO_H(strtoll, long long) -+STRTO_H(strtoull, unsigned long long) -+STRTO_H(strtou64, u64) -+ -+void bch2_hprint(struct printbuf *buf, s64 v) -+{ -+ int u, t = 0; -+ -+ for (u = 0; v >= 1024 || v <= -1024; u++) { -+ t = v & ~(~0U << 10); -+ v >>= 10; -+ } -+ -+ pr_buf(buf, "%lli", v); -+ -+ /* -+ * 103 is magic: t is in the range [-1023, 1023] and we want -+ * to turn it into [-9, 9] -+ */ -+ if (u && v < 100 && v > -100) -+ pr_buf(buf, ".%i", t / 103); -+ if (u) -+ pr_buf(buf, "%c", si_units[u]); -+} -+ -+void bch2_string_opt_to_text(struct printbuf *out, -+ const char * const list[], -+ size_t selected) -+{ -+ size_t i; -+ -+ for (i = 0; list[i]; i++) -+ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); -+} -+ -+void bch2_flags_to_text(struct printbuf *out, -+ const char * const list[], u64 flags) -+{ -+ unsigned bit, nr = 0; -+ bool first = true; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ while (list[nr]) -+ nr++; -+ -+ while (flags && (bit = __ffs(flags)) < nr) { -+ if (!first) -+ pr_buf(out, ","); -+ first = false; -+ pr_buf(out, "%s", list[bit]); -+ flags ^= 1 << bit; -+ } -+} -+ -+u64 bch2_read_flag_list(char *opt, const char * const list[]) -+{ -+ u64 ret = 0; -+ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); -+ -+ if (!d) -+ return -ENOMEM; -+ -+ s = strim(d); -+ -+ while ((p = strsep(&s, ","))) { -+ int flag = match_string(list, -1, p); -+ if (flag < 0) { -+ ret = -1; -+ break; -+ } -+ -+ ret |= 1 << flag; -+ } -+ -+ kfree(d); -+ -+ return ret; -+} -+ -+bool bch2_is_zero(const void *_p, size_t n) -+{ -+ const char *p = _p; -+ size_t i; -+ -+ for (i = 0; i < n; i++) -+ if (p[i]) -+ return false; -+ return true; -+} -+ -+static void bch2_quantiles_update(struct quantiles *q, u64 v) -+{ -+ unsigned i = 0; -+ -+ while (i < ARRAY_SIZE(q->entries)) { -+ struct quantile_entry *e = q->entries + i; -+ -+ if (unlikely(!e->step)) { -+ e->m = v; -+ e->step = max_t(unsigned, v / 2, 1024); -+ } else if (e->m > v) { -+ e->m = e->m >= e->step -+ ? e->m - e->step -+ : 0; -+ } else if (e->m < v) { -+ e->m = e->m + e->step > e->m -+ ? e->m + e->step -+ : U32_MAX; -+ } -+ -+ if ((e->m > v ? e->m - v : v - e->m) < e->step) -+ e->step = max_t(unsigned, e->step / 2, 1); -+ -+ if (v >= e->m) -+ break; -+ -+ i = eytzinger0_child(i, v > e->m); -+ } -+} -+ -+/* time stats: */ -+ -+static void bch2_time_stats_update_one(struct time_stats *stats, -+ u64 start, u64 end) -+{ -+ u64 duration, freq; -+ -+ duration = time_after64(end, start) -+ ? end - start : 0; -+ freq = time_after64(end, stats->last_event) -+ ? end - stats->last_event : 0; -+ -+ stats->count++; -+ -+ stats->average_duration = stats->average_duration -+ ? ewma_add(stats->average_duration, duration, 6) -+ : duration; -+ -+ stats->average_frequency = stats->average_frequency -+ ? ewma_add(stats->average_frequency, freq, 6) -+ : freq; -+ -+ stats->max_duration = max(stats->max_duration, duration); -+ -+ stats->last_event = end; -+ -+ bch2_quantiles_update(&stats->quantiles, duration); -+} -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) -+{ -+ unsigned long flags; -+ -+ if (!stats->buffer) { -+ spin_lock_irqsave(&stats->lock, flags); -+ bch2_time_stats_update_one(stats, start, end); -+ -+ if (stats->average_frequency < 32 && -+ stats->count > 1024) -+ stats->buffer = -+ alloc_percpu_gfp(struct time_stat_buffer, -+ GFP_ATOMIC); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ } else { -+ struct time_stat_buffer_entry *i; -+ struct time_stat_buffer *b; -+ -+ preempt_disable(); -+ b = this_cpu_ptr(stats->buffer); -+ -+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); -+ b->entries[b->nr++] = (struct time_stat_buffer_entry) { -+ .start = start, -+ .end = end -+ }; -+ -+ if (b->nr == ARRAY_SIZE(b->entries)) { -+ spin_lock_irqsave(&stats->lock, flags); -+ for (i = b->entries; -+ i < b->entries + ARRAY_SIZE(b->entries); -+ i++) -+ bch2_time_stats_update_one(stats, i->start, i->end); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ -+ b->nr = 0; -+ } -+ -+ preempt_enable(); -+ } -+} -+ -+static const struct time_unit { -+ const char *name; -+ u32 nsecs; -+} time_units[] = { -+ { "ns", 1 }, -+ { "us", NSEC_PER_USEC }, -+ { "ms", NSEC_PER_MSEC }, -+ { "sec", NSEC_PER_SEC }, -+}; -+ -+static const struct time_unit *pick_time_units(u64 ns) -+{ -+ const struct time_unit *u; -+ -+ for (u = time_units; -+ u + 1 < time_units + ARRAY_SIZE(time_units) && -+ ns >= u[1].nsecs << 1; -+ u++) -+ ; -+ -+ return u; -+} -+ -+static void pr_time_units(struct printbuf *out, u64 ns) -+{ -+ const struct time_unit *u = pick_time_units(ns); -+ -+ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -+} -+ -+size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) -+{ -+ struct printbuf out = _PBUF(buf, len); -+ const struct time_unit *u; -+ u64 freq = READ_ONCE(stats->average_frequency); -+ u64 q, last_q = 0; -+ int i; -+ -+ pr_buf(&out, "count:\t\t%llu\n", -+ stats->count); -+ pr_buf(&out, "rate:\t\t%llu/sec\n", -+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); -+ -+ pr_buf(&out, "frequency:\t"); -+ pr_time_units(&out, freq); -+ -+ pr_buf(&out, "\navg duration:\t"); -+ pr_time_units(&out, stats->average_duration); -+ -+ pr_buf(&out, "\nmax duration:\t"); -+ pr_time_units(&out, stats->max_duration); -+ -+ i = eytzinger0_first(NR_QUANTILES); -+ u = pick_time_units(stats->quantiles.entries[i].m); -+ -+ pr_buf(&out, "\nquantiles (%s):\t", u->name); -+ eytzinger0_for_each(i, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ -+ q = max(stats->quantiles.entries[i].m, last_q); -+ pr_buf(&out, "%llu%s", -+ div_u64(q, u->nsecs), -+ is_last ? "\n" : " "); -+ last_q = q; -+ } -+ -+ return out.pos - buf; -+} -+ -+void bch2_time_stats_exit(struct time_stats *stats) -+{ -+ free_percpu(stats->buffer); -+} -+ -+void bch2_time_stats_init(struct time_stats *stats) -+{ -+ memset(stats, 0, sizeof(*stats)); -+ spin_lock_init(&stats->lock); -+} -+ -+/* ratelimit: */ -+ -+/** -+ * bch2_ratelimit_delay() - return how long to delay until the next time to do -+ * some work -+ * -+ * @d - the struct bch_ratelimit to update -+ * -+ * Returns the amount of time to delay by, in jiffies -+ */ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -+{ -+ u64 now = local_clock(); -+ -+ return time_after64(d->next, now) -+ ? nsecs_to_jiffies(d->next - now) -+ : 0; -+} -+ -+/** -+ * bch2_ratelimit_increment() - increment @d by the amount of work done -+ * -+ * @d - the struct bch_ratelimit to update -+ * @done - the amount of work done, in arbitrary units -+ */ -+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -+{ -+ u64 now = local_clock(); -+ -+ d->next += div_u64(done * NSEC_PER_SEC, d->rate); -+ -+ if (time_before64(now + NSEC_PER_SEC, d->next)) -+ d->next = now + NSEC_PER_SEC; -+ -+ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) -+ d->next = now - NSEC_PER_SEC * 2; -+} -+ -+/* pd controller: */ -+ -+/* -+ * Updates pd_controller. Attempts to scale inputed values to units per second. -+ * @target: desired value -+ * @actual: current value -+ * -+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing -+ * it makes actual go down. -+ */ -+void bch2_pd_controller_update(struct bch_pd_controller *pd, -+ s64 target, s64 actual, int sign) -+{ -+ s64 proportional, derivative, change; -+ -+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; -+ -+ if (seconds_since_update == 0) -+ return; -+ -+ pd->last_update = jiffies; -+ -+ proportional = actual - target; -+ proportional *= seconds_since_update; -+ proportional = div_s64(proportional, pd->p_term_inverse); -+ -+ derivative = actual - pd->last_actual; -+ derivative = div_s64(derivative, seconds_since_update); -+ derivative = ewma_add(pd->smoothed_derivative, derivative, -+ (pd->d_term / seconds_since_update) ?: 1); -+ derivative = derivative * pd->d_term; -+ derivative = div_s64(derivative, pd->p_term_inverse); -+ -+ change = proportional + derivative; -+ -+ /* Don't increase rate if not keeping up */ -+ if (change > 0 && -+ pd->backpressure && -+ time_after64(local_clock(), -+ pd->rate.next + NSEC_PER_MSEC)) -+ change = 0; -+ -+ change *= (sign * -1); -+ -+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, -+ 1, UINT_MAX); -+ -+ pd->last_actual = actual; -+ pd->last_derivative = derivative; -+ pd->last_proportional = proportional; -+ pd->last_change = change; -+ pd->last_target = target; -+} -+ -+void bch2_pd_controller_init(struct bch_pd_controller *pd) -+{ -+ pd->rate.rate = 1024; -+ pd->last_update = jiffies; -+ pd->p_term_inverse = 6000; -+ pd->d_term = 30; -+ pd->d_smooth = pd->d_term; -+ pd->backpressure = 1; -+} -+ -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) -+{ -+ /* 2^64 - 1 is 20 digits, plus null byte */ -+ char rate[21]; -+ char actual[21]; -+ char target[21]; -+ char proportional[21]; -+ char derivative[21]; -+ char change[21]; -+ s64 next_io; -+ -+ bch2_hprint(&PBUF(rate), pd->rate.rate); -+ bch2_hprint(&PBUF(actual), pd->last_actual); -+ bch2_hprint(&PBUF(target), pd->last_target); -+ bch2_hprint(&PBUF(proportional), pd->last_proportional); -+ bch2_hprint(&PBUF(derivative), pd->last_derivative); -+ bch2_hprint(&PBUF(change), pd->last_change); -+ -+ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); -+ -+ return sprintf(buf, -+ "rate:\t\t%s/sec\n" -+ "target:\t\t%s\n" -+ "actual:\t\t%s\n" -+ "proportional:\t%s\n" -+ "derivative:\t%s\n" -+ "change:\t\t%s/sec\n" -+ "next io:\t%llims\n", -+ rate, target, actual, proportional, -+ derivative, change, next_io); -+} -+ -+/* misc: */ -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t size) -+{ -+ while (size) { -+ struct page *page = is_vmalloc_addr(base) -+ ? vmalloc_to_page(base) -+ : virt_to_page(base); -+ unsigned offset = offset_in_page(base); -+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, offset)); -+ size -= len; -+ base += len; -+ } -+} -+ -+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -+{ -+ while (size) { -+ struct page *page = alloc_page(gfp_mask); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ if (!page) -+ return -ENOMEM; -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ return 0; -+} -+ -+size_t bch2_rand_range(size_t max) -+{ -+ size_t rand; -+ -+ if (!max) -+ return 0; -+ -+ do { -+ rand = get_random_long(); -+ rand &= roundup_pow_of_two(max) - 1; -+ } while (rand >= max); -+ -+ return rand; -+} -+ -+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, dst, iter, dst_iter) { -+ void *dstp = kmap_atomic(bv.bv_page); -+ memcpy(dstp + bv.bv_offset, src, bv.bv_len); -+ kunmap_atomic(dstp); -+ -+ src += bv.bv_len; -+ } -+} -+ -+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, src, iter, src_iter) { -+ void *srcp = kmap_atomic(bv.bv_page); -+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); -+ kunmap_atomic(srcp); -+ -+ dst += bv.bv_len; -+ } -+} -+ -+void bch_scnmemcpy(struct printbuf *out, -+ const char *src, size_t len) -+{ -+ size_t n = printbuf_remaining(out); -+ -+ if (n) { -+ n = min(n - 1, len); -+ memcpy(out->pos, src, n); -+ out->pos += n; -+ *out->pos = '\0'; -+ } -+} -+ -+#include "eytzinger.h" -+ -+static int alignment_ok(const void *base, size_t align) -+{ -+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || -+ ((unsigned long)base & (align - 1)) == 0; -+} -+ -+static void u32_swap(void *a, void *b, size_t size) -+{ -+ u32 t = *(u32 *)a; -+ *(u32 *)a = *(u32 *)b; -+ *(u32 *)b = t; -+} -+ -+static void u64_swap(void *a, void *b, size_t size) -+{ -+ u64 t = *(u64 *)a; -+ *(u64 *)a = *(u64 *)b; -+ *(u64 *)b = t; -+} -+ -+static void generic_swap(void *a, void *b, size_t size) -+{ -+ char t; -+ -+ do { -+ t = *(char *)a; -+ *(char *)a++ = *(char *)b; -+ *(char *)b++ = t; -+ } while (--size > 0); -+} -+ -+static inline int do_cmp(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ size_t l, size_t r) -+{ -+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+static inline void do_swap(void *base, size_t n, size_t size, -+ void (*swap_func)(void *, void *, size_t), -+ size_t l, size_t r) -+{ -+ swap_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+void eytzinger0_sort(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)) -+{ -+ int i, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for (i = n / 2 - 1; i >= 0; --i) { -+ for (r = i; r * 2 + 1 < n; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < n && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - 1; i > 0; --i) { -+ do_swap(base, n, size, swap_func, 0, i); -+ -+ for (r = 0; r * 2 + 1 < i; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < i && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t size)) -+{ -+ /* pre-scale counters for performance */ -+ int i = (num/2 - 1) * size, n = num * size, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for ( ; i >= 0; i -= size) { -+ for (r = i; r * 2 + size < n; r = c) { -+ c = r * 2 + size; -+ if (c < n - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - size; i > 0; i -= size) { -+ swap_func(base, base + i, size); -+ for (r = 0; r * 2 + size < i; r = c) { -+ c = r * 2 + size; -+ if (c < i - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+} -+ -+static void mempool_free_vp(void *element, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ vpfree(element, size); -+} -+ -+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ return vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -+{ -+ return size < PAGE_SIZE -+ ? mempool_init_kmalloc_pool(pool, min_nr, size) -+ : mempool_init(pool, min_nr, mempool_alloc_vp, -+ mempool_free_vp, (void *) size); -+} -+ -+#if 0 -+void eytzinger1_test(void) -+{ -+ unsigned inorder, eytz, size; -+ -+ pr_info("1 based eytzinger test:"); -+ -+ for (size = 2; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger1_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -+ -+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ -+ inorder = 1; -+ eytzinger1_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger1_last(size) && -+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+void eytzinger0_test(void) -+{ -+ -+ unsigned inorder, eytz, size; -+ -+ pr_info("0 based eytzinger test:"); -+ -+ for (size = 1; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger0_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -+ -+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ -+ inorder = 0; -+ eytzinger0_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger0_last(size) && -+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+{ -+ const u16 *l = _l, *r = _r; -+ -+ return (*l > *r) - (*r - *l); -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ int i, c1 = -1, c2 = -1; -+ ssize_t r; -+ -+ r = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) -+ c1 = test_array[r]; -+ -+ for (i = 0; i < nr; i++) -+ if (test_array[i] <= search && test_array[i] > c2) -+ c2 = test_array[i]; -+ -+ if (c1 != c2) { -+ eytzinger0_for_each(i, nr) -+ pr_info("[%3u] = %12u", i, test_array[i]); -+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -+ i, r, c1, c2); -+ } -+} -+ -+void eytzinger0_find_test(void) -+{ -+ unsigned i, nr, allocated = 1 << 12; -+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); -+ -+ for (nr = 1; nr < allocated; nr++) { -+ pr_info("testing %u elems", nr); -+ -+ get_random_bytes(test_array, nr * sizeof(test_array[0])); -+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); -+ -+ /* verify array is sorted correctly: */ -+ eytzinger0_for_each(i, nr) -+ BUG_ON(i != eytzinger0_last(nr) && -+ test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ -+ for (i = 0; i < U16_MAX; i += 1 << 12) -+ eytzinger0_find_test_val(test_array, nr, i); -+ -+ for (i = 0; i < nr; i++) { -+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); -+ eytzinger0_find_test_val(test_array, nr, test_array[i]); -+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); -+ } -+ } -+ -+ kfree(test_array); -+} -+#endif -+ -+/* -+ * Accumulate percpu counters onto one cpu's copy - only valid when access -+ * against any percpu counter is guarded against -+ */ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -+{ -+ u64 *ret; -+ int cpu; -+ -+ preempt_disable(); -+ ret = this_cpu_ptr(p); -+ preempt_enable(); -+ -+ for_each_possible_cpu(cpu) { -+ u64 *i = per_cpu_ptr(p, cpu); -+ -+ if (i != ret) { -+ acc_u64s(ret, i, nr); -+ memset(i, 0, nr * sizeof(u64)); -+ } -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -new file mode 100644 -index 000000000000..0128daba5970 ---- /dev/null -+++ b/fs/bcachefs/util.h -@@ -0,0 +1,761 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_UTIL_H -+#define _BCACHEFS_UTIL_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -+#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) -+ -+struct closure; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define EBUG_ON(cond) BUG_ON(cond) -+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) -+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -+#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) -+#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) -+#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) -+#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) -+#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) -+#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) -+#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) -+#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -+ -+#define memcpy(dst, src, len) \ -+({ \ -+ void *_dst = (dst); \ -+ const void *_src = (src); \ -+ size_t _len = (len); \ -+ \ -+ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ -+ (void *) (_dst) + (_len) <= (void *) (_src))); \ -+ memcpy(_dst, _src, _len); \ -+}) -+ -+#else /* DEBUG */ -+ -+#define EBUG_ON(cond) -+#define atomic_dec_bug(v) atomic_dec(v) -+#define atomic_inc_bug(v, i) atomic_inc(v) -+#define atomic_sub_bug(i, v) atomic_sub(i, v) -+#define atomic_add_bug(i, v) atomic_add(i, v) -+#define atomic_long_dec_bug(v) atomic_long_dec(v) -+#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) -+#define atomic64_dec_bug(v) atomic64_dec(v) -+#define atomic64_inc_bug(v, i) atomic64_inc(v) -+#define atomic64_sub_bug(i, v) atomic64_sub(i, v) -+#define atomic64_add_bug(i, v) atomic64_add(i, v) -+ -+#endif -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+#define CPU_BIG_ENDIAN 0 -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+#define CPU_BIG_ENDIAN 1 -+#endif -+ -+/* type hackery */ -+ -+#define type_is_exact(_val, _type) \ -+ __builtin_types_compatible_p(typeof(_val), _type) -+ -+#define type_is(_val, _type) \ -+ (__builtin_types_compatible_p(typeof(_val), _type) || \ -+ __builtin_types_compatible_p(typeof(_val), const _type)) -+ -+/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -+static inline size_t buf_pages(void *p, size_t len) -+{ -+ return DIV_ROUND_UP(len + -+ ((unsigned long) p & (PAGE_SIZE - 1)), -+ PAGE_SIZE); -+} -+ -+static inline void vpfree(void *p, size_t size) -+{ -+ if (is_vmalloc_addr(p)) -+ vfree(p); -+ else -+ free_pages((unsigned long) p, get_order(size)); -+} -+ -+static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc(size, gfp_mask, PAGE_KERNEL); -+} -+ -+static inline void kvpfree(void *p, size_t size) -+{ -+ if (size < PAGE_SIZE) -+ kfree(p); -+ else -+ vpfree(p, size); -+} -+ -+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return size < PAGE_SIZE -+ ? kmalloc(size, gfp_mask) -+ : vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); -+ -+#define HEAP(type) \ -+struct { \ -+ size_t size, used; \ -+ type *data; \ -+} -+ -+#define DECLARE_HEAP(type, name) HEAP(type) name -+ -+#define init_heap(heap, _size, gfp) \ -+({ \ -+ (heap)->used = 0; \ -+ (heap)->size = (_size); \ -+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ -+ (gfp)); \ -+}) -+ -+#define free_heap(heap) \ -+do { \ -+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ -+ (heap)->data = NULL; \ -+} while (0) -+ -+#define heap_set_backpointer(h, i, _fn) \ -+do { \ -+ void (*fn)(typeof(h), size_t) = _fn; \ -+ if (fn) \ -+ fn(h, i); \ -+} while (0) -+ -+#define heap_swap(h, i, j, set_backpointer) \ -+do { \ -+ swap((h)->data[i], (h)->data[j]); \ -+ heap_set_backpointer(h, i, set_backpointer); \ -+ heap_set_backpointer(h, j, set_backpointer); \ -+} while (0) -+ -+#define heap_peek(h) \ -+({ \ -+ EBUG_ON(!(h)->used); \ -+ (h)->data[0]; \ -+}) -+ -+#define heap_full(h) ((h)->used == (h)->size) -+ -+#define heap_sift_down(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _c, _j = i; \ -+ \ -+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ -+ _c = _j * 2 + 1; \ -+ if (_c + 1 < (h)->used && \ -+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ -+ _c++; \ -+ \ -+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ -+ break; \ -+ heap_swap(h, _c, _j, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_sift_up(h, i, cmp, set_backpointer) \ -+do { \ -+ while (i) { \ -+ size_t p = (i - 1) / 2; \ -+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ -+ break; \ -+ heap_swap(h, i, p, set_backpointer); \ -+ i = p; \ -+ } \ -+} while (0) -+ -+#define __heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ size_t _i = (h)->used++; \ -+ (h)->data[_i] = d; \ -+ heap_set_backpointer(h, _i, set_backpointer); \ -+ \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ _i; \ -+}) -+ -+#define heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = !heap_full(h); \ -+ if (_r) \ -+ __heap_add(h, d, cmp, set_backpointer); \ -+ _r; \ -+}) -+ -+#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -+do { \ -+ if (!heap_add(h, new, cmp, set_backpointer) && \ -+ cmp(h, new, heap_peek(h)) >= 0) { \ -+ (h)->data[0] = new; \ -+ heap_set_backpointer(h, 0, set_backpointer); \ -+ heap_sift_down(h, 0, cmp, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_del(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _i = (i); \ -+ \ -+ BUG_ON(_i >= (h)->used); \ -+ (h)->used--; \ -+ heap_swap(h, _i, (h)->used, set_backpointer); \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ heap_sift_down(h, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define heap_pop(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = (h)->used; \ -+ if (_r) { \ -+ (d) = (h)->data[0]; \ -+ heap_del(h, 0, cmp, set_backpointer); \ -+ } \ -+ _r; \ -+}) -+ -+#define heap_resort(heap, cmp, set_backpointer) \ -+do { \ -+ ssize_t _i; \ -+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ -+ heap_sift_down(heap, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define ANYSINT_MAX(t) \ -+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -+ -+struct printbuf { -+ char *pos; -+ char *end; -+}; -+ -+static inline size_t printbuf_remaining(struct printbuf *buf) -+{ -+ return buf->end - buf->pos; -+} -+ -+#define _PBUF(_buf, _len) \ -+ ((struct printbuf) { \ -+ .pos = _buf, \ -+ .end = _buf + _len, \ -+ }) -+ -+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) -+ -+#define pr_buf(_out, ...) \ -+do { \ -+ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ -+ __VA_ARGS__); \ -+} while (0) -+ -+void bch_scnmemcpy(struct printbuf *, const char *, size_t); -+ -+int bch2_strtoint_h(const char *, int *); -+int bch2_strtouint_h(const char *, unsigned int *); -+int bch2_strtoll_h(const char *, long long *); -+int bch2_strtoull_h(const char *, unsigned long long *); -+int bch2_strtou64_h(const char *, u64 *); -+ -+static inline int bch2_strtol_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtoint_h(cp, (int *) res); -+#else -+ return bch2_strtoll_h(cp, (long long *) res); -+#endif -+} -+ -+static inline int bch2_strtoul_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtouint_h(cp, (unsigned int *) res); -+#else -+ return bch2_strtoull_h(cp, (unsigned long long *) res); -+#endif -+} -+ -+#define strtoi_h(cp, res) \ -+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ -+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ -+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ -+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ -+ : -EINVAL) -+ -+#define strtoul_safe(cp, var) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = _v; \ -+ _r; \ -+}) -+ -+#define strtoul_safe_clamp(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = clamp_t(typeof(var), _v, min, max); \ -+ _r; \ -+}) -+ -+#define strtoul_safe_restrict(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r && _v >= min && _v <= max) \ -+ var = _v; \ -+ else \ -+ _r = -EINVAL; \ -+ _r; \ -+}) -+ -+#define snprint(buf, size, var) \ -+ snprintf(buf, size, \ -+ type_is(var, int) ? "%i\n" \ -+ : type_is(var, unsigned) ? "%u\n" \ -+ : type_is(var, long) ? "%li\n" \ -+ : type_is(var, unsigned long) ? "%lu\n" \ -+ : type_is(var, s64) ? "%lli\n" \ -+ : type_is(var, u64) ? "%llu\n" \ -+ : type_is(var, char *) ? "%s\n" \ -+ : "%i\n", var) -+ -+void bch2_hprint(struct printbuf *, s64); -+ -+bool bch2_is_zero(const void *, size_t); -+ -+void bch2_string_opt_to_text(struct printbuf *, -+ const char * const [], size_t); -+ -+void bch2_flags_to_text(struct printbuf *, const char * const[], u64); -+u64 bch2_read_flag_list(char *, const char * const[]); -+ -+#define NR_QUANTILES 15 -+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -+ -+struct quantiles { -+ struct quantile_entry { -+ u64 m; -+ u64 step; -+ } entries[NR_QUANTILES]; -+}; -+ -+struct time_stat_buffer { -+ unsigned nr; -+ struct time_stat_buffer_entry { -+ u64 start; -+ u64 end; -+ } entries[32]; -+}; -+ -+struct time_stats { -+ spinlock_t lock; -+ u64 count; -+ /* all fields are in nanoseconds */ -+ u64 average_duration; -+ u64 average_frequency; -+ u64 max_duration; -+ u64 last_event; -+ struct quantiles quantiles; -+ -+ struct time_stat_buffer __percpu *buffer; -+}; -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64, u64); -+ -+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) -+{ -+ __bch2_time_stats_update(stats, start, local_clock()); -+} -+ -+size_t bch2_time_stats_print(struct time_stats *, char *, size_t); -+ -+void bch2_time_stats_exit(struct time_stats *); -+void bch2_time_stats_init(struct time_stats *); -+ -+#define ewma_add(ewma, val, weight) \ -+({ \ -+ typeof(ewma) _ewma = (ewma); \ -+ typeof(weight) _weight = (weight); \ -+ \ -+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -+}) -+ -+struct bch_ratelimit { -+ /* Next time we want to do some work, in nanoseconds */ -+ u64 next; -+ -+ /* -+ * Rate at which we want to do work, in units per nanosecond -+ * The units here correspond to the units passed to -+ * bch2_ratelimit_increment() -+ */ -+ unsigned rate; -+}; -+ -+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -+{ -+ d->next = local_clock(); -+} -+ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *); -+void bch2_ratelimit_increment(struct bch_ratelimit *, u64); -+ -+struct bch_pd_controller { -+ struct bch_ratelimit rate; -+ unsigned long last_update; -+ -+ s64 last_actual; -+ s64 smoothed_derivative; -+ -+ unsigned p_term_inverse; -+ unsigned d_smooth; -+ unsigned d_term; -+ -+ /* for exporting to sysfs (no effect on behavior) */ -+ s64 last_derivative; -+ s64 last_proportional; -+ s64 last_change; -+ s64 last_target; -+ -+ /* If true, the rate will not increase if bch2_ratelimit_delay() -+ * is not being called often enough. */ -+ bool backpressure; -+}; -+ -+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -+void bch2_pd_controller_init(struct bch_pd_controller *); -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); -+ -+#define sysfs_pd_controller_attribute(name) \ -+ rw_attribute(name##_rate); \ -+ rw_attribute(name##_rate_bytes); \ -+ rw_attribute(name##_rate_d_term); \ -+ rw_attribute(name##_rate_p_term_inverse); \ -+ read_attribute(name##_rate_debug) -+ -+#define sysfs_pd_controller_files(name) \ -+ &sysfs_##name##_rate, \ -+ &sysfs_##name##_rate_bytes, \ -+ &sysfs_##name##_rate_d_term, \ -+ &sysfs_##name##_rate_p_term_inverse, \ -+ &sysfs_##name##_rate_debug -+ -+#define sysfs_pd_controller_show(name, var) \ -+do { \ -+ sysfs_hprint(name##_rate, (var)->rate.rate); \ -+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ -+ sysfs_print(name##_rate_d_term, (var)->d_term); \ -+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ -+ \ -+ if (attr == &sysfs_##name##_rate_debug) \ -+ return bch2_pd_controller_print_debug(var, buf); \ -+} while (0) -+ -+#define sysfs_pd_controller_store(name, var) \ -+do { \ -+ sysfs_strtoul_clamp(name##_rate, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul_clamp(name##_rate_bytes, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ -+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ -+ (var)->p_term_inverse, 1, INT_MAX); \ -+} while (0) -+ -+#define container_of_or_null(ptr, type, member) \ -+({ \ -+ typeof(ptr) _ptr = ptr; \ -+ _ptr ? container_of(_ptr, type, member) : NULL; \ -+}) -+ -+/* Does linear interpolation between powers of two */ -+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -+{ -+ unsigned fract = x & ~(~0 << fract_bits); -+ -+ x >>= fract_bits; -+ x = 1 << x; -+ x += (x * fract) >> fract_bits; -+ -+ return x; -+} -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -+ -+static inline sector_t bdev_sectors(struct block_device *bdev) -+{ -+ return bdev->bd_inode->i_size >> 9; -+} -+ -+#define closure_bio_submit(bio, cl) \ -+do { \ -+ closure_get(cl); \ -+ submit_bio(bio); \ -+} while (0) -+ -+#define kthread_wait_freezable(cond) \ -+({ \ -+ int _ret = 0; \ -+ while (1) { \ -+ set_current_state(TASK_INTERRUPTIBLE); \ -+ if (kthread_should_stop()) { \ -+ _ret = -1; \ -+ break; \ -+ } \ -+ \ -+ if (cond) \ -+ break; \ -+ \ -+ schedule(); \ -+ try_to_freeze(); \ -+ } \ -+ set_current_state(TASK_RUNNING); \ -+ _ret; \ -+}) -+ -+size_t bch2_rand_range(size_t); -+ -+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -+void memcpy_from_bio(void *, struct bio *, struct bvec_iter); -+ -+static inline void memcpy_u64s_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+} -+ -+static inline void __memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("rep ; movsq" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+#endif -+} -+ -+static inline void memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || -+ dst + u64s * sizeof(u64) <= src)); -+ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst > src); -+ -+ __memmove_u64s_down(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up_small(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s; -+ u64 *src = (u64 *) _src + u64s; -+ -+ while (u64s--) -+ *--dst = *--src; -+} -+ -+static inline void memmove_u64s_up_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up_small(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s - 1; -+ u64 *src = (u64 *) _src + u64s - 1; -+ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("std ;\n" -+ "rep ; movsq\n" -+ "cld ;\n" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ while (u64s--) -+ *dst-- = *src--; -+#endif -+} -+ -+static inline void memmove_u64s_up(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+static inline void memmove_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ if (dst < src) -+ __memmove_u64s_down(dst, src, u64s); -+ else -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -+static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -+{ -+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; -+ -+ memset(s + bytes, c, rem); -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+/* just the memmove, doesn't update @_nr */ -+#define __array_insert_item(_array, _nr, _pos) \ -+ memmove(&(_array)[(_pos) + 1], \ -+ &(_array)[(_pos)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))) -+ -+#define array_insert_item(_array, _nr, _pos, _new_item) \ -+do { \ -+ __array_insert_item(_array, _nr, _pos); \ -+ (_nr)++; \ -+ (_array)[(_pos)] = (_new_item); \ -+} while (0) -+ -+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -+do { \ -+ (_nr) -= (_nr_to_remove); \ -+ memmove(&(_array)[(_pos)], \ -+ &(_array)[(_pos) + (_nr_to_remove)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+} while (0) -+ -+#define array_remove_item(_array, _nr, _pos) \ -+ array_remove_items(_array, _nr, _pos, 1) -+ -+#define bubble_sort(_base, _nr, _cmp) \ -+do { \ -+ ssize_t _i, _end; \ -+ bool _swapped = true; \ -+ \ -+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ -+ _swapped = false; \ -+ for (_i = 0; _i < _end; _i++) \ -+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ -+ swap((_base)[_i], (_base)[_i + 1]); \ -+ _swapped = true; \ -+ } \ -+ } \ -+} while (0) -+ -+static inline u64 percpu_u64_get(u64 __percpu *src) -+{ -+ u64 ret = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ ret += *per_cpu_ptr(src, cpu); -+ return ret; -+} -+ -+static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ *per_cpu_ptr(dst, cpu) = 0; -+ -+ preempt_disable(); -+ *this_cpu_ptr(dst) = src; -+ preempt_enable(); -+} -+ -+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr; i++) -+ acc[i] += src[i]; -+} -+ -+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, -+ unsigned nr) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -+} -+ -+static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(p, cpu), c, bytes); -+} -+ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -+ -+#define cmp_int(l, r) ((l > r) - (l < r)) -+ -+#endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h -new file mode 100644 -index 000000000000..c099cdc0605f ---- /dev/null -+++ b/fs/bcachefs/vstructs.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _VSTRUCTS_H -+#define _VSTRUCTS_H -+ -+#include "util.h" -+ -+/* -+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this -+ * assumes u64 is little endian: -+ */ -+#define __vstruct_u64s(_s) \ -+({ \ -+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ -+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ -+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ -+ : ((__force u8) ((_s)->u64s))); \ -+}) -+ -+#define __vstruct_bytes(_type, _u64s) \ -+({ \ -+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ -+ \ -+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -+}) -+ -+#define vstruct_bytes(_s) \ -+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) -+ -+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ -+ (round_up(__vstruct_bytes(_type, _u64s), \ -+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) -+ -+#define vstruct_blocks(_s, _sector_block_bits) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) -+ -+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ -+ __vstruct_u64s(_s) + (_u64s)) -+ -+#define vstruct_sectors(_s, _sector_block_bits) \ -+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) -+ -+#define vstruct_next(_s) \ -+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_last(_s) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_end(_s) \ -+ ((void *) ((_s)->_data + __vstruct_u64s(_s))) -+ -+#define vstruct_for_each(_s, _i) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s); \ -+ _i = vstruct_next(_i)) -+ -+#define vstruct_for_each_safe(_s, _i, _t) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ -+ _i = _t) -+ -+#define vstruct_idx(_s, _idx) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) -+ -+#endif /* _VSTRUCTS_H */ -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -new file mode 100644 -index 000000000000..725a6f3ef8ce ---- /dev/null -+++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,582 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "fs.h" -+#include "rebalance.h" -+#include "str_hash.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -+ -+static u64 bch2_xattr_hash(const struct bch_hash_info *info, -+ const struct xattr_search_key *key) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); -+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); -+ -+ return bch2_str_hash_end(&ctx, info); -+} -+ -+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_xattr_hash(info, key); -+} -+ -+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); -+ -+ return bch2_xattr_hash(info, -+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -+} -+ -+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ const struct xattr_search_key *r = _r; -+ -+ return l.v->x_type != r->type || -+ l.v->x_name_len != r->name.len || -+ memcmp(l.v->x_name, r->name.name, r->name.len); -+} -+ -+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); -+ -+ return l.v->x_type != r.v->x_type || -+ l.v->x_name_len != r.v->x_name_len || -+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -+} -+ -+const struct bch_hash_desc bch2_xattr_hash_desc = { -+ .btree_id = BTREE_ID_XATTRS, -+ .key_type = KEY_TYPE_xattr, -+ .hash_key = xattr_hash_key, -+ .hash_bkey = xattr_hash_bkey, -+ .cmp_key = xattr_cmp_key, -+ .cmp_bkey = xattr_cmp_bkey, -+}; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) < -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) > -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)) -+ return "value too big"; -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (!handler) -+ return "invalid type"; -+ -+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) -+ return "xattr name has invalid characters"; -+ -+ return NULL; -+} -+ -+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (handler && handler->prefix) -+ pr_buf(out, "%s", handler->prefix); -+ else if (handler) -+ pr_buf(out, "(type %u)", xattr.v->x_type); -+ else -+ pr_buf(out, "(unknown type %u)", xattr.v->x_type); -+ -+ bch_scnmemcpy(out, xattr.v->x_name, -+ xattr.v->x_name_len); -+ pr_buf(out, ":"); -+ bch_scnmemcpy(out, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+} -+ -+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, -+ const char *name, void *buffer, size_t size, int type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(type, name, strlen(name)), -+ 0); -+ if (IS_ERR(iter)) { -+ bch2_trans_exit(&trans); -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ -+ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ ret = le16_to_cpu(xattr.v->x_val_len); -+ if (buffer) { -+ if (ret > size) -+ ret = -ERANGE; -+ else -+ memcpy(buffer, xattr_val(xattr.v), ret); -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_xattr_set(struct btree_trans *trans, u64 inum, -+ const struct bch_hash_info *hash_info, -+ const char *name, const void *value, size_t size, -+ int type, int flags) -+{ -+ int ret; -+ -+ if (value) { -+ struct bkey_i_xattr *xattr; -+ unsigned namelen = strlen(name); -+ unsigned u64s = BKEY_U64s + -+ xattr_val_u64s(namelen, size); -+ -+ if (u64s > U8_MAX) -+ return -ERANGE; -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = type; -+ xattr->v.x_name_len = namelen; -+ xattr->v.x_val_len = cpu_to_le16(size); -+ memcpy(xattr->v.x_name, name, namelen); -+ memcpy(xattr_val(&xattr->v), value, size); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inum, &xattr->k_i, -+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| -+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(type, name, strlen(name)); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, -+ hash_info, inum, &search); -+ } -+ -+ if (ret == -ENOENT) -+ ret = flags & XATTR_REPLACE ? -ENODATA : 0; -+ -+ return ret; -+} -+ -+struct xattr_buf { -+ char *buf; -+ size_t len; -+ size_t used; -+}; -+ -+static int __bch2_xattr_emit(const char *prefix, -+ const char *name, size_t name_len, -+ struct xattr_buf *buf) -+{ -+ const size_t prefix_len = strlen(prefix); -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (buf->buf) { -+ if (buf->used + total_len > buf->len) -+ return -ERANGE; -+ -+ memcpy(buf->buf + buf->used, prefix, prefix_len); -+ memcpy(buf->buf + buf->used + prefix_len, -+ name, name_len); -+ buf->buf[buf->used + prefix_len + name_len] = '\0'; -+ } -+ -+ buf->used += total_len; -+ return 0; -+} -+ -+static int bch2_xattr_emit(struct dentry *dentry, -+ const struct bch_xattr *xattr, -+ struct xattr_buf *buf) -+{ -+ const struct xattr_handler *handler = -+ bch2_xattr_type_to_handler(xattr->x_type); -+ -+ return handler && (!handler->list || handler->list(dentry)) -+ ? __bch2_xattr_emit(handler->prefix ?: handler->name, -+ xattr->x_name, xattr->x_name_len, buf) -+ : 0; -+} -+ -+static int bch2_xattr_list_bcachefs(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct xattr_buf *buf, -+ bool all) -+{ -+ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; -+ unsigned id; -+ int ret = 0; -+ u64 v; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ v = bch2_inode_opt_get(&inode->ei_inode, id); -+ if (!v) -+ continue; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << id))) -+ continue; -+ -+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], -+ strlen(bch2_inode_opts[id]), buf); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -+{ -+ struct bch_fs *c = dentry->d_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; -+ u64 inum = dentry->d_inode->i_ino; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS(inum, 0), 0, k, ret) { -+ BUG_ON(k.k->p.inode < inum); -+ -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_xattr) -+ continue; -+ -+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); -+ if (ret) -+ break; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); -+ if (ret) -+ return ret; -+ -+ return buf.used; -+} -+ -+static int bch2_xattr_get_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); -+} -+ -+static int bch2_xattr_set_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, -+ bch2_xattr_set(&trans, inode->v.i_ino, -+ &inode->ei_str_hash, -+ name, value, size, -+ handler->flags, flags)); -+} -+ -+static const struct xattr_handler bch_xattr_user_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_USER, -+}; -+ -+static bool bch2_xattr_trusted_list(struct dentry *dentry) -+{ -+ return capable(CAP_SYS_ADMIN); -+} -+ -+static const struct xattr_handler bch_xattr_trusted_handler = { -+ .prefix = XATTR_TRUSTED_PREFIX, -+ .list = bch2_xattr_trusted_list, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -+}; -+ -+static const struct xattr_handler bch_xattr_security_handler = { -+ .prefix = XATTR_SECURITY_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -+}; -+ -+#ifndef NO_BCACHEFS_FS -+ -+static int opt_to_inode_opt(int id) -+{ -+ switch (id) { -+#define x(name, ...) \ -+ case Opt_##name: return Inode_opt_##name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ return -1; -+ } -+} -+ -+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size, -+ bool all) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_opts opts = -+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); -+ const struct bch_option *opt; -+ int id, inode_opt_id; -+ char buf[512]; -+ struct printbuf out = PBUF(buf); -+ unsigned val_len; -+ u64 v; -+ -+ id = bch2_opt_lookup(name); -+ if (id < 0 || !bch2_opt_is_inode_opt(id)) -+ return -EINVAL; -+ -+ inode_opt_id = opt_to_inode_opt(id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + id; -+ -+ if (!bch2_opt_defined_by_id(&opts, id)) -+ return -ENODATA; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) -+ return -ENODATA; -+ -+ v = bch2_opt_get_by_id(&opts, id); -+ bch2_opt_to_text(&out, c, opt, v, 0); -+ -+ val_len = out.pos - buf; -+ -+ if (buffer && val_len > size) -+ return -ERANGE; -+ -+ if (buffer) -+ memcpy(buffer, buf, val_len); -+ return val_len; -+} -+ -+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, false); -+} -+ -+struct inode_opt_set { -+ int id; -+ u64 v; -+ bool defined; -+}; -+ -+static int inode_opt_set_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_opt_set *s = p; -+ -+ if (s->defined) -+ bi->bi_fields_set |= 1U << s->id; -+ else -+ bi->bi_fields_set &= ~(1U << s->id); -+ -+ bch2_inode_opt_set(bi, s->id, s->v); -+ -+ return 0; -+} -+ -+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ const struct bch_option *opt; -+ char *buf; -+ struct inode_opt_set s; -+ int opt_id, inode_opt_id, ret; -+ -+ opt_id = bch2_opt_lookup(name); -+ if (opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + opt_id; -+ -+ inode_opt_id = opt_to_inode_opt(opt_id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ s.id = inode_opt_id; -+ -+ if (value) { -+ u64 v = 0; -+ -+ buf = kmalloc(size + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ memcpy(buf, value, size); -+ buf[size] = '\0'; -+ -+ ret = bch2_opt_parse(c, opt, buf, &v); -+ kfree(buf); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, opt_id, v); -+ if (ret < 0) -+ return ret; -+ -+ s.v = v + 1; -+ s.defined = true; -+ } else { -+ if (!IS_ROOT(dentry)) { -+ struct bch_inode_info *dir = -+ to_bch_ei(d_inode(dentry->d_parent)); -+ -+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); -+ } else { -+ s.v = 0; -+ } -+ -+ s.defined = false; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (inode_opt_id == Inode_opt_project) { -+ ret = bch2_set_projid(c, inode, s.v); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (value && -+ (opt_id == Opt_background_compression || -+ opt_id == Opt_background_target)) -+ bch2_rebalance_add_work(c, inode->v.i_blocks); -+ -+ return ret; -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_handler = { -+ .prefix = "bcachefs.", -+ .get = bch2_xattr_bcachefs_get, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+static int bch2_xattr_bcachefs_get_effective( -+ const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, true); -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { -+ .prefix = "bcachefs_effective.", -+ .get = bch2_xattr_bcachefs_get_effective, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+const struct xattr_handler *bch2_xattr_handlers[] = { -+ &bch_xattr_user_handler, -+ &posix_acl_access_xattr_handler, -+ &posix_acl_default_xattr_handler, -+ &bch_xattr_trusted_handler, -+ &bch_xattr_security_handler, -+#ifndef NO_BCACHEFS_FS -+ &bch_xattr_bcachefs_handler, -+ &bch_xattr_bcachefs_effective_handler, -+#endif -+ NULL -+}; -+ -+static const struct xattr_handler *bch_xattr_handler_map[] = { -+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = -+ &posix_acl_access_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = -+ &posix_acl_default_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, -+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -+}; -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -+{ -+ return type < ARRAY_SIZE(bch_xattr_handler_map) -+ ? bch_xattr_handler_map[type] -+ : NULL; -+} -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -new file mode 100644 -index 000000000000..4151065ab853 ---- /dev/null -+++ b/fs/bcachefs/xattr.h -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_XATTR_H -+#define _BCACHEFS_XATTR_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_xattr_hash_desc; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_xattr (struct bkey_ops) { \ -+ .key_invalid = bch2_xattr_invalid, \ -+ .val_to_text = bch2_xattr_to_text, \ -+} -+ -+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + -+ name_len + val_len, sizeof(u64)); -+} -+ -+#define xattr_val(_xattr) \ -+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) -+ -+struct xattr_search_key { -+ u8 type; -+ struct qstr name; -+}; -+ -+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ -+ { .type = _type, .name = QSTR_INIT(_name, _len) }) -+ -+struct dentry; -+struct xattr_handler; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, -+ const char *, void *, size_t, int); -+ -+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, -+ const char *, const void *, size_t, int, int); -+ -+ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -+ -+extern const struct xattr_handler *bch2_xattr_handlers[]; -+ -+#endif /* _BCACHEFS_XATTR_H */ -diff --git a/fs/cifs/file.c b/fs/cifs/file.c -index 75ddce8ef456..31d4aff3bbe5 100644 ---- a/fs/cifs/file.c -+++ b/fs/cifs/file.c -@@ -4299,20 +4299,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - - page = lru_to_page(page_list); - -- /* -- * Lock the page and put it in the cache. Since no one else -- * should have access to this page, we're safe to simply set -- * PG_locked without checking it first. -- */ -- __SetPageLocked(page); -- rc = add_to_page_cache_locked(page, mapping, -- page->index, gfp); -+ rc = add_to_page_cache(page, mapping, -+ page->index, gfp); - - /* give up if we can't stick it in the cache */ -- if (rc) { -- __ClearPageLocked(page); -+ if (rc) - return rc; -- } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; -@@ -4331,11 +4323,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - if (*bytes + PAGE_SIZE > rsize) - break; - -- __SetPageLocked(page); -- if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { -- __ClearPageLocked(page); -+ if (add_to_page_cache(page, mapping, page->index, gfp)) - break; -- } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; -diff --git a/fs/dcache.c b/fs/dcache.c -index b280e07e162b..7a73f5bf9c76 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -3113,9 +3113,8 @@ void d_genocide(struct dentry *parent) - - EXPORT_SYMBOL(d_genocide); - --void d_tmpfile(struct dentry *dentry, struct inode *inode) -+void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) - { -- inode_dec_link_count(inode); - BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_u.d_alias) || - !d_unlinked(dentry)); -@@ -3125,6 +3124,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) - (unsigned long long)inode->i_ino); - spin_unlock(&dentry->d_lock); - spin_unlock(&dentry->d_parent->d_lock); -+} -+EXPORT_SYMBOL(d_mark_tmpfile); -+ -+void d_tmpfile(struct dentry *dentry, struct inode *inode) -+{ -+ inode_dec_link_count(inode); -+ d_mark_tmpfile(dentry, inode); - d_instantiate(dentry, inode); - } - EXPORT_SYMBOL(d_tmpfile); -diff --git a/fs/inode.c b/fs/inode.c -index 93d9252a00ab..f2b6d24f3456 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -1503,6 +1503,46 @@ int insert_inode_locked(struct inode *inode) - } - EXPORT_SYMBOL(insert_inode_locked); - -+struct inode *insert_inode_locked2(struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ ino_t ino = inode->i_ino; -+ struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ -+ while (1) { -+ struct inode *old = NULL; -+ spin_lock(&inode_hash_lock); -+ hlist_for_each_entry(old, head, i_hash) { -+ if (old->i_ino != ino) -+ continue; -+ if (old->i_sb != sb) -+ continue; -+ spin_lock(&old->i_lock); -+ if (old->i_state & (I_FREEING|I_WILL_FREE)) { -+ spin_unlock(&old->i_lock); -+ continue; -+ } -+ break; -+ } -+ if (likely(!old)) { -+ spin_lock(&inode->i_lock); -+ inode->i_state |= I_NEW | I_CREATING; -+ hlist_add_head(&inode->i_hash, head); -+ spin_unlock(&inode->i_lock); -+ spin_unlock(&inode_hash_lock); -+ return NULL; -+ } -+ __iget(old); -+ spin_unlock(&old->i_lock); -+ spin_unlock(&inode_hash_lock); -+ wait_on_inode(old); -+ if (unlikely(!inode_unhashed(old))) -+ return old; -+ iput(old); -+ } -+} -+EXPORT_SYMBOL(insert_inode_locked2); -+ - int insert_inode_locked4(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 71e387a5fe90..e916f046fed4 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -323,6 +323,10 @@ - __start___verbose = .; \ - KEEP(*(__verbose)) \ - __stop___verbose = .; \ -+ . = ALIGN(8); \ -+ __start___faults = .; \ -+ *(__faults) \ -+ __stop___faults = .; \ - LIKELY_PROFILE() \ - BRANCH_PROFILE() \ - TRACE_PRINTKS() \ -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 32868fbedc9e..2979f9082a98 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -889,6 +889,7 @@ extern const char *blk_op_str(unsigned int op); - - int blk_status_to_errno(blk_status_t status); - blk_status_t errno_to_blk_status(int errno); -+const char *blk_status_to_str(blk_status_t status); - - int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); - -diff --git a/include/linux/closure.h b/include/linux/closure.h -new file mode 100644 -index 000000000000..abacb91c3565 ---- /dev/null -+++ b/include/linux/closure.h -@@ -0,0 +1,404 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _LINUX_CLOSURE_H -+#define _LINUX_CLOSURE_H -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Closure is perhaps the most overused and abused term in computer science, but -+ * since I've been unable to come up with anything better you're stuck with it -+ * again. -+ * -+ * What are closures? -+ * -+ * They embed a refcount. The basic idea is they count "things that are in -+ * progress" - in flight bios, some other thread that's doing something else - -+ * anything you might want to wait on. -+ * -+ * The refcount may be manipulated with closure_get() and closure_put(). -+ * closure_put() is where many of the interesting things happen, when it causes -+ * the refcount to go to 0. -+ * -+ * Closures can be used to wait on things both synchronously and asynchronously, -+ * and synchronous and asynchronous use can be mixed without restriction. To -+ * wait synchronously, use closure_sync() - you will sleep until your closure's -+ * refcount hits 1. -+ * -+ * To wait asynchronously, use -+ * continue_at(cl, next_function, workqueue); -+ * -+ * passing it, as you might expect, the function to run when nothing is pending -+ * and the workqueue to run that function out of. -+ * -+ * continue_at() also, critically, requires a 'return' immediately following the -+ * location where this macro is referenced, to return to the calling function. -+ * There's good reason for this. -+ * -+ * To use safely closures asynchronously, they must always have a refcount while -+ * they are running owned by the thread that is running them. Otherwise, suppose -+ * you submit some bios and wish to have a function run when they all complete: -+ * -+ * foo_endio(struct bio *bio) -+ * { -+ * closure_put(cl); -+ * } -+ * -+ * closure_init(cl); -+ * -+ * do_stuff(); -+ * closure_get(cl); -+ * bio1->bi_endio = foo_endio; -+ * bio_submit(bio1); -+ * -+ * do_more_stuff(); -+ * closure_get(cl); -+ * bio2->bi_endio = foo_endio; -+ * bio_submit(bio2); -+ * -+ * continue_at(cl, complete_some_read, system_wq); -+ * -+ * If closure's refcount started at 0, complete_some_read() could run before the -+ * second bio was submitted - which is almost always not what you want! More -+ * importantly, it wouldn't be possible to say whether the original thread or -+ * complete_some_read()'s thread owned the closure - and whatever state it was -+ * associated with! -+ * -+ * So, closure_init() initializes a closure's refcount to 1 - and when a -+ * closure_fn is run, the refcount will be reset to 1 first. -+ * -+ * Then, the rule is - if you got the refcount with closure_get(), release it -+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -+ * on a closure because you called closure_init() or you were run out of a -+ * closure - _always_ use continue_at(). Doing so consistently will help -+ * eliminate an entire class of particularly pernicious races. -+ * -+ * Lastly, you might have a wait list dedicated to a specific event, and have no -+ * need for specifying the condition - you just want to wait until someone runs -+ * closure_wake_up() on the appropriate wait list. In that case, just use -+ * closure_wait(). It will return either true or false, depending on whether the -+ * closure was already on a wait list or not - a closure can only be on one wait -+ * list at a time. -+ * -+ * Parents: -+ * -+ * closure_init() takes two arguments - it takes the closure to initialize, and -+ * a (possibly null) parent. -+ * -+ * If parent is non null, the new closure will have a refcount for its lifetime; -+ * a closure is considered to be "finished" when its refcount hits 0 and the -+ * function to run is null. Hence -+ * -+ * continue_at(cl, NULL, NULL); -+ * -+ * returns up the (spaghetti) stack of closures, precisely like normal return -+ * returns up the C stack. continue_at() with non null fn is better thought of -+ * as doing a tail call. -+ * -+ * All this implies that a closure should typically be embedded in a particular -+ * struct (which its refcount will normally control the lifetime of), and that -+ * struct can very much be thought of as a stack frame. -+ */ -+ -+struct closure; -+struct closure_syncer; -+typedef void (closure_fn) (struct closure *); -+extern struct dentry *bcache_debug; -+ -+struct closure_waitlist { -+ struct llist_head list; -+}; -+ -+enum closure_state { -+ /* -+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -+ * the thread that owns the closure, and cleared by the thread that's -+ * waking up the closure. -+ * -+ * The rest are for debugging and don't affect behaviour: -+ * -+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by -+ * closure_init() and when closure_put() runs then next function), and -+ * must be cleared before remaining hits 0. Primarily to help guard -+ * against incorrect usage and accidentally transferring references. -+ * continue_at() and closure_return() clear it for you, if you're doing -+ * something unusual you can use closure_set_dead() which also helps -+ * annotate where references are being transferred. -+ */ -+ -+ CLOSURE_BITS_START = (1U << 26), -+ CLOSURE_DESTRUCTOR = (1U << 26), -+ CLOSURE_WAITING = (1U << 28), -+ CLOSURE_RUNNING = (1U << 30), -+}; -+ -+#define CLOSURE_GUARD_MASK \ -+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -+ -+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -+ -+struct closure { -+ union { -+ struct { -+ struct workqueue_struct *wq; -+ struct closure_syncer *s; -+ struct llist_node list; -+ closure_fn *fn; -+ }; -+ struct work_struct work; -+ }; -+ -+ struct closure *parent; -+ -+ atomic_t remaining; -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+#define CLOSURE_MAGIC_DEAD 0xc054dead -+#define CLOSURE_MAGIC_ALIVE 0xc054a11e -+ -+ unsigned int magic; -+ struct list_head all; -+ unsigned long ip; -+ unsigned long waiting_on; -+#endif -+}; -+ -+void closure_sub(struct closure *cl, int v); -+void closure_put(struct closure *cl); -+void __closure_wake_up(struct closure_waitlist *list); -+bool closure_wait(struct closure_waitlist *list, struct closure *cl); -+void __closure_sync(struct closure *cl); -+ -+/** -+ * closure_sync - sleep until a closure a closure has nothing left to wait on -+ * -+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns -+ * the last refcount. -+ */ -+static inline void closure_sync(struct closure *cl) -+{ -+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -+ __closure_sync(cl); -+} -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+void closure_debug_create(struct closure *cl); -+void closure_debug_destroy(struct closure *cl); -+ -+#else -+ -+static inline void closure_debug_create(struct closure *cl) {} -+static inline void closure_debug_destroy(struct closure *cl) {} -+ -+#endif -+ -+static inline void closure_set_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _THIS_IP_; -+#endif -+} -+ -+static inline void closure_set_ret_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _RET_IP_; -+#endif -+} -+ -+static inline void closure_set_waiting(struct closure *cl, unsigned long f) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->waiting_on = f; -+#endif -+} -+ -+static inline void closure_set_stopped(struct closure *cl) -+{ -+ atomic_sub(CLOSURE_RUNNING, &cl->remaining); -+} -+ -+static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -+ struct workqueue_struct *wq) -+{ -+ closure_set_ip(cl); -+ cl->fn = fn; -+ cl->wq = wq; -+ /* between atomic_dec() in closure_put() */ -+ smp_mb__before_atomic(); -+} -+ -+static inline void closure_queue(struct closure *cl) -+{ -+ struct workqueue_struct *wq = cl->wq; -+ /** -+ * Changes made to closure, work_struct, or a couple of other structs -+ * may cause work.func not pointing to the right location. -+ */ -+ BUILD_BUG_ON(offsetof(struct closure, fn) -+ != offsetof(struct work_struct, func)); -+ -+ if (wq) { -+ INIT_WORK(&cl->work, cl->work.func); -+ queue_work(wq, &cl->work); -+ } else -+ cl->fn(cl); -+} -+ -+/** -+ * closure_get - increment a closure's refcount -+ */ -+static inline void closure_get(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ BUG_ON((atomic_inc_return(&cl->remaining) & -+ CLOSURE_REMAINING_MASK) <= 1); -+#else -+ atomic_inc(&cl->remaining); -+#endif -+} -+ -+/** -+ * closure_init - Initialize a closure, setting the refcount to 1 -+ * @cl: closure to initialize -+ * @parent: parent of the new closure. cl will take a refcount on it for its -+ * lifetime; may be NULL. -+ */ -+static inline void closure_init(struct closure *cl, struct closure *parent) -+{ -+ cl->fn = NULL; -+ cl->parent = parent; -+ if (parent) -+ closure_get(parent); -+ -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+ -+ closure_debug_create(cl); -+ closure_set_ip(cl); -+} -+ -+static inline void closure_init_stack(struct closure *cl) -+{ -+ memset(cl, 0, sizeof(struct closure)); -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+} -+ -+/** -+ * closure_wake_up - wake up all closures on a wait list, -+ * with memory barrier -+ */ -+static inline void closure_wake_up(struct closure_waitlist *list) -+{ -+ /* Memory barrier for the wait list */ -+ smp_mb(); -+ __closure_wake_up(list); -+} -+ -+/** -+ * continue_at - jump to another function with barrier -+ * -+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have -+ * been dropped with closure_put()), it will resume execution at @fn running out -+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -+ * -+ * This is because after calling continue_at() you no longer have a ref on @cl, -+ * and whatever @cl owns may be freed out from under you - a running closure fn -+ * has a ref on its own closure which continue_at() drops. -+ * -+ * Note you are expected to immediately return after using this macro. -+ */ -+#define continue_at(_cl, _fn, _wq) \ -+do { \ -+ set_closure_fn(_cl, _fn, _wq); \ -+ closure_sub(_cl, CLOSURE_RUNNING + 1); \ -+} while (0) -+ -+/** -+ * closure_return - finish execution of a closure -+ * -+ * This is used to indicate that @cl is finished: when all outstanding refs on -+ * @cl have been dropped @cl's ref on its parent closure (as passed to -+ * closure_init()) will be dropped, if one was specified - thus this can be -+ * thought of as returning to the parent closure. -+ */ -+#define closure_return(_cl) continue_at((_cl), NULL, NULL) -+ -+/** -+ * continue_at_nobarrier - jump to another function without barrier -+ * -+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if -+ * @wq is NULL). -+ * -+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -+ * thus it's not safe to touch anything protected by @cl after a -+ * continue_at_nobarrier(). -+ */ -+#define continue_at_nobarrier(_cl, _fn, _wq) \ -+do { \ -+ closure_set_ip(_cl); \ -+ if (_wq) { \ -+ INIT_WORK(&(_cl)->work, (void *) _fn); \ -+ queue_work((_wq), &(_cl)->work); \ -+ } else { \ -+ (_fn)(_cl); \ -+ } \ -+} while (0) -+ -+/** -+ * closure_return_with_destructor - finish execution of a closure, -+ * with destructor -+ * -+ * Works like closure_return(), except @destructor will be called when all -+ * outstanding refs on @cl have been dropped; @destructor may be used to safely -+ * free the memory occupied by @cl, and it is called with the ref on the parent -+ * closure still held - so @destructor could safely return an item to a -+ * freelist protected by @cl's parent. -+ */ -+#define closure_return_with_destructor(_cl, _destructor) \ -+do { \ -+ set_closure_fn(_cl, _destructor, NULL); \ -+ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -+} while (0) -+ -+/** -+ * closure_call - execute @fn out of a new, uninitialized closure -+ * -+ * Typically used when running out of one closure, and we want to run @fn -+ * asynchronously out of a new closure - @parent will then wait for @cl to -+ * finish. -+ */ -+static inline void closure_call(struct closure *cl, closure_fn fn, -+ struct workqueue_struct *wq, -+ struct closure *parent) -+{ -+ closure_init(cl, parent); -+ continue_at_nobarrier(cl, fn, wq); -+} -+ -+#define __closure_wait_event(waitlist, _cond) \ -+do { \ -+ struct closure cl; \ -+ \ -+ closure_init_stack(&cl); \ -+ \ -+ while (1) { \ -+ closure_wait(waitlist, &cl); \ -+ if (_cond) \ -+ break; \ -+ closure_sync(&cl); \ -+ } \ -+ closure_wake_up(waitlist); \ -+ closure_sync(&cl); \ -+} while (0) -+ -+#define closure_wait_event(waitlist, _cond) \ -+do { \ -+ if (!(_cond)) \ -+ __closure_wait_event(waitlist, _cond); \ -+} while (0) -+ -+#endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index cdf016596659..d3ab422fd4bf 100644 ---- a/include/linux/compiler_attributes.h -+++ b/include/linux/compiler_attributes.h -@@ -270,4 +270,9 @@ - */ - #define __weak __attribute__((__weak__)) - -+/* -+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute -+ */ -+#define __flatten __attribute__((flatten)) -+ - #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ -diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index c1488cc84fd9..3d6c4102ecc1 100644 ---- a/include/linux/dcache.h -+++ b/include/linux/dcache.h -@@ -254,6 +254,7 @@ extern struct dentry * d_make_root(struct inode *); - /* - the ramfs-type tree */ - extern void d_genocide(struct dentry *); - -+extern void d_mark_tmpfile(struct dentry *, struct inode *); - extern void d_tmpfile(struct dentry *, struct inode *); - - extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h -new file mode 100644 -index 000000000000..6e7bb56ae8b4 ---- /dev/null -+++ b/include/linux/dynamic_fault.h -@@ -0,0 +1,117 @@ -+#ifndef _DYNAMIC_FAULT_H -+#define _DYNAMIC_FAULT_H -+ -+#include -+#include -+#include -+ -+enum dfault_enabled { -+ DFAULT_DISABLED, -+ DFAULT_ENABLED, -+ DFAULT_ONESHOT, -+}; -+ -+union dfault_state { -+ struct { -+ unsigned enabled:2; -+ unsigned count:30; -+ }; -+ -+ struct { -+ unsigned v; -+ }; -+}; -+ -+/* -+ * An instance of this structure is created in a special -+ * ELF section at every dynamic fault callsite. At runtime, -+ * the special section is treated as an array of these. -+ */ -+struct _dfault { -+ const char *modname; -+ const char *function; -+ const char *filename; -+ const char *class; -+ -+ const u16 line; -+ -+ unsigned frequency; -+ union dfault_state state; -+ -+ struct static_key enabled; -+} __aligned(8); -+ -+ -+#ifdef CONFIG_DYNAMIC_FAULT -+ -+int dfault_add_module(struct _dfault *tab, unsigned int n, const char *mod); -+int dfault_remove_module(char *mod_name); -+bool __dynamic_fault_enabled(struct _dfault *); -+ -+#define dynamic_fault(_class) \ -+({ \ -+ static struct _dfault descriptor \ -+ __used __aligned(8) __attribute__((section("__faults"))) = { \ -+ .modname = KBUILD_MODNAME, \ -+ .function = __func__, \ -+ .filename = __FILE__, \ -+ .line = __LINE__, \ -+ .class = _class, \ -+ }; \ -+ \ -+ static_key_false(&descriptor.enabled) && \ -+ __dynamic_fault_enabled(&descriptor); \ -+}) -+ -+#define memory_fault() dynamic_fault("memory") -+#define race_fault() dynamic_fault("race") -+ -+#define kmalloc(...) \ -+ (memory_fault() ? NULL : kmalloc(__VA_ARGS__)) -+#define kzalloc(...) \ -+ (memory_fault() ? NULL : kzalloc(__VA_ARGS__)) -+#define krealloc(...) \ -+ (memory_fault() ? NULL : krealloc(__VA_ARGS__)) -+ -+#define mempool_alloc(pool, gfp_mask) \ -+ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ -+ ? NULL : mempool_alloc(pool, gfp_mask)) -+ -+#define __get_free_pages(...) \ -+ (memory_fault() ? 0 : __get_free_pages(__VA_ARGS__)) -+#define alloc_pages_node(...) \ -+ (memory_fault() ? NULL : alloc_pages_node(__VA_ARGS__)) -+#define alloc_pages_nodemask(...) \ -+ (memory_fault() ? NULL : alloc_pages_nodemask(__VA_ARGS__)) -+ -+#define bio_alloc_bioset(gfp_mask, ...) \ -+ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ -+ ? NULL : bio_alloc_bioset(gfp_mask, __VA_ARGS__)) -+ -+#define bio_clone(bio, gfp_mask) \ -+ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ -+ ? NULL : bio_clone(bio, gfp_mask)) -+ -+#define bio_clone_bioset(bio, gfp_mask, bs) \ -+ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ -+ ? NULL : bio_clone_bioset(bio, gfp_mask, bs)) -+ -+#define bio_kmalloc(...) \ -+ (memory_fault() ? NULL : bio_kmalloc(__VA_ARGS__)) -+#define bio_clone_kmalloc(...) \ -+ (memory_fault() ? NULL : bio_clone_kmalloc(__VA_ARGS__)) -+ -+#define bio_iov_iter_get_pages(...) \ -+ (memory_fault() ? -ENOMEM : bio_iov_iter_get_pages(__VA_ARGS__)) -+ -+#else /* CONFIG_DYNAMIC_FAULT */ -+ -+#define dfault_add_module(tab, n, modname) 0 -+#define dfault_remove_module(mod) 0 -+#define dynamic_fault(_class) 0 -+#define memory_fault() 0 -+#define race_fault() 0 -+ -+#endif /* CONFIG_DYNAMIC_FAULT */ -+ -+#endif -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 45cc10cdf6dd..51f2268a3eaa 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3072,6 +3072,7 @@ extern struct inode *find_inode_nowait(struct super_block *, - void *data); - extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); - extern int insert_inode_locked(struct inode *); -+extern struct inode *insert_inode_locked2(struct inode *); - #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern void lockdep_annotate_inode_mutex_key(struct inode *inode); - #else -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index a8f7bd8ea1c6..2b41ba4377ec 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -605,32 +605,21 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) - return 0; - } - --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask); - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask); -+ - extern void delete_from_page_cache(struct page *page); - extern void __delete_from_page_cache(struct page *page, void *shadow); - int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); - void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); - --/* -- * Like add_to_page_cache_locked, but used to add newly allocated pages: -- * the page is new, so we can just run __SetPageLocked() against it. -- */ --static inline int add_to_page_cache(struct page *page, -- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) --{ -- int error; -- -- __SetPageLocked(page); -- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); -- if (unlikely(error)) -- __ClearPageLocked(page); -- return error; --} -- - static inline unsigned long dir_pages(struct inode *inode) - { - return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 4418f5cb8324..3f99f17a095b 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -45,6 +45,7 @@ struct io_context; - struct mempolicy; - struct nameidata; - struct nsproxy; -+struct pagecache_lock; - struct perf_event_context; - struct pid_namespace; - struct pipe_inode_info; -@@ -734,6 +735,7 @@ struct task_struct { - - struct mm_struct *mm; - struct mm_struct *active_mm; -+ struct address_space *faults_disabled_mapping; - - /* Per-thread vma caching: */ - struct vmacache vmacache; -diff --git a/include/linux/six.h b/include/linux/six.h -new file mode 100644 -index 000000000000..a16e94f482e9 ---- /dev/null -+++ b/include/linux/six.h -@@ -0,0 +1,197 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_SIX_H -+#define _LINUX_SIX_H -+ -+/* -+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw -+ * semaphores, except with a third intermediate state, intent. Basic operations -+ * are: -+ * -+ * six_lock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * six_lock_intent(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * -+ * Intent locks block other intent locks, but do not block read locks, and you -+ * must have an intent lock held before taking a write lock, like so: -+ * -+ * six_lock_intent(&foo->lock); -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * Other operations: -+ * -+ * six_trylock_read() -+ * six_trylock_intent() -+ * six_trylock_write() -+ * -+ * six_lock_downgrade(): convert from intent to read -+ * six_lock_tryupgrade(): attempt to convert from read to intent -+ * -+ * Locks also embed a sequence number, which is incremented when the lock is -+ * locked or unlocked for write. The current sequence number can be grabbed -+ * while a lock is held from lock->state.seq; then, if you drop the lock you can -+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock -+ * iff it hasn't been locked for write in the meantime. -+ * -+ * There are also operations that take the lock type as a parameter, where the -+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: -+ * -+ * six_lock_type(lock, type) -+ * six_unlock_type(lock, type) -+ * six_relock(lock, type, seq) -+ * six_trylock_type(lock, type) -+ * six_trylock_convert(lock, from, to) -+ * -+ * A lock may be held multiple types by the same thread (for read or intent, -+ * not write). However, the six locks code does _not_ implement the actual -+ * recursive checks itself though - rather, if your code (e.g. btree iterator -+ * code) knows that the current thread already has a lock held, and for the -+ * correct type, six_lock_increment() may be used to bump up the counter for -+ * that type - the only effect is that one more call to unlock will be required -+ * before the lock is unlocked. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define SIX_LOCK_SEPARATE_LOCKFNS -+ -+union six_lock_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ /* for waitlist_bitnr() */ -+ unsigned long l; -+ }; -+ -+ struct { -+ unsigned read_lock:28; -+ unsigned intent_lock:1; -+ unsigned waiters:3; -+ /* -+ * seq works much like in seqlocks: it's incremented every time -+ * we lock and unlock for write. -+ * -+ * If it's odd write lock is held, even unlocked. -+ * -+ * Thus readers can unlock, and then lock again later iff it -+ * hasn't been modified in the meantime. -+ */ -+ u32 seq; -+ }; -+}; -+ -+enum six_lock_type { -+ SIX_LOCK_read, -+ SIX_LOCK_intent, -+ SIX_LOCK_write, -+}; -+ -+struct six_lock { -+ union six_lock_state state; -+ unsigned intent_lock_recurse; -+ struct task_struct *owner; -+ struct optimistic_spin_queue osq; -+ -+ raw_spinlock_t wait_lock; -+ struct list_head wait_list[2]; -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map dep_map; -+#endif -+}; -+ -+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -+ -+static __always_inline void __six_lock_init(struct six_lock *lock, -+ const char *name, -+ struct lock_class_key *key) -+{ -+ atomic64_set(&lock->state.counter, 0); -+ raw_spin_lock_init(&lock->wait_lock); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); -+ lockdep_init_map(&lock->dep_map, name, key, 0); -+#endif -+} -+ -+#define six_lock_init(lock) \ -+do { \ -+ static struct lock_class_key __key; \ -+ \ -+ __six_lock_init((lock), #lock, &__key); \ -+} while (0) -+ -+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *); \ -+bool six_relock_##type(struct six_lock *, u32); \ -+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ -+void six_unlock_##type(struct six_lock *); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+#undef __SIX_LOCK -+ -+#define SIX_LOCK_DISPATCH(type, fn, ...) \ -+ switch (type) { \ -+ case SIX_LOCK_read: \ -+ return fn##_read(__VA_ARGS__); \ -+ case SIX_LOCK_intent: \ -+ return fn##_intent(__VA_ARGS__); \ -+ case SIX_LOCK_write: \ -+ return fn##_write(__VA_ARGS__); \ -+ default: \ -+ BUG(); \ -+ } -+ -+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_trylock, lock); -+} -+ -+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); -+} -+ -+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); -+} -+ -+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_unlock, lock); -+} -+ -+void six_lock_downgrade(struct six_lock *); -+bool six_lock_tryupgrade(struct six_lock *); -+bool six_trylock_convert(struct six_lock *, enum six_lock_type, -+ enum six_lock_type); -+ -+void six_lock_increment(struct six_lock *, enum six_lock_type); -+ -+void six_lock_wakeup_all(struct six_lock *); -+ -+#endif /* _LINUX_SIX_H */ -diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h -new file mode 100644 -index 000000000000..bafbccafae30 ---- /dev/null -+++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,664 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM bcachefs -+ -+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_BCACHE_H -+ -+#include -+ -+DECLARE_EVENT_CLASS(bpos, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = p->inode; -+ __entry->offset = p->offset; -+ ), -+ -+ TP_printk("%llu:%llu", __entry->inode, __entry->offset) -+); -+ -+DECLARE_EVENT_CLASS(bkey, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = k->p.inode; -+ __entry->offset = k->p.offset; -+ __entry->size = k->size; -+ ), -+ -+ TP_printk("%llu:%llu len %u", __entry->inode, -+ __entry->offset, __entry->size) -+); -+ -+DECLARE_EVENT_CLASS(bch_fs, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DECLARE_EVENT_CLASS(bio, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(sector_t, sector ) -+ __field(unsigned int, nr_sector ) -+ __array(char, rwbs, 6 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; -+ __entry->sector = bio->bi_iter.bi_sector; -+ __entry->nr_sector = bio->bi_iter.bi_size >> 9; -+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); -+ ), -+ -+ TP_printk("%d,%d %s %llu + %u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, -+ (unsigned long long)__entry->sector, __entry->nr_sector) -+); -+ -+/* io.c: */ -+ -+DEFINE_EVENT(bio, read_split, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_bounce, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_retry, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, promote, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* Journal */ -+ -+DEFINE_EVENT(bch_fs, journal_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, journal_entry_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bio, journal_write, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* bset.c: */ -+ -+DEFINE_EVENT(bpos, bkey_pack_pos_fail, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p) -+); -+ -+/* Btree */ -+ -+DECLARE_EVENT_CLASS(btree_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u8, level ) -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->level = b->c.level; -+ __entry->id = b->c.btree_id; -+ __entry->inode = b->key.k.p.inode; -+ __entry->offset = b->key.k.p.offset; -+ ), -+ -+ TP_printk("%pU %u id %u %llu:%llu", -+ __entry->uuid, __entry->level, __entry->id, -+ __entry->inode, __entry->offset) -+); -+ -+DEFINE_EVENT(btree_node, btree_read, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_write, -+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), -+ TP_ARGS(b, bytes, sectors), -+ -+ TP_STRUCT__entry( -+ __field(enum btree_node_type, type) -+ __field(unsigned, bytes ) -+ __field(unsigned, sectors ) -+ ), -+ -+ TP_fast_assign( -+ __entry->type = btree_node_type(b); -+ __entry->bytes = bytes; -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("bkey type %u bytes %u sectors %u", -+ __entry->type , __entry->bytes, __entry->sectors) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_alloc, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_free, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_reap, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+TRACE_EVENT(btree_reserve_get_fail, -+ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), -+ TP_ARGS(c, required, cl), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, required ) -+ __field(struct closure *, cl ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->required = required; -+ __entry->cl = cl; -+ ), -+ -+ TP_printk("%pU required %zu by %p", __entry->uuid, -+ __entry->required, __entry->cl) -+); -+ -+TRACE_EVENT(btree_insert_key, -+ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), -+ TP_ARGS(c, b, k), -+ -+ TP_STRUCT__entry( -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->id = b->c.btree_id; -+ __entry->inode = k->k.p.inode; -+ __entry->offset = k->k.p.offset; -+ __entry->size = k->k.size; -+ ), -+ -+ TP_printk("btree %u: %llu:%llu len %u", __entry->id, -+ __entry->inode, __entry->offset, __entry->size) -+); -+ -+DEFINE_EVENT(btree_node, btree_split, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_compact, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_merge, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_set_root, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+/* Garbage collection */ -+ -+DEFINE_EVENT(btree_node, btree_gc_coalesce, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_gc_coalesce_fail, -+ TP_PROTO(struct bch_fs *c, int reason), -+ TP_ARGS(c, reason), -+ -+ TP_STRUCT__entry( -+ __field(u8, reason ) -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->reason = reason; -+ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU: %u", __entry->uuid, __entry->reason) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(bch_fs, gc_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+/* Allocator */ -+ -+TRACE_EVENT(alloc_batch, -+ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), -+ TP_ARGS(ca, free, total), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, free ) -+ __field(size_t, total ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->free = free; -+ __entry->total = total; -+ ), -+ -+ TP_printk("%pU free %zu total %zu", -+ __entry->uuid, __entry->free, __entry->total) -+); -+ -+TRACE_EVENT(invalidate, -+ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), -+ TP_ARGS(ca, offset, sectors), -+ -+ TP_STRUCT__entry( -+ __field(unsigned, sectors ) -+ __field(dev_t, dev ) -+ __field(__u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; -+ __entry->offset = offset, -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("invalidated %u sectors at %d,%d sector=%llu", -+ __entry->sectors, MAJOR(__entry->dev), -+ MINOR(__entry->dev), __entry->offset) -+); -+ -+DEFINE_EVENT(bch_fs, rescale_prios, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DECLARE_EVENT_CLASS(bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16) -+ __field(enum alloc_reserve, reserve ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->reserve = reserve; -+ ), -+ -+ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+/* Moving IO */ -+ -+DEFINE_EVENT(bkey, move_extent, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_alloc_fail, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_race, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+TRACE_EVENT(move_data, -+ TP_PROTO(struct bch_fs *c, u64 sectors_moved, -+ u64 keys_moved), -+ TP_ARGS(c, sectors_moved, keys_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, keys_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->keys_moved = keys_moved; -+ ), -+ -+ TP_printk("%pU sectors_moved %llu keys_moved %llu", -+ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) -+); -+ -+TRACE_EVENT(copygc, -+ TP_PROTO(struct bch_dev *ca, -+ u64 sectors_moved, u64 sectors_not_moved, -+ u64 buckets_moved, u64 buckets_not_moved), -+ TP_ARGS(ca, -+ sectors_moved, sectors_not_moved, -+ buckets_moved, buckets_not_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, sectors_not_moved ) -+ __field(u64, buckets_moved ) -+ __field(u64, buckets_not_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->sectors_not_moved = sectors_not_moved; -+ __entry->buckets_moved = buckets_moved; -+ __entry->buckets_not_moved = buckets_moved; -+ ), -+ -+ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", -+ __entry->uuid, -+ __entry->sectors_moved, __entry->sectors_not_moved, -+ __entry->buckets_moved, __entry->buckets_not_moved) -+); -+ -+TRACE_EVENT(transaction_restart_ip, -+ TP_PROTO(unsigned long caller, unsigned long ip), -+ TP_ARGS(caller, ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, caller ) -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->caller = caller; -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) -+); -+ -+DECLARE_EVENT_CLASS(transaction_restart, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pf", (void *) __entry->ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+TRACE_EVENT(trans_restart_iters_realloced, -+ TP_PROTO(unsigned long ip, unsigned nr), -+ TP_ARGS(ip, nr), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned, nr ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->nr = nr; -+ ), -+ -+ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) -+); -+ -+TRACE_EVENT(trans_restart_mem_realloced, -+ TP_PROTO(unsigned long ip, unsigned long bytes), -+ TP_ARGS(ip, bytes), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned long, bytes ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->bytes = bytes; -+ ), -+ -+ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_traverse, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_atomic, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DECLARE_EVENT_CLASS(node_lock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq), -+ -+ TP_STRUCT__entry( -+ __field(u32, level) -+ __field(u32, iter_seq) -+ __field(u32, node) -+ __field(u32, node_seq) -+ ), -+ -+ TP_fast_assign( -+ __entry->level = level; -+ __entry->iter_seq = iter_seq; -+ __entry->node = node; -+ __entry->node_seq = node_seq; -+ ), -+ -+ TP_printk("level %u iter seq %u node %u node seq %u", -+ __entry->level, __entry->iter_seq, -+ __entry->node, __entry->node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_upgrade_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_relock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+#endif /* _TRACE_BCACHE_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/init/init_task.c b/init/init_task.c -index bd403ed3e418..3035fffd976b 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -76,6 +76,7 @@ struct task_struct init_task - .nr_cpus_allowed= NR_CPUS, - .mm = NULL, - .active_mm = &init_mm, -+ .faults_disabled_mapping = NULL, - .restart_block = { - .fn = do_no_restart_syscall, - }, -diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks -index 3de8fd11873b..ab8aa082ce56 100644 ---- a/kernel/Kconfig.locks -+++ b/kernel/Kconfig.locks -@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB - config MMIOWB - def_bool y if ARCH_HAS_MMIOWB - depends on SMP -+ -+config SIXLOCKS -+ bool -diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile -index 45452facff3b..6c8f7340c0a2 100644 ---- a/kernel/locking/Makefile -+++ b/kernel/locking/Makefile -@@ -29,3 +29,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o - obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o - obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o - obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o -+obj-$(CONFIG_SIXLOCKS) += six.o -diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h -index baca699b94e9..4abb462d914d 100644 ---- a/kernel/locking/lockdep_internals.h -+++ b/kernel/locking/lockdep_internals.h -@@ -96,7 +96,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - #else - #define MAX_LOCKDEP_ENTRIES 32768UL - --#define MAX_LOCKDEP_CHAINS_BITS 16 -+#define MAX_LOCKDEP_CHAINS_BITS 18 - - /* - * Stack-trace: tightly packed array of stack backtrace -@@ -114,7 +114,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - - #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - --#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) - - extern struct list_head all_lock_classes; - extern struct lock_chain lock_chains[]; -diff --git a/kernel/locking/six.c b/kernel/locking/six.c -new file mode 100644 -index 000000000000..49d46ed2e18e ---- /dev/null -+++ b/kernel/locking/six.c -@@ -0,0 +1,553 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) do {} while (0) -+#endif -+ -+#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -+#define six_release(l) lock_release(l, _RET_IP_) -+ -+struct six_lock_vals { -+ /* Value we add to the lock in order to take the lock: */ -+ u64 lock_val; -+ -+ /* If the lock has this value (used as a mask), taking the lock fails: */ -+ u64 lock_fail; -+ -+ /* Value we add to the lock in order to release the lock: */ -+ u64 unlock_val; -+ -+ /* Mask that indicates lock is held for this type: */ -+ u64 held_mask; -+ -+ /* Waitlist we wakeup when releasing the lock: */ -+ enum six_lock_type unlock_wakeup; -+}; -+ -+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) -+ -+#define LOCK_VALS { \ -+ [SIX_LOCK_read] = { \ -+ .lock_val = __SIX_VAL(read_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_write, \ -+ .unlock_val = -__SIX_VAL(read_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_read, \ -+ .unlock_wakeup = SIX_LOCK_write, \ -+ }, \ -+ [SIX_LOCK_intent] = { \ -+ .lock_val = __SIX_VAL(intent_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_intent, \ -+ .unlock_val = -__SIX_VAL(intent_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_intent, \ -+ .unlock_wakeup = SIX_LOCK_intent, \ -+ }, \ -+ [SIX_LOCK_write] = { \ -+ .lock_val = __SIX_VAL(seq, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_read, \ -+ .unlock_val = __SIX_VAL(seq, 1), \ -+ .held_mask = __SIX_LOCK_HELD_write, \ -+ .unlock_wakeup = SIX_LOCK_read, \ -+ }, \ -+} -+ -+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, -+ union six_lock_state old) -+{ -+ if (type != SIX_LOCK_intent) -+ return; -+ -+ if (!old.intent_lock) { -+ EBUG_ON(lock->owner); -+ lock->owner = current; -+ } else { -+ EBUG_ON(lock->owner != current); -+ } -+} -+ -+static __always_inline bool do_six_trylock_type(struct six_lock *lock, -+ enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); -+ -+ do { -+ old.v = v; -+ -+ EBUG_ON(type == SIX_LOCK_write && -+ ((old.v & __SIX_LOCK_HELD_write) || -+ !(old.v & __SIX_LOCK_HELD_intent))); -+ -+ if (old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ if (!do_six_trylock_type(lock, type)) -+ return false; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ old.v = v; -+ -+ if (old.seq != seq || old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+struct six_lock_waiter { -+ struct list_head list; -+ struct task_struct *task; -+}; -+ -+/* This is probably up there with the more evil things I've done */ -+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) -+ -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER -+ -+static inline int six_can_spin_on_owner(struct six_lock *lock) -+{ -+ struct task_struct *owner; -+ int retval = 1; -+ -+ if (need_resched()) -+ return 0; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(lock->owner); -+ if (owner) -+ retval = owner->on_cpu; -+ rcu_read_unlock(); -+ /* -+ * if lock->owner is not set, the mutex owner may have just acquired -+ * it and not set the owner yet or the mutex has been released. -+ */ -+ return retval; -+} -+ -+static inline bool six_spin_on_owner(struct six_lock *lock, -+ struct task_struct *owner) -+{ -+ bool ret = true; -+ -+ rcu_read_lock(); -+ while (lock->owner == owner) { -+ /* -+ * Ensure we emit the owner->on_cpu, dereference _after_ -+ * checking lock->owner still matches owner. If that fails, -+ * owner might point to freed memory. If it still matches, -+ * the rcu_read_lock() ensures the memory stays valid. -+ */ -+ barrier(); -+ -+ if (!owner->on_cpu || need_resched()) { -+ ret = false; -+ break; -+ } -+ -+ cpu_relax(); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ struct task_struct *task = current; -+ -+ if (type == SIX_LOCK_write) -+ return false; -+ -+ preempt_disable(); -+ if (!six_can_spin_on_owner(lock)) -+ goto fail; -+ -+ if (!osq_lock(&lock->osq)) -+ goto fail; -+ -+ while (1) { -+ struct task_struct *owner; -+ -+ /* -+ * If there's an owner, wait for it to either -+ * release the lock or go to sleep. -+ */ -+ owner = READ_ONCE(lock->owner); -+ if (owner && !six_spin_on_owner(lock, owner)) -+ break; -+ -+ if (do_six_trylock_type(lock, type)) { -+ osq_unlock(&lock->osq); -+ preempt_enable(); -+ return true; -+ } -+ -+ /* -+ * When there's no owner, we might have preempted between the -+ * owner acquiring the lock and setting the owner field. If -+ * we're an RT task that will live-lock because we won't let -+ * the owner complete. -+ */ -+ if (!owner && (need_resched() || rt_task(task))) -+ break; -+ -+ /* -+ * The cpu_relax() call is a compiler barrier which forces -+ * everything in this loop to be re-loaded. We don't need -+ * memory barriers as we'll eventually observe the right -+ * values at the cost of a few extra spins. -+ */ -+ cpu_relax(); -+ } -+ -+ osq_unlock(&lock->osq); -+fail: -+ preempt_enable(); -+ -+ /* -+ * If we fell out of the spin path because of need_resched(), -+ * reschedule now, before we try-lock again. This avoids getting -+ * scheduled out right after we obtained the lock. -+ */ -+ if (need_resched()) -+ schedule(); -+ -+ return false; -+} -+ -+#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ return false; -+} -+ -+#endif -+ -+noinline -+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ struct six_lock_waiter wait; -+ int ret = 0; -+ u64 v; -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ return ret; -+ -+ if (six_optimistic_spin(lock, type)) -+ return 0; -+ -+ lock_contended(&lock->dep_map, _RET_IP_); -+ -+ INIT_LIST_HEAD(&wait.list); -+ wait.task = current; -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (type == SIX_LOCK_write) -+ EBUG_ON(lock->owner != current); -+ else if (list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_add_tail(&wait.list, &lock->wait_list[type]); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ break; -+ -+ v = READ_ONCE(lock->state.v); -+ do { -+ new.v = old.v = v; -+ -+ if (!(old.v & l[type].lock_fail)) -+ new.v += l[type].lock_val; -+ else if (!(new.waiters & (1 << type))) -+ new.waiters |= 1 << type; -+ else -+ break; /* waiting bit already set */ -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ if (!(old.v & l[type].lock_fail)) -+ break; -+ -+ schedule(); -+ } -+ -+ if (!ret) -+ six_set_owner(lock, type, old); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (!list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_del_init(&wait.list); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ return ret; -+} -+ -+__always_inline -+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ int ret; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 0); -+ -+ ret = do_six_trylock_type(lock, type) ? 0 -+ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); -+ -+ if (ret && type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ if (!ret) -+ lock_acquired(&lock->dep_map, _RET_IP_); -+ -+ return ret; -+} -+ -+static inline void six_lock_wakeup(struct six_lock *lock, -+ union six_lock_state state, -+ unsigned waitlist_id) -+{ -+ struct list_head *wait_list = &lock->wait_list[waitlist_id]; -+ struct six_lock_waiter *w, *next; -+ -+ if (waitlist_id == SIX_LOCK_write && state.read_lock) -+ return; -+ -+ if (!(state.waiters & (1 << waitlist_id))) -+ return; -+ -+ clear_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ -+ if (waitlist_id == SIX_LOCK_write) { -+ struct task_struct *p = READ_ONCE(lock->owner); -+ -+ if (p) -+ wake_up_process(p); -+ return; -+ } -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry_safe(w, next, wait_list, list) { -+ list_del_init(&w->list); -+ -+ if (wake_up_process(w->task) && -+ waitlist_id != SIX_LOCK_read) { -+ if (!list_empty(wait_list)) -+ set_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ break; -+ } -+ } -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+ -+__always_inline __flatten -+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state state; -+ -+ EBUG_ON(!(lock->state.v & l[type].held_mask)); -+ EBUG_ON(type == SIX_LOCK_write && -+ !(lock->state.v & __SIX_LOCK_HELD_intent)); -+ -+ if (type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ -+ if (type == SIX_LOCK_intent) { -+ EBUG_ON(lock->owner != current); -+ -+ if (lock->intent_lock_recurse) { -+ --lock->intent_lock_recurse; -+ return; -+ } -+ -+ lock->owner = NULL; -+ } -+ -+ state.v = atomic64_add_return_release(l[type].unlock_val, -+ &lock->state.counter); -+ six_lock_wakeup(lock, state, l[type].unlock_wakeup); -+} -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *lock) \ -+{ \ -+ return __six_trylock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_trylock_##type); \ -+ \ -+bool six_relock_##type(struct six_lock *lock, u32 seq) \ -+{ \ -+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ -+} \ -+EXPORT_SYMBOL_GPL(six_relock_##type); \ -+ \ -+int six_lock_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p) \ -+{ \ -+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ -+} \ -+EXPORT_SYMBOL_GPL(six_lock_##type); \ -+ \ -+void six_unlock_##type(struct six_lock *lock) \ -+{ \ -+ __six_unlock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_unlock_##type); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+ -+#undef __SIX_LOCK -+ -+/* Convert from intent to read: */ -+void six_lock_downgrade(struct six_lock *lock) -+{ -+ six_lock_increment(lock, SIX_LOCK_read); -+ six_unlock_intent(lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_downgrade); -+ -+bool six_lock_tryupgrade(struct six_lock *lock) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ new.v = old.v = v; -+ -+ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); -+ -+ new.v += l[SIX_LOCK_read].unlock_val; -+ -+ if (new.v & l[SIX_LOCK_intent].lock_fail) -+ return false; -+ -+ new.v += l[SIX_LOCK_intent].lock_val; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ six_set_owner(lock, SIX_LOCK_intent, old); -+ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_lock_tryupgrade); -+ -+bool six_trylock_convert(struct six_lock *lock, -+ enum six_lock_type from, -+ enum six_lock_type to) -+{ -+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); -+ -+ if (to == from) -+ return true; -+ -+ if (to == SIX_LOCK_read) { -+ six_lock_downgrade(lock); -+ return true; -+ } else { -+ return six_lock_tryupgrade(lock); -+ } -+} -+EXPORT_SYMBOL_GPL(six_trylock_convert); -+ -+/* -+ * Increment read/intent lock count, assuming we already have it read or intent -+ * locked: -+ */ -+void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ -+ EBUG_ON(type == SIX_LOCK_write); -+ six_acquire(&lock->dep_map, 0); -+ -+ /* XXX: assert already locked, and that we don't overflow: */ -+ -+ switch (type) { -+ case SIX_LOCK_read: -+ atomic64_add(l[type].lock_val, &lock->state.counter); -+ break; -+ case SIX_LOCK_intent: -+ lock->intent_lock_recurse++; -+ break; -+ case SIX_LOCK_write: -+ BUG(); -+ break; -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_increment); -+ -+void six_lock_wakeup_all(struct six_lock *lock) -+{ -+ struct six_lock_waiter *w; -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry(w, &lock->wait_list[0], list) -+ wake_up_process(w->task); -+ list_for_each_entry(w, &lock->wait_list[1], list) -+ wake_up_process(w->task); -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -diff --git a/lib/Kconfig b/lib/Kconfig -index 5d53f9609c25..a7024d19e000 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -451,6 +451,9 @@ config ASSOCIATIVE_ARRAY - - for more information. - -+config CLOSURES -+ bool -+ - config HAS_IOMEM - bool - depends on !NO_IOMEM -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 21d9c5f6e7ec..aa82ecff7123 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -1411,6 +1411,15 @@ config DEBUG_CREDENTIALS - - source "kernel/rcu/Kconfig.debug" - -+config DEBUG_CLOSURES -+ bool "Debug closures (bcache async widgits)" -+ depends on CLOSURES -+ select DEBUG_FS -+ help -+ Keeps all active closures in a linked list and provides a debugfs -+ interface to list them, which makes it possible to see asynchronous -+ operations that get stuck. -+ - config DEBUG_WQ_FORCE_RR_CPU - bool "Force round-robin CPU selection for unbound work items" - depends on DEBUG_KERNEL -@@ -1721,6 +1730,11 @@ config FAULT_INJECTION_STACKTRACE_FILTER - help - Provide stacktrace filter for fault-injection capabilities - -+config DYNAMIC_FAULT -+ bool "Enable dynamic fault support" -+ default n -+ depends on DEBUG_FS -+ - config ARCH_HAS_KCOV - bool - help -diff --git a/lib/Makefile b/lib/Makefile -index 685aee60de1d..74a60979c41c 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -189,6 +189,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o - obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o - obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o - -+obj-$(CONFIG_DYNAMIC_FAULT) += dynamic_fault.o -+ - obj-$(CONFIG_NLATTR) += nlattr.o - - obj-$(CONFIG_LRU_CACHE) += lru_cache.o -@@ -201,6 +203,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o - - obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o - -+obj-$(CONFIG_CLOSURES) += closure.o -+ - obj-$(CONFIG_DQL) += dynamic_queue_limits.o - - obj-$(CONFIG_GLOB) += glob.o -diff --git a/lib/closure.c b/lib/closure.c -new file mode 100644 -index 000000000000..3e6366c26209 ---- /dev/null -+++ b/lib/closure.c -@@ -0,0 +1,214 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Asynchronous refcounty things -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+static inline void closure_put_after_sub(struct closure *cl, int flags) -+{ -+ int r = flags & CLOSURE_REMAINING_MASK; -+ -+ BUG_ON(flags & CLOSURE_GUARD_MASK); -+ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -+ -+ if (!r) { -+ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -+ atomic_set(&cl->remaining, -+ CLOSURE_REMAINING_INITIALIZER); -+ closure_queue(cl); -+ } else { -+ struct closure *parent = cl->parent; -+ closure_fn *destructor = cl->fn; -+ -+ closure_debug_destroy(cl); -+ -+ if (destructor) -+ destructor(cl); -+ -+ if (parent) -+ closure_put(parent); -+ } -+ } -+} -+ -+/* For clearing flags with the same atomic op as a put */ -+void closure_sub(struct closure *cl, int v) -+{ -+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -+} -+EXPORT_SYMBOL(closure_sub); -+ -+/* -+ * closure_put - decrement a closure's refcount -+ */ -+void closure_put(struct closure *cl) -+{ -+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -+} -+EXPORT_SYMBOL(closure_put); -+ -+/* -+ * closure_wake_up - wake up all closures on a wait list, without memory barrier -+ */ -+void __closure_wake_up(struct closure_waitlist *wait_list) -+{ -+ struct llist_node *list; -+ struct closure *cl, *t; -+ struct llist_node *reverse = NULL; -+ -+ list = llist_del_all(&wait_list->list); -+ -+ /* We first reverse the list to preserve FIFO ordering and fairness */ -+ reverse = llist_reverse_order(list); -+ -+ /* Then do the wakeups */ -+ llist_for_each_entry_safe(cl, t, reverse, list) { -+ closure_set_waiting(cl, 0); -+ closure_sub(cl, CLOSURE_WAITING + 1); -+ } -+} -+EXPORT_SYMBOL(__closure_wake_up); -+ -+/** -+ * closure_wait - add a closure to a waitlist -+ * @waitlist: will own a ref on @cl, which will be released when -+ * closure_wake_up() is called on @waitlist. -+ * @cl: closure pointer. -+ * -+ */ -+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -+{ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ return false; -+ -+ closure_set_waiting(cl, _RET_IP_); -+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -+ llist_add(&cl->list, &waitlist->list); -+ -+ return true; -+} -+EXPORT_SYMBOL(closure_wait); -+ -+struct closure_syncer { -+ struct task_struct *task; -+ int done; -+}; -+ -+static void closure_sync_fn(struct closure *cl) -+{ -+ struct closure_syncer *s = cl->s; -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = READ_ONCE(s->task); -+ s->done = 1; -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void __sched __closure_sync(struct closure *cl) -+{ -+ struct closure_syncer s = { .task = current }; -+ -+ cl->s = &s; -+ continue_at(cl, closure_sync_fn, NULL); -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (s.done) -+ break; -+ schedule(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+} -+EXPORT_SYMBOL(__closure_sync); -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+static LIST_HEAD(closure_list); -+static DEFINE_SPINLOCK(closure_list_lock); -+ -+void closure_debug_create(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_ALIVE; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_add(&cl->all, &closure_list); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_create); -+ -+void closure_debug_destroy(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_DEAD; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_del(&cl->all); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_destroy); -+ -+static int debug_seq_show(struct seq_file *f, void *data) -+{ -+ struct closure *cl; -+ -+ spin_lock_irq(&closure_list_lock); -+ -+ list_for_each_entry(cl, &closure_list, all) { -+ int r = atomic_read(&cl->remaining); -+ -+ seq_printf(f, "%p: %pS -> %pS p %p r %i ", -+ cl, (void *) cl->ip, cl->fn, cl->parent, -+ r & CLOSURE_REMAINING_MASK); -+ -+ seq_printf(f, "%s%s\n", -+ test_bit(WORK_STRUCT_PENDING_BIT, -+ work_data_bits(&cl->work)) ? "Q" : "", -+ r & CLOSURE_RUNNING ? "R" : ""); -+ -+ if (r & CLOSURE_WAITING) -+ seq_printf(f, " W %pS\n", -+ (void *) cl->waiting_on); -+ -+ seq_puts(f, "\n"); -+ } -+ -+ spin_unlock_irq(&closure_list_lock); -+ return 0; -+} -+ -+static int debug_seq_open(struct inode *inode, struct file *file) -+{ -+ return single_open(file, debug_seq_show, NULL); -+} -+ -+static const struct file_operations debug_ops = { -+ .owner = THIS_MODULE, -+ .open = debug_seq_open, -+ .read = seq_read, -+ .release = single_release -+}; -+ -+static int __init closure_debug_init(void) -+{ -+ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); -+ return 0; -+} -+late_initcall(closure_debug_init) -+ -+#endif -diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c -new file mode 100644 -index 000000000000..75fc9a1b4bce ---- /dev/null -+++ b/lib/dynamic_fault.c -@@ -0,0 +1,760 @@ -+/* -+ * lib/dynamic_fault.c -+ * -+ * make dynamic_fault() calls runtime configurable based upon their -+ * source module. -+ * -+ * Copyright (C) 2011 Adam Berkan -+ * Based on dynamic_debug.c: -+ * Copyright (C) 2008 Jason Baron -+ * By Greg Banks -+ * Copyright (c) 2008 Silicon Graphics Inc. All Rights Reserved. -+ * -+ */ -+ -+#define pr_fmt(fmt) "dfault: " fmt "\n" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#undef kzalloc -+ -+extern struct _dfault __start___faults[]; -+extern struct _dfault __stop___faults[]; -+ -+struct dfault_table { -+ struct list_head link; -+ char *mod_name; -+ unsigned int num_dfaults; -+ struct _dfault *dfaults; -+}; -+ -+struct dfault_query { -+ const char *filename; -+ const char *module; -+ const char *function; -+ const char *class; -+ unsigned int first_line, last_line; -+ unsigned int first_index, last_index; -+ -+ unsigned match_line:1; -+ unsigned match_index:1; -+ -+ unsigned set_enabled:1; -+ unsigned enabled:2; -+ -+ unsigned set_frequency:1; -+ unsigned frequency; -+}; -+ -+struct dfault_iter { -+ struct dfault_table *table; -+ unsigned int idx; -+}; -+ -+static DEFINE_MUTEX(dfault_lock); -+static LIST_HEAD(dfault_tables); -+ -+bool __dynamic_fault_enabled(struct _dfault *df) -+{ -+ union dfault_state old, new; -+ unsigned v = df->state.v; -+ bool ret; -+ -+ do { -+ old.v = new.v = v; -+ -+ if (new.enabled == DFAULT_DISABLED) -+ return false; -+ -+ ret = df->frequency -+ ? ++new.count >= df->frequency -+ : true; -+ if (ret) -+ new.count = 0; -+ if (ret && new.enabled == DFAULT_ONESHOT) -+ new.enabled = DFAULT_DISABLED; -+ } while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v); -+ -+ if (ret) -+ pr_debug("returned true for %s:%u", df->filename, df->line); -+ -+ return ret; -+} -+EXPORT_SYMBOL(__dynamic_fault_enabled); -+ -+/* Return the last part of a pathname */ -+static inline const char *basename(const char *path) -+{ -+ const char *tail = strrchr(path, '/'); -+ -+ return tail ? tail + 1 : path; -+} -+ -+/* format a string into buf[] which describes the _dfault's flags */ -+static char *dfault_describe_flags(struct _dfault *df, char *buf, size_t buflen) -+{ -+ switch (df->state.enabled) { -+ case DFAULT_DISABLED: -+ strlcpy(buf, "disabled", buflen); -+ break; -+ case DFAULT_ENABLED: -+ strlcpy(buf, "enabled", buflen); -+ break; -+ case DFAULT_ONESHOT: -+ strlcpy(buf, "oneshot", buflen); -+ break; -+ default: -+ BUG(); -+ } -+ -+ return buf; -+} -+ -+/* -+ * must be called with dfault_lock held -+ */ -+ -+/* -+ * Search the tables for _dfault's which match the given -+ * `query' and apply the `flags' and `mask' to them. Tells -+ * the user which dfault's were changed, or whether none -+ * were matched. -+ */ -+static int dfault_change(const struct dfault_query *query) -+{ -+ struct dfault_table *dt; -+ unsigned int nfound = 0; -+ unsigned i, index = 0; -+ char flagbuf[16]; -+ -+ /* search for matching dfaults */ -+ mutex_lock(&dfault_lock); -+ list_for_each_entry(dt, &dfault_tables, link) { -+ -+ /* match against the module name */ -+ if (query->module != NULL && -+ strcmp(query->module, dt->mod_name)) -+ continue; -+ -+ for (i = 0 ; i < dt->num_dfaults ; i++) { -+ struct _dfault *df = &dt->dfaults[i]; -+ -+ /* match against the source filename */ -+ if (query->filename != NULL && -+ strcmp(query->filename, df->filename) && -+ strcmp(query->filename, basename(df->filename))) -+ continue; -+ -+ /* match against the function */ -+ if (query->function != NULL && -+ strcmp(query->function, df->function)) -+ continue; -+ -+ /* match against the class */ -+ if (query->class) { -+ size_t len = strlen(query->class); -+ -+ if (strncmp(query->class, df->class, len)) -+ continue; -+ -+ if (df->class[len] && df->class[len] != ':') -+ continue; -+ } -+ -+ /* match against the line number range */ -+ if (query->match_line && -+ (df->line < query->first_line || -+ df->line > query->last_line)) -+ continue; -+ -+ /* match against the fault index */ -+ if (query->match_index && -+ (index < query->first_index || -+ index > query->last_index)) { -+ index++; -+ continue; -+ } -+ -+ if (query->set_enabled && -+ query->enabled != df->state.enabled) { -+ if (query->enabled != DFAULT_DISABLED) -+ static_key_slow_inc(&df->enabled); -+ else if (df->state.enabled != DFAULT_DISABLED) -+ static_key_slow_dec(&df->enabled); -+ -+ df->state.enabled = query->enabled; -+ } -+ -+ if (query->set_frequency) -+ df->frequency = query->frequency; -+ -+ pr_debug("changed %s:%d [%s]%s #%d %s", -+ df->filename, df->line, dt->mod_name, -+ df->function, index, -+ dfault_describe_flags(df, flagbuf, -+ sizeof(flagbuf))); -+ -+ index++; -+ nfound++; -+ } -+ } -+ mutex_unlock(&dfault_lock); -+ -+ pr_debug("dfault: %u matches", nfound); -+ -+ return nfound ? 0 : -ENOENT; -+} -+ -+/* -+ * Split the buffer `buf' into space-separated words. -+ * Handles simple " and ' quoting, i.e. without nested, -+ * embedded or escaped \". Return the number of words -+ * or <0 on error. -+ */ -+static int dfault_tokenize(char *buf, char *words[], int maxwords) -+{ -+ int nwords = 0; -+ -+ while (*buf) { -+ char *end; -+ -+ /* Skip leading whitespace */ -+ buf = skip_spaces(buf); -+ if (!*buf) -+ break; /* oh, it was trailing whitespace */ -+ -+ /* Run `end' over a word, either whitespace separated or quoted -+ */ -+ if (*buf == '"' || *buf == '\'') { -+ int quote = *buf++; -+ -+ for (end = buf ; *end && *end != quote ; end++) -+ ; -+ if (!*end) -+ return -EINVAL; /* unclosed quote */ -+ } else { -+ for (end = buf ; *end && !isspace(*end) ; end++) -+ ; -+ BUG_ON(end == buf); -+ } -+ /* Here `buf' is the start of the word, `end' is one past the -+ * end -+ */ -+ -+ if (nwords == maxwords) -+ return -EINVAL; /* ran out of words[] before bytes */ -+ if (*end) -+ *end++ = '\0'; /* terminate the word */ -+ words[nwords++] = buf; -+ buf = end; -+ } -+ -+ return nwords; -+} -+ -+/* -+ * Parse a range. -+ */ -+static inline int parse_range(char *str, -+ unsigned int *first, -+ unsigned int *last) -+{ -+ char *first_str = str; -+ char *last_str = strchr(first_str, '-'); -+ -+ if (last_str) -+ *last_str++ = '\0'; -+ -+ if (kstrtouint(first_str, 10, first)) -+ return -EINVAL; -+ -+ if (!last_str) -+ *last = *first; -+ else if (kstrtouint(last_str, 10, last)) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+enum dfault_token { -+ TOK_INVALID, -+ -+ /* Queries */ -+ TOK_FUNC, -+ TOK_FILE, -+ TOK_LINE, -+ TOK_MODULE, -+ TOK_CLASS, -+ TOK_INDEX, -+ -+ /* Commands */ -+ TOK_DISABLE, -+ TOK_ENABLE, -+ TOK_ONESHOT, -+ TOK_FREQUENCY, -+}; -+ -+static const struct { -+ const char *str; -+ enum dfault_token tok; -+ unsigned args_required; -+} dfault_token_strs[] = { -+ { "func", TOK_FUNC, 1, }, -+ { "file", TOK_FILE, 1, }, -+ { "line", TOK_LINE, 1, }, -+ { "module", TOK_MODULE, 1, }, -+ { "class", TOK_CLASS, 1, }, -+ { "index", TOK_INDEX, 1, }, -+ { "disable", TOK_DISABLE, 0, }, -+ { "enable", TOK_ENABLE, 0, }, -+ { "oneshot", TOK_ONESHOT, 0, }, -+ { "frequency", TOK_FREQUENCY, 1, }, -+}; -+ -+static enum dfault_token str_to_token(const char *word, unsigned nr_words) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(dfault_token_strs); i++) -+ if (!strcmp(word, dfault_token_strs[i].str)) { -+ if (nr_words < dfault_token_strs[i].args_required) { -+ pr_debug("insufficient arguments to \"%s\"", -+ word); -+ return TOK_INVALID; -+ } -+ -+ return dfault_token_strs[i].tok; -+ } -+ -+ pr_debug("unknown keyword \"%s\"", word); -+ -+ return TOK_INVALID; -+} -+ -+static int dfault_parse_command(struct dfault_query *query, -+ enum dfault_token tok, -+ char *words[], size_t nr_words) -+{ -+ unsigned i = 0; -+ int ret; -+ -+ switch (tok) { -+ case TOK_INVALID: -+ return -EINVAL; -+ case TOK_FUNC: -+ query->function = words[i++]; -+ case TOK_FILE: -+ query->filename = words[i++]; -+ return 1; -+ case TOK_LINE: -+ ret = parse_range(words[i++], -+ &query->first_line, -+ &query->last_line); -+ if (ret) -+ return ret; -+ query->match_line = true; -+ break; -+ case TOK_MODULE: -+ query->module = words[i++]; -+ break; -+ case TOK_CLASS: -+ query->class = words[i++]; -+ break; -+ case TOK_INDEX: -+ ret = parse_range(words[i++], -+ &query->first_index, -+ &query->last_index); -+ if (ret) -+ return ret; -+ query->match_index = true; -+ break; -+ case TOK_DISABLE: -+ query->set_enabled = true; -+ query->enabled = DFAULT_DISABLED; -+ break; -+ case TOK_ENABLE: -+ query->set_enabled = true; -+ query->enabled = DFAULT_ENABLED; -+ break; -+ case TOK_ONESHOT: -+ query->set_enabled = true; -+ query->enabled = DFAULT_ONESHOT; -+ break; -+ case TOK_FREQUENCY: -+ query->set_frequency = 1; -+ ret = kstrtouint(words[i++], 10, &query->frequency); -+ if (ret) -+ return ret; -+ -+ if (!query->set_enabled) { -+ query->set_enabled = 1; -+ query->enabled = DFAULT_ENABLED; -+ } -+ break; -+ } -+ -+ return i; -+} -+ -+/* -+ * Parse words[] as a dfault query specification, which is a series -+ * of (keyword, value) pairs chosen from these possibilities: -+ * -+ * func -+ * file -+ * file -+ * module -+ * line -+ * line - // where either may be empty -+ * index - // dynamic faults numbered from -+ * // to inside each matching function -+ */ -+static int dfault_parse_query(struct dfault_query *query, -+ char *words[], size_t nr_words) -+{ -+ unsigned i = 0; -+ -+ while (i < nr_words) { -+ const char *tok_str = words[i++]; -+ enum dfault_token tok = str_to_token(tok_str, nr_words - i); -+ int ret = dfault_parse_command(query, tok, words + i, -+ nr_words - i); -+ -+ if (ret < 0) -+ return ret; -+ i += ret; -+ BUG_ON(i > nr_words); -+ } -+ -+ return 0; -+} -+ -+/* -+ * File_ops->write method for /dynamic_fault/conrol. Gathers the -+ * command text from userspace, parses and executes it. -+ */ -+static ssize_t dfault_proc_write(struct file *file, const char __user *ubuf, -+ size_t len, loff_t *offp) -+{ -+ struct dfault_query query; -+#define MAXWORDS 9 -+ int nwords; -+ char *words[MAXWORDS]; -+ char tmpbuf[256]; -+ int ret; -+ -+ memset(&query, 0, sizeof(query)); -+ -+ if (len == 0) -+ return 0; -+ /* we don't check *offp -- multiple writes() are allowed */ -+ if (len > sizeof(tmpbuf)-1) -+ return -E2BIG; -+ if (copy_from_user(tmpbuf, ubuf, len)) -+ return -EFAULT; -+ tmpbuf[len] = '\0'; -+ -+ pr_debug("read %zu bytes from userspace", len); -+ -+ nwords = dfault_tokenize(tmpbuf, words, MAXWORDS); -+ if (nwords < 0) -+ return -EINVAL; -+ if (dfault_parse_query(&query, words, nwords)) -+ return -EINVAL; -+ -+ /* actually go and implement the change */ -+ ret = dfault_change(&query); -+ if (ret < 0) -+ return ret; -+ -+ *offp += len; -+ return len; -+} -+ -+/* Control file read code */ -+ -+/* -+ * Set the iterator to point to the first _dfault object -+ * and return a pointer to that first object. Returns -+ * NULL if there are no _dfaults at all. -+ */ -+static struct _dfault *dfault_iter_first(struct dfault_iter *iter) -+{ -+ if (list_empty(&dfault_tables)) { -+ iter->table = NULL; -+ iter->idx = 0; -+ return NULL; -+ } -+ iter->table = list_entry(dfault_tables.next, -+ struct dfault_table, link); -+ iter->idx = 0; -+ return &iter->table->dfaults[iter->idx]; -+} -+ -+/* -+ * Advance the iterator to point to the next _dfault -+ * object from the one the iterator currently points at, -+ * and returns a pointer to the new _dfault. Returns -+ * NULL if the iterator has seen all the _dfaults. -+ */ -+static struct _dfault *dfault_iter_next(struct dfault_iter *iter) -+{ -+ if (iter->table == NULL) -+ return NULL; -+ if (++iter->idx == iter->table->num_dfaults) { -+ /* iterate to next table */ -+ iter->idx = 0; -+ if (list_is_last(&iter->table->link, &dfault_tables)) { -+ iter->table = NULL; -+ return NULL; -+ } -+ iter->table = list_entry(iter->table->link.next, -+ struct dfault_table, link); -+ } -+ return &iter->table->dfaults[iter->idx]; -+} -+ -+/* -+ * Seq_ops start method. Called at the start of every -+ * read() call from userspace. Takes the dfault_lock and -+ * seeks the seq_file's iterator to the given position. -+ */ -+static void *dfault_proc_start(struct seq_file *m, loff_t *pos) -+{ -+ struct dfault_iter *iter = m->private; -+ struct _dfault *dp; -+ int n = *pos; -+ -+ mutex_lock(&dfault_lock); -+ -+ if (n < 0) -+ return NULL; -+ dp = dfault_iter_first(iter); -+ while (dp != NULL && --n >= 0) -+ dp = dfault_iter_next(iter); -+ return dp; -+} -+ -+/* -+ * Seq_ops next method. Called several times within a read() -+ * call from userspace, with dfault_lock held. Walks to the -+ * next _dfault object with a special case for the header line. -+ */ -+static void *dfault_proc_next(struct seq_file *m, void *p, loff_t *pos) -+{ -+ struct dfault_iter *iter = m->private; -+ struct _dfault *dp; -+ -+ if (p == SEQ_START_TOKEN) -+ dp = dfault_iter_first(iter); -+ else -+ dp = dfault_iter_next(iter); -+ ++*pos; -+ return dp; -+} -+ -+/* -+ * Seq_ops show method. Called several times within a read() -+ * call from userspace, with dfault_lock held. Formats the -+ * current _dfault as a single human-readable line, with a -+ * special case for the header line. -+ */ -+static int dfault_proc_show(struct seq_file *m, void *p) -+{ -+ struct dfault_iter *iter = m->private; -+ struct _dfault *df = p; -+ char flagsbuf[8]; -+ -+ seq_printf(m, "%s:%u class:%s module:%s func:%s %s \"\"\n", -+ df->filename, df->line, df->class, -+ iter->table->mod_name, df->function, -+ dfault_describe_flags(df, flagsbuf, sizeof(flagsbuf))); -+ -+ return 0; -+} -+ -+/* -+ * Seq_ops stop method. Called at the end of each read() -+ * call from userspace. Drops dfault_lock. -+ */ -+static void dfault_proc_stop(struct seq_file *m, void *p) -+{ -+ mutex_unlock(&dfault_lock); -+} -+ -+static const struct seq_operations dfault_proc_seqops = { -+ .start = dfault_proc_start, -+ .next = dfault_proc_next, -+ .show = dfault_proc_show, -+ .stop = dfault_proc_stop -+}; -+ -+/* -+ * File_ops->open method for /dynamic_fault/control. Does the seq_file -+ * setup dance, and also creates an iterator to walk the _dfaults. -+ * Note that we create a seq_file always, even for O_WRONLY files -+ * where it's not needed, as doing so simplifies the ->release method. -+ */ -+static int dfault_proc_open(struct inode *inode, struct file *file) -+{ -+ struct dfault_iter *iter; -+ int err; -+ -+ iter = kzalloc(sizeof(*iter), GFP_KERNEL); -+ if (iter == NULL) -+ return -ENOMEM; -+ -+ err = seq_open(file, &dfault_proc_seqops); -+ if (err) { -+ kfree(iter); -+ return err; -+ } -+ ((struct seq_file *) file->private_data)->private = iter; -+ return 0; -+} -+ -+static const struct file_operations dfault_proc_fops = { -+ .owner = THIS_MODULE, -+ .open = dfault_proc_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = seq_release_private, -+ .write = dfault_proc_write -+}; -+ -+/* -+ * Allocate a new dfault_table for the given module -+ * and add it to the global list. -+ */ -+int dfault_add_module(struct _dfault *tab, unsigned int n, -+ const char *name) -+{ -+ struct dfault_table *dt; -+ char *new_name; -+ const char *func = NULL; -+ int i; -+ -+ dt = kzalloc(sizeof(*dt), GFP_KERNEL); -+ if (dt == NULL) -+ return -ENOMEM; -+ new_name = kstrdup(name, GFP_KERNEL); -+ if (new_name == NULL) { -+ kfree(dt); -+ return -ENOMEM; -+ } -+ dt->mod_name = new_name; -+ dt->num_dfaults = n; -+ dt->dfaults = tab; -+ -+ mutex_lock(&dfault_lock); -+ list_add_tail(&dt->link, &dfault_tables); -+ mutex_unlock(&dfault_lock); -+ -+ /* __attribute__(("section")) emits things in reverse order */ -+ for (i = n - 1; i >= 0; i--) -+ if (!func || strcmp(tab[i].function, func)) -+ func = tab[i].function; -+ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(dfault_add_module); -+ -+static void dfault_table_free(struct dfault_table *dt) -+{ -+ list_del_init(&dt->link); -+ kfree(dt->mod_name); -+ kfree(dt); -+} -+ -+/* -+ * Called in response to a module being unloaded. Removes -+ * any dfault_table's which point at the module. -+ */ -+int dfault_remove_module(char *mod_name) -+{ -+ struct dfault_table *dt, *nextdt; -+ int ret = -ENOENT; -+ -+ mutex_lock(&dfault_lock); -+ list_for_each_entry_safe(dt, nextdt, &dfault_tables, link) { -+ if (!strcmp(dt->mod_name, mod_name)) { -+ dfault_table_free(dt); -+ ret = 0; -+ } -+ } -+ mutex_unlock(&dfault_lock); -+ return ret; -+} -+EXPORT_SYMBOL_GPL(dfault_remove_module); -+ -+static void dfault_remove_all_tables(void) -+{ -+ mutex_lock(&dfault_lock); -+ while (!list_empty(&dfault_tables)) { -+ struct dfault_table *dt = list_entry(dfault_tables.next, -+ struct dfault_table, -+ link); -+ dfault_table_free(dt); -+ } -+ mutex_unlock(&dfault_lock); -+} -+ -+static int __init dynamic_fault_init(void) -+{ -+ struct dentry *dir, *file; -+ struct _dfault *iter, *iter_start; -+ const char *modname = NULL; -+ int ret = 0; -+ int n = 0; -+ -+ dir = debugfs_create_dir("dynamic_fault", NULL); -+ if (!dir) -+ return -ENOMEM; -+ file = debugfs_create_file("control", 0644, dir, NULL, -+ &dfault_proc_fops); -+ if (!file) { -+ debugfs_remove(dir); -+ return -ENOMEM; -+ } -+ if (__start___faults != __stop___faults) { -+ iter = __start___faults; -+ modname = iter->modname; -+ iter_start = iter; -+ for (; iter < __stop___faults; iter++) { -+ if (strcmp(modname, iter->modname)) { -+ ret = dfault_add_module(iter_start, n, modname); -+ if (ret) -+ goto out_free; -+ n = 0; -+ modname = iter->modname; -+ iter_start = iter; -+ } -+ n++; -+ } -+ ret = dfault_add_module(iter_start, n, modname); -+ } -+out_free: -+ if (ret) { -+ dfault_remove_all_tables(); -+ debugfs_remove(dir); -+ debugfs_remove(file); -+ } -+ return 0; -+} -+module_init(dynamic_fault_init); -diff --git a/mm/filemap.c b/mm/filemap.c -index 23a051a7ef0f..d39a3f28d6a9 100644 ---- a/mm/filemap.c -+++ b/mm/filemap.c -@@ -116,6 +116,69 @@ - * ->tasklist_lock (memory_failure, collect_procs_ao) - */ - -+static int page_cache_tree_insert_vec(struct page *pages[], -+ unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, -+ gfp_t gfp_mask, -+ void *shadow[]) -+{ -+ XA_STATE(xas, &mapping->i_pages, index); -+ void *old; -+ int i = 0, error = 0; -+ -+ mapping_set_update(&xas, mapping); -+ -+ if (!nr_pages) -+ return 0; -+ -+ xa_lock_irq(&mapping->i_pages); -+ -+ while (1) { -+ old = xas_load(&xas); -+ if (old && !xa_is_value(old)) { -+ error = -EEXIST; -+ break; -+ } -+ -+ xas_store(&xas, pages[i]); -+ error = xas_error(&xas); -+ -+ if (error == -ENOMEM) { -+ xa_unlock_irq(&mapping->i_pages); -+ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) -+ error = 0; -+ xa_lock_irq(&mapping->i_pages); -+ -+ if (!error) -+ continue; -+ break; -+ } -+ -+ if (error) -+ break; -+ -+ if (shadow) -+ shadow[i] = old; -+ if (xa_is_value(old)) -+ mapping->nrexceptional--; -+ mapping->nrpages++; -+ -+ /* hugetlb pages do not participate in page cache accounting. */ -+ if (!PageHuge(pages[i])) -+ __inc_node_page_state(pages[i], NR_FILE_PAGES); -+ -+ if (++i == nr_pages) -+ break; -+ -+ xas_next(&xas); -+ } -+ -+ xa_unlock_irq(&mapping->i_pages); -+ -+ return i ?: error; -+} -+ - static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) - { -@@ -825,118 +888,154 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) - } - EXPORT_SYMBOL_GPL(replace_page_cache_page); - --static int __add_to_page_cache_locked(struct page *page, -- struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask, -- void **shadowp) -+static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask, -+ void *shadow[]) - { -- XA_STATE(xas, &mapping->i_pages, offset); -- int huge = PageHuge(page); - struct mem_cgroup *memcg; -- int error; -- void *old; -+ int i, nr_added = 0, error = 0; - -- VM_BUG_ON_PAGE(!PageLocked(page), page); -- VM_BUG_ON_PAGE(PageSwapBacked(page), page); -- mapping_set_update(&xas, mapping); -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- if (!huge) { -- error = mem_cgroup_try_charge(page, current->mm, -- gfp_mask, &memcg, false); -- if (error) -- return error; -+ VM_BUG_ON_PAGE(PageSwapBacked(page), page); -+ VM_BUG_ON_PAGE(PageSwapCache(page), page); -+ -+ if (!PageHuge(page)) { -+ error = mem_cgroup_try_charge(page, current->mm, -+ gfp_mask, &memcg, false); -+ if (error) { -+ if (!i) -+ return error; -+ nr_pages = i; -+ break; -+ } -+ } -+ -+ __SetPageLocked(page); -+ get_page(page); -+ page->mapping = mapping; -+ page->index = index + i; - } - -- get_page(page); -- page->mapping = mapping; -- page->index = offset; -+ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, -+ index, gfp_mask, shadow); -+ if (error > 0) { -+ nr_added = error; -+ error = 0; -+ } - -- do { -- xas_lock_irq(&xas); -- old = xas_load(&xas); -- if (old && !xa_is_value(old)) -- xas_set_err(&xas, -EEXIST); -- xas_store(&xas, page); -- if (xas_error(&xas)) -- goto unlock; -+ for (i = 0; i < nr_added; i++) { -+ struct page *page = pages[i]; - -- if (xa_is_value(old)) { -- mapping->nrexceptional--; -- if (shadowp) -- *shadowp = old; -- } -- mapping->nrpages++; -+ if (!PageHuge(page)) -+ mem_cgroup_commit_charge(page, memcg, false, false); - -- /* hugetlb pages do not participate in page cache accounting */ -- if (!huge) -- __inc_node_page_state(page, NR_FILE_PAGES); --unlock: -- xas_unlock_irq(&xas); -- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); -+ trace_mm_filemap_add_to_page_cache(page); -+ } - -- if (xas_error(&xas)) -- goto error; -+ for (i = nr_added; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- if (!huge) -- mem_cgroup_commit_charge(page, memcg, false, false); -- trace_mm_filemap_add_to_page_cache(page); -- return 0; --error: -- page->mapping = NULL; -- /* Leave page->index set: truncation relies upon it */ -- if (!huge) -- mem_cgroup_cancel_charge(page, memcg, false); -- put_page(page); -- return xas_error(&xas); -+ if (!PageHuge(page)) -+ mem_cgroup_cancel_charge(page, memcg, false); -+ -+ /* Leave page->index set: truncation relies upon it */ -+ page->mapping = NULL; -+ put_page(page); -+ __ClearPageLocked(page); -+ } -+ -+ return nr_added ?: error; - } --ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); - - /** -- * add_to_page_cache_locked - add a locked page to the pagecache -+ * add_to_page_cache - add a newly allocated page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * -- * This function is used to add a page to the pagecache. It must be locked. -- * This function does not add the page to the LRU. The caller must do that. -+ * This function is used to add a page to the pagecache. It must be newly -+ * allocated. This function does not add the page to the LRU. The caller must -+ * do that. - * - * Return: %0 on success, negative error code otherwise. - */ --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) - { -- return __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, NULL); -+ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, -+ gfp_mask, NULL); -+ if (ret < 0) -+ return ret; -+ return 0; - } --EXPORT_SYMBOL(add_to_page_cache_locked); -+EXPORT_SYMBOL(add_to_page_cache); -+ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); - --int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask) - { -- void *shadow = NULL; -- int ret; -+ void *shadow_stack[8], **shadow = shadow_stack; -+ int i, ret = 0, err = 0, nr_added; -+ -+ if (nr_pages > ARRAY_SIZE(shadow_stack)) { -+ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); -+ if (!shadow) -+ goto slowpath; -+ } -+ -+ for (i = 0; i < nr_pages; i++) -+ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); -+ -+ ret = add_to_page_cache_vec(pages, nr_pages, mapping, -+ offset, gfp_mask, shadow); -+ nr_added = ret > 0 ? ret : 0; -+ -+ /* -+ * The page might have been evicted from cache only recently, in which -+ * case it should be activated like any other repeatedly accessed page. -+ * The exception is pages getting rewritten; evicting other data from -+ * the working set, only to cache data that will get overwritten with -+ * something else, is a waste of memory. -+ */ -+ for (i = 0; i < nr_added; i++) { -+ struct page *page = pages[i]; -+ void *s = shadow[i]; - -- __SetPageLocked(page); -- ret = __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, &shadow); -- if (unlikely(ret)) -- __ClearPageLocked(page); -- else { -- /* -- * The page might have been evicted from cache only -- * recently, in which case it should be activated like -- * any other repeatedly accessed page. -- * The exception is pages getting rewritten; evicting other -- * data from the working set, only to cache data that will -- * get overwritten with something else, is a waste of memory. -- */ - WARN_ON_ONCE(PageActive(page)); -- if (!(gfp_mask & __GFP_WRITE) && shadow) -- workingset_refault(page, shadow); -+ if (!(gfp_mask & __GFP_WRITE) && s) -+ workingset_refault(page, s); - lru_cache_add(page); - } -+ -+ if (shadow != shadow_stack) -+ kfree(shadow); -+ - return ret; -+slowpath: -+ for (i = 0; i < nr_pages; i++) { -+ err = add_to_page_cache_lru(pages[i], mapping, -+ offset + i, gfp_mask); -+ if (err) -+ break; -+ } -+ -+ return i ?: err; -+} -+EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); -+ -+int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) -+{ -+ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); -+ if (ret < 0) -+ return ret; -+ return 0; - } - EXPORT_SYMBOL_GPL(add_to_page_cache_lru); - -@@ -1827,6 +1926,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, - - return ret; - } -+EXPORT_SYMBOL(find_get_pages_range); - - /** - * find_get_pages_contig - gang contiguous pagecache lookup -@@ -1975,6 +2075,222 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) - ra->ra_pages /= 4; - } - -+static struct page * -+generic_file_buffered_read_readpage(struct file *filp, -+ struct address_space *mapping, -+ struct page *page) -+{ -+ struct file_ra_state *ra = &filp->f_ra; -+ int error; -+ -+ /* -+ * A previous I/O error may have been due to temporary -+ * failures, eg. multipath errors. -+ * PG_error will be set again if readpage fails. -+ */ -+ ClearPageError(page); -+ /* Start the actual read. The read will unlock the page. */ -+ error = mapping->a_ops->readpage(filp, page); -+ -+ if (unlikely(error)) { -+ put_page(page); -+ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; -+ } -+ -+ if (!PageUptodate(page)) { -+ error = lock_page_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ if (!PageUptodate(page)) { -+ if (page->mapping == NULL) { -+ /* -+ * invalidate_mapping_pages got it -+ */ -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ unlock_page(page); -+ shrink_readahead_size_eio(ra); -+ put_page(page); -+ return ERR_PTR(-EIO); -+ } -+ unlock_page(page); -+ } -+ -+ return page; -+} -+ -+static struct page * -+generic_file_buffered_read_pagenotuptodate(struct file *filp, -+ struct iov_iter *iter, -+ struct page *page, -+ loff_t pos, loff_t count) -+{ -+ struct address_space *mapping = filp->f_mapping; -+ struct inode *inode = mapping->host; -+ int error; -+ -+ /* -+ * See comment in do_read_cache_page on why -+ * wait_on_page_locked is used to avoid unnecessarily -+ * serialisations and why it's safe. -+ */ -+ error = wait_on_page_locked_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ -+ if (PageUptodate(page)) -+ return page; -+ -+ if (inode->i_blkbits == PAGE_SHIFT || -+ !mapping->a_ops->is_partially_uptodate) -+ goto page_not_up_to_date; -+ /* pipes can't handle partially uptodate pages */ -+ if (unlikely(iov_iter_is_pipe(iter))) -+ goto page_not_up_to_date; -+ if (!trylock_page(page)) -+ goto page_not_up_to_date; -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) -+ goto page_not_up_to_date_locked; -+ -+ if (!mapping->a_ops->is_partially_uptodate(page, -+ pos & ~PAGE_MASK, count)) -+ goto page_not_up_to_date_locked; -+ unlock_page(page); -+ return page; -+ -+page_not_up_to_date: -+ /* Get exclusive access to the page ... */ -+ error = lock_page_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ -+page_not_up_to_date_locked: -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) { -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ -+ /* Did somebody else fill it already? */ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return page; -+ } -+ -+ return generic_file_buffered_read_readpage(filp, mapping, page); -+} -+ -+static struct page * -+generic_file_buffered_read_no_cached_page(struct kiocb *iocb, -+ struct iov_iter *iter) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ struct page *page; -+ int error; -+ -+ /* -+ * Ok, it wasn't cached, so we need to create a new -+ * page.. -+ */ -+ page = page_cache_alloc(mapping); -+ if (!page) -+ return ERR_PTR(-ENOMEM); -+ -+ error = add_to_page_cache_lru(page, mapping, index, -+ mapping_gfp_constraint(mapping, GFP_KERNEL)); -+ if (error) { -+ put_page(page); -+ return error != -EEXIST ? ERR_PTR(error) : NULL; -+ } -+ -+ return generic_file_buffered_read_readpage(filp, mapping, page); -+} -+ -+static int generic_file_buffered_read_get_pages(struct kiocb *iocb, -+ struct iov_iter *iter, -+ struct page **pages, -+ unsigned nr) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ struct file_ra_state *ra = &filp->f_ra; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -+ int i, j, ret, err = 0; -+ -+ nr = min_t(unsigned long, last_index - index, nr); -+find_page: -+ if (fatal_signal_pending(current)) -+ return -EINTR; -+ -+ ret = find_get_pages_contig(mapping, index, nr, pages); -+ if (ret) -+ goto got_pages; -+ -+ if (iocb->ki_flags & IOCB_NOWAIT) -+ return -EAGAIN; -+ -+ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); -+ -+ ret = find_get_pages_contig(mapping, index, nr, pages); -+ if (ret) -+ goto got_pages; -+ -+ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); -+ err = PTR_ERR_OR_ZERO(pages[0]); -+ ret = !IS_ERR_OR_NULL(pages[0]); -+got_pages: -+ for (i = 0; i < ret; i++) { -+ struct page *page = pages[i]; -+ pgoff_t pg_index = index +i; -+ loff_t pg_pos = max(iocb->ki_pos, -+ (loff_t) pg_index << PAGE_SHIFT); -+ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; -+ -+ if (PageReadahead(page)) -+ page_cache_async_readahead(mapping, ra, filp, page, -+ pg_index, last_index - pg_index); -+ -+ if (!PageUptodate(page)) { -+ if (iocb->ki_flags & IOCB_NOWAIT) { -+ for (j = i; j < ret; j++) -+ put_page(pages[j]); -+ ret = i; -+ err = -EAGAIN; -+ break; -+ } -+ -+ page = generic_file_buffered_read_pagenotuptodate(filp, -+ iter, page, pg_pos, pg_count); -+ if (IS_ERR_OR_NULL(page)) { -+ for (j = i + 1; j < ret; j++) -+ put_page(pages[j]); -+ ret = i; -+ err = PTR_ERR_OR_ZERO(page); -+ break; -+ } -+ } -+ } -+ -+ if (likely(ret)) -+ return ret; -+ if (err) -+ return err; -+ goto find_page; -+} -+ - /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read -@@ -1995,252 +2311,108 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) - { - struct file *filp = iocb->ki_filp; -+ struct file_ra_state *ra = &filp->f_ra; - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; -- struct file_ra_state *ra = &filp->f_ra; -- loff_t *ppos = &iocb->ki_pos; -- pgoff_t index; -- pgoff_t last_index; -- pgoff_t prev_index; -- unsigned long offset; /* offset into pagecache page */ -- unsigned int prev_offset; -- int error = 0; -- -- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) -+ size_t orig_count = iov_iter_count(iter); -+ struct page *page_array[8], **pages; -+ unsigned nr_pages = ARRAY_SIZE(page_array); -+ unsigned read_nr_pages = ((iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT) - -+ (iocb->ki_pos >> PAGE_SHIFT); -+ int i, pg_nr, error = 0; -+ bool writably_mapped; -+ loff_t isize, end_offset; -+ -+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) - return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - -- index = *ppos >> PAGE_SHIFT; -- prev_index = ra->prev_pos >> PAGE_SHIFT; -- prev_offset = ra->prev_pos & (PAGE_SIZE-1); -- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -- offset = *ppos & ~PAGE_MASK; -- -- for (;;) { -- struct page *page; -- pgoff_t end_index; -- loff_t isize; -- unsigned long nr, ret; -+ if (read_nr_pages > nr_pages && -+ (pages = kmalloc_array(read_nr_pages, sizeof(void *), GFP_KERNEL))) -+ nr_pages = read_nr_pages; -+ else -+ pages = page_array; - -+ do { - cond_resched(); --find_page: -- if (fatal_signal_pending(current)) { -- error = -EINTR; -- goto out; -- } - -- page = find_get_page(mapping, index); -- if (!page) { -- if (iocb->ki_flags & IOCB_NOWAIT) -- goto would_block; -- page_cache_sync_readahead(mapping, -- ra, filp, -- index, last_index - index); -- page = find_get_page(mapping, index); -- if (unlikely(page == NULL)) -- goto no_cached_page; -- } -- if (PageReadahead(page)) { -- page_cache_async_readahead(mapping, -- ra, filp, page, -- index, last_index - index); -+ i = 0; -+ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, -+ pages, nr_pages); -+ if (pg_nr < 0) { -+ error = pg_nr; -+ break; - } -- if (!PageUptodate(page)) { -- if (iocb->ki_flags & IOCB_NOWAIT) { -- put_page(page); -- goto would_block; -- } - -- /* -- * See comment in do_read_cache_page on why -- * wait_on_page_locked is used to avoid unnecessarily -- * serialisations and why it's safe. -- */ -- error = wait_on_page_locked_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- if (PageUptodate(page)) -- goto page_ok; -- -- if (inode->i_blkbits == PAGE_SHIFT || -- !mapping->a_ops->is_partially_uptodate) -- goto page_not_up_to_date; -- /* pipes can't handle partially uptodate pages */ -- if (unlikely(iov_iter_is_pipe(iter))) -- goto page_not_up_to_date; -- if (!trylock_page(page)) -- goto page_not_up_to_date; -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) -- goto page_not_up_to_date_locked; -- if (!mapping->a_ops->is_partially_uptodate(page, -- offset, iter->count)) -- goto page_not_up_to_date_locked; -- unlock_page(page); -- } --page_ok: - /* -- * i_size must be checked after we know the page is Uptodate. -+ * i_size must be checked after we know the pages are Uptodate. - * - * Checking i_size after the check allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ -- - isize = i_size_read(inode); -- end_index = (isize - 1) >> PAGE_SHIFT; -- if (unlikely(!isize || index > end_index)) { -- put_page(page); -- goto out; -- } -+ if (unlikely(iocb->ki_pos >= isize)) -+ goto put_pages; - -- /* nr is the maximum number of bytes to copy from this page */ -- nr = PAGE_SIZE; -- if (index == end_index) { -- nr = ((isize - 1) & ~PAGE_MASK) + 1; -- if (nr <= offset) { -- put_page(page); -- goto out; -- } -- } -- nr = nr - offset; -+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - -- /* If users can be writing to this page using arbitrary -- * virtual addresses, take care about potential aliasing -- * before reading the page on the kernel side. -- */ -- if (mapping_writably_mapped(mapping)) -- flush_dcache_page(page); -+ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > -+ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) -+ put_page(pages[--pg_nr]); - - /* -- * When a sequential read accesses a page several times, -- * only mark it as accessed the first time. -+ * Once we start copying data, we don't want to be touching any -+ * cachelines that might be contended: - */ -- if (prev_index != index || offset != prev_offset) -- mark_page_accessed(page); -- prev_index = index; -+ writably_mapped = mapping_writably_mapped(mapping); - - /* -- * Ok, we have the page, and it's up-to-date, so -- * now we can copy it to user space... -+ * When a sequential read accesses a page several times, only -+ * mark it as accessed the first time. - */ -+ if (iocb->ki_pos >> PAGE_SHIFT != -+ ra->prev_pos >> PAGE_SHIFT) -+ mark_page_accessed(pages[0]); -+ for (i = 1; i < pg_nr; i++) -+ mark_page_accessed(pages[i]); -+ -+ for (i = 0; i < pg_nr; i++) { -+ unsigned offset = iocb->ki_pos & ~PAGE_MASK; -+ unsigned bytes = min_t(loff_t, end_offset - iocb->ki_pos, -+ PAGE_SIZE - offset); -+ unsigned copied; - -- ret = copy_page_to_iter(page, offset, nr, iter); -- offset += ret; -- index += offset >> PAGE_SHIFT; -- offset &= ~PAGE_MASK; -- prev_offset = offset; -- -- put_page(page); -- written += ret; -- if (!iov_iter_count(iter)) -- goto out; -- if (ret < nr) { -- error = -EFAULT; -- goto out; -- } -- continue; -- --page_not_up_to_date: -- /* Get exclusive access to the page ... */ -- error = lock_page_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- --page_not_up_to_date_locked: -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) { -- unlock_page(page); -- put_page(page); -- continue; -- } -+ /* -+ * If users can be writing to this page using arbitrary -+ * virtual addresses, take care about potential aliasing -+ * before reading the page on the kernel side. -+ */ -+ if (writably_mapped) -+ flush_dcache_page(pages[i]); - -- /* Did somebody else fill it already? */ -- if (PageUptodate(page)) { -- unlock_page(page); -- goto page_ok; -- } -+ copied = copy_page_to_iter(pages[i], offset, bytes, iter); - --readpage: -- /* -- * A previous I/O error may have been due to temporary -- * failures, eg. multipath errors. -- * PG_error will be set again if readpage fails. -- */ -- ClearPageError(page); -- /* Start the actual read. The read will unlock the page. */ -- error = mapping->a_ops->readpage(filp, page); -+ iocb->ki_pos += copied; -+ ra->prev_pos = iocb->ki_pos; - -- if (unlikely(error)) { -- if (error == AOP_TRUNCATED_PAGE) { -- put_page(page); -- error = 0; -- goto find_page; -- } -- goto readpage_error; -- } -- -- if (!PageUptodate(page)) { -- error = lock_page_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- if (!PageUptodate(page)) { -- if (page->mapping == NULL) { -- /* -- * invalidate_mapping_pages got it -- */ -- unlock_page(page); -- put_page(page); -- goto find_page; -- } -- unlock_page(page); -- shrink_readahead_size_eio(ra); -- error = -EIO; -- goto readpage_error; -+ if (copied < bytes) { -+ error = -EFAULT; -+ break; - } -- unlock_page(page); - } -+put_pages: -+ for (i = 0; i < pg_nr; i++) -+ put_page(pages[i]); -+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - -- goto page_ok; -- --readpage_error: -- /* UHHUH! A synchronous read error occurred. Report it */ -- put_page(page); -- goto out; -- --no_cached_page: -- /* -- * Ok, it wasn't cached, so we need to create a new -- * page.. -- */ -- page = page_cache_alloc(mapping); -- if (!page) { -- error = -ENOMEM; -- goto out; -- } -- error = add_to_page_cache_lru(page, mapping, index, -- mapping_gfp_constraint(mapping, GFP_KERNEL)); -- if (error) { -- put_page(page); -- if (error == -EEXIST) { -- error = 0; -- goto find_page; -- } -- goto out; -- } -- goto readpage; -- } -+ file_accessed(filp); -+ written += orig_count - iov_iter_count(iter); - --would_block: -- error = -EAGAIN; --out: -- ra->prev_pos = prev_index; -- ra->prev_pos <<= PAGE_SHIFT; -- ra->prev_pos |= prev_offset; -+ if (pages != page_array) -+ kfree(pages); - -- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; -- file_accessed(filp); - return written ? written : error; - } - -diff --git a/mm/gup.c b/mm/gup.c -index 87a6a59fe667..6ecc36d28c04 100644 ---- a/mm/gup.c -+++ b/mm/gup.c -@@ -1093,6 +1093,13 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - } - cond_resched(); - -+ if (current->faults_disabled_mapping && -+ vma->vm_file && -+ vma->vm_file->f_mapping == current->faults_disabled_mapping) { -+ ret = -EFAULT; -+ goto out; -+ } -+ - page = follow_page_mask(vma, start, foll_flags, &ctx); - if (!page) { - ret = faultin_page(tsk, vma, start, &foll_flags, -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 7326b54ab728..bdc2eb057b3b 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -2467,20 +2467,19 @@ int __set_page_dirty_nobuffers(struct page *page) - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); -- unsigned long flags; - - if (!mapping) { - unlock_page_memcg(page); - return 1; - } - -- xa_lock_irqsave(&mapping->i_pages, flags); -+ xa_lock_irq(&mapping->i_pages); - BUG_ON(page_mapping(page) != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); -- xa_unlock_irqrestore(&mapping->i_pages, flags); -+ xa_unlock_irq(&mapping->i_pages); - unlock_page_memcg(page); - - if (mapping->host) { diff --git a/linux57-tkg/linux57-tkg-patches/0009-glitched-bmq.patch b/linux57-tkg/linux57-tkg-patches/0009-glitched-bmq.patch deleted file mode 100644 index 38666e4..0000000 --- a/linux57-tkg/linux57-tkg-patches/0009-glitched-bmq.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - BMQ - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. diff --git a/linux57-tkg/linux57-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux57-tkg/linux57-tkg-patches/0009-glitched-ondemand-bmq.patch deleted file mode 100644 index a926040..0000000 --- a/linux57-tkg/linux57-tkg-patches/0009-glitched-ondemand-bmq.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux57-tkg/linux57-tkg-patches/0009-prjc_v5.7-r3.patch b/linux57-tkg/linux57-tkg-patches/0009-prjc_v5.7-r3.patch deleted file mode 100644 index d95c1c6..0000000 --- a/linux57-tkg/linux57-tkg-patches/0009-prjc_v5.7-r3.patch +++ /dev/null @@ -1,7817 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 5e2ce88d6eda..eda08ad54201 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4445,6 +4445,12 @@ - - sbni= [NET] Granch SBNI12 leased line adapter - -+ sched_timeslice= -+ [KNL] Time slice in us for BMQ scheduler. -+ Format: (must be >= 1000) -+ Default: 4000 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_debug [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 0d427fd10941..e0e112c68fa5 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1230,3 +1230,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ CPU scheduler only. This determines what type of yield calls to -+sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index eb2255e95f62..62b8cedbccb6 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 4418f5cb8324..1e8030513489 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -652,13 +652,18 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) - struct llist_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -672,6 +677,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -680,13 +686,25 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+ int boost_prio; -+#ifdef CONFIG_SCHED_BMQ -+ int bmq_idx; -+ struct list_head bmq_node; -+#endif /* CONFIG_SCHED_BMQ */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1306,6 +1324,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..da0306d2fedb 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,20 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +34,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..ba6fd6a5b4b1 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,11 +20,17 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+#ifdef CONFIG_SCHED_ALT -+/* +/- priority levels from the base priority */ -+#define MAX_PRIORITY_ADJ 4 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/init/Kconfig b/init/Kconfig -index 74a5ac65644f..4ef358fc7b51 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -689,9 +689,33 @@ config GENERIC_SCHED_CLOCK - - menu "Scheduler features" - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+endchoice -+ -+endif -+ - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_BMQ - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -777,6 +801,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_BMQ - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -878,7 +903,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_BMQ - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1134,6 +1159,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_BMQ - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index bd403ed3e418..737a814482d6 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -67,9 +67,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -79,6 +85,14 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+ .boost_prio = 0, -+#ifdef CONFIG_SCHED_BMQ -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -86,6 +100,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 729d3a5c772e..1e3dac9b6a43 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index d56fe51bdf07..3aa2c1e822b0 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..4176ad070bc9 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_ALT -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index c9f090d64f00..b5d0c7088021 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, -@@ -238,6 +238,7 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -246,6 +247,7 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; - } -@@ -257,6 +259,7 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -265,6 +268,7 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; - } -@@ -680,7 +684,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -953,7 +957,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..1cad9ff599a4 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o alt_debug.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..48e5fac710bc ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,6057 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include "sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_us; -+ -+ get_option(&str, ×lice_us); -+ if (timeslice_us >= 1000) -+ sched_timeslice_ns = timeslice_us * 1000; -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline int task_sched_prio(struct task_struct *p) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : p->prio + p->boost_prio; -+} -+ -+#include "bmq_imp.h" -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ __requeue_task(p, rq); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static inline bool got_nohz_idle_kick(void) -+{ -+ int cpu = smp_processor_id(); -+ -+ /* TODO: need to support nohz_flag -+ if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) -+ return false; -+ */ -+ -+ if (idle_cpu(cpu) && !need_resched()) -+ return true; -+ -+ /* -+ * We can't run Idle Load Balance on this CPU for this time so we -+ * cancel it and clear NOHZ_BALANCE_KICK -+ */ -+ /* TODO: need to support nohz_flag -+ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ */ -+ return false; -+} -+ -+#else /* CONFIG_NO_HZ_COMMON */ -+ -+static inline bool got_nohz_idle_kick(void) -+{ -+ return false; -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq->hrtick_csd.flags = 0; -+ rq->hrtick_csd.func = __hrtick_start; -+ rq->hrtick_csd.info = rq; -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ sched_ttwu_pending(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void) -+{ -+ struct rq *rq = this_rq(); -+ struct llist_node *llist = llist_del_all(&rq->wake_list); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry) -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ check_preempt_curr(rq); -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void scheduler_ipi(void) -+{ -+ /* -+ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting -+ * TIF_NEED_RESCHED remotely (for the first time) will also send -+ * this IPI. -+ */ -+ preempt_fold_need_resched(); -+ -+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) -+ return; -+ -+ irq_enter(); -+ sched_ttwu_pending(); -+ -+ /* -+ * Check if someone kicked us for doing the nohz idle load balance. -+ */ -+ if (unlikely(got_nohz_idle_kick())) { -+ /* TODO need to kick off balance -+ this_rq()->idle_balance = 1; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ */ -+ } -+ irq_exit(); -+} -+ -+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { -+ if (!set_nr_if_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+ } -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+#if defined(CONFIG_SMP) -+ if (!cpus_share_cache(smp_processor_id(), cpu)) { -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ ttwu_queue_remote(p, cpu, wake_flags); -+ return; -+ } -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ check_preempt_curr(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_remote()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ cpu = task_cpu(p); -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto unlock; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else /* CONFIG_SMP */ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, cpu, wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ raw_spin_unlock(&rq->lock); -+ -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+ struct task_struct *p = current; -+ int dest_cpu; -+ -+ if (task_rq(p)->nr_running < 2) -+ return; -+ -+ dest_cpu = cpumask_any_and(p->cpus_ptr, &sched_rq_watermark[IDLE_WM]); -+ if ( dest_cpu < nr_cpu_ids) { -+#ifdef CONFIG_SCHED_SMT -+ int smt = cpumask_any_and(p->cpus_ptr, &sched_sg_idle_mask); -+ if (smt < nr_cpu_ids) -+ dest_cpu = smt; -+#endif -+ if (likely(cpu_active(dest_cpu))) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); -+ return; -+ } -+ } -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+DEFINE_PER_CPU(unsigned long, thermal_pressure); -+ -+void arch_set_thermal_pressure(struct cpumask *cpus, -+ unsigned long th_pressure) -+{ -+ int cpu; -+ -+ for_each_cpu(cpu, cpus) -+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(cpu, &tmp, -+ per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance_check(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu; -+ -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) -+ return; -+ -+ cpu = cpu_of(rq); -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk)) { -+ if (sg_balance_trigger(i)) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0, ); -+ set_task_cpu(p, dest_cpu); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+#ifdef CONFIG_SMP -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+#endif -+ rq->nr_running += nr_migrated; -+#ifdef CONFIG_SMP -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+#endif -+ update_sched_rq_watermark(rq); -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+ } -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ if (rq_switch_time(rq) < boost_threshold(prev)) -+ boost_task(prev); -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ if (likely(prev != next)) { -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && sched_task_need_requeue(p)) { -+ requeue_task(p, rq); -+ check_preempt_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (!llist_empty(&rq->wake_list)) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) { -+ current->boost_prio = MAX_PRIORITY_ADJ; -+ requeue_task(current, rq); -+ } -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ sched_queue_init_idle(rq, idle); -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ p = sched_rq_first_task(rq); -+ while (p != rq->idle) { -+ int dest_cpu; -+ -+ /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ /* -+ * Rules for changing task_struct::cpus_allowed are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ p = sched_rq_first_task(rq); -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_ttwu_pending(); -+ -+ sched_tick_stop(cpu); -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last) \ -+ if (cpumask_and(chk, chk, mask)) \ -+ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ -+ cpu, (chk++)->bits[0]); \ -+ if (!last) \ -+ cpumask_complement(chk, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ -+ cpumask_complement(chk, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = chk; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(rq); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+#endif -+ rq->nr_switches = 0; -+ atomic_set(&rq->nr_iowait, 0); -+ hrtick_rq_init(rq); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(preempt_disable_ip); -+ pr_cont("\n"); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..835e6bb98dda ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the BMQ debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..2b66983cce42 ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,527 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+#ifdef CONFIG_SCHED_BMQ -+ struct bmq queue; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_SMP -+ struct llist_head wake_list; -+#endif -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, -+ const cpumask_t *mask) -+{ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu : -+ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); -+} -+ -+extern void sched_ttwu_pending(void); -+#else /* !CONFIG_SMP */ -+static inline void sched_ttwu_pending(void) { } -+#endif /* CONFIG_SMP */ -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..4ce30c30bd3e ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,14 @@ -+#ifndef BMQ_H -+#define BMQ_H -+ -+/* bits: -+ * RT(0-99), Low prio adj range, nice width, high prio adj range, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, SCHED_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+#endif -diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h -new file mode 100644 -index 000000000000..cb0fc0688a89 ---- /dev/null -+++ b/kernel/sched/bmq_imp.h -@@ -0,0 +1,86 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler 5.7-r3 by Alfred Chen.\n" -+ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ struct bmq *q = &rq->queue; -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ struct bmq *q = &rq->queue; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); -+ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); -+ set_bit(idle->bmq_idx, q->bitmap); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ list_del(&p->bmq_node); \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap);\ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->bmq_idx = task_sched_prio(p); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ -+ set_bit(p->bmq_idx, rq->queue.bitmap) -+ -+static inline void __requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ int idx = task_sched_prio(p); -+ -+ list_del(&p->bmq_node); -+ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); -+ if (idx != p->bmq_idx) { -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) -+ clear_bit(p->bmq_idx, rq->queue.bitmap); -+ p->bmq_idx = idx; -+ set_bit(p->bmq_idx, rq->queue.bitmap); -+ update_sched_rq_watermark(rq); -+ } -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p) -+{ -+ return (task_sched_prio(p) != p->bmq_idx); -+} -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 7fbaee24c824..0d7ad05b84fe 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_ALT */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -916,6 +927,7 @@ static int __init sugov_register(void) - core_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_ALT - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_ALT */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index ff9435dee1df..0ee9967d2d74 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -658,7 +658,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index b743bf38f08f..472478a4f2a8 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index b647d04d9c8b..f1983eb87f13 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - /* -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index eb034d9f024d..49aa805750c5 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,11 +1,13 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); -@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_ALT - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -177,6 +182,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 1f58677a8f23..682e6b3802c1 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - - #include -@@ -2548,3 +2552,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..108422ebc7bf 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 8344757bba6e..558ce8a70926 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -4,6 +4,7 @@ - */ - #include "sched.h" - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1190,8 +1191,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1424,6 +1427,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1453,6 +1457,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology = tl; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2327,3 +2332,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 8a176d8727a3..8e2ba49be0e1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_ALT -+static int __maybe_unused zero = 0; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1049,6 +1055,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index d89da1c7e005..a73adff9f309 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1923,8 +1923,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 2fd3b3fa68bf..e053bc56c019 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..cfbae0a21cef 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux57-tkg/linux57-tkg-patches/0011-ZFS-fix.patch b/linux57-tkg/linux57-tkg-patches/0011-ZFS-fix.patch deleted file mode 100644 index af71d04..0000000 --- a/linux57-tkg/linux57-tkg-patches/0011-ZFS-fix.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= -Date: Thu, 2 May 2019 05:28:08 +0100 -Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with - EXPORT_SYMBOL_GPL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We need these symbols in zfs as the fpu implementation breaks userspace: - -https://github.com/zfsonlinux/zfs/issues/9346 -Signed-off-by: Jörg Thalheim ---- - arch/x86/kernel/fpu/core.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index 12c70840980e..352538b3bb5d 100644 ---- a/arch/x86/kernel/fpu/core.c -+++ b/arch/x86/kernel/fpu/core.c -@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) - } - __cpu_invalidate_fpregs_state(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_begin); -+EXPORT_SYMBOL(kernel_fpu_begin); - - void kernel_fpu_end(void) - { -@@ -111,7 +111,7 @@ void kernel_fpu_end(void) - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_end); -+EXPORT_SYMBOL(kernel_fpu_end); - - /* - * Save the FPU state (mark it for reload if necessary): --- -2.23.0 - - diff --git a/linux57-tkg/linux57-tkg-patches/0012-linux-hardened.patch b/linux57-tkg/linux57-tkg-patches/0012-linux-hardened.patch deleted file mode 100644 index 6f20939..0000000 --- a/linux57-tkg/linux57-tkg-patches/0012-linux-hardened.patch +++ /dev/null @@ -1,2916 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 5e2ce88d6eda..5cdeccf3459f 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -518,17 +518,6 @@ - nosocket -- Disable socket memory accounting. - nokmem -- Disable kernel memory accounting. - -- checkreqprot [SELINUX] Set initial checkreqprot flag value. -- Format: { "0" | "1" } -- See security/selinux/Kconfig help text. -- 0 -- check protection applied by kernel (includes -- any implied execute protection). -- 1 -- check protection requested by application. -- Default value is set via a kernel config option. -- Value can be changed at runtime via -- /sys/fs/selinux/checkreqprot. -- Setting checkreqprot to 1 is deprecated. -- - cio_ignore= [S390] - See Documentation/s390/common_io.rst for details. - clk_ignore_unused -@@ -3446,6 +3435,11 @@ - the specified number of seconds. This is to be used if - your oopses keep scrolling off the screen. - -+ extra_latent_entropy -+ Enable a very simple form of latent entropy extraction -+ from the first 4GB of memory as the bootmem allocator -+ passes the memory pages to the buddy allocator. -+ - pcbit= [HW,ISDN] - - pcd. [PARIDE] -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 0d427fd10941..e0042d797c38 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1167,6 +1167,26 @@ If a value outside of this range is written to ``threads-max`` an - ``EINVAL`` error occurs. - - -+tiocsti_restrict -+================ -+ -+This toggle indicates whether unprivileged users are prevented from -+using the ``TIOCSTI`` ioctl to inject commands into other processes -+which share a tty session. -+ -+When ``tiocsti_restrict`` is set to (0) there are no restrictions(accept -+the default restriction of only being able to injection commands into -+one's own tty). When ``tiocsti_restrict`` is set to (1), users must have -+``CAP_SYS_ADMIN`` to use the ``TIOCSTI`` ioctl. -+ -+When user namespaces are in use, the check for the capability -+``CAP_SYS_ADMIN`` is done against the user namespace that originally -+opened the tty. -+ -+The kernel config option ``CONFIG_SECURITY_TIOCSTI_RESTRICT`` sets the -+default value of ``tiocsti_restrict``. -+ -+ - unknown_nmi_panic - ================= - -diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt -index 9375324aa8e1..7bd9b330493c 100644 ---- a/Documentation/networking/ip-sysctl.txt -+++ b/Documentation/networking/ip-sysctl.txt -@@ -587,6 +587,23 @@ tcp_comp_sack_nr - INTEGER - - Default : 44 - -+tcp_simult_connect - BOOLEAN -+ Enable TCP simultaneous connect that adds a weakness in Linux's strict -+ implementation of TCP that allows two clients to connect to each other -+ without either entering a listening state. The weakness allows an attacker -+ to easily prevent a client from connecting to a known server provided the -+ source port for the connection is guessed correctly. -+ -+ As the weakness could be used to prevent an antivirus or IPS from fetching -+ updates, or prevent an SSL gateway from fetching a CRL, it should be -+ eliminated by disabling this option. Though Linux is one of few operating -+ systems supporting simultaneous connect, it has no legitimate use in -+ practice and is rarely supported by firewalls. -+ -+ Disabling this may break TCP STUNT which is used by some applications for -+ NAT traversal. -+ Default: Value of CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON -+ - tcp_slow_start_after_idle - BOOLEAN - If set, provide RFC2861 behavior and time out the congestion - window after an idle period. An idle period is defined at -diff --git a/arch/Kconfig b/arch/Kconfig -index 786a85d4ad40..78ae69e78a81 100644 ---- a/arch/Kconfig -+++ b/arch/Kconfig -@@ -671,7 +671,7 @@ config ARCH_MMAP_RND_BITS - int "Number of bits to use for ASLR of mmap base address" if EXPERT - range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX - default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT -- default ARCH_MMAP_RND_BITS_MIN -+ default ARCH_MMAP_RND_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_BITS - help - This value can be used to select the number of bits to use to -@@ -705,7 +705,7 @@ config ARCH_MMAP_RND_COMPAT_BITS - int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT - range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX - default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT -- default ARCH_MMAP_RND_COMPAT_BITS_MIN -+ default ARCH_MMAP_RND_COMPAT_BITS_MAX - depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS - help - This value can be used to select the number of bits to use to -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 5d513f461957..39abe5fd57fb 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -1216,6 +1216,7 @@ config RODATA_FULL_DEFAULT_ENABLED - - config ARM64_SW_TTBR0_PAN - bool "Emulate Privileged Access Never using TTBR0_EL1 switching" -+ default y - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved -@@ -1706,6 +1707,7 @@ config RANDOMIZE_BASE - bool "Randomize the address of the kernel image" - select ARM64_MODULE_PLTS if MODULES - select RELOCATABLE -+ default y - help - Randomizes the virtual address at which the kernel image is - loaded, as a security feature that deters exploit attempts -diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug -index a1efa246c9ed..ccacb3619b59 100644 ---- a/arch/arm64/Kconfig.debug -+++ b/arch/arm64/Kconfig.debug -@@ -26,6 +26,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig -index 03d0189f7d68..d7c642f8f063 100644 ---- a/arch/arm64/configs/defconfig -+++ b/arch/arm64/configs/defconfig -@@ -1,4 +1,3 @@ --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ_IDLE=y -diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h -index b618017205a3..0a228dbcad65 100644 ---- a/arch/arm64/include/asm/elf.h -+++ b/arch/arm64/include/asm/elf.h -@@ -103,14 +103,10 @@ - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ --#ifdef CONFIG_ARM64_FORCE_52BIT --#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) --#else --#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) --#endif /* CONFIG_ARM64_FORCE_52BIT */ -+#define ELF_ET_DYN_BASE 0x100000000UL - - #ifndef __ASSEMBLY__ - -@@ -164,10 +160,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, - /* 1GB of VA */ - #ifdef CONFIG_COMPAT - #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ -- 0x7ff >> (PAGE_SHIFT - 12) : \ -- 0x3ffff >> (PAGE_SHIFT - 12)) -+ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ -+ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #else --#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) -+#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) - #endif - - #ifdef __AARCH64EB__ -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 2d3f963fd6f1..7b5923dd44e1 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1191,8 +1191,7 @@ config VM86 - default X86_LEGACY_VM86 - - config X86_16BIT -- bool "Enable support for 16-bit segments" if EXPERT -- default y -+ bool "Enable support for 16-bit segments" - depends on MODIFY_LDT_SYSCALL - ---help--- - This option is required by programs like Wine to run 16-bit -@@ -2329,7 +2328,7 @@ config COMPAT_VDSO - choice - prompt "vsyscall table for legacy applications" - depends on X86_64 -- default LEGACY_VSYSCALL_XONLY -+ default LEGACY_VSYSCALL_NONE - help - Legacy user code that does not know how to find the vDSO expects - to be able to issue three syscalls by calling fixed addresses in -@@ -2425,8 +2424,7 @@ config CMDLINE_OVERRIDE - be set to 'N' under normal conditions. - - config MODIFY_LDT_SYSCALL -- bool "Enable the LDT (local descriptor table)" if EXPERT -- default y -+ bool "Enable the LDT (local descriptor table)" - ---help--- - Linux can allow user programs to install a per-process x86 - Local Descriptor Table (LDT) using the modify_ldt(2) system -diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug -index 2e74690b028a..87c7294dd172 100644 ---- a/arch/x86/Kconfig.debug -+++ b/arch/x86/Kconfig.debug -@@ -75,6 +75,7 @@ config EFI_PGT_DUMP - config DEBUG_WX - bool "Warn on W+X mappings at boot" - select PTDUMP_CORE -+ default y - ---help--- - Generate a warning if any W+X mappings are found at boot. - -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 614961009075..06c473ba6b1a 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -1,5 +1,4 @@ - # CONFIG_LOCALVERSION_AUTO is not set --CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_TASKSTATS=y -diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c -index 43428cc514c8..1b01bf6a6fe7 100644 ---- a/arch/x86/entry/vdso/vma.c -+++ b/arch/x86/entry/vdso/vma.c -@@ -316,55 +316,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) - } - - #ifdef CONFIG_X86_64 --/* -- * Put the vdso above the (randomized) stack with another randomized -- * offset. This way there is no hole in the middle of address space. -- * To save memory make sure it is still in the same PTE as the stack -- * top. This doesn't give that many random bits. -- * -- * Note that this algorithm is imperfect: the distribution of the vdso -- * start address within a PMD is biased toward the end. -- * -- * Only used for the 64-bit and x32 vdsos. -- */ --static unsigned long vdso_addr(unsigned long start, unsigned len) --{ -- unsigned long addr, end; -- unsigned offset; -- -- /* -- * Round up the start address. It can start out unaligned as a result -- * of stack start randomization. -- */ -- start = PAGE_ALIGN(start); -- -- /* Round the lowest possible end address up to a PMD boundary. */ -- end = (start + len + PMD_SIZE - 1) & PMD_MASK; -- if (end >= TASK_SIZE_MAX) -- end = TASK_SIZE_MAX; -- end -= len; -- -- if (end > start) { -- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); -- addr = start + (offset << PAGE_SHIFT); -- } else { -- addr = start; -- } -- -- /* -- * Forcibly align the final address in case we have a hardware -- * issue that requires alignment for performance reasons. -- */ -- addr = align_vdso_addr(addr); -- -- return addr; --} -- - static int map_vdso_randomized(const struct vdso_image *image) - { -- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); -- -- return map_vdso(image, addr); -+ return map_vdso(image, 0); - } - #endif - -diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h -index 69c0f892e310..f9f7a85bb71e 100644 ---- a/arch/x86/include/asm/elf.h -+++ b/arch/x86/include/asm/elf.h -@@ -248,11 +248,11 @@ extern int force_personality32; - - /* - * This is the base location for PIE (ET_DYN with INTERP) loads. On -- * 64-bit, this is above 4GB to leave the entire 32-bit address -+ * 64-bit, this is raised to 4GB to leave the entire 32-bit address - * space open for things that want to use the area for 32-bit pointers. - */ - #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ -- (DEFAULT_MAP_WINDOW / 3 * 2)) -+ 0x100000000UL) - - /* This yields a mask that user programs can use to figure out what - instruction set this CPU supports. This could be done in user space, -@@ -312,8 +312,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); - - #ifdef CONFIG_X86_32 - --#define __STACK_RND_MASK(is32bit) (0x7ff) --#define STACK_RND_MASK (0x7ff) -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) - - #define ARCH_DLINFO ARCH_DLINFO_IA32 - -@@ -322,7 +322,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); - #else /* CONFIG_X86_32 */ - - /* 1GB for 64bit, 8MB for 32bit */ --#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) -+#ifdef CONFIG_COMPAT -+#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) -+#else -+#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) -+#endif - #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) - - #define ARCH_DLINFO \ -@@ -380,5 +384,4 @@ struct va_alignment { - } ____cacheline_aligned; - - extern struct va_alignment va_align; --extern unsigned long align_vdso_addr(unsigned long); - #endif /* _ASM_X86_ELF_H */ -diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h -index 6f66d841262d..b786e7cb395d 100644 ---- a/arch/x86/include/asm/tlbflush.h -+++ b/arch/x86/include/asm/tlbflush.h -@@ -295,6 +295,7 @@ static inline void cr4_set_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 | mask) != cr4) - __cr4_set(cr4 | mask); - } -@@ -305,6 +306,7 @@ static inline void cr4_clear_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - if ((cr4 & ~mask) != cr4) - __cr4_set(cr4 & ~mask); - } -@@ -334,6 +336,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - __cr4_set(cr4 ^ mask); - } - -@@ -440,6 +443,7 @@ static inline void __native_flush_tlb_global(void) - raw_local_irq_save(flags); - - cr4 = this_cpu_read(cpu_tlbstate.cr4); -+ BUG_ON(cr4 != __read_cr4()); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ -diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c -index 8f4533c1a4ec..632ef7ef4615 100644 ---- a/arch/x86/kernel/process.c -+++ b/arch/x86/kernel/process.c -@@ -42,6 +42,8 @@ - #include - #include - #include -+#include -+#include - - #include "process.h" - -@@ -907,7 +909,10 @@ unsigned long arch_align_stack(unsigned long sp) - - unsigned long arch_randomize_brk(struct mm_struct *mm) - { -- return randomize_page(mm->brk, 0x02000000); -+ if (mmap_is_ia32()) -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; -+ else -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - /* -diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c -index 504fa5425bce..e30ec4c750d1 100644 ---- a/arch/x86/kernel/sys_x86_64.c -+++ b/arch/x86/kernel/sys_x86_64.c -@@ -52,13 +52,6 @@ static unsigned long get_align_bits(void) - return va_align.bits & get_align_mask(); - } - --unsigned long align_vdso_addr(unsigned long addr) --{ -- unsigned long align_mask = get_align_mask(); -- addr = (addr + align_mask) & ~align_mask; -- return addr | get_align_bits(); --} -- - static int __init control_va_addr_alignment(char *str) - { - /* guard against enabling this on other CPU families */ -@@ -120,10 +113,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, - } - - *begin = get_mmap_base(1); -- if (in_32bit_syscall()) -- *end = task_size_32bit(); -- else -- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); -+ *end = get_mmap_base(0); - } - - unsigned long -@@ -200,7 +190,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; -- info.low_limit = PAGE_SIZE; -+ info.low_limit = get_mmap_base(1); - info.high_limit = get_mmap_base(0); - - /* -diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c -index 4222a010057a..2c0c6b47b75b 100644 ---- a/arch/x86/mm/init_32.c -+++ b/arch/x86/mm/init_32.c -@@ -566,9 +566,9 @@ static void __init pagetable_init(void) - - #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) - /* Bits supported by the hardware: */ --pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; -+pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; - /* Bits allowed in normal kernel mappings: */ --pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; -+pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; - EXPORT_SYMBOL_GPL(__supported_pte_mask); - /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ - EXPORT_SYMBOL(__default_kernel_pte_mask); -diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c -index 8b5f73f5e207..83f76a72f684 100644 ---- a/arch/x86/mm/init_64.c -+++ b/arch/x86/mm/init_64.c -@@ -98,9 +98,9 @@ DEFINE_ENTRY(pte, pte, init) - */ - - /* Bits supported by the hardware: */ --pteval_t __supported_pte_mask __read_mostly = ~0; -+pteval_t __supported_pte_mask __ro_after_init = ~0; - /* Bits allowed in normal kernel mappings: */ --pteval_t __default_kernel_pte_mask __read_mostly = ~0; -+pteval_t __default_kernel_pte_mask __ro_after_init = ~0; - EXPORT_SYMBOL_GPL(__supported_pte_mask); - /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ - EXPORT_SYMBOL(__default_kernel_pte_mask); -diff --git a/block/blk-softirq.c b/block/blk-softirq.c -index 6e7ec87d49fa..d6ee3f8b3e74 100644 ---- a/block/blk-softirq.c -+++ b/block/blk-softirq.c -@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ --static __latent_entropy void blk_done_softirq(struct softirq_action *h) -+static __latent_entropy void blk_done_softirq(void) - { - struct list_head *cpu_list, local_list; - -diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c -index e74c8fe2a5fd..ec43f04b1687 100644 ---- a/drivers/ata/libata-core.c -+++ b/drivers/ata/libata-core.c -@@ -4541,7 +4541,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) - struct ata_port *ap; - unsigned int tag; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - ap = qc->ap; - - qc->flags = 0; -@@ -4558,7 +4558,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) - struct ata_port *ap; - struct ata_link *link; - -- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ -+ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); - ap = qc->ap; - link = qc->dev->link; -diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig -index d4665fe9ccd2..315576465ca0 100644 ---- a/drivers/char/Kconfig -+++ b/drivers/char/Kconfig -@@ -326,7 +326,6 @@ config NSC_GPIO - - config DEVMEM - bool "/dev/mem virtual device support" -- default y - help - Say Y here if you want to support the /dev/mem device. - The /dev/mem device is used to access areas of physical -@@ -390,7 +389,6 @@ config MAX_RAW_DEVS - config DEVPORT - bool "/dev/port character device" - depends on ISA || PCI -- default y - help - Say Y here if you want to support the /dev/port device. The /dev/port - device is similar to /dev/mem, but for I/O ports. -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 2dff93d7a501..f1da13f791cd 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -122,7 +122,6 @@ config UNIX98_PTYS - - config LEGACY_PTYS - bool "Legacy (BSD) PTY support" -- default y - ---help--- - A pseudo terminal (PTY) is a software device consisting of two - halves: a master and a slave. The slave device behaves identical to -diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c -index 5a6f36b391d9..616d82a19160 100644 ---- a/drivers/tty/tty_io.c -+++ b/drivers/tty/tty_io.c -@@ -174,6 +174,7 @@ static void free_tty_struct(struct tty_struct *tty) - put_device(tty->dev); - kfree(tty->write_buf); - tty->magic = 0xDEADDEAD; -+ put_user_ns(tty->owner_user_ns); - kfree(tty); - } - -@@ -2179,11 +2180,19 @@ static int tty_fasync(int fd, struct file *filp, int on) - * FIXME: may race normal receive processing - */ - -+int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); -+ - static int tiocsti(struct tty_struct *tty, char __user *p) - { - char ch, mbz = 0; - struct tty_ldisc *ld; - -+ if (tiocsti_restrict && -+ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { -+ dev_warn_ratelimited(tty->dev, -+ "Denied TIOCSTI ioctl for non-privileged process\n"); -+ return -EPERM; -+ } - if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - if (get_user(ch, p)) -@@ -3009,6 +3018,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) - tty->index = idx; - tty_line_name(driver, idx, tty->name); - tty->dev = tty_get_device(tty); -+ tty->owner_user_ns = get_user_ns(current_user_ns()); - - return tty; - } -diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c -index fc748c731832..f745c9ee5885 100644 ---- a/drivers/usb/core/hub.c -+++ b/drivers/usb/core/hub.c -@@ -46,6 +46,8 @@ - #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ - #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ - -+extern int deny_new_usb; -+ - /* Protect struct usb_device->state and ->children members - * Note: Both are also protected by ->dev.sem, except that ->state can - * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ -@@ -5100,6 +5102,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, - goto done; - return; - } -+ -+ if (deny_new_usb) { -+ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); -+ goto done; -+ } -+ - if (hub_is_superspeed(hub->hdev)) - unit_load = 150; - else -diff --git a/fs/exec.c b/fs/exec.c -index 2c465119affc..bf220ff8c019 100644 ---- a/fs/exec.c -+++ b/fs/exec.c -@@ -62,6 +62,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -274,6 +275,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) - mm->stack_vm = mm->total_vm = 1; - up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); -+ if (randomize_va_space) -+ bprm->p ^= get_random_int() & ~PAGE_MASK; - return 0; - err: - up_write(&mm->mmap_sem); -diff --git a/fs/namei.c b/fs/namei.c -index a320371899cf..6cc595eed647 100644 ---- a/fs/namei.c -+++ b/fs/namei.c -@@ -918,10 +918,10 @@ static inline void put_link(struct nameidata *nd) - path_put(&last->link); - } - --int sysctl_protected_symlinks __read_mostly = 0; --int sysctl_protected_hardlinks __read_mostly = 0; --int sysctl_protected_fifos __read_mostly; --int sysctl_protected_regular __read_mostly; -+int sysctl_protected_symlinks __read_mostly = 1; -+int sysctl_protected_hardlinks __read_mostly = 1; -+int sysctl_protected_fifos __read_mostly = 2; -+int sysctl_protected_regular __read_mostly = 2; - - /** - * may_follow_link - Check symlink following for unsafe situations -diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig -index 88e1763e02f3..71820a515c91 100644 ---- a/fs/nfs/Kconfig -+++ b/fs/nfs/Kconfig -@@ -195,7 +195,6 @@ config NFS_DEBUG - bool - depends on NFS_FS && SUNRPC_DEBUG - select CRC32 -- default y - - config NFS_DISABLE_UDP_SUPPORT - bool "NFS: Disable NFS UDP protocol support" -diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig -index 27ef84d99f59..fb27f99a5e66 100644 ---- a/fs/proc/Kconfig -+++ b/fs/proc/Kconfig -@@ -41,7 +41,6 @@ config PROC_KCORE - config PROC_VMCORE - bool "/proc/vmcore support" - depends on PROC_FS && CRASH_DUMP -- default y - help - Exports the dump image of crashed kernel in ELF format. - -diff --git a/fs/stat.c b/fs/stat.c -index 030008796479..b1c2c0d5b874 100644 ---- a/fs/stat.c -+++ b/fs/stat.c -@@ -42,8 +42,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) - stat->gid = inode->i_gid; - stat->rdev = inode->i_rdev; - stat->size = i_size_read(inode); -- stat->atime = inode->i_atime; -- stat->mtime = inode->i_mtime; -+ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = inode->i_ctime; -+ stat->mtime = inode->i_ctime; -+ } else { -+ stat->atime = inode->i_atime; -+ stat->mtime = inode->i_mtime; -+ } - stat->ctime = inode->i_ctime; - stat->blksize = i_blocksize(inode); - stat->blocks = inode->i_blocks; -@@ -79,9 +84,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, - if (IS_AUTOMOUNT(inode)) - stat->attributes |= STATX_ATTR_AUTOMOUNT; - -- if (inode->i_op->getattr) -- return inode->i_op->getattr(path, stat, request_mask, -- query_flags); -+ if (inode->i_op->getattr) { -+ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); -+ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { -+ stat->atime = stat->ctime; -+ stat->mtime = stat->ctime; -+ } -+ return retval; -+ } - - generic_fillattr(inode, stat); - return 0; -diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c -index e39fdec8a0b0..08610405fdae 100644 ---- a/fs/userfaultfd.c -+++ b/fs/userfaultfd.c -@@ -28,7 +28,11 @@ - #include - #include - -+#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED - int sysctl_unprivileged_userfaultfd __read_mostly = 1; -+#else -+int sysctl_unprivileged_userfaultfd __read_mostly; -+#endif - - static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; - -diff --git a/include/linux/cache.h b/include/linux/cache.h -index 750621e41d1c..e7157c18c62c 100644 ---- a/include/linux/cache.h -+++ b/include/linux/cache.h -@@ -31,6 +31,8 @@ - #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) - #endif - -+#define __read_only __ro_after_init -+ - #ifndef ____cacheline_aligned - #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) - #endif -diff --git a/include/linux/capability.h b/include/linux/capability.h -index ecce0f43c73a..e46306dd4401 100644 ---- a/include/linux/capability.h -+++ b/include/linux/capability.h -@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); - extern bool has_ns_capability_noaudit(struct task_struct *t, - struct user_namespace *ns, int cap); - extern bool capable(int cap); -+extern bool capable_noaudit(int cap); - extern bool ns_capable(struct user_namespace *ns, int cap); - extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); - extern bool ns_capable_setid(struct user_namespace *ns, int cap); -@@ -234,6 +235,10 @@ static inline bool capable(int cap) - { - return true; - } -+static inline bool capable_noaudit(int cap) -+{ -+ return true; -+} - static inline bool ns_capable(struct user_namespace *ns, int cap) - { - return true; -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 45cc10cdf6dd..162d589f120a 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3659,4 +3659,15 @@ static inline int inode_drain_writes(struct inode *inode) - return filemap_write_and_wait(inode->i_mapping); - } - -+extern int device_sidechannel_restrict; -+ -+static inline bool is_sidechannel_device(const struct inode *inode) -+{ -+ umode_t mode; -+ if (!device_sidechannel_restrict) -+ return false; -+ mode = inode->i_mode; -+ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); -+} -+ - #endif /* _LINUX_FS_H */ -diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h -index 5ab28f6c7d26..6333478e581c 100644 ---- a/include/linux/fsnotify.h -+++ b/include/linux/fsnotify.h -@@ -65,6 +65,9 @@ static inline int fsnotify_file(struct file *file, __u32 mask) - struct inode *inode = file_inode(file); - int ret; - -+ if (mask & (FS_ACCESS | FS_MODIFY) && is_sidechannel_device(inode)) -+ return 0; -+ - if (file->f_mode & FMODE_NONOTIFY) - return 0; - -diff --git a/include/linux/gfp.h b/include/linux/gfp.h -index 4aba4c86c626..7d2bd45f35ed 100644 ---- a/include/linux/gfp.h -+++ b/include/linux/gfp.h -@@ -561,9 +561,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, - extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); - extern unsigned long get_zeroed_page(gfp_t gfp_mask); - --void *alloc_pages_exact(size_t size, gfp_t gfp_mask); -+void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); - void free_pages_exact(void *virt, size_t size); --void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); -+void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); - - #define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask), 0) -diff --git a/include/linux/highmem.h b/include/linux/highmem.h -index ea5cdbd8c2c3..805b84d6bbca 100644 ---- a/include/linux/highmem.h -+++ b/include/linux/highmem.h -@@ -215,6 +215,13 @@ static inline void clear_highpage(struct page *page) - kunmap_atomic(kaddr); - } - -+static inline void verify_zero_highpage(struct page *page) -+{ -+ void *kaddr = kmap_atomic(page); -+ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); -+ kunmap_atomic(kaddr); -+} -+ - static inline void zero_user_segments(struct page *page, - unsigned start1, unsigned end1, - unsigned start2, unsigned end2) -diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h -index 80f637c3a6f3..0188c5fa11cb 100644 ---- a/include/linux/interrupt.h -+++ b/include/linux/interrupt.h -@@ -554,7 +554,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; - - struct softirq_action - { -- void (*action)(struct softirq_action *); -+ void (*action)(void); - }; - - asmlinkage void do_softirq(void); -@@ -569,7 +569,7 @@ static inline void do_softirq_own_stack(void) - } - #endif - --extern void open_softirq(int nr, void (*action)(struct softirq_action *)); -+extern void __init open_softirq(int nr, void (*action)(void)); - extern void softirq_init(void); - extern void __raise_softirq_irqoff(unsigned int nr); - -diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h -index 069aa2ebef90..cb9e3637a620 100644 ---- a/include/linux/kobject_ns.h -+++ b/include/linux/kobject_ns.h -@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { - void (*drop_ns)(void *); - }; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); - int kobj_ns_type_registered(enum kobj_ns_type type); - const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); - const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 465e8ad671f8..57f78e2fcdac 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -751,7 +751,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) - } - #endif - --extern void *kvmalloc_node(size_t size, gfp_t flags, int node); -+extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); - static inline void *kvmalloc(size_t size, gfp_t flags) - { - return kvmalloc_node(size, flags, NUMA_NO_NODE); -diff --git a/include/linux/percpu.h b/include/linux/percpu.h -index 5e76af742c80..9a6c682ec127 100644 ---- a/include/linux/percpu.h -+++ b/include/linux/percpu.h -@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_populate_pte_fn_t populate_pte_fn); - #endif - --extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); - extern bool is_kernel_percpu_address(unsigned long addr); - -@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); - extern void __init setup_per_cpu_areas(void); - #endif - --extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); --extern void __percpu *__alloc_percpu(size_t size, size_t align); -+extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); -+extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); - extern void free_percpu(void __percpu *__pdata); - extern phys_addr_t per_cpu_ptr_to_phys(void *addr); - -diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h -index 9c3e7619c929..2976a90b927c 100644 ---- a/include/linux/perf_event.h -+++ b/include/linux/perf_event.h -@@ -1303,6 +1303,14 @@ static inline int perf_is_paranoid(void) - return sysctl_perf_event_paranoid > -1; - } - -+static inline int perf_allow_open(struct perf_event_attr *attr) -+{ -+ if (sysctl_perf_event_paranoid > 2 && !capable(CAP_SYS_ADMIN)) -+ return -EACCES; -+ -+ return security_perf_event_open(attr, PERF_SECURITY_OPEN); -+} -+ - static inline int perf_allow_kernel(struct perf_event_attr *attr) - { - if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) -diff --git a/include/linux/slab.h b/include/linux/slab.h -index 6d454886bcaf..60e0df2ccc59 100644 ---- a/include/linux/slab.h -+++ b/include/linux/slab.h -@@ -184,7 +184,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); - /* - * Common kmalloc functions provided by all allocators - */ --void * __must_check krealloc(const void *, size_t, gfp_t); -+void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); - void kfree(const void *); - void kzfree(const void *); - size_t __ksize(const void *); -@@ -389,7 +389,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) - } - #endif /* !CONFIG_SLOB */ - --void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; -+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; - void kmem_cache_free(struct kmem_cache *, void *); - -@@ -413,7 +413,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) - } - - #ifdef CONFIG_NUMA --void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; -+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); - void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; - #else - static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) -@@ -538,7 +538,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) - * Try really hard to succeed the allocation but fail - * eventually. - */ --static __always_inline void *kmalloc(size_t size, gfp_t flags) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) - { - if (__builtin_constant_p(size)) { - #ifndef CONFIG_SLOB -@@ -560,7 +560,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) - return __kmalloc(size, flags); - } - --static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) -+static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) - { - #ifndef CONFIG_SLOB - if (__builtin_constant_p(size) && -diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h -index d2153789bd9f..97da977d6060 100644 ---- a/include/linux/slub_def.h -+++ b/include/linux/slub_def.h -@@ -121,6 +121,11 @@ struct kmem_cache { - unsigned long random; - #endif - -+#ifdef CONFIG_SLAB_CANARY -+ unsigned long random_active; -+ unsigned long random_inactive; -+#endif -+ - #ifdef CONFIG_NUMA - /* - * Defragmentation by allocating from a remote node. -diff --git a/include/linux/string.h b/include/linux/string.h -index 9b7a0632e87a..5c2420dfe2e7 100644 ---- a/include/linux/string.h -+++ b/include/linux/string.h -@@ -271,6 +271,12 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob - void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); - void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); - -+#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING -+#define __string_size(p) __builtin_object_size(p, 1) -+#else -+#define __string_size(p) __builtin_object_size(p, 0) -+#endif -+ - #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) - - #ifdef CONFIG_KASAN -@@ -299,7 +305,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) - - __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (__builtin_constant_p(size) && p_size < size) - __write_overflow(); - if (p_size < size) -@@ -309,7 +315,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) - - __FORTIFY_INLINE char *strcat(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - if (p_size == (size_t)-1) - return __underlying_strcat(p, q); - if (strlcat(p, q, p_size) >= p_size) -@@ -320,7 +326,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) - __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - { - __kernel_size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - - /* Work around gcc excess stack consumption issue */ - if (p_size == (size_t)-1 || -@@ -335,7 +341,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) - extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); - __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) - { -- size_t p_size = __builtin_object_size(p, 0); -+ size_t p_size = __string_size(p); - __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); - if (p_size <= ret && maxlen != ret) - fortify_panic(__func__); -@@ -347,8 +353,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); - __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - { - size_t ret; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __real_strlcpy(p, q, size); - ret = strlen(q); -@@ -368,8 +374,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) - __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) - { - size_t p_len, copy_len; -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strncat(p, q, count); - p_len = strlen(p); -@@ -482,8 +488,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) - /* defined after fortified strlen and memcpy to reuse them */ - __FORTIFY_INLINE char *strcpy(char *p, const char *q) - { -- size_t p_size = __builtin_object_size(p, 0); -- size_t q_size = __builtin_object_size(q, 0); -+ size_t p_size = __string_size(p); -+ size_t q_size = __string_size(q); - if (p_size == (size_t)-1 && q_size == (size_t)-1) - return __underlying_strcpy(p, q); - memcpy(p, q, strlen(q) + 1); -diff --git a/include/linux/tty.h b/include/linux/tty.h -index a99e9b8e4e31..ee272abea5f9 100644 ---- a/include/linux/tty.h -+++ b/include/linux/tty.h -@@ -14,6 +14,7 @@ - #include - #include - #include -+#include - - - /* -@@ -338,6 +339,7 @@ struct tty_struct { - /* If the tty has a pending do_SAK, queue it here - akpm */ - struct work_struct SAK_work; - struct tty_port *port; -+ struct user_namespace *owner_user_ns; - } __randomize_layout; - - /* Each of a tty's open files has private_data pointing to tty_file_private */ -@@ -347,6 +349,8 @@ struct tty_file_private { - struct list_head list; - }; - -+extern int tiocsti_restrict; -+ - /* tty magic number */ - #define TTY_MAGIC 0x5401 - -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index a95d3cc74d79..93c9cc5baa23 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -102,20 +102,20 @@ static inline void vmalloc_init(void) - static inline unsigned long vmalloc_nr_pages(void) { return 0; } - #endif - --extern void *vmalloc(unsigned long size); --extern void *vzalloc(unsigned long size); --extern void *vmalloc_user(unsigned long size); --extern void *vmalloc_node(unsigned long size, int node); --extern void *vzalloc_node(unsigned long size, int node); --extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); --extern void *vmalloc_exec(unsigned long size); --extern void *vmalloc_32(unsigned long size); --extern void *vmalloc_32_user(unsigned long size); --extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -+extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); -+extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) __attribute__((alloc_size(1))); -+extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); -+extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); -+extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); - extern void *__vmalloc_node_range(unsigned long size, unsigned long align, - unsigned long start, unsigned long end, gfp_t gfp_mask, - pgprot_t prot, unsigned long vm_flags, int node, -- const void *caller); -+ const void *caller) __attribute__((alloc_size(1))); - #ifndef CONFIG_MMU - extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); - static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, -diff --git a/include/net/tcp.h b/include/net/tcp.h -index 6f8e60c6fbc7..fe971ed1978b 100644 ---- a/include/net/tcp.h -+++ b/include/net/tcp.h -@@ -244,6 +244,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); - /* sysctl variables for tcp */ - extern int sysctl_tcp_max_orphans; - extern long sysctl_tcp_mem[3]; -+extern int sysctl_tcp_simult_connect; - - #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ - #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ -diff --git a/init/Kconfig b/init/Kconfig -index 74a5ac65644f..b0f67731c203 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -349,6 +349,7 @@ config USELIB - config AUDIT - bool "Auditing support" - depends on NET -+ default y - help - Enable auditing infrastructure that can be used with another - kernel subsystem, such as SELinux (which requires this for -@@ -1102,6 +1103,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ depends on USER_NS -+ default n -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say N. -+ - config PID_NS - bool "PID Namespaces" - default y -@@ -1515,8 +1532,7 @@ config SHMEM - which may be appropriate on small systems without swap. - - config AIO -- bool "Enable AIO support" if EXPERT -- default y -+ bool "Enable AIO support" - help - This option enables POSIX asynchronous I/O which may by used - by some high performance threaded applications. Disabling -@@ -1652,6 +1668,23 @@ config USERFAULTFD - Enable the userfaultfd() system call that allows to intercept and - handle page faults in userland. - -+config USERFAULTFD_UNPRIVILEGED -+ bool "Allow unprivileged users to use the userfaultfd syscall" -+ depends on USERFAULTFD -+ default n -+ help -+ When disabled, unprivileged users will not be able to use the userfaultfd -+ syscall. Userfaultfd provide attackers with a way to stall a kernel -+ thread in the middle of memory accesses from userspace by initiating an -+ access on an unmapped page. To avoid various heap grooming and heap -+ spraying techniques for exploiting use-after-free flaws this should be -+ disabled by default. -+ -+ This setting can be overridden at runtime via the -+ vm.unprivileged_userfaultfd sysctl. -+ -+ If unsure, say N. -+ - config ARCH_HAS_MEMBARRIER_CALLBACKS - bool - -@@ -1764,7 +1797,7 @@ config VM_EVENT_COUNTERS - - config SLUB_DEBUG - default y -- bool "Enable SLUB debugging support" if EXPERT -+ bool "Enable SLUB debugging support" - depends on SLUB && SYSFS - help - SLUB has extensive debug support features. Disabling these can -@@ -1788,7 +1821,6 @@ config SLUB_MEMCG_SYSFS_ON - - config COMPAT_BRK - bool "Disable heap randomization" -- default y - help - Randomizing heap placement makes heap exploits harder, but it - also breaks ancient binaries (including anything libc5 based). -@@ -1835,7 +1867,6 @@ endchoice - - config SLAB_MERGE_DEFAULT - bool "Allow slab caches to be merged" -- default y - help - For reduced kernel memory fragmentation, slab caches can be - merged when they share the same size and other characteristics. -@@ -1848,9 +1879,9 @@ config SLAB_MERGE_DEFAULT - command line. - - config SLAB_FREELIST_RANDOM -- default n - depends on SLAB || SLUB - bool "SLAB freelist randomization" -+ default y - help - Randomizes the freelist order used on creating new pages. This - security feature reduces the predictability of the kernel slab -@@ -1859,12 +1890,30 @@ config SLAB_FREELIST_RANDOM - config SLAB_FREELIST_HARDENED - bool "Harden slab freelist metadata" - depends on SLUB -+ default y - help - Many kernel heap attacks try to target slab cache metadata and - other infrastructure. This options makes minor performance - sacrifices to harden the kernel slab allocator against common - freelist exploit methods. - -+config SLAB_CANARY -+ depends on SLUB -+ depends on !SLAB_MERGE_DEFAULT -+ bool "SLAB canaries" -+ default y -+ help -+ Place canaries at the end of kernel slab allocations, sacrificing -+ some performance and memory usage for security. -+ -+ Canaries can detect some forms of heap corruption when allocations -+ are freed and as part of the HARDENED_USERCOPY feature. It provides -+ basic use-after-free detection for HARDENED_USERCOPY. -+ -+ Canaries absorb small overflows (rendering them harmless), mitigate -+ non-NUL terminated C string overflows on 64-bit via a guaranteed zero -+ byte and provide basic double-free detection. -+ - config SHUFFLE_PAGE_ALLOCATOR - bool "Page allocator randomization" - default SLAB_FREELIST_RANDOM && ACPI_NUMA -diff --git a/kernel/audit.c b/kernel/audit.c -index f711f424a28a..f15d1d41244c 100644 ---- a/kernel/audit.c -+++ b/kernel/audit.c -@@ -1642,6 +1642,9 @@ static int __init audit_enable(char *str) - - if (audit_default == AUDIT_OFF) - audit_initialized = AUDIT_DISABLED; -+ else if (!audit_ever_enabled) -+ audit_initialized = AUDIT_UNINITIALIZED; -+ - if (audit_set_enabled(audit_default)) - pr_err("audit: error setting audit state (%d)\n", - audit_default); -diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c -index 916f5132a984..296a07014999 100644 ---- a/kernel/bpf/core.c -+++ b/kernel/bpf/core.c -@@ -520,7 +520,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) - /* All BPF JIT sysctl knobs here. */ - int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); - int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); --int bpf_jit_harden __read_mostly; -+int bpf_jit_harden __read_mostly = 2; - long bpf_jit_limit __read_mostly; - - static void -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index c8acc8f37583..ccf05cdfd932 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); - static DEFINE_IDR(map_idr); - static DEFINE_SPINLOCK(map_idr_lock); - --int sysctl_unprivileged_bpf_disabled __read_mostly; -+int sysctl_unprivileged_bpf_disabled __read_mostly = 1; - - static const struct bpf_map_ops * const bpf_map_types[] = { - #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) -diff --git a/kernel/capability.c b/kernel/capability.c -index 1444f3954d75..8cc9dd7992f2 100644 ---- a/kernel/capability.c -+++ b/kernel/capability.c -@@ -449,6 +449,12 @@ bool capable(int cap) - return ns_capable(&init_user_ns, cap); - } - EXPORT_SYMBOL(capable); -+ -+bool capable_noaudit(int cap) -+{ -+ return ns_capable_noaudit(&init_user_ns, cap); -+} -+EXPORT_SYMBOL(capable_noaudit); - #endif /* CONFIG_MULTIUSER */ - - /** -diff --git a/kernel/events/core.c b/kernel/events/core.c -index 1dd91f960839..90a629557f9e 100644 ---- a/kernel/events/core.c -+++ b/kernel/events/core.c -@@ -406,8 +406,13 @@ static cpumask_var_t perf_online_mask; - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu events for unpriv - * 2 - disallow kernel profiling for unpriv -+ * 3 - disallow all unpriv perf event use - */ -+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT -+int sysctl_perf_event_paranoid __read_mostly = 3; -+#else - int sysctl_perf_event_paranoid __read_mostly = 2; -+#endif - - /* Minimum for 512 kiB + 1 user control page */ - int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ -@@ -11501,7 +11506,7 @@ SYSCALL_DEFINE5(perf_event_open, - return -EINVAL; - - /* Do we allow access to perf_event_open(2) ? */ -- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); -+ err = perf_allow_open(&attr); - if (err) - return err; - -diff --git a/kernel/fork.c b/kernel/fork.c -index 48ed22774efa..ec61454a18d5 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -106,6 +106,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1848,6 +1853,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2948,6 +2957,12 @@ int ksys_unshare(unsigned long unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c -index dd572ce7c747..95af139ac6ba 100644 ---- a/kernel/rcu/tiny.c -+++ b/kernel/rcu/tiny.c -@@ -100,7 +100,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) - } - - /* Invoke the RCU callbacks whose grace period has elapsed. */ --static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) -+static __latent_entropy void rcu_process_callbacks(void) - { - struct rcu_head *next, *list; - unsigned long flags; -diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c -index d9a49cd6065a..9f63b28e3ebe 100644 ---- a/kernel/rcu/tree.c -+++ b/kernel/rcu/tree.c -@@ -2437,7 +2437,7 @@ static __latent_entropy void rcu_core(void) - trace_rcu_utilization(TPS("End RCU core")); - } - --static void rcu_core_si(struct softirq_action *h) -+static void rcu_core_si(void) - { - rcu_core(); - } -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 5725199b32dc..dfb99620cb41 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -10568,7 +10568,7 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) - * run_rebalance_domains is triggered when needed from the scheduler tick. - * Also triggered for nohz idle balancing (with nohz_balancing_kick set). - */ --static __latent_entropy void run_rebalance_domains(struct softirq_action *h) -+static __latent_entropy void run_rebalance_domains(void) - { - struct rq *this_rq = this_rq(); - enum cpu_idle_type idle = this_rq->idle_balance ? -diff --git a/kernel/softirq.c b/kernel/softirq.c -index a47c6dd57452..c12cb85a6504 100644 ---- a/kernel/softirq.c -+++ b/kernel/softirq.c -@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); - EXPORT_PER_CPU_SYMBOL(irq_stat); - #endif - --static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; -+static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); - - DEFINE_PER_CPU(struct task_struct *, ksoftirqd); - -@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) - kstat_incr_softirqs_this_cpu(vec_nr); - - trace_softirq_entry(vec_nr); -- h->action(h); -+ h->action(); - trace_softirq_exit(vec_nr); - if (unlikely(prev_count != preempt_count())) { - pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", -@@ -453,7 +453,7 @@ void __raise_softirq_irqoff(unsigned int nr) - or_softirq_pending(1UL << nr); - } - --void open_softirq(int nr, void (*action)(struct softirq_action *)) -+void __init open_softirq(int nr, void (*action)(void)) - { - softirq_vec[nr].action = action; - } -@@ -499,8 +499,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) - } - EXPORT_SYMBOL(__tasklet_hi_schedule); - --static void tasklet_action_common(struct softirq_action *a, -- struct tasklet_head *tl_head, -+static void tasklet_action_common(struct tasklet_head *tl_head, - unsigned int softirq_nr) - { - struct tasklet_struct *list; -@@ -537,14 +536,14 @@ static void tasklet_action_common(struct softirq_action *a, - } - } - --static __latent_entropy void tasklet_action(struct softirq_action *a) -+static __latent_entropy void tasklet_action(void) - { -- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); -+ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); - } - --static __latent_entropy void tasklet_hi_action(struct softirq_action *a) -+static __latent_entropy void tasklet_hi_action(void) - { -- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); -+ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); - } - - void tasklet_init(struct tasklet_struct *t, -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 8a176d8727a3..87bc1d26c376 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -68,6 +68,7 @@ - #include - #include - #include -+#include - - #include "../lib/kstrtox.h" - -@@ -104,12 +105,19 @@ - #if defined(CONFIG_SYSCTL) - - /* External variables not in a header file. */ -+#if IS_ENABLED(CONFIG_USB) -+int deny_new_usb __read_mostly = 0; -+EXPORT_SYMBOL(deny_new_usb); -+#endif - extern int suid_dumpable; - #ifdef CONFIG_COREDUMP - extern int core_uses_pid; - extern char core_pattern[]; - extern unsigned int core_pipe_limit; - #endif -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - extern int pid_max; - extern int pid_max_min, pid_max_max; - extern int percpu_pagelist_fraction; -@@ -121,32 +129,32 @@ extern int sysctl_nr_trim_pages; - - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR --static int sixty = 60; -+static int sixty __read_only = 60; - #endif - --static int __maybe_unused neg_one = -1; --static int __maybe_unused two = 2; --static int __maybe_unused four = 4; --static unsigned long zero_ul; --static unsigned long one_ul = 1; --static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int one_thousand = 1000; -+static int __maybe_unused neg_one __read_only = -1; -+static int __maybe_unused two __read_only = 2; -+static int __maybe_unused four __read_only = 4; -+static unsigned long zero_ul __read_only; -+static unsigned long one_ul __read_only = 1; -+static unsigned long long_max __read_only = LONG_MAX; -+static int one_hundred __read_only = 100; -+static int one_thousand __read_only = 1000; - #ifdef CONFIG_PRINTK --static int ten_thousand = 10000; -+static int ten_thousand __read_only = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS --static int six_hundred_forty_kb = 640 * 1024; -+static int six_hundred_forty_kb __read_only = 640 * 1024; - #endif - - /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ --static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; -+static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; - - /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ --static int maxolduid = 65535; --static int minolduid; -+static int maxolduid __read_only = 65535; -+static int minolduid __read_only; - --static int ngroups_max = NGROUPS_MAX; -+static int ngroups_max __read_only = NGROUPS_MAX; - static const int cap_last_cap = CAP_LAST_CAP; - - /* -@@ -154,9 +162,12 @@ static const int cap_last_cap = CAP_LAST_CAP; - * and hung_task_check_interval_secs - */ - #ifdef CONFIG_DETECT_HUNG_TASK --static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); -+static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); - #endif - -+int device_sidechannel_restrict __read_mostly = 1; -+EXPORT_SYMBOL(device_sidechannel_restrict); -+ - #ifdef CONFIG_INOTIFY_USER - #include - #endif -@@ -289,19 +300,19 @@ static struct ctl_table sysctl_base_table[] = { - }; - - #ifdef CONFIG_SCHED_DEBUG --static int min_sched_granularity_ns = 100000; /* 100 usecs */ --static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ --static int min_wakeup_granularity_ns; /* 0 usecs */ --static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ -+static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ -+static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ -+static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ -+static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ - #ifdef CONFIG_SMP --static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; --static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; -+static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; -+static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; - #endif /* CONFIG_SMP */ - #endif /* CONFIG_SCHED_DEBUG */ - - #ifdef CONFIG_COMPACTION --static int min_extfrag_threshold; --static int max_extfrag_threshold = 1000; -+static int min_extfrag_threshold __read_only; -+static int max_extfrag_threshold __read_only = 1000; - #endif - - static struct ctl_table kern_table[] = { -@@ -534,6 +545,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -@@ -880,6 +900,37 @@ static struct ctl_table kern_table[] = { - .extra1 = SYSCTL_ZERO, - .extra2 = &two, - }, -+#endif -+#if defined CONFIG_TTY -+ { -+ .procname = "tiocsti_restrict", -+ .data = &tiocsti_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#endif -+ { -+ .procname = "device_sidechannel_restrict", -+ .data = &device_sidechannel_restrict, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#if IS_ENABLED(CONFIG_USB) -+ { -+ .procname = "deny_new_usb", -+ .data = &deny_new_usb, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax_sysadmin, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, - #endif - { - .procname = "ngroups_max", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index d89da1c7e005..8e1003ef3ebb 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, - } - } - --static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) -+static __latent_entropy void hrtimer_run_softirq(void) - { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - unsigned long flags; -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index a5221abb4594..636f4f9566fa 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -1780,7 +1780,7 @@ static inline void __run_timers(struct timer_base *base) - /* - * This function runs timers and the timer-tq in bottom half context. - */ --static __latent_entropy void run_timer_softirq(struct softirq_action *h) -+static __latent_entropy void run_timer_softirq(void) - { - struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); - -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 8eadadc478f9..c36ecd19562c 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -21,6 +21,13 @@ - #include - #include - -+/* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else -+int unprivileged_userns_clone; -+#endif -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 21d9c5f6e7ec..ab5ae07fa69a 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -337,6 +337,9 @@ config SECTION_MISMATCH_WARN_ONLY - - If unsure, say Y. - -+config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE -+ bool "Enable verbose reporting of writable function pointers" -+ - # - # Select this config option from the architecture Kconfig, if it - # is preferred to always offer frame pointers as a config -@@ -798,6 +801,7 @@ menu "Debug Oops, Lockups and Hangs" - - config PANIC_ON_OOPS - bool "Panic on Oops" -+ default y - help - Say Y here to enable the kernel to panic when it oopses. This - has the same effect as setting oops=panic on the kernel command -@@ -807,7 +811,7 @@ config PANIC_ON_OOPS - anything erroneous after an oops which could result in data - corruption or other issues. - -- Say N if unsure. -+ Say Y if unsure. - - config PANIC_ON_OOPS_VALUE - int -@@ -1346,6 +1350,7 @@ menu "Debug kernel data structures" - config DEBUG_LIST - bool "Debug linked list manipulation" - depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION -+ default y - help - Enable this to turn on extended checks in the linked-list - walking routines. -@@ -1385,6 +1390,7 @@ config DEBUG_NOTIFIERS - config BUG_ON_DATA_CORRUPTION - bool "Trigger a BUG when data corruption is detected" - select DEBUG_LIST -+ default y - help - Select this option if the kernel should BUG when it encounters - data corruption in kernel memory structures when they get checked -@@ -1540,6 +1546,7 @@ config STRICT_DEVMEM - config IO_STRICT_DEVMEM - bool "Filter I/O access to /dev/mem" - depends on STRICT_DEVMEM -+ default y - help - If this option is disabled, you allow userspace (root) access to all - io-memory regardless of whether a driver is actively using that -diff --git a/lib/irq_poll.c b/lib/irq_poll.c -index 2f17b488d58e..b6e7996a0058 100644 ---- a/lib/irq_poll.c -+++ b/lib/irq_poll.c -@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) - } - EXPORT_SYMBOL(irq_poll_complete); - --static void __latent_entropy irq_poll_softirq(struct softirq_action *h) -+static void __latent_entropy irq_poll_softirq(void) - { - struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = irq_poll_budget; -diff --git a/lib/kobject.c b/lib/kobject.c -index 83198cb37d8d..4a053b7aef42 100644 ---- a/lib/kobject.c -+++ b/lib/kobject.c -@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); - - - static DEFINE_SPINLOCK(kobj_ns_type_lock); --static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; -+static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; - --int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) -+int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) - { - enum kobj_ns_type type = ops->type; - int error; -diff --git a/lib/nlattr.c b/lib/nlattr.c -index cace9b307781..39ba1387045d 100644 ---- a/lib/nlattr.c -+++ b/lib/nlattr.c -@@ -571,6 +571,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) - { - int minlen = min_t(int, count, nla_len(src)); - -+ BUG_ON(minlen < 0); -+ - memcpy(dest, nla_data(src), minlen); - if (count > minlen) - memset(dest + minlen, 0, count - minlen); -diff --git a/lib/vsprintf.c b/lib/vsprintf.c -index 7c47ad52ce2f..d1e002579732 100644 ---- a/lib/vsprintf.c -+++ b/lib/vsprintf.c -@@ -817,7 +817,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, - return pointer_string(buf, end, (const void *)hashval, spec); - } - --int kptr_restrict __read_mostly; -+int kptr_restrict __read_mostly = 2; - - static noinline_for_stack - char *restricted_pointer(char *buf, char *end, const void *ptr, -diff --git a/mm/Kconfig b/mm/Kconfig -index c1acc34c1c35..06dd0aa41a1b 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -320,7 +320,8 @@ config KSM - config DEFAULT_MMAP_MIN_ADDR - int "Low address space to protect from user allocation" - depends on MMU -- default 4096 -+ default 32768 if ARM || (ARM64 && COMPAT) -+ default 65536 - help - This is the portion of low virtual memory which should be protected - from userspace allocation. Keeping a user from writing to low pages -diff --git a/mm/mmap.c b/mm/mmap.c -index f609e9ec4a25..66297ff169d9 100644 ---- a/mm/mmap.c -+++ b/mm/mmap.c -@@ -231,6 +231,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) - - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); -+ /* properly handle unaligned min_brk as an empty heap */ -+ if (min_brk & ~PAGE_MASK) { -+ if (brk == min_brk) -+ newbrk -= PAGE_SIZE; -+ if (mm->brk == min_brk) -+ oldbrk -= PAGE_SIZE; -+ } - if (oldbrk == newbrk) { - mm->brk = brk; - goto success; -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index d0c0d9364aa6..1f1a45afac2a 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -68,6 +68,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -106,6 +107,15 @@ struct pcpu_drain { - static DEFINE_MUTEX(pcpu_drain_mutex); - static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); - -+bool __meminitdata extra_latent_entropy; -+ -+static int __init setup_extra_latent_entropy(char *str) -+{ -+ extra_latent_entropy = true; -+ return 0; -+} -+early_param("extra_latent_entropy", setup_extra_latent_entropy); -+ - #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY - volatile unsigned long latent_entropy __latent_entropy; - EXPORT_SYMBOL(latent_entropy); -@@ -1479,6 +1489,25 @@ static void __free_pages_ok(struct page *page, unsigned int order) - local_irq_restore(flags); - } - -+static void __init __gather_extra_latent_entropy(struct page *page, -+ unsigned int nr_pages) -+{ -+ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { -+ unsigned long hash = 0; -+ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; -+ const unsigned long *data = lowmem_page_address(page); -+ -+ for (index = 0; index < end; index++) -+ hash ^= hash + data[index]; -+#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY -+ latent_entropy ^= hash; -+ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); -+#else -+ add_device_randomness((const void *)&hash, sizeof(hash)); -+#endif -+ } -+} -+ - void __free_pages_core(struct page *page, unsigned int order) - { - unsigned int nr_pages = 1 << order; -@@ -1493,7 +1522,6 @@ void __free_pages_core(struct page *page, unsigned int order) - } - __ClearPageReserved(p); - set_page_count(p, 0); -- - atomic_long_add(nr_pages, &page_zone(page)->managed_pages); - set_page_refcounted(page); - __free_pages(page, order); -@@ -1544,6 +1572,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, - { - if (early_page_uninitialised(pfn)) - return; -+ __gather_extra_latent_entropy(page, 1 << order); - __free_pages_core(page, order); - } - -@@ -1635,6 +1664,7 @@ static void __init deferred_free_range(unsigned long pfn, - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { - set_pageblock_migratetype(page, MIGRATE_MOVABLE); -+ __gather_extra_latent_entropy(page, 1 << pageblock_order); - __free_pages_core(page, pageblock_order); - return; - } -@@ -1642,6 +1672,7 @@ static void __init deferred_free_range(unsigned long pfn, - for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) - set_pageblock_migratetype(page, MIGRATE_MOVABLE); -+ __gather_extra_latent_entropy(page, 1); - __free_pages_core(page, 0); - } - } -@@ -2202,6 +2233,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags - { - post_alloc_hook(page, order, gfp_flags); - -+ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { -+ int i; -+ for (i = 0; i < (1 << order); i++) -+ verify_zero_highpage(page + i); -+ } -+ - if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) - kernel_init_free_pages(page, 1 << order); - -diff --git a/mm/slab.h b/mm/slab.h -index 74f7e09a7cfd..ce786e0af610 100644 ---- a/mm/slab.h -+++ b/mm/slab.h -@@ -472,9 +472,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) - struct page *page; - - page = virt_to_head_page(obj); -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(!PageSlab(page)); -+#else - if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", - __func__)) - return NULL; -+#endif - return page->slab_cache; - } - -@@ -520,9 +524,14 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) - return s; - - cachep = virt_to_cache(x); -- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), -- "%s: Wrong slab cache. %s but object is from %s\n", -- __func__, s->name, cachep->name); -+ if (cachep && !slab_equal_or_root(cachep, s)) { -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG(); -+#else -+ WARN_ONCE(1, "%s: Wrong slab cache. %s but object is from %s\n", -+ __func__, s->name, cachep->name); -+#endif -+ } - return cachep; - } - -@@ -547,7 +556,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) - * back there or track user information then we can - * only use the space before that information. - */ -- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) -+ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) - return s->inuse; - /* - * Else we can use all the padding etc for the allocation -@@ -676,8 +685,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } - static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) - { - if (static_branch_unlikely(&init_on_alloc)) { -+#ifndef CONFIG_SLUB - if (c->ctor) - return false; -+#endif - if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) - return flags & __GFP_ZERO; - return true; -@@ -687,9 +698,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) - - static inline bool slab_want_init_on_free(struct kmem_cache *c) - { -- if (static_branch_unlikely(&init_on_free)) -- return !(c->ctor || -- (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); -+ if (static_branch_unlikely(&init_on_free)) { -+#ifndef CONFIG_SLUB -+ if (c->ctor) -+ return false; -+#endif -+ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) -+ return false; -+ return true; -+ } - return false; - } - -diff --git a/mm/slab_common.c b/mm/slab_common.c -index 37d48a56431d..b8947336d0e1 100644 ---- a/mm/slab_common.c -+++ b/mm/slab_common.c -@@ -28,10 +28,10 @@ - - #include "slab.h" - --enum slab_state slab_state; -+enum slab_state slab_state __ro_after_init; - LIST_HEAD(slab_caches); - DEFINE_MUTEX(slab_mutex); --struct kmem_cache *kmem_cache; -+struct kmem_cache *kmem_cache __ro_after_init; - - #ifdef CONFIG_HARDENED_USERCOPY - bool usercopy_fallback __ro_after_init = -@@ -59,7 +59,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, - /* - * Merge control. If this is set then no merging of slab caches will occur. - */ --static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); -+static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); - - static int __init setup_slab_nomerge(char *str) - { -diff --git a/mm/slub.c b/mm/slub.c -index 660f4324c097..54c3291a7571 100644 ---- a/mm/slub.c -+++ b/mm/slub.c -@@ -123,6 +123,12 @@ static inline int kmem_cache_debug(struct kmem_cache *s) - #endif - } - -+static inline bool has_sanitize_verify(struct kmem_cache *s) -+{ -+ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && -+ slab_want_init_on_free(s); -+} -+ - void *fixup_red_left(struct kmem_cache *s, void *p) - { - if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) -@@ -494,13 +500,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) - * Debug settings: - */ - #if defined(CONFIG_SLUB_DEBUG_ON) --static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; -+static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; - #else --static slab_flags_t slub_debug; -+static slab_flags_t slub_debug __ro_after_init; - #endif - --static char *slub_debug_slabs; --static int disable_higher_order_debug; -+static char *slub_debug_slabs __ro_after_init; -+static int disable_higher_order_debug __ro_after_init; - - /* - * slub is about to manipulate internal object metadata. This memory lies -@@ -571,6 +577,33 @@ static inline unsigned int get_info_end(struct kmem_cache *s) - return s->inuse; - } - -+#ifdef CONFIG_SLAB_CANARY -+static inline unsigned long *get_canary(struct kmem_cache *s, void *object) -+{ -+ return object + get_info_end(s); -+} -+ -+static inline unsigned long get_canary_value(const void *canary, unsigned long value) -+{ -+ return (value ^ (unsigned long)canary) & CANARY_MASK; -+} -+ -+static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ *canary = get_canary_value(canary, value); -+} -+ -+static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) -+{ -+ unsigned long *canary = get_canary(s, object); -+ BUG_ON(*canary != get_canary_value(canary, value)); -+} -+#else -+#define set_canary(s, object, value) -+#define check_canary(s, object, value) -+#endif -+ - static struct track *get_track(struct kmem_cache *s, void *object, - enum track_item alloc) - { -@@ -578,6 +611,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, - - p = object + get_info_end(s); - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ p = (void *)p + sizeof(void *); -+ - return p + alloc; - } - -@@ -719,6 +755,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) - - off = get_info_end(s); - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - off += 2 * sizeof(struct track); - -@@ -827,8 +866,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, - * Meta data starts here. - * - * A. Free pointer (if we cannot overwrite object on free) -- * B. Tracking data for SLAB_STORE_USER -- * C. Padding to reach required alignment boundary or at mininum -+ * B. Canary for SLAB_CANARY -+ * C. Tracking data for SLAB_STORE_USER -+ * D. Padding to reach required alignment boundary or at mininum - * one word if debugging is on to be able to detect writes - * before the word boundary. - * -@@ -846,6 +886,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) - { - unsigned long off = get_info_end(s); /* The end of info */ - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ off += sizeof(void *); -+ - if (s->flags & SLAB_STORE_USER) - /* We also have user information there */ - off += 2 * sizeof(struct track); -@@ -1491,6 +1534,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - object = next; - next = get_freepointer(s, object); - -+ check_canary(s, object, s->random_active); -+ - if (slab_want_init_on_free(s)) { - /* - * Clear the object and the metadata, but don't touch -@@ -1501,8 +1546,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - : 0; - memset((char *)object + s->inuse, 0, - s->size - s->inuse - rsize); -- -+ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) -+ s->ctor(object); - } -+ -+ set_canary(s, object, s->random_inactive); -+ - /* If object's reuse doesn't have to be delayed */ - if (!slab_free_hook(s, object)) { - /* Move object to the new freelist */ -@@ -1510,6 +1559,18 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, - *head = object; - if (!*tail) - *tail = object; -+ } else if (slab_want_init_on_free(s) && s->ctor) { -+ /* Objects that are put into quarantine by KASAN will -+ * still undergo free_consistency_checks() and thus -+ * need to show a valid freepointer to check_object(). -+ * -+ * Note that doing this for all caches (not just ctor -+ * ones, which have s->offset >= object_size)) causes a -+ * GPF, due to KASAN poisoning and the way -+ * set_freepointer() eventually dereferences the -+ * freepointer. -+ */ -+ set_freepointer(s, object, NULL); - } - } while (object != old_tail); - -@@ -1523,8 +1584,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page, - void *object) - { - setup_object_debug(s, page, object); -+ set_canary(s, object, s->random_inactive); - object = kasan_init_slab_obj(s, object); -- if (unlikely(s->ctor)) { -+ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { - kasan_unpoison_object_data(s, object); - s->ctor(object); - kasan_poison_object_data(s, object); -@@ -2818,8 +2880,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, - - maybe_wipe_obj_freeptr(s, object); - -- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) -+ if (has_sanitize_verify(s) && object) { -+ /* KASAN hasn't unpoisoned the object yet (this is done in the -+ * post-alloc hook), so let's do it temporarily. -+ */ -+ kasan_unpoison_object_data(s, object); -+ BUG_ON(memchr_inv(object, 0, s->object_size)); -+ if (s->ctor) -+ s->ctor(object); -+ kasan_poison_object_data(s, object); -+ } else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) { - memset(object, 0, s->object_size); -+ if (s->ctor) { -+ kasan_unpoison_object_data(s, object); -+ s->ctor(object); -+ kasan_poison_object_data(s, object); -+ } -+ } -+ -+ if (object) { -+ check_canary(s, object, s->random_inactive); -+ set_canary(s, object, s->random_active); -+ } - - slab_post_alloc_hook(s, gfpflags, 1, &object); - -@@ -3204,7 +3286,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) - { - struct kmem_cache_cpu *c; -- int i; -+ int i, k; - - /* memcg and kmem_cache debug support */ - s = slab_pre_alloc_hook(s, flags); -@@ -3253,11 +3335,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - local_irq_enable(); - - /* Clear memory outside IRQ disabled fastpath loop */ -- if (unlikely(slab_want_init_on_alloc(flags, s))) { -+ if (has_sanitize_verify(s)) { -+ int j; -+ -+ for (j = 0; j < i; j++) { -+ /* KASAN hasn't unpoisoned the object yet (this is done -+ * in the post-alloc hook), so let's do it temporarily. -+ */ -+ kasan_unpoison_object_data(s, p[j]); -+ BUG_ON(memchr_inv(p[j], 0, s->object_size)); -+ if (s->ctor) -+ s->ctor(p[j]); -+ kasan_poison_object_data(s, p[j]); -+ } -+ } else if (unlikely(slab_want_init_on_alloc(flags, s))) { - int j; - -- for (j = 0; j < i; j++) -+ for (j = 0; j < i; j++) { - memset(p[j], 0, s->object_size); -+ if (s->ctor) { -+ kasan_unpoison_object_data(s, p[j]); -+ s->ctor(p[j]); -+ kasan_poison_object_data(s, p[j]); -+ } -+ } -+ } -+ -+ for (k = 0; k < i; k++) { -+ check_canary(s, p[k], s->random_inactive); -+ set_canary(s, p[k], s->random_active); - } - - /* memcg and kmem_cache debug support */ -@@ -3291,9 +3397,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); - * and increases the number of allocations possible without having to - * take the list_lock. - */ --static unsigned int slub_min_order; --static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; --static unsigned int slub_min_objects; -+static unsigned int slub_min_order __ro_after_init; -+static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; -+static unsigned int slub_min_objects __ro_after_init; - - /* - * Calculate the order of allocation given an slab object size. -@@ -3461,6 +3567,7 @@ static void early_kmem_cache_node_alloc(int node) - init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); - init_tracking(kmem_cache_node, n); - #endif -+ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); - n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), - GFP_KERNEL); - page->freelist = get_freepointer(kmem_cache_node, n); -@@ -3641,6 +3748,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) - s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); - } - -+ if (IS_ENABLED(CONFIG_SLAB_CANARY)) -+ size += sizeof(void *); -+ - #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) - /* -@@ -3713,6 +3823,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) - #ifdef CONFIG_SLAB_FREELIST_HARDENED - s->random = get_random_long(); - #endif -+#ifdef CONFIG_SLAB_CANARY -+ s->random_active = get_random_long(); -+ s->random_inactive = get_random_long(); -+#endif - - if (!calculate_sizes(s, -1)) - goto error; -@@ -3988,6 +4102,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, - offset -= s->red_left_pad; - } - -+ check_canary(s, (void *)ptr - offset, s->random_active); -+ - /* Allow address range falling entirely within usercopy region. */ - if (offset >= s->useroffset && - offset - s->useroffset <= s->usersize && -@@ -4021,7 +4137,11 @@ size_t __ksize(const void *object) - page = virt_to_head_page(object); - - if (unlikely(!PageSlab(page))) { -+#ifdef CONFIG_BUG_ON_DATA_CORRUPTION -+ BUG_ON(!PageCompound(page)); -+#else - WARN_ON(!PageCompound(page)); -+#endif - return page_size(page); - } - -@@ -4848,7 +4968,7 @@ enum slab_stat_type { - #define SO_TOTAL (1 << SL_TOTAL) - - #ifdef CONFIG_MEMCG --static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); -+static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); - - static int __init setup_slub_memcg_sysfs(char *str) - { -diff --git a/mm/swap.c b/mm/swap.c -index bf9a79fed62d..3375d4cf4ee8 100644 ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -94,6 +94,13 @@ static void __put_compound_page(struct page *page) - if (!PageHuge(page)) - __page_cache_release(page); - dtor = get_compound_page_dtor(page); -+ if (!PageHuge(page)) -+ BUG_ON(dtor != free_compound_page -+#ifdef CONFIG_TRANSPARENT_HUGEPAGE -+ && dtor != free_transhuge_page -+#endif -+ ); -+ - (*dtor)(page); - } - -diff --git a/mm/util.c b/mm/util.c -index dc1c877d5481..4872ec1b8858 100644 ---- a/mm/util.c -+++ b/mm/util.c -@@ -335,9 +335,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) - { - /* Is the current task 32bit ? */ - if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) -- return randomize_page(mm->brk, SZ_32M); -+ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; - -- return randomize_page(mm->brk, SZ_1G); -+ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; - } - - unsigned long arch_mmap_rnd(void) -diff --git a/net/core/dev.c b/net/core/dev.c -index c9ee5d80d5ea..9904a4aefa8b 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -4750,7 +4750,7 @@ int netif_rx_ni(struct sk_buff *skb) - } - EXPORT_SYMBOL(netif_rx_ni); - --static __latent_entropy void net_tx_action(struct softirq_action *h) -+static __latent_entropy void net_tx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - -@@ -6622,7 +6622,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) - return work; - } - --static __latent_entropy void net_rx_action(struct softirq_action *h) -+static __latent_entropy void net_rx_action(void) - { - struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index 25a8888826b8..7343a827e166 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -267,6 +267,7 @@ config IP_PIMSM_V2 - - config SYN_COOKIES - bool "IP: TCP syncookie support" -+ default y - ---help--- - Normal TCP/IP networking is open to an attack known as "SYN - flooding". This denial-of-service attack prevents legitimate remote -@@ -739,3 +740,26 @@ config TCP_MD5SIG - on the Internet. - - If unsure, say N. -+ -+config TCP_SIMULT_CONNECT_DEFAULT_ON -+ bool "Enable TCP simultaneous connect" -+ help -+ Enable TCP simultaneous connect that adds a weakness in Linux's strict -+ implementation of TCP that allows two clients to connect to each other -+ without either entering a listening state. The weakness allows an -+ attacker to easily prevent a client from connecting to a known server -+ provided the source port for the connection is guessed correctly. -+ -+ As the weakness could be used to prevent an antivirus or IPS from -+ fetching updates, or prevent an SSL gateway from fetching a CRL, it -+ should be eliminated by disabling this option. Though Linux is one of -+ few operating systems supporting simultaneous connect, it has no -+ legitimate use in practice and is rarely supported by firewalls. -+ -+ Disabling this may break TCP STUNT which is used by some applications -+ for NAT traversal. -+ -+ This setting can be overridden at runtime via the -+ net.ipv4.tcp_simult_connect sysctl. -+ -+ If unsure, say N. -diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c -index 81b267e990a1..587dbfdbcf1a 100644 ---- a/net/ipv4/sysctl_net_ipv4.c -+++ b/net/ipv4/sysctl_net_ipv4.c -@@ -604,6 +604,15 @@ static struct ctl_table ipv4_table[] = { - .mode = 0644, - .proc_handler = proc_do_static_key, - }, -+ { -+ .procname = "tcp_simult_connect", -+ .data = &sysctl_tcp_simult_connect, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, - { } - }; - -diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 1fa009999f57..43aa2340feb2 100644 ---- a/net/ipv4/tcp_input.c -+++ b/net/ipv4/tcp_input.c -@@ -82,6 +82,7 @@ - #include - - int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -+int sysctl_tcp_simult_connect __read_mostly = IS_ENABLED(CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON); - - #define FLAG_DATA 0x01 /* Incoming frame contained data. */ - #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ -@@ -6064,7 +6065,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, - tcp_paws_reject(&tp->rx_opt, 0)) - goto discard_and_undo; - -- if (th->syn) { -+ if (th->syn && sysctl_tcp_simult_connect) { - /* We see SYN without ACK. It is attempt of - * simultaneous connect with crossed SYNs. - * Particularly, it can be connect to self. -diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost -index 33aaa572f686..447648fc48f4 100644 ---- a/scripts/Makefile.modpost -+++ b/scripts/Makefile.modpost -@@ -53,6 +53,7 @@ MODPOST = scripts/mod/modpost \ - $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ - $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ - $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ -+ $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ - $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ - $(if $(KBUILD_MODPOST_WARN),-w) - -diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig -index 013ba3a57669..31ce967a1959 100644 ---- a/scripts/gcc-plugins/Kconfig -+++ b/scripts/gcc-plugins/Kconfig -@@ -53,6 +53,11 @@ config GCC_PLUGIN_LATENT_ENTROPY - is some slowdown of the boot process (about 0.5%) and fork and - irq processing. - -+ When extra_latent_entropy is passed on the kernel command line, -+ entropy will be extracted from up to the first 4GB of RAM while the -+ runtime memory allocator is being initialized. This costs even more -+ slowdown of the boot process. -+ - Note that entropy extracted this way is not cryptographically - secure! - -diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c -index 5c3c50c5ec52..b539cd7159be 100644 ---- a/scripts/mod/modpost.c -+++ b/scripts/mod/modpost.c -@@ -37,6 +37,8 @@ static int warn_unresolved = 0; - /* How a symbol is exported */ - static int sec_mismatch_count = 0; - static int sec_mismatch_fatal = 0; -+static int writable_fptr_count = 0; -+static int writable_fptr_verbose = 0; - /* ignore missing files */ - static int ignore_missing_files; - /* If set to 1, only warn (instead of error) about missing ns imports */ -@@ -1007,6 +1009,7 @@ enum mismatch { - ANY_EXIT_TO_ANY_INIT, - EXPORT_TO_INIT_EXIT, - EXTABLE_TO_NON_TEXT, -+ DATA_TO_TEXT - }; - - /** -@@ -1133,6 +1136,12 @@ static const struct sectioncheck sectioncheck[] = { - .good_tosec = {ALL_TEXT_SECTIONS , NULL}, - .mismatch = EXTABLE_TO_NON_TEXT, - .handler = extable_mismatch_handler, -+}, -+/* Do not reference code from writable data */ -+{ -+ .fromsec = { DATA_SECTIONS, NULL }, -+ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, -+ .mismatch = DATA_TO_TEXT - } - }; - -@@ -1320,10 +1329,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, - continue; - if (!is_valid_name(elf, sym)) - continue; -- if (sym->st_value == addr) -- return sym; - /* Find a symbol nearby - addr are maybe negative */ - d = sym->st_value - addr; -+ if (d == 0) -+ return sym; - if (d < 0) - d = addr - sym->st_value; - if (d < distance) { -@@ -1458,7 +1467,13 @@ static void report_sec_mismatch(const char *modname, - char *prl_from; - char *prl_to; - -- sec_mismatch_count++; -+ if (mismatch->mismatch == DATA_TO_TEXT) { -+ writable_fptr_count++; -+ if (!writable_fptr_verbose) -+ return; -+ } else { -+ sec_mismatch_count++; -+ } - - get_pretty_name(from_is_func, &from, &from_p); - get_pretty_name(to_is_func, &to, &to_p); -@@ -1580,6 +1595,12 @@ static void report_sec_mismatch(const char *modname, - fatal("There's a special handler for this mismatch type, " - "we should never get here."); - break; -+ case DATA_TO_TEXT: -+ fprintf(stderr, -+ "The %s %s:%s references\n" -+ "the %s %s:%s%s\n", -+ from, fromsec, fromsym, to, tosec, tosym, to_p); -+ break; - } - fprintf(stderr, "\n"); - } -@@ -2559,7 +2580,7 @@ int main(int argc, char **argv) - struct ext_sym_list *extsym_iter; - struct ext_sym_list *extsym_start = NULL; - -- while ((opt = getopt(argc, argv, "i:e:mnsT:o:awENd:")) != -1) { -+ while ((opt = getopt(argc, argv, "i:e:fmnsT:o:awENd:")) != -1) { - switch (opt) { - case 'i': - kernel_read = optarg; -@@ -2573,6 +2594,9 @@ int main(int argc, char **argv) - extsym_iter->file = optarg; - extsym_start = extsym_iter; - break; -+ case 'f': -+ writable_fptr_verbose = 1; -+ break; - case 'm': - modversions = 1; - break; -@@ -2676,6 +2700,11 @@ int main(int argc, char **argv) - } - - free(buf.p); -+ if (writable_fptr_count && !writable_fptr_verbose) -+ warn("modpost: Found %d writable function pointer%s.\n" -+ "To see full details build your kernel with:\n" -+ "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", -+ writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); - - return err; - } -diff --git a/security/Kconfig b/security/Kconfig -index cd3cc7da3a55..127b54aecf87 100644 ---- a/security/Kconfig -+++ b/security/Kconfig -@@ -9,7 +9,7 @@ source "security/keys/Kconfig" - - config SECURITY_DMESG_RESTRICT - bool "Restrict unprivileged access to the kernel syslog" -- default n -+ default y - help - This enforces restrictions on unprivileged users reading the kernel - syslog via dmesg(8). -@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT - - If you are unsure how to answer this question, answer N. - -+config SECURITY_PERF_EVENTS_RESTRICT -+ bool "Restrict unprivileged use of performance events" -+ depends on PERF_EVENTS -+ default y -+ help -+ If you say Y here, the kernel.perf_event_paranoid sysctl -+ will be set to 3 by default, and no unprivileged use of the -+ perf_event_open syscall will be permitted unless it is -+ changed. -+ -+config SECURITY_TIOCSTI_RESTRICT -+ bool "Restrict unprivileged use of tiocsti command injection" -+ default y -+ help -+ This enforces restrictions on unprivileged users injecting commands -+ into other processes which share a tty session using the TIOCSTI -+ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. -+ -+ If this option is not selected, no restrictions will be enforced -+ unless the tiocsti_restrict sysctl is explicitly set to (1). -+ -+ If you are unsure how to answer this question, answer N. -+ - config SECURITY - bool "Enable different security models" - depends on SYSFS - depends on MULTIUSER -+ default y - help - This allows you to choose different security modules to be - configured into your kernel. -@@ -48,6 +72,7 @@ config SECURITYFS - config SECURITY_NETWORK - bool "Socket and Networking Security Hooks" - depends on SECURITY -+ default y - help - This enables the socket and networking security hooks. - If enabled, a security module can use these hooks to -@@ -154,6 +179,7 @@ config HARDENED_USERCOPY - bool "Harden memory copies between kernel and userspace" - depends on HAVE_HARDENED_USERCOPY_ALLOCATOR - imply STRICT_DEVMEM -+ default y - help - This option checks for obviously wrong memory regions when - copying memory to/from the kernel (via copy_to_user() and -@@ -166,7 +192,6 @@ config HARDENED_USERCOPY - config HARDENED_USERCOPY_FALLBACK - bool "Allow usercopy whitelist violations to fallback to object size" - depends on HARDENED_USERCOPY -- default y - help - This is a temporary option that allows missing usercopy whitelists - to be discovered via a WARN() to the kernel log, instead of -@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN - config FORTIFY_SOURCE - bool "Harden common str/mem functions against buffer overflows" - depends on ARCH_HAS_FORTIFY_SOURCE -+ default y - help - Detect overflows of buffers in common string and memory functions - where the compiler can determine and validate the buffer sizes. - -+config FORTIFY_SOURCE_STRICT_STRING -+ bool "Harden common functions against buffer overflows" -+ depends on FORTIFY_SOURCE -+ depends on EXPERT -+ help -+ Perform stricter overflow checks catching overflows within objects -+ for common C string functions rather than only between objects. -+ -+ This is not yet intended for production use, only bug finding. -+ - config STATIC_USERMODEHELPER - bool "Force all usermode helper calls through a single binary" - help -diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening -index af4c979b38ee..001796a391e9 100644 ---- a/security/Kconfig.hardening -+++ b/security/Kconfig.hardening -@@ -169,6 +169,7 @@ config STACKLEAK_RUNTIME_DISABLE - - config INIT_ON_ALLOC_DEFAULT_ON - bool "Enable heap memory zeroing on allocation by default" -+ default yes - help - This has the effect of setting "init_on_alloc=1" on the kernel - command line. This can be disabled with "init_on_alloc=0". -@@ -181,6 +182,7 @@ config INIT_ON_ALLOC_DEFAULT_ON - - config INIT_ON_FREE_DEFAULT_ON - bool "Enable heap memory zeroing on free by default" -+ default yes - help - This has the effect of setting "init_on_free=1" on the kernel - command line. This can be disabled with "init_on_free=0". -@@ -196,6 +198,21 @@ config INIT_ON_FREE_DEFAULT_ON - touching "cold" memory areas. Most cases see 3-5% impact. Some - synthetic workloads have measured as high as 8%. - -+config PAGE_SANITIZE_VERIFY -+ bool "Verify sanitized pages" -+ default y -+ help -+ When init_on_free is enabled, verify that newly allocated pages -+ are zeroed to detect write-after-free bugs. -+ -+config SLAB_SANITIZE_VERIFY -+ default y -+ bool "Verify sanitized SLAB allocations" -+ depends on !KASAN -+ help -+ When init_on_free is enabled, verify that newly allocated slab -+ objects are zeroed to detect write-after-free bugs. -+ - endmenu - - endmenu -diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig -index 9e921fc72538..ae851a826c26 100644 ---- a/security/selinux/Kconfig -+++ b/security/selinux/Kconfig -@@ -3,7 +3,7 @@ config SECURITY_SELINUX - bool "NSA SELinux Support" - depends on SECURITY_NETWORK && AUDIT && NET && INET - select NETWORK_SECMARK -- default n -+ default y - help - This selects NSA Security-Enhanced Linux (SELinux). - You will also need a policy configuration and a labeled filesystem. -@@ -70,29 +70,6 @@ config SECURITY_SELINUX_AVC_STATS - /sys/fs/selinux/avc/cache_stats, which may be monitored via - tools such as avcstat. - --config SECURITY_SELINUX_CHECKREQPROT_VALUE -- int "NSA SELinux checkreqprot default value" -- depends on SECURITY_SELINUX -- range 0 1 -- default 0 -- help -- This option sets the default value for the 'checkreqprot' flag -- that determines whether SELinux checks the protection requested -- by the application or the protection that will be applied by the -- kernel (including any implied execute for read-implies-exec) for -- mmap and mprotect calls. If this option is set to 0 (zero), -- SELinux will default to checking the protection that will be applied -- by the kernel. If this option is set to 1 (one), SELinux will -- default to checking the protection requested by the application. -- The checkreqprot flag may be changed from the default via the -- 'checkreqprot=' boot parameter. It may also be changed at runtime -- via /sys/fs/selinux/checkreqprot if authorized by policy. -- -- WARNING: this option is deprecated and will be removed in a future -- kernel release. -- -- If you are unsure how to answer this question, answer 0. -- - config SECURITY_SELINUX_SIDTAB_HASH_BITS - int "NSA SELinux sidtab hashtable size" - depends on SECURITY_SELINUX -diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c -index 4c037c2545c1..2437a1895baa 100644 ---- a/security/selinux/hooks.c -+++ b/security/selinux/hooks.c -@@ -135,21 +135,7 @@ static int __init selinux_enabled_setup(char *str) - __setup("selinux=", selinux_enabled_setup); - #endif - --static unsigned int selinux_checkreqprot_boot = -- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; -- --static int __init checkreqprot_setup(char *str) --{ -- unsigned long checkreqprot; -- -- if (!kstrtoul(str, 0, &checkreqprot)) { -- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; -- if (checkreqprot) -- pr_warn("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); -- } -- return 1; --} --__setup("checkreqprot=", checkreqprot_setup); -+static const unsigned int selinux_checkreqprot_boot; - - /** - * selinux_secmark_enabled - Check to see if SECMARK is currently enabled -diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c -index 4781314c2510..7f068515d799 100644 ---- a/security/selinux/selinuxfs.c -+++ b/security/selinux/selinuxfs.c -@@ -641,7 +641,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, - static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) - { -- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; - char *page; - ssize_t length; - unsigned int new_value; -@@ -665,18 +664,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, - return PTR_ERR(page); - - length = -EINVAL; -- if (sscanf(page, "%u", &new_value) != 1) -+ if (sscanf(page, "%u", &new_value) != 1 || new_value) - goto out; - -- if (new_value) { -- char comm[sizeof(current->comm)]; -- -- memcpy(comm, current->comm, sizeof(comm)); -- pr_warn_once("SELinux: %s (%d) set checkreqprot to 1. This is deprecated and will be rejected in a future kernel release.\n", -- comm, current->pid); -- } -- -- fsi->state->checkreqprot = new_value ? 1 : 0; - length = count; - out: - kfree(page); -diff --git a/security/yama/Kconfig b/security/yama/Kconfig -index a810304123ca..b809050b25d2 100644 ---- a/security/yama/Kconfig -+++ b/security/yama/Kconfig -@@ -2,7 +2,7 @@ - config SECURITY_YAMA - bool "Yama support" - depends on SECURITY -- default n -+ default y - help - This selects Yama, which extends DAC support with additional - system-wide security settings beyond regular Linux discretionary diff --git a/linux57-tkg/linux57-tkg-patches/0012-misc-additions.patch b/linux57-tkg/linux57-tkg-patches/0012-misc-additions.patch deleted file mode 100644 index 33f5502..0000000 --- a/linux57-tkg/linux57-tkg-patches/0012-misc-additions.patch +++ /dev/null @@ -1,55 +0,0 @@ -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 0840d27381ea..73aba9a31064 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP - def_bool y - depends on VT_CONSOLE && PM_SLEEP - -+config NR_TTY_DEVICES -+ int "Maximum tty device number" -+ depends on VT -+ range 12 63 -+ default 63 -+ ---help--- -+ This option is used to change the number of tty devices in /dev. -+ The default value is 63. The lowest number you can set is 12, -+ 63 is also the upper limit so we don't overrun the serial -+ consoles. -+ -+ If unsure, say 63. -+ - config HW_CONSOLE - bool - depends on VT && !UML -diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h -index e9d39c48520a..3bceead8da40 100644 ---- a/include/uapi/linux/vt.h -+++ b/include/uapi/linux/vt.h -@@ -3,12 +3,25 @@ - #define _UAPI_LINUX_VT_H - - -+/* -+ * We will make this definition solely for the purpose of making packages -+ * such as splashutils build, because they can not understand that -+ * NR_TTY_DEVICES is defined in the kernel configuration. -+ */ -+#ifndef CONFIG_NR_TTY_DEVICES -+#define CONFIG_NR_TTY_DEVICES 63 -+#endif -+ - /* - * These constants are also useful for user-level apps (e.g., VC - * resizing). - */ - #define MIN_NR_CONSOLES 1 /* must be at least 1 */ --#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -+/* -+ * NR_TTY_DEVICES: -+ * Value MUST be at least 12 and must never be higher then 63 -+ */ -+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ - /* Note: the ioctl VT_GETSTATE does not work for - consoles 16 and higher (since it returns a short) */ - diff --git a/linux58-tkg/PKGBUILD b/linux58-tkg/PKGBUILD deleted file mode 100644 index 1d3be7e..0000000 --- a/linux58-tkg/PKGBUILD +++ /dev/null @@ -1,285 +0,0 @@ -# Based on the file created for Arch Linux by: -# Tobias Powalowski -# Thomas Baechler - -# Contributor: Tk-Glitch - -plain ' .---.` `.---.' -plain ' `/syhhhyso- -osyhhhys/`' -plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' -plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' -plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' -plain ' /ssyhhhysssssssssssssssssyhhhyss/' -plain ' .ossssssssssssssssssssssssssssso.' -plain ' :sssssssssssssssssssssssssssssssss:' -plain ' /sssssssssssssssssssssssssssssssssss/' -plain ' :sssssssssssssoosssssssoosssssssssssss:' -plain ' osssssssssssssoosssssssoossssssssssssso' -plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' -plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' -plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' -plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' -plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' -plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' -plain ' `.-:///////:-.`' - -_where="$PWD" # track basedir as different Arch based distros are moving srcdir around - -source "$_where"/customization.cfg # load default configuration from file -source "$_where"/linux*-tkg-config/prepare - -if [[ "$_sub" = rc* ]]; then - _srcpath="linux-${_basekernel}-${_sub}" -else - _srcpath="linux-${_basekernel}" -fi - -_tkg_initscript - -_distro="Arch" - -if [ -n "$_custom_pkgbase" ]; then - pkgbase="${_custom_pkgbase}" -else - pkgbase=linux"${_basever}"-tkg-"${_cpusched}" -fi -pkgname=("${pkgbase}" "${pkgbase}-headers") -pkgver="${_basekernel}"."${_sub}" -pkgrel=25 -pkgdesc='Linux-tkg' -arch=('x86_64') # no i686 in here -url="http://www.kernel.org/" -license=('GPL2') -makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'pahole' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') -optdepends=('schedtool') -options=('!strip' 'docs') -source=("https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" - "https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" - "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" - 'config.x86_64' # stock Arch config - #'config_hardened.x86_64' # hardened Arch config - 90-cleanup.hook - cleanup - # ARCH Patches - 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - # TkG - 0002-clear-patches.patch - 0003-glitched-base.patch - 0003-glitched-cfs.patch - #0004-glitched-ondemand-muqss.patch - #0004-glitched-muqss.patch - #0004-5.8-ck1.patch - 0005-undead-glitched-ondemand-pds.patch - 0005-undead-glitched-pds.patch - 0005-v5.8_undead-pds099o.patch - 0005-glitched-pds.patch - 0006-add-acs-overrides_iommu.patch - 0007-v5.8-fsync.patch - 0008-5.8-bcachefs.patch - 0009-glitched-ondemand-bmq.patch - 0009-glitched-bmq.patch - 0009-prjc_v5.8-r3.patch - 0011-ZFS-fix.patch - #0012-linux-hardened.patch - 0012-misc-additions.patch -) -sha256sums=('e7f75186aa0642114af8f19d99559937300ca27acaf7451b36d4f9b0f85cf1f5' - '2ea49982bd10e4c880d49051535bd820e276dd3235c3c913b255aaaadc707e1d' - '5ab29eb64e57df83b395a29a6a4f89030d142feffbfbf73b3afc6d97a2a7fd12' - '181330a9cf4517abbbe29b93165bc859ad8ca14a43582f4e1d69aae2b5ecc2c9' - '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' - '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' - 'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6' - '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' - 'b9ebe0ae69bc2b2091d6bfcf6c7875a87ea7969fcfa4e306c48d47a60f9ef4d6' - '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' - '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' - '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' - '31b172eb6a0c635a8d64cc1c2e8181d9f928ee991bd44f6e556d1713b815f8d9' - '87bca363416655bc865fcb2cc0d1532cb010a61d9b9f625e3c15cd12eeee3a59' - '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - 'cd225e86d72eaf6c31ef3d7b20df397f4cc44ddd04389850691292cdf292b204' - '86414a20225deec084e0e48b35552b3a4eef67f76755b32a10febb7b6308dcb7' - '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' - '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' - 'f5dbff4833a2e3ca94c202e5197894d5f1006c689ff149355353e77d2e17c943' - '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' - '98311deeb474b39e821cd1e64198793d5c4d797155b3b8bbcb1938b7f11e8d74') - -export KBUILD_BUILD_HOST=archlinux -export KBUILD_BUILD_USER=$pkgbase -export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" - -prepare() { - rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build - - ln -s "${_where}/customization.cfg" "${srcdir}" # workaround - - cd "${srcdir}/${_srcpath}" - - _tkg_srcprep -} - -build() { - cd "${srcdir}/${_srcpath}" - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _force_all_threads="-j$((`nproc`*2))" - else - _force_all_threads="${MAKEFLAGS}" - fi - - # ccache - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - export PATH="/usr/lib/ccache/bin/:$PATH" - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - fi - - # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". - declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" - - # build! - _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) -} - -hackbase() { - pkgdesc="The $pkgdesc kernel and modules" - depends=('coreutils' 'kmod' 'initramfs') - optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' - 'crda: to set the correct wireless channels of your country.' - 'linux-firmware: Firmware files for Linux' - 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' - 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' - 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' - 'update-grub: Simple wrapper around grub-mkconfig.') - provides=("linux=${pkgver}" "${pkgbase}" VIRTUALBOX-GUEST-MODULES WIREGUARD-MODULE) - replaces=(virtualbox-guest-modules-arch wireguard-arch) - - cd "${srcdir}/${_srcpath}" - - # get kernel version - local _kernver="$(\033[1;0m \033[1;1m$1\033[1;0m" >&2 -} - -error() { - echo -e " \033[1;31m==> ERROR: $1\033[1;0m" >&2 -} - -warning() { - echo -e " \033[1;33m==> WARNING: $1\033[1;0m" >&2 -} - -plain() { - echo "$1" >&2 -} - -# Stop the script at any ecountered error -set -e - -_where=`pwd` -srcdir="$_where" - -source linux*-tkg-config/prepare - -_cpu_opt_patch_link="https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v${_basekernel}%2B.patch" - -source customization.cfg - -if [ "$1" != "install" ] && [ "$1" != "config" ] && [ "$1" != "uninstall-help" ]; then - msg2 "Argument not recognised, options are: - - config : shallow clones the linux ${_basekernel}.x git tree into the folder linux-${_basekernel}, then applies on it the extra patches and prepares the .config file - by copying the one from the current linux system in /boot/config-`uname -r` and updates it. - - install : [RPM and DEB based distros only], does the config step, proceeds to compile, then prompts to install - - uninstall-help : [RPM and DEB based distros only], lists the installed kernels in this system, then gives a hint on how to uninstall them manually." - exit 0 -fi - -# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. -if [ -e "$_EXT_CONFIG_PATH" ]; then - msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values." - source "$_EXT_CONFIG_PATH" -fi - -_misc_adds="false" # We currently don't want this enabled on non-Arch - -if [ "$1" = "install" ] || [ "$1" = "config" ]; then - - if [ -z $_distro ] && [ "$1" = "install" ]; then - while true; do - echo "Which linux distribution are you running ?" - echo "if it's not on the list, chose the closest one to it: Fedora/Suse for RPM, Ubuntu/Debian for DEB" - echo " 1) Debian" - echo " 2) Fedora" - echo " 3) Suse" - echo " 4) Ubuntu" - read -p "[1-4]: " _distro_index - - if [ "$_distro_index" = "1" ]; then - _distro="Debian" - break - elif [ "$_distro_index" = "2" ]; then - _distro="Fedora" - break - elif [ "$_distro_index" = "3" ]; then - _distro="Suse" - break - elif [ "$_distro_index" = "4" ]; then - _distro="Ubuntu" - break - else - echo "Wrong index." - fi - done - fi - - if [[ $1 = "install" && "$_distro" != "Ubuntu" && "$_distro" != "Debian" && "$_distro" != "Fedora" && "$_distro" != "Suse" ]]; then - msg2 "Variable \"_distro\" in \"customization.cfg\" hasn't been set to \"Ubuntu\", \"Debian\", \"Fedora\" or \"Suse\"" - msg2 "This script can only install custom kernels for RPM and DEB based distros, though only those keywords are permitted. Exiting..." - exit 0 - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - msg2 "Installing dependencies" - sudo apt install git build-essential kernel-package fakeroot libncurses5-dev libssl-dev ccache bison flex qtbase5-dev -y - elif [ "$_distro" = "Fedora" ]; then - msg2 "Installing dependencies" - sudo dnf install fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby qt5-devel libXi-devel gcc-c++ git ccache flex bison elfutils-libelf-devel openssl-devel dwarves rpm-build -y - elif [ "$_distro" = "Suse" ]; then - msg2 "Installing dependencies" - sudo zypper install -y rpmdevtools ncurses-devel pesign libXi-devel gcc-c++ git ccache flex bison elfutils libelf-devel openssl-devel dwarves make patch bc rpm-build libqt5-qtbase-common-devel libqt5-qtbase-devel lz4 - fi - - # Force prepare script to avoid Arch specific commands if the user is using `config` - if [ "$1" = "config" ]; then - _distro="" - fi - - if [ -d linux-${_basekernel}.orig ]; then - rm -rf linux-${_basekernel}.orig - fi - - if [ -d linux-${_basekernel} ]; then - msg2 "Reseting files in linux-$_basekernel to their original state and getting latest updates" - cd "$_where"/linux-${_basekernel} - git checkout --force linux-$_basekernel.y - git clean -f -d -x - git pull - msg2 "Done" - cd "$_where" - else - msg2 "Shallow git cloning linux $_basekernel" - git clone --branch linux-$_basekernel.y --single-branch --depth=1 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git linux-${_basekernel} - msg2 "Done" - fi - - # Define current kernel subversion - if [ -z $_kernel_subver ]; then - cd "$_where"/linux-${_basekernel} - _kernelverstr=`git describe` - _kernel_subver=${_kernelverstr:5} - cd "$_where" - fi - - - # Run init script that is also run in PKGBUILD, it will define some env vars that we will use - _tkg_initscript - - cd "$_where" - msg2 "Downloading Graysky2's CPU optimisations patch" - wget "$_cpu_opt_patch_link" - - # Follow Ubuntu install isntructions in https://wiki.ubuntu.com/KernelTeam/GitKernelBuild - - # cd in linux folder, copy Ubuntu's current config file, update with new params - cd "$_where"/linux-${_basekernel} - - msg2 "Copying current kernel's config and running make oldconfig..." - cp /boot/config-`uname -r` .config - if [ "$_distro" = "Debian" ]; then #Help Debian cert problem. - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/test-signing-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/debian-uefi-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - fi - yes '' | make oldconfig - msg2 "Done" - - # apply linux-tkg patching script - _tkg_srcprep - - msg2 "Configuration done." -fi - -if [ "$1" = "install" ]; then - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _thread_num=`nproc` - else - _thread_num=`expr \`nproc\` / 4` - if [ "$_thread_num" = "0" ]; then - _thread_num=1 - fi - fi - - # ccache - if [ "$_noccache" != "true" ]; then - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - export PATH="/usr/lib/ccache/bin/:$PATH" - elif [ "$_distro" = "Fedora" ] || [ "$_distro" = "Suse" ]; then - export PATH="/usr/lib64/ccache/:$PATH" - fi - - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - - fi - - if [ -z $_kernel_localversion ]; then - _kernel_flavor="tkg-${_cpusched}" - else - _kernel_flavor="tkg-${_kernel_localversion}" - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - - if make -j ${_thread_num} deb-pkg LOCALVERSION=-${_kernel_flavor}; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create DEBS folder if it doesn't exist - mkdir -p DEBS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv "$_where"/*.deb "$_where"/DEBS/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [[ $_install =~ [yY] ]] || [ $_install = "yes" ] || [ $_install = "Yes" ]; then - cd "$_where" - _kernelname=$_basekernel.$_kernel_subver-$_kernel_flavor - _headers_deb="linux-headers-${_kernelname}*.deb" - _image_deb="linux-image-${_kernelname}_*.deb" - _kernel_devel_deb="linux-libc-dev_${_kernelname}*.deb" - - cd DEBS - sudo dpkg -i $_headers_deb $_image_deb $_kernel_devel_deb - fi - fi - - elif [[ "$_distro" = "Fedora" || "$_distro" = "Suse" ]]; then - - # Replace dashes with underscores, it seems that it's being done by binrpm-pkg - # Se we can actually refer properly to the rpm files. - _kernel_flavor=${_kernel_flavor//-/_} - - if make -j ${_thread_num} rpm-pkg EXTRAVERSION="_${_kernel_flavor}"; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create RPMS folder if it doesn't exist - mkdir -p RPMS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv ~/rpmbuild/RPMS/x86_64/* "$_where"/RPMS/ - - #Clean up the original folder, unneeded and takes a lot of space - rm -rf ~/rpmbuild/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [ "$_install" = "y" ] || [ "$_install" = "Y" ] || [ "$_install" = "yes" ] || [ "$_install" = "Yes" ]; then - - _kernelname=$_basekernel.${_kernel_subver}_$_kernel_flavor - _headers_rpm="kernel-headers-${_kernelname}*.rpm" - _kernel_rpm="kernel-${_kernelname}*.rpm" - _kernel_devel_rpm="kernel-devel-${_kernelname}*.rpm" - - cd RPMS - if [ "$_distro" = "Fedora" ]; then - sudo dnf install $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - elif [ "$_distro" = "Suse" ]; then - msg2 "Some files from 'linux-glibc-devel' will be replaced by files from the custom kernel-hearders package" - msg2 "To revert back to the original kernel headers do 'sudo zypper install -f linux-glibc-devel'" - sudo zypper install --replacefiles --allow-unsigned-rpm $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - fi - - msg2 "Install successful" - fi - fi - fi -fi - -if [ "$1" = "uninstall-help" ]; then - - cd "$_where" - msg2 "List of installed custom tkg kernels: " - - if [ "$_distro" = "Ubuntu" ]; then - dpkg -l "*tkg*" | grep "linux.*tkg" - dpkg -l "*linux-libc-dev*" | grep "linux.*tkg" - msg2 "To uninstall a version, you should remove the linux-image, linux-headers and linux-libc-dev associated to it (if installed), with: " - msg2 " sudo apt remove linux-image-VERSION linux-headers-VERSION linux-libc-dev-VERSION" - msg2 " where VERSION is displayed in the lists above, uninstall only versions that have \"tkg\" in its name" - elif [ "$_distro" = "Fedora" ]; then - dnf list --installed kernel* - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo dnf remove --noautoremove kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second column" - elif [ "$_distro" = "Suse" ]; then - zypper packages --installed-only | grep "kernel.*tkg" - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo zypper remove --no-clean-deps kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second to last column" - fi - -fi diff --git a/linux58-tkg/linux58-tkg-config/90-cleanup.hook b/linux58-tkg/linux58-tkg-config/90-cleanup.hook deleted file mode 100644 index 99f5221..0000000 --- a/linux58-tkg/linux58-tkg-config/90-cleanup.hook +++ /dev/null @@ -1,14 +0,0 @@ -[Trigger] -Type = File -Operation = Install -Operation = Upgrade -Operation = Remove -Target = usr/lib/modules/*/ -Target = !usr/lib/modules/*/?* - -[Action] -Description = Cleaning up... -When = PostTransaction -Exec = /usr/share/libalpm/scripts/cleanup -NeedsTargets - diff --git a/linux58-tkg/linux58-tkg-config/cleanup b/linux58-tkg/linux58-tkg-config/cleanup deleted file mode 100755 index c00c08d..0000000 --- a/linux58-tkg/linux58-tkg-config/cleanup +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -for _f in /usr/lib/modules/*tkg*; do - if [[ ! -e ${_f}/vmlinuz ]]; then - rm -rf "$_f" - fi -done - -# vim:set ft=sh sw=2 et: - diff --git a/linux58-tkg/linux58-tkg-config/config.x86_64 b/linux58-tkg/linux58-tkg-config/config.x86_64 deleted file mode 100644 index 5cd21b9..0000000 --- a/linux58-tkg/linux58-tkg-config/config.x86_64 +++ /dev/null @@ -1,11020 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 5.8.6-arch1 Kernel Configuration -# -CONFIG_CC_VERSION_TEXT="gcc (GCC) 10.2.0" -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=100200 -CONFIG_LD_VERSION=235000000 -CONFIG_CLANG_VERSION=0 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_CAN_LINK_STATIC=y -CONFIG_CC_HAS_ASM_GOTO=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -CONFIG_KERNEL_XZ=y -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -CONFIG_DEFAULT_INIT="" -CONFIG_DEFAULT_HOSTNAME="archlinux" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_WATCH_QUEUE=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -# end of Timers subsystem - -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y - -# -# CPU/Task time and stats accounting -# -CONFIG_TICK_CPU_ACCOUNTING=y -# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -# CONFIG_SCHED_THERMAL_PRESSURE is not set -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -CONFIG_RCU_EXPERT=y -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_RUDE_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_RCU_FANOUT=64 -CONFIG_RCU_FANOUT_LEAF=16 -CONFIG_RCU_FAST_NO_HZ=y -CONFIG_RCU_BOOST=y -CONFIG_RCU_BOOST_DELAY=500 -# CONFIG_RCU_NOCB_CPU is not set -# CONFIG_TASKS_TRACE_RCU_READ_MB is not set -# end of RCU Subsystem - -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_IKHEADERS is not set -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -CONFIG_UCLAMP_TASK=y -CONFIG_UCLAMP_BUCKETS_COUNT=5 -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_USER_NS_UNPRIVILEGED=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_BOOT_CONFIG=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -CONFIG_EXPERT=y -# CONFIG_UID16 is not set -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_HAVE_ARCH_USERFAULTFD_WP=y -CONFIG_MEMBARRIER=y -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_BPF_LSM=y -CONFIG_BPF_SYSCALL=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_USERFAULTFD=y -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_RSEQ=y -# CONFIG_DEBUG_RSEQ is not set -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -# CONFIG_SLOB is not set -CONFIG_SLAB_MERGE_DEFAULT=y -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_FILTER_PGPROT=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DYNAMIC_PHYSICAL_MASK=y -CONFIG_PGTABLE_LEVELS=5 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_X86_CPU_RESCTRL=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_PARAVIRT_XXL=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -CONFIG_XEN=y -CONFIG_XEN_PV=y -CONFIG_XEN_PV_SMP=y -CONFIG_XEN_DOM0=y -CONFIG_XEN_PVHVM=y -CONFIG_XEN_PVHVM_SMP=y -CONFIG_XEN_512GB=y -CONFIG_XEN_SAVE_RESTORE=y -# CONFIG_XEN_DEBUG_FS is not set -CONFIG_XEN_PVH=y -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -CONFIG_JAILHOUSE_GUEST=y -CONFIG_ACRN_GUEST=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_PROCESSOR_SELECT=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=320 -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -CONFIG_X86_MCE_INJECT=m -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=m -CONFIG_PERF_EVENTS_INTEL_RAPL=m -CONFIG_PERF_EVENTS_INTEL_CSTATE=m -CONFIG_PERF_EVENTS_AMD_POWER=m -# end of Performance monitoring - -CONFIG_X86_16BIT=y -CONFIG_X86_ESPFIX64=y -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_X86_IOPL_IOPERM=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -CONFIG_X86_5LEVEL=y -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=5 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ARCH_PROC_KCORE_TEXT=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_UMIP=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_SECCOMP=y -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_ARCH_HAS_KEXEC_PURGATORY=y -# CONFIG_KEXEC_SIG is not set -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x200000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -CONFIG_LEGACY_VSYSCALL_XONLY=y -# CONFIG_LEGACY_VSYSCALL_NONE is not set -# CONFIG_CMDLINE_BOOL is not set -CONFIG_MODIFY_LDT_SYSCALL=y -CONFIG_HAVE_LIVEPATCH=y -# CONFIG_LIVEPATCH is not set -# end of Processor type and features - -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_SUSPEND_SKIP_SYNC is not set -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_HIBERNATION_SNAPSHOT_DEV=y -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -# CONFIG_PM_TEST_SUSPEND is not set -CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_DPM_WATCHDOG is not set -CONFIG_PM_TRACE=y -CONFIG_PM_TRACE_RTC=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_PM_GENERIC_DOMAINS_OF=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -# CONFIG_ACPI_PROCFS_POWER is not set -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -CONFIG_ACPI_EC_DEBUGFS=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=y -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_DEBUG=y -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -CONFIG_ACPI_CUSTOM_METHOD=m -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -# CONFIG_NFIT_SECURITY_DEBUG is not set -CONFIG_ACPI_NUMA=y -CONFIG_ACPI_HMAT=y -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -CONFIG_ACPI_APEI_EINJ=m -CONFIG_ACPI_APEI_ERST_DEBUG=m -CONFIG_DPTF_POWER=m -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_ADXL=y -CONFIG_PMIC_OPREGION=y -CONFIG_BYTCRC_PMIC_OPREGION=y -CONFIG_CHTCRC_PMIC_OPREGION=y -CONFIG_XPOWER_PMIC_OPREGION=y -CONFIG_BXT_WC_PMIC_OPREGION=y -CONFIG_CHT_WC_PMIC_OPREGION=y -CONFIG_CHT_DC_TI_PMIC_OPREGION=y -CONFIG_ACPI_CONFIGFS=m -CONFIG_TPS68470_PMIC_OPREGION=y -CONFIG_X86_PM_TIMER=y -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_CPUFREQ_DT=m -CONFIG_CPUFREQ_DT_PLATDEV=y -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=m - -# -# shared options -# -CONFIG_X86_SPEEDSTEP_LIB=m -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=m -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_XEN=y -CONFIG_MMCONF_FAM10H=y -# CONFIG_PCI_CNB20LE_QUIRK is not set -# CONFIG_ISA_BUS is not set -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# CONFIG_X86_SYSFB is not set -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_X86_X32 is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -# end of Binary Emulations - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -# CONFIG_GOOGLE_SMI is not set -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_MEMCONSOLE=m -# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set -CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -# CONFIG_EFI_VARS is not set -CONFIG_EFI_ESRT=y -CONFIG_EFI_RUNTIME_MAP=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_SOFT_RESERVE=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_APPLE_PROPERTIES=y -# CONFIG_RESET_ATTACK_MITIGATION is not set -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_EFI_EMBEDDED_FIRMWARE=y -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_EFI_EARLYCON=y - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_MMU_AUDIT=y -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y -CONFIG_AS_TPAUSE=y - -# -# General architecture-dependent options -# -CONFIG_CRASH_CORE=y -CONFIG_KEXEC_CORE=y -CONFIG_HOTPLUG_SMT=y -CONFIG_OPROFILE=m -# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_CC_HAS_STACKPROTECTOR_NONE=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=28 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_COPY_THREAD_TLS=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_ISA_BUS_API=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set -# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set -# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_MODVERSIONS is not set -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -CONFIG_MODULE_COMPRESS_XZ=y -CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_INTEGRITY_T10=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y -# CONFIG_BLK_CMDLINE_PARSER is not set -CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y -CONFIG_BLK_SED_OPAL=y -CONFIG_BLK_INLINE_ENCRYPTION=y -CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_AIX_PARTITION=y -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y -CONFIG_BLK_PM=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_HAVE_MEMORY_PRESENT=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_FAST_GUP=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -CONFIG_HWPOISON_INJECT=m -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -# CONFIG_CMA is not set -CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZPOOL=y -CONFIG_ZBUD=y -CONFIG_Z3FOLD=y -CONFIG_ZSMALLOC=y -# CONFIG_ZSMALLOC_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DEVICE=y -CONFIG_DEV_PAGEMAP_OPS=y -CONFIG_HMM_MIRROR=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_BENCHMARK is not set -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MAPPING_DIRTY_HELPERS=y -# end of Memory Management options - -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_REDIRECT=y -CONFIG_SKB_EXTENSIONS=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_UNIX_SCM=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_AH=m -CONFIG_XFRM_ESP=m -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_XFRM_ESPINTCP=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -# CONFIG_NET_IPGRE_BROADCAST is not set -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_ESPINTCP=y -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_INET_RAW_DIAG=m -CONFIG_INET_DIAG_DESTROY=y -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_CUBIC=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="cubic" -CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_ESPINTCP=y -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_RPL_LWTUNNEL=y -CONFIG_NETLABEL=y -CONFIG_MPTCP=y -CONFIG_MPTCP_IPV6=y -# CONFIG_MPTCP_HMAC_TEST is not set -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=15 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_MH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS MH scheduler -# -CONFIG_IP_VS_MH_TAB_INDEX=12 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_FLOW_TABLE_IPV4=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_FLOW_TABLE_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_BPFILTER is not set -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y -# end of DCCP CCIDs Configuration - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# end of DCCP Kernel Hacking - -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_TIPC_CRYPTO=y -CONFIG_TIPC_DIAG=m -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_BRIDGE_MRP=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_8021Q=m -CONFIG_NET_DSA_TAG_AR9331=m -CONFIG_NET_DSA_TAG_BRCM_COMMON=m -CONFIG_NET_DSA_TAG_BRCM=m -CONFIG_NET_DSA_TAG_BRCM_PREPEND=m -CONFIG_NET_DSA_TAG_GSWIP=m -CONFIG_NET_DSA_TAG_DSA=m -CONFIG_NET_DSA_TAG_EDSA=m -CONFIG_NET_DSA_TAG_MTK=m -CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_OCELOT=m -CONFIG_NET_DSA_TAG_QCA=m -CONFIG_NET_DSA_TAG_LAN9303=m -CONFIG_NET_DSA_TAG_SJA1105=m -CONFIG_NET_DSA_TAG_TRAILER=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -# CONFIG_DECNET is not set -CONFIG_LLC=m -CONFIG_LLC2=m -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_CBS=m -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_SKBPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_FQ_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_SCH_ETS=m -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_CODEL is not set -CONFIG_DEFAULT_FQ_CODEL=y -# CONFIG_DEFAULT_SFQ is not set -# CONFIG_DEFAULT_PFIFO_FAST is not set -CONFIG_DEFAULT_NET_SCH="fq_codel" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_MPLS=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_CTINFO=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_ACT_CT=m -CONFIG_NET_ACT_GATE=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_TC_SKB_EXT=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -CONFIG_BATMAN_ADV_BATMAN_V=y -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -CONFIG_BATMAN_ADV_DEBUGFS=y -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_BATMAN_ADV_SYSFS=y -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VSOCKETS_DIAG=m -CONFIG_VSOCKETS_LOOPBACK=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -CONFIG_QRTR=m -CONFIG_QRTR_SMD=m -CONFIG_QRTR_TUN=m -CONFIG_QRTR_MHI=m -CONFIG_NET_NCSI=y -CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -CONFIG_NET_DROP_MONITOR=y -# end of Network testing -# end of Networking options - -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -# end of AX.25 network device drivers - -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m -CONFIG_CAN_J1939=m - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_FLEXCAN=m -CONFIG_CAN_GRCAN=m -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_KVASER_PCIEFD=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_M_CAN_PLATFORM=m -CONFIG_CAN_M_CAN_TCAN4X5X=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_EMS_PCI=m -# CONFIG_CAN_EMS_PCMCIA is not set -CONFIG_CAN_F81601=m -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m -# end of CAN SPI interfaces - -# -# CAN USB interfaces -# -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_MCBA_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_UCAN=m -# end of CAN USB interfaces - -# CONFIG_CAN_DEBUG_DEVICES is not set -# end of CAN Device Drivers - -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -CONFIG_BT_MSFTEXT=y -CONFIG_BT_DEBUGFS=y -# CONFIG_BT_SELFTEST is not set - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_RTL=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_MTKSDIO=m -CONFIG_BT_MTKUART=m -CONFIG_BT_HCIRSI=m -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -CONFIG_AF_RXRPC_DEBUG=y -CONFIG_RXKAD=y -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -# CONFIG_CFG80211_CERTIFICATION_ONUS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_XEN=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -CONFIG_CEPH_LIB_PRETTYDEBUG=y -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_PN532_UART=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -# end of Near Field Communication (NFC) devices - -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SOCK_MSG=y -CONFIG_NET_DEVLINK=y -CONFIG_PAGE_POOL=y -CONFIG_FAILOVER=m -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_BW is not set -CONFIG_PCIE_EDR=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=y -CONFIG_PCI_PF_STUB=m -CONFIG_XEN_PCIDEV_FRONTEND=m -CONFIG_PCI_ATS=y -CONFIG_PCI_ECAM=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_P2PDMA=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=y - -# -# PCI controller drivers -# -CONFIG_PCI_FTPCI100=y -CONFIG_PCI_HOST_COMMON=y -CONFIG_PCI_HOST_GENERIC=y -CONFIG_PCIE_XILINX=y -CONFIG_VMD=m -CONFIG_PCI_HYPERV_INTERFACE=m - -# -# DesignWare PCI Core Support -# -CONFIG_PCIE_DW=y -CONFIG_PCIE_DW_HOST=y -CONFIG_PCIE_DW_EP=y -CONFIG_PCIE_DW_PLAT=y -CONFIG_PCIE_DW_PLAT_HOST=y -CONFIG_PCIE_DW_PLAT_EP=y -CONFIG_PCIE_INTEL_GW=y -CONFIG_PCI_MESON=y -# end of DesignWare PCI Core Support - -# -# Mobiveil PCIe Core Support -# -# end of Mobiveil PCIe Core Support - -# -# Cadence PCIe controllers support -# -CONFIG_PCIE_CADENCE=y -CONFIG_PCIE_CADENCE_HOST=y -CONFIG_PCIE_CADENCE_EP=y -CONFIG_PCIE_CADENCE_PLAT=y -CONFIG_PCIE_CADENCE_PLAT_HOST=y -CONFIG_PCIE_CADENCE_PLAT_EP=y -# end of Cadence PCIe controllers support -# end of PCI controller drivers - -# -# PCI Endpoint -# -CONFIG_PCI_ENDPOINT=y -CONFIG_PCI_ENDPOINT_CONFIGFS=y -# CONFIG_PCI_EPF_TEST is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -# end of PCI switch controller drivers - -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=m -CONFIG_RAPIDIO_TSI721=m -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=m -CONFIG_RAPIDIO_CPS_XX=m -CONFIG_RAPIDIO_TSI568=m -CONFIG_RAPIDIO_CPS_GEN2=m -CONFIG_RAPIDIO_RXS_GEN3=m -# end of RapidIO Switch drivers - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_CACHE=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_HMEM_REPORTING=y -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_SYS_HYPERVISOR=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SLIMBUS=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_REGMAP_SOUNDWIRE=m -CONFIG_REGMAP_SCCB=m -CONFIG_REGMAP_I3C=m -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# end of Generic Driver Options - -# -# Bus devices -# -CONFIG_MOXTET=m -CONFIG_SIMPLE_PM_BUS=y -CONFIG_MHI_BUS=m -# end of Bus devices - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y -CONFIG_GNSS=m -CONFIG_GNSS_SERIAL=m -CONFIG_GNSS_MTK_SERIAL=m -CONFIG_GNSS_SIRF_SERIAL=m -CONFIG_GNSS_UBX_SERIAL=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m - -# -# Partition parsers -# -CONFIG_MTD_AR7_PARTS=m -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_OF_PARTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# end of Partition parsers - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_PSTORE=m -CONFIG_MTD_SWAP=m -CONFIG_MTD_PARTITIONED_MASTER=y - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m -# end of RAM/ROM/Flash chip drivers - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_MTD_PHYSMAP_VERSATILE=y -CONFIG_MTD_PHYSMAP_GEMINI=y -CONFIG_MTD_PHYSMAP_GPIO_ADDR=y -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -# end of Mapping drivers for chip access - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -# end of Self-contained MTD device drivers - -CONFIG_MTD_NAND_CORE=m -CONFIG_MTD_ONENAND=m -# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y -CONFIG_MTD_NAND_ECC_SW_HAMMING=m -CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y -CONFIG_MTD_RAW_NAND=m -CONFIG_MTD_NAND_ECC_SW_BCH=y - -# -# Raw/parallel NAND flash controllers -# -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_DT=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_MXIC=m -CONFIG_MTD_NAND_GPIO=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_NAND_CADENCE=m -CONFIG_MTD_NAND_ARASAN=m - -# -# Misc -# -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_SPI_NAND=m - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -# end of LPDDR & LPDDR2 PCM memory drivers - -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_SPI_INTEL_SPI=m -CONFIG_SPI_INTEL_SPI_PCI=m -CONFIG_SPI_INTEL_SPI_PLATFORM=m -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -CONFIG_MTD_UBI_GLUEBI=m -CONFIG_MTD_UBI_BLOCK=y -CONFIG_MTD_HYPERBUS=m -CONFIG_DTC=y -CONFIG_OF=y -# CONFIG_OF_UNITTEST is not set -CONFIG_OF_FLATTREE=y -CONFIG_OF_EARLY_FLATTREE=y -CONFIG_OF_KOBJ=y -CONFIG_OF_DYNAMIC=y -CONFIG_OF_ADDRESS=y -CONFIG_OF_IRQ=y -CONFIG_OF_NET=y -CONFIG_OF_MDIO=m -CONFIG_OF_RESERVED_MEM=y -CONFIG_OF_RESOLVE=y -CONFIG_OF_OVERLAY=y -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_CDROM=m -# CONFIG_PARIDE is not set -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -# CONFIG_CDROM_PKTCDVD_WCACHE is not set -CONFIG_ATA_OVER_ETH=m -CONFIG_XEN_BLKDEV_FRONTEND=m -CONFIG_XEN_BLKDEV_BACKEND=m -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m -CONFIG_BLK_DEV_RNBD=y -CONFIG_BLK_DEV_RNBD_CLIENT=m -CONFIG_BLK_DEV_RNBD_SERVER=m - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -CONFIG_NVME_MULTIPATH=y -CONFIG_NVME_HWMON=y -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -CONFIG_NVME_TARGET=m -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m -CONFIG_NVME_TARGET_TCP=m -# end of NVME Support - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_VMWARE_BALLOON=m -CONFIG_LATTICE_ECP3_CONFIG=m -# CONFIG_SRAM is not set -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_XILINX_SDFEC=m -CONFIG_MISC_RTSX=m -CONFIG_PVPANIC=m -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -# end of Texas Instruments shared transport line discipline - -CONFIG_SENSORS_LIS3_I2C=m -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=m -CONFIG_INTEL_MEI_ME=m -CONFIG_INTEL_MEI_TXE=m -CONFIG_INTEL_MEI_HDCP=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC & related support -# -CONFIG_INTEL_MIC_BUS=m -CONFIG_SCIF_BUS=m -CONFIG_VOP_BUS=m -CONFIG_INTEL_MIC_HOST=m -CONFIG_INTEL_MIC_CARD=m -CONFIG_SCIF=m -CONFIG_MIC_COSM=m -CONFIG_VOP=m -# end of Intel MIC & related support - -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -CONFIG_MISC_ALCOR_PCI=m -CONFIG_MISC_RTSX_PCI=m -CONFIG_MISC_RTSX_USB=m -CONFIG_HABANA_AI=m -CONFIG_UACCE=m -# end of Misc devices - -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC94XX=m -CONFIG_AIC94XX_DEBUG=y -CONFIG_SCSI_MVSAS=m -CONFIG_SCSI_MVSAS_DEBUG=y -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -# CONFIG_SCSI_UFS_DWC_TC_PCI is not set -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_CDNS_PLATFORM=m -# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set -CONFIG_SCSI_UFS_BSG=y -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_SCSI_MYRB=m -CONFIG_SCSI_MYRS=m -CONFIG_VMWARE_PVSCSI=m -CONFIG_XEN_SCSI_FRONTEND=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_FDOMAIN=m -CONFIG_SCSI_FDOMAIN_PCI=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -CONFIG_SCSI_IPR_TRACE=y -CONFIG_SCSI_IPR_DUMP=y -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -CONFIG_SCSI_DEBUG=m -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=3 -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_AHCI_CEVA=m -CONFIG_AHCI_QORIQ=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -# CONFIG_PATA_PLATFORM is not set -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BCACHE_ASYNC_REGISTRAION=y -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -CONFIG_DM_DEBUG=y -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -CONFIG_DM_EBS=m -CONFIG_DM_ERA=m -CONFIG_DM_CLONE=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_MULTIPATH_HST=m -CONFIG_DM_DELAY=m -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -# CONFIG_FUSION_LOGGING is not set - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -# end of IEEE 1394 (FireWire) support - -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN_L3S=y -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_BAREUDP=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_NTB_NETDEV=m -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -# CONFIG_ATM_NICSTAR_USE_SUNI is not set -# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m -CONFIG_CAIF_DRIVERS=y -CONFIG_CAIF_TTY=m -CONFIG_CAIF_SPI_SLAVE=m -CONFIG_CAIF_SPI_SYNC=y -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -# CONFIG_B53_SPI_DRIVER is not set -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_B53_SERDES=m -CONFIG_NET_DSA_BCM_SF2=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_LANTIQ_GSWIP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_MV88E6XXX_PTP=y -CONFIG_NET_DSA_AR9331=m -CONFIG_NET_DSA_SJA1105=m -CONFIG_NET_DSA_SJA1105_PTP=y -CONFIG_NET_DSA_SJA1105_TAS=y -CONFIG_NET_DSA_SJA1105_VL=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_REALTEK_SMI=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_NET_DSA_VITESSE_VSC73XX=m -CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m -CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m -# end of Distributed Switch Architecture drivers - -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BCMGENET=m -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_SYSTEMPORT=m -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_BNXT_HWMON=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_CAVIUM_PTP=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -CONFIG_CHELSIO_T4_FCOE=y -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_NET_VENDOR_CORTINA=y -CONFIG_GEMINI_ETHERNET=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_BE2=y -CONFIG_BE2NET_BE3=y -CONFIG_BE2NET_LANCER=y -CONFIG_BE2NET_SKYHAWK=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_GOOGLE=y -CONFIG_GVE=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -# CONFIG_IXGBE_IPSEC is not set -CONFIG_IXGBEVF=m -CONFIG_IXGBEVF_IPSEC=y -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_IAVF=m -CONFIG_I40EVF=m -CONFIG_ICE=m -CONFIG_FM10K=m -CONFIG_IGC=m -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX4_CORE_GEN2=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_EN_ARFS=y -CONFIG_MLX5_EN_RXNFC=y -CONFIG_MLX5_MPFS=y -CONFIG_MLX5_ESWITCH=y -CONFIG_MLX5_CLS_ACT=y -CONFIG_MLX5_TC_CT=y -CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_IPOIB=y -CONFIG_MLX5_FPGA_IPSEC=y -CONFIG_MLX5_EN_IPSEC=y -CONFIG_MLX5_FPGA_TLS=y -CONFIG_MLX5_TLS=y -CONFIG_MLX5_EN_TLS=y -CONFIG_MLX5_SW_STEERING=y -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_LAN743X=m -CONFIG_NET_VENDOR_MICROSEMI=y -CONFIG_MSCC_OCELOT_SWITCH=m -CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETERION=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -CONFIG_NFP_APP_FLOWER=y -CONFIG_NFP_APP_ABM_NIC=y -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_NI=y -CONFIG_NI_XGE_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_VENDOR_PACKET_ENGINES=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_PENSANDO=y -CONFIG_IONIC=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_QED_OOO=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCA7000=m -CONFIG_QCA7000_SPI=m -CONFIG_QCA7000_UART=m -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_SOCIONEXT=y -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -# CONFIG_STMMAC_SELFTESTS is not set -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_DWC_QOS_ETH=m -CONFIG_DWMAC_GENERIC=m -CONFIG_DWMAC_INTEL=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -# CONFIG_TI_CPSW_PHY_SEL is not set -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XILINX=y -CONFIG_XILINX_AXI_EMAC=m -CONFIG_XILINX_LL_TEMAC=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -CONFIG_DEFXX_MMIO=y -CONFIG_SKFP=m -# CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_BUS_MUX=m -CONFIG_MDIO_BUS_MUX_GPIO=m -CONFIG_MDIO_BUS_MUX_MMIOREG=m -CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_HISI_FEMAC=m -CONFIG_MDIO_I2C=m -CONFIG_MDIO_IPQ4019=m -CONFIG_MDIO_IPQ8064=m -CONFIG_MDIO_MSCC_MIIM=m -CONFIG_MDIO_MVUSB=m -CONFIG_MDIO_OCTEON=m -CONFIG_MDIO_THUNDER=m -CONFIG_MDIO_XPCS=m -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y - -# -# MII PHY device drivers -# -CONFIG_SFP=m -CONFIG_ADIN_PHY=m -CONFIG_AMD_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AX88796B_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_BROADCOM_PHY=m -CONFIG_BCM54140_PHY=m -CONFIG_BCM84881_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_DP83822_PHY=m -CONFIG_DP83TC811_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_DP83869_PHY=m -CONFIG_FIXED_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_LXT_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROCHIP_T1_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_NXP_TJA11XX_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_RENESAS_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_USB_NET_AQC111=m -CONFIG_WLAN=y -# CONFIG_WIRELESS_WDS is not set -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -CONFIG_ATH5K_DEBUG=y -CONFIG_ATH5K_TRACER=y -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_COMMON_DEBUG=y -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -CONFIG_ATH9K_DEBUGFS=y -CONFIG_ATH9K_STATION_STATISTICS=y -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_PCI_NO_EEPROM=m -CONFIG_ATH9K_HTC=m -CONFIG_ATH9K_HTC_DEBUGFS=y -CONFIG_ATH9K_HWRNG=y -CONFIG_ATH9K_COMMON_SPECTRAL=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_DEBUGFS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -CONFIG_ATH6KL_DEBUG=y -CONFIG_ATH6KL_TRACING=y -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -CONFIG_WIL6210_DEBUGFS=y -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_AHB=y -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -CONFIG_ATH10K_DEBUG=y -CONFIG_ATH10K_DEBUGFS=y -CONFIG_ATH10K_SPECTRAL=y -CONFIG_ATH10K_TRACING=y -CONFIG_WCN36XX=m -CONFIG_WCN36XX_DEBUGFS=y -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -CONFIG_B43LEGACY_DEBUG=y -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -CONFIG_BRCM_TRACING=y -CONFIG_BRCMDBG=y -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -CONFIG_IWLEGACY_DEBUG=y -CONFIG_IWLEGACY_DEBUGFS=y -# end of iwl3945 / iwl4965 Debugging Options - -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -# CONFIG_IWLWIFI_BCAST_FILTERING is not set - -# -# Debugging Options -# -CONFIG_IWLWIFI_DEBUG=y -CONFIG_IWLWIFI_DEBUGFS=y -CONFIG_IWLWIFI_DEVICE_TRACING=y -# end of Debugging Options - -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -# CONFIG_P54_SPI_DEFAULT_EEPROM is not set -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76x0_COMMON=m -CONFIG_MT76x0U=m -CONFIG_MT76x0E=m -CONFIG_MT76x2_COMMON=m -CONFIG_MT76x2E=m -CONFIG_MT76x2U=m -CONFIG_MT7603E=m -CONFIG_MT7615_COMMON=m -CONFIG_MT7615E=m -CONFIG_MT7663U=m -CONFIG_MT7915E=m -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -CONFIG_RT2X00_LIB_DEBUGFS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -CONFIG_RTLWIFI_DEBUG=y -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_RTW88=m -CONFIG_RTW88_CORE=m -CONFIG_RTW88_PCI=m -CONFIG_RTW88_8822B=m -CONFIG_RTW88_8822C=m -CONFIG_RTW88_8723D=m -CONFIG_RTW88_8822BE=m -CONFIG_RTW88_8822CE=m -CONFIG_RTW88_8723DE=m -CONFIG_RTW88_DEBUG=y -CONFIG_RTW88_DEBUGFS=y -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -CONFIG_RSI_DEBUGFS=y -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_RSI_COEX=y -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SPI=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_VIRT_WIFI=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -# end of WiMAX Wireless Broadband devices - -# CONFIG_WAN is not set -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_IEEE802154_MCR20A=m -CONFIG_IEEE802154_HWSIM=m -CONFIG_XEN_NETDEV_FRONTEND=m -CONFIG_XEN_NETDEV_BACKEND=m -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_USB4_NET=m -CONFIG_HYPERV_NET=m -CONFIG_NETDEVSIM=m -CONFIG_NET_FAILOVER=m -CONFIG_ISDN=y -CONFIG_ISDN_CAPI=y -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_HDLC=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_NVM=y -CONFIG_NVM_PBLK=m -# CONFIG_NVM_PBLK_DEBUG is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=m -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5520=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_APPLESPI=m -CONFIG_KEYBOARD_ATKBD=m -CONFIG_KEYBOARD_QT1050=m -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_STMPE=m -CONFIG_KEYBOARD_IQS62X=m -CONFIG_KEYBOARD_OMAP4=m -CONFIG_KEYBOARD_TC3589X=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_TWL4030=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_KEYBOARD_CAP11XX=m -CONFIG_KEYBOARD_BCM=m -CONFIG_KEYBOARD_MTK_PMIC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -CONFIG_MOUSE_PS2_VMMOUSE=y -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=m -CONFIG_JOYSTICK_IFORCE_232=m -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -CONFIG_JOYSTICK_JOYDUMP=m -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_JOYSTICK_PXRC=m -CONFIG_JOYSTICK_FSIA6B=m -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_88PM860X=m -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ADC=m -CONFIG_TOUCHSCREEN_AR1021_I2C=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_BU21029=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m -CONFIG_TOUCHSCREEN_CY8CTMA140=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9034=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_EXC3000=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_HIDEEP=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_S6SY761=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_IMX6UL_TSC=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_STMPE=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_COLIBRI_VF50=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_TOUCHSCREEN_IQS5XX=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM860X_ONKEY=m -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_ATMEL_CAPTOUCH=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77650_ONKEY=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MAX8925_ONKEY=m -CONFIG_INPUT_MAX8997_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_GPIO_VIBRA=m -CONFIG_INPUT_CPCAP_PWRBUTTON=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_TWL4030_PWRBUTTON=m -CONFIG_INPUT_TWL4030_VIBRA=m -CONFIG_INPUT_TWL6040_VIBRA=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PALMAS_PWRBUTTON=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_RK805_PWRKEY=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9055_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_IQS269A=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_INPUT_RAVE_SP_PWRBUTTON=m -CONFIG_INPUT_STPMIC1_ONKEY=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -# CONFIG_RMI4_F54 is not set -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=m -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=m -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=m -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -# CONFIG_SERIO_APBPS2 is not set -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_LDISC_AUTOLOAD=y - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=m -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=32 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_ASPEED_VUART=m -CONFIG_SERIAL_8250_SHARE_IRQ=y -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -CONFIG_SERIAL_8250_RSA=y -CONFIG_SERIAL_8250_DWLIB=y -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_OF_PLATFORM=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=m -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SIFIVE=m -CONFIG_SERIAL_LANTIQ=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_XILINX_PS_UART=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_FSL_LINFLEXUART=m -CONFIG_SERIAL_CONEXANT_DIGICOLOR=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_SPRD=m -# end of Serial drivers - -CONFIG_SERIAL_MCTRL_GPIO=y -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_NOZOMI=m -CONFIG_NULL_TTY=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_HVC_DRIVER=y -CONFIG_HVC_IRQ=y -CONFIG_HVC_XEN=y -CONFIG_HVC_XEN_FRONTEND=y -CONFIG_SERIAL_DEV_BUS=y -CONFIG_SERIAL_DEV_CTRL_TTYPORT=y -# CONFIG_TTY_PRINTK is not set -CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set -CONFIG_PPDEV=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PLAT_DATA=y -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_IPMB_DEVICE_INTERFACE=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_HW_RANDOM_CCTRNG=m -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -# end of PCMCIA character devices - -CONFIG_MWAVE=m -CONFIG_DEVMEM=y -# CONFIG_DEVKMEM is not set -CONFIG_NVRAM=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -CONFIG_DEVPORT=y -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_XEN=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m -CONFIG_XILLYBUS_OF=m -# end of Character devices - -# CONFIG_RANDOM_TRUST_CPU is not set -# CONFIG_RANDOM_TRUST_BOOTLOADER is not set - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_ARB_GPIO_CHALLENGE=m -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_GPMUX=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_PINCTRL=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_DEMUX_PINCTRL=m -CONFIG_I2C_MUX_MLXCPLD=m -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_AMD_MP2=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_CHT_WC=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_NVIDIA_GPU=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=y -CONFIG_I2C_DESIGNWARE_SLAVE=y -CONFIG_I2C_DESIGNWARE_PLATFORM=y -CONFIG_I2C_DESIGNWARE_BAYTRAIL=y -CONFIG_I2C_DESIGNWARE_PCI=m -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -CONFIG_I2C_RK3X=m -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -CONFIG_I2C_FSI=m -# end of I2C Hardware Bus support - -CONFIG_I2C_STUB=m -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -CONFIG_SPI_MEM=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_DMA=y -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_FSI=m -CONFIG_SPI_NXP_FLEXSPI=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_FSL_LIB=m -CONFIG_SPI_FSL_SPI=m -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_SIFIVE=m -CONFIG_SPI_MXIC=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m -CONFIG_SPI_AMD=m - -# -# SPI Multiplexer support -# -CONFIG_SPI_MUX=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPI_DYNAMIC=y -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=y -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -CONFIG_PPS_CLIENT_KTIMER=m -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=y -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_INES=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PTP_1588_CLOCK_IDT82P33=m -CONFIG_PTP_1588_CLOCK_IDTCM=m -CONFIG_PTP_1588_CLOCK_VMW=m -# end of PTP clock support - -CONFIG_PINCTRL=y -CONFIG_GENERIC_PINCTRL_GROUPS=y -CONFIG_PINMUX=y -CONFIG_GENERIC_PINMUX_FUNCTIONS=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AS3722=m -CONFIG_PINCTRL_AXP209=m -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_DA9062=m -CONFIG_PINCTRL_MCP23S08_I2C=m -CONFIG_PINCTRL_MCP23S08_SPI=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_SINGLE=m -CONFIG_PINCTRL_SX150X=y -CONFIG_PINCTRL_STMFX=m -CONFIG_PINCTRL_MAX77620=m -CONFIG_PINCTRL_PALMAS=m -CONFIG_PINCTRL_RK805=m -CONFIG_PINCTRL_OCELOT=y -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y -CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_JASPERLAKE=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y -CONFIG_PINCTRL_LOCHNAGAR=m -CONFIG_PINCTRL_MADERA=m -CONFIG_PINCTRL_CS47L15=y -CONFIG_PINCTRL_CS47L35=y -CONFIG_PINCTRL_CS47L85=y -CONFIG_PINCTRL_CS47L90=y -CONFIG_PINCTRL_CS47L92=y -CONFIG_PINCTRL_EQUILIBRIUM=m -CONFIG_GPIOLIB=y -CONFIG_GPIOLIB_FASTPATH_LIMIT=512 -CONFIG_OF_GPIO=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_GENERIC=y -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_74XX_MMIO=m -CONFIG_GPIO_ALTERA=m -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_CADENCE=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_FTGPIO010=y -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_GRGPIO=m -CONFIG_GPIO_HLWD=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LOGICVC=m -CONFIG_GPIO_MB86S7X=m -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_SAMA5D2_PIOBU=m -CONFIG_GPIO_SIFIVE=y -CONFIG_GPIO_SIOX=m -CONFIG_GPIO_SYSCON=m -CONFIG_GPIO_VX855=m -CONFIG_GPIO_WCD934X=m -CONFIG_GPIO_XILINX=m -CONFIG_GPIO_AMD_FCH=m -# end of Memory mapped GPIO drivers - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m -CONFIG_GPIO_WINBOND=m -CONFIG_GPIO_WS16C48=m -# end of Port-mapped I/O GPIO drivers - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_ADNP=m -CONFIG_GPIO_GW_PLD=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCA953X_IRQ=y -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m -# end of I2C GPIO expanders - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ADP5520=m -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD70528=m -CONFIG_GPIO_BD71828=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_CRYSTAL_COVE=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DA9055=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_LP87565=m -CONFIG_GPIO_MADERA=m -CONFIG_GPIO_MAX77620=m -CONFIG_GPIO_MAX77650=m -CONFIG_GPIO_MSIC=y -CONFIG_GPIO_PALMAS=y -CONFIG_GPIO_RC5T583=y -CONFIG_GPIO_STMPE=y -CONFIG_GPIO_TC3589X=y -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS6586X=y -CONFIG_GPIO_TPS65910=y -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_TPS68470=y -CONFIG_GPIO_TQMX86=m -CONFIG_GPIO_TWL4030=m -CONFIG_GPIO_TWL6040=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8350=m -CONFIG_GPIO_WM8994=m -# end of MFD GPIO expanders - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_PCIE_IDIO_24=m -CONFIG_GPIO_RDC321X=m -CONFIG_GPIO_SODAVILLE=y -# end of PCI GPIO expanders - -# -# SPI GPIO expanders -# -CONFIG_GPIO_74X164=m -CONFIG_GPIO_MAX3191X=m -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m -CONFIG_GPIO_MOXTET=m -# end of SPI GPIO expanders - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -# end of USB GPIO expanders - -CONFIG_GPIO_AGGREGATOR=m -CONFIG_GPIO_MOCKUP=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m -CONFIG_W1_MASTER_SGI=m -# end of 1-wire Bus Masters - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -CONFIG_W1_SLAVE_DS2405=m -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2430=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -# CONFIG_W1_SLAVE_DS2433_CRC is not set -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS250X=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_W1_SLAVE_DS28E17=m -# end of 1-wire Slaves - -CONFIG_POWER_AVS=y -CONFIG_QCOM_CPR=m -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_AS3722=y -CONFIG_POWER_RESET_GPIO=y -CONFIG_POWER_RESET_GPIO_RESTART=y -CONFIG_POWER_RESET_LTC2952=y -CONFIG_POWER_RESET_MT6323=y -CONFIG_POWER_RESET_RESTART=y -CONFIG_POWER_RESET_SYSCON=y -CONFIG_POWER_RESET_SYSCON_POWEROFF=y -CONFIG_REBOOT_MODE=m -CONFIG_SYSCON_REBOOT_MODE=m -CONFIG_NVMEM_REBOOT_MODE=m -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_MAX8925_POWER=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -CONFIG_WM8350_POWER=m -CONFIG_TEST_POWER=m -CONFIG_BATTERY_88PM860X=m -CONFIG_CHARGER_ADP5061=m -CONFIG_BATTERY_ACT8945A=m -CONFIG_BATTERY_CPCAP=m -CONFIG_BATTERY_CW2015=m -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_LEGO_EV3=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_MANAGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9030=m -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_BATTERY_TWL4030_MADC=m -CONFIG_CHARGER_88PM860X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_BATTERY_RX51=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_TWL4030=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_LP8788=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LT3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_DETECTOR_MAX14656=m -CONFIG_CHARGER_MAX77650=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_MAX8997=m -CONFIG_CHARGER_MAX8998=m -CONFIG_CHARGER_MP2629=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -CONFIG_CHARGER_BQ25890=m -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65090=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_CHARGER_CROS_USBPD=m -CONFIG_CHARGER_UCS1002=m -CONFIG_CHARGER_BD70528=m -CONFIG_CHARGER_BD99954=m -CONFIG_CHARGER_WILCO=m -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM1177=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_AS370=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_AXI_FAN_CONTROL=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_AMD_ENERGY=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_DRIVETEMP=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_DA9055=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_GSC=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LOCHNAGAR=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2947=m -CONFIG_SENSORS_LTC2947_I2C=m -CONFIG_SENSORS_LTC2947_SPI=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX31730=m -CONFIG_SENSORS_MAX6621=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_MLXREG_FAN=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_NPCM7XX=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_BEL_PFE=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_INSPUR_IPSPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_IR38064=m -CONFIG_SENSORS_IRPS5401=m -CONFIG_SENSORS_ISL68137=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -# CONFIG_SENSORS_LTC2978_REGULATOR is not set -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX16601=m -CONFIG_SENSORS_MAX20730=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX31785=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -CONFIG_SENSORS_PXE1610=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_XDPE122=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_PWM_FAN=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TMP513=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83773G=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_WM8350=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -# CONFIG_THERMAL_STATISTICS is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 -CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_OF=y -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -CONFIG_CPU_THERMAL=y -CONFIG_CPU_FREQ_THERMAL=y -CONFIG_CPU_IDLE_THERMAL=y -CONFIG_CLOCK_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_THERMAL_MMIO=m -CONFIG_MAX77620_THERMAL=m -CONFIG_DA9062_THERMAL=m - -# -# Intel thermal drivers -# -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=y -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -# end of Intel thermal drivers - -# CONFIG_TI_SOC_THERMAL is not set -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_BD70528_WATCHDOG=m -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9055_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_GPIO_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_MENZ069_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_WM8350_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_RAVE_SP_WATCHDOG=m -CONFIG_MLX_WDT=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_RN5T618_WATCHDOG=m -CONFIG_TWL4030_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_MAX77620_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_STPMIC1_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m -CONFIG_F71808E_WDT=m -CONFIG_SP5100_TCO=m -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_KEMPLD_WDT=m -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_TQMX86_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m -CONFIG_XEN_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -# CONFIG_BCMA_HOST_SOC is not set -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_ACT8945A=m -CONFIG_MFD_AS3711=y -CONFIG_MFD_AS3722=m -CONFIG_PMIC_ADP5520=y -CONFIG_MFD_AAT2870_CORE=y -CONFIG_MFD_ATMEL_FLEXCOM=m -CONFIG_MFD_ATMEL_HLCDC=m -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC_DEV=m -CONFIG_MFD_MADERA=m -CONFIG_MFD_MADERA_I2C=m -CONFIG_MFD_MADERA_SPI=m -CONFIG_MFD_CS47L15=y -CONFIG_MFD_CS47L35=y -CONFIG_MFD_CS47L85=y -CONFIG_MFD_CS47L90=y -CONFIG_MFD_CS47L92=y -CONFIG_PMIC_DA903X=y -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9052_I2C=y -CONFIG_MFD_DA9055=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_GATEWORKS_GSC=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_MFD_MP2629=m -CONFIG_MFD_HI6421_PMIC=m -CONFIG_HTC_PASIC3=m -CONFIG_HTC_I2CPLD=y -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC=y -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_INTEL_SOC_PMIC_CHTWC=y -CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m -CONFIG_INTEL_SOC_PMIC_MRFLD=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_INTEL_MSIC=y -CONFIG_MFD_INTEL_PMC_BXT=m -CONFIG_MFD_IQS62X=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_88PM860X=y -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77620=y -CONFIG_MFD_MAX77650=m -CONFIG_MFD_MAX77686=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX77843=y -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MAX8925=y -CONFIG_MFD_MAX8997=y -CONFIG_MFD_MAX8998=y -CONFIG_MFD_MT6360=m -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_CPCAP=m -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RC5T583=y -CONFIG_MFD_RK808=m -CONFIG_MFD_RN5T618=m -CONFIG_MFD_SEC_CORE=y -CONFIG_MFD_SI476X_CORE=m -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_MFD_SMSC=y -CONFIG_ABX500_CORE=y -CONFIG_AB3100_CORE=y -CONFIG_AB3100_OTP=y -CONFIG_MFD_STMPE=y - -# -# STMicroelectronics STMPE Interface Drivers -# -CONFIG_STMPE_I2C=y -CONFIG_STMPE_SPI=y -# end of STMicroelectronics STMPE Interface Drivers - -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_LP8788=y -CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65090=y -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TPS68470=y -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TI_LP87565=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS6586X=y -CONFIG_MFD_TPS65910=y -CONFIG_MFD_TPS65912=m -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y -CONFIG_TWL4030_CORE=y -CONFIG_MFD_TWL4030_AUDIO=y -CONFIG_TWL6040_CORE=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -CONFIG_MFD_TC3589X=y -CONFIG_MFD_TQMX86=m -CONFIG_MFD_VX855=m -CONFIG_MFD_LOCHNAGAR=y -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM8400=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_I2C=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8350=y -CONFIG_MFD_WM8350_I2C=y -CONFIG_MFD_WM8994=m -CONFIG_MFD_ROHM_BD718XX=m -CONFIG_MFD_ROHM_BD70528=m -CONFIG_MFD_ROHM_BD71828=m -CONFIG_MFD_STPMIC1=m -CONFIG_MFD_STMFX=m -CONFIG_MFD_WCD934X=m -CONFIG_RAVE_SP_CORE=m -# end of Multifunction device drivers - -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PG86X=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_88PM8607=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_ACT8945A=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_AAT2870=m -CONFIG_REGULATOR_AB3100=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AS3711=m -CONFIG_REGULATOR_AS3722=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD70528=m -CONFIG_REGULATOR_BD71828=m -CONFIG_REGULATOR_BD718XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_CPCAP=m -CONFIG_REGULATOR_DA903X=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9055=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_HI6421=m -CONFIG_REGULATOR_HI6421V530=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LOCHNAGAR=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP873X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LP87565=m -CONFIG_REGULATOR_LP8788=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX77620=m -CONFIG_REGULATOR_MAX77650=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8925=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m -CONFIG_REGULATOR_MAX8997=m -CONFIG_REGULATOR_MAX8998=m -CONFIG_REGULATOR_MAX77686=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MAX77802=m -CONFIG_REGULATOR_MAX77826=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MCP16502=m -CONFIG_REGULATOR_MP5416=m -CONFIG_REGULATOR_MP8859=m -CONFIG_REGULATOR_MP886X=m -CONFIG_REGULATOR_MPQ7920=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6358=m -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PALMAS=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_RC5T583=m -CONFIG_REGULATOR_RK808=m -CONFIG_REGULATOR_RN5T618=m -CONFIG_REGULATOR_ROHM=m -CONFIG_REGULATOR_RT5033=m -CONFIG_REGULATOR_S2MPA01=m -CONFIG_REGULATOR_S2MPS11=m -CONFIG_REGULATOR_S5M8767=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_SLG51000=m -CONFIG_REGULATOR_STPMIC1=m -CONFIG_REGULATOR_SY8106A=m -CONFIG_REGULATOR_SY8824X=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65090=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS65218=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS6586X=m -CONFIG_REGULATOR_TPS65910=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m -CONFIG_REGULATOR_TWL4030=m -CONFIG_REGULATOR_VCTRL=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8350=m -CONFIG_REGULATOR_WM8400=m -CONFIG_REGULATOR_WM8994=m -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_LIRC=y -CONFIG_RC_DECODERS=y -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_IR_IMON_DECODER=m -CONFIG_IR_RCMM_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_IMON_RAW=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_RC_XBOX_DVD=m -CONFIG_CEC_CORE=m -CONFIG_CEC_NOTIFIER=y -CONFIG_CEC_PIN=y -CONFIG_MEDIA_CEC_RC=y -# CONFIG_CEC_PIN_ERROR_INJ is not set -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_CEC_CROS_EC=m -CONFIG_CEC_GPIO=m -CONFIG_CEC_SECO=m -CONFIG_CEC_SECO_RC=y -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_SUPPORT=m -# CONFIG_MEDIA_SUPPORT_FILTER is not set -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y - -# -# Media device types -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_PLATFORM_SUPPORT=y -CONFIG_MEDIA_TEST_SUPPORT=y -# end of Media device types - -# -# Media core support -# -CONFIG_VIDEO_DEV=m -CONFIG_MEDIA_CONTROLLER=y -CONFIG_DVB_CORE=m -# end of Media core support - -# -# Video4Linux options -# -CONFIG_VIDEO_V4L2=m -CONFIG_VIDEO_V4L2_I2C=y -CONFIG_VIDEO_V4L2_SUBDEV_API=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -# end of Video4Linux options - -# -# Media controller options -# -CONFIG_MEDIA_CONTROLLER_DVB=y -CONFIG_MEDIA_CONTROLLER_REQUEST_API=y - -# -# Please notice that the enabled Media controller Request API is EXPERIMENTAL -# -# end of Media controller options - -# -# Digital TV options -# -CONFIG_DVB_MMAP=y -CONFIG_DVB_NET=y -CONFIG_DVB_MAX_ADAPTERS=16 -# CONFIG_DVB_DYNAMIC_MINORS is not set -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set -# CONFIG_DVB_ULE_DEBUG is not set -# end of Digital TV options - -# -# Media drivers -# -CONFIG_TTPCI_EEPROM=m -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_CXUSB_ANALOG=y -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_VIDEO_IPU3_CIO2=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=m -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m -CONFIG_RADIO_WL128X=m -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set -CONFIG_VIDEO_V4L2_TPG=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_VIDEO_CADENCE=y -CONFIG_VIDEO_CADENCE_CSI2RX=m -CONFIG_VIDEO_CADENCE_CSI2TX=m -CONFIG_VIDEO_ASPEED=m -CONFIG_VIDEO_MUX=m -CONFIG_VIDEO_XILINX=m -CONFIG_VIDEO_XILINX_TPG=m -CONFIG_VIDEO_XILINX_VTC=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# MMC/SDIO DVB adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_V4L_TEST_DRIVERS=y -CONFIG_VIDEO_VIMC=m -CONFIG_VIDEO_VIVID=m -CONFIG_VIDEO_VIVID_CEC=y -CONFIG_VIDEO_VIVID_MAX_DEVS=64 -CONFIG_VIDEO_VIM2M=m -CONFIG_VIDEO_VICODEC=m - -# -# FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -# end of Media drivers - -# -# Media ancillary drivers -# -CONFIG_MEDIA_ATTACH=y - -# -# IR I2C driver auto-selected by 'Autoselect ancillary drivers' -# -CONFIG_VIDEO_IR_I2C=m - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TDA1997X=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_TLV320AIC23B=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m -# end of Audio decoders, processors and mixers - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m -# end of RDS decoders - -# -# Video decoders -# -CONFIG_VIDEO_ADV7180=m -CONFIG_VIDEO_ADV7183=m -CONFIG_VIDEO_ADV748X=m -CONFIG_VIDEO_ADV7604=m -CONFIG_VIDEO_ADV7604_CEC=y -CONFIG_VIDEO_ADV7842=m -CONFIG_VIDEO_ADV7842_CEC=y -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_ML86V7667=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TC358743=m -CONFIG_VIDEO_TC358743_CEC=y -CONFIG_VIDEO_TVP514X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TVP7002=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_TW9910=m -CONFIG_VIDEO_VPX3220=m - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m -# end of Video decoders - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m -CONFIG_VIDEO_ADV7343=m -CONFIG_VIDEO_ADV7393=m -CONFIG_VIDEO_AD9389B=m -CONFIG_VIDEO_AK881X=m -CONFIG_VIDEO_THS8200=m -# end of Video encoders - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m -# end of Video improvement chips - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m -# end of Audio/Video compression chips - -# -# SDR tuner chips -# -CONFIG_SDR_MAX2175=m -# end of SDR tuner chips - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_THS7303=m -CONFIG_VIDEO_M52790=m -CONFIG_VIDEO_I2C=m -CONFIG_VIDEO_ST_MIPID02=m -# end of Miscellaneous helper chips - -# -# Camera sensor devices -# -CONFIG_VIDEO_APTINA_PLL=m -CONFIG_VIDEO_SMIAPP_PLL=m -CONFIG_VIDEO_HI556=m -CONFIG_VIDEO_IMX214=m -CONFIG_VIDEO_IMX219=m -CONFIG_VIDEO_IMX258=m -CONFIG_VIDEO_IMX274=m -CONFIG_VIDEO_IMX290=m -CONFIG_VIDEO_IMX319=m -CONFIG_VIDEO_IMX355=m -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV2659=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_OV2685=m -CONFIG_VIDEO_OV2740=m -CONFIG_VIDEO_OV5640=m -CONFIG_VIDEO_OV5645=m -CONFIG_VIDEO_OV5647=m -CONFIG_VIDEO_OV6650=m -CONFIG_VIDEO_OV5670=m -CONFIG_VIDEO_OV5675=m -CONFIG_VIDEO_OV5695=m -CONFIG_VIDEO_OV7251=m -CONFIG_VIDEO_OV772X=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_OV7740=m -CONFIG_VIDEO_OV8856=m -CONFIG_VIDEO_OV9640=m -CONFIG_VIDEO_OV9650=m -CONFIG_VIDEO_OV13858=m -CONFIG_VIDEO_VS6624=m -CONFIG_VIDEO_MT9M001=m -CONFIG_VIDEO_MT9M032=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9P031=m -CONFIG_VIDEO_MT9T001=m -CONFIG_VIDEO_MT9T112=m -CONFIG_VIDEO_MT9V011=m -CONFIG_VIDEO_MT9V032=m -CONFIG_VIDEO_MT9V111=m -CONFIG_VIDEO_SR030PC30=m -CONFIG_VIDEO_NOON010PC30=m -CONFIG_VIDEO_M5MOLS=m -CONFIG_VIDEO_RJ54N1=m -CONFIG_VIDEO_S5K6AA=m -CONFIG_VIDEO_S5K6A3=m -CONFIG_VIDEO_S5K4ECGX=m -CONFIG_VIDEO_S5K5BAF=m -CONFIG_VIDEO_SMIAPP=m -CONFIG_VIDEO_ET8EK8=m -CONFIG_VIDEO_S5C73M3=m -# end of Camera sensor devices - -# -# Lens drivers -# -CONFIG_VIDEO_AD5820=m -CONFIG_VIDEO_AK7375=m -CONFIG_VIDEO_DW9714=m -CONFIG_VIDEO_DW9807_VCM=m -# end of Lens drivers - -# -# Flash devices -# -CONFIG_VIDEO_ADP1653=m -CONFIG_VIDEO_LM3560=m -CONFIG_VIDEO_LM3646=m -# end of Flash devices - -# -# SPI helper chips -# -CONFIG_VIDEO_GS1662=m -# end of SPI helper chips - -# -# Media SPI Adapters -# -CONFIG_CXD2880_SPI_DRV=m -# end of Media SPI Adapters - -CONFIG_MEDIA_TUNER=m - -# -# Customize TV tuners -# -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA18250=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m -CONFIG_MEDIA_TUNER_QM1D1B0004=m -# end of Customize TV tuners - -# -# Customise DVB Frontends -# - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_S5H1432=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_DIB9000=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m -CONFIG_DVB_CXD2880=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m -CONFIG_DVB_MN88443X=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBH29=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_LGS8GL5=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Common Interface (EN50221) controller drivers -# -CONFIG_DVB_CXD2099=m -CONFIG_DVB_SP2=m -# end of Customise DVB Frontends - -# -# Tools to develop new frontends -# -CONFIG_DVB_DUMMY_FE=m -# end of Media ancillary drivers - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=10 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DBI=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set -CONFIG_DRM_LOAD_EDID_FIRMWARE=y -CONFIG_DRM_DP_CEC=y -CONFIG_DRM_TTM=m -CONFIG_DRM_TTM_DMA_PAGE_POOL=y -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_I2C_NXP_TDA9950=m -# end of I2C encoder or helper chips - -# -# ARM devices -# -CONFIG_DRM_KOMEDA=m -# end of ARM devices - -CONFIG_DRM_RADEON=m -CONFIG_DRM_RADEON_USERPTR=y -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_DCN=y -CONFIG_DRM_AMD_DC_HDCP=y -# CONFIG_DEBUG_KERNEL_DC is not set -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_DRM_NOUVEAU=m -# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_NOUVEAU_SVM=y -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="*" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m - -# -# drm/i915 Debugging -# -# CONFIG_DRM_I915_WERROR is not set -# CONFIG_DRM_I915_DEBUG is not set -# CONFIG_DRM_I915_DEBUG_MMIO is not set -# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set -# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set -# CONFIG_DRM_I915_DEBUG_GUC is not set -# CONFIG_DRM_I915_SELFTEST is not set -# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set -# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set -# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set -# end of drm/i915 Debugging - -# -# drm/i915 Profile Guided Optimisation -# -CONFIG_DRM_I915_FENCE_TIMEOUT=10000 -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -# end of drm/i915 Profile Guided Optimisation - -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_RCAR_DW_HDMI=m -CONFIG_DRM_RCAR_LVDS=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_PANEL_ARM_VERSATILE=m -CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596=m -CONFIG_DRM_PANEL_BOE_HIMAX8279D=m -CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m -CONFIG_DRM_PANEL_LVDS=m -CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_PANEL_ELIDA_KD35T133=m -CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m -CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m -CONFIG_DRM_PANEL_ILITEK_IL9322=m -CONFIG_DRM_PANEL_ILITEK_ILI9881C=m -CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m -CONFIG_DRM_PANEL_JDI_LT070ME05000=m -CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m -CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W=m -CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m -CONFIG_DRM_PANEL_SAMSUNG_LD9040=m -CONFIG_DRM_PANEL_LG_LB035Q02=m -CONFIG_DRM_PANEL_LG_LG4573=m -CONFIG_DRM_PANEL_NEC_NL8048HL11=m -CONFIG_DRM_PANEL_NOVATEK_NT35510=m -CONFIG_DRM_PANEL_NOVATEK_NT39016=m -CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m -CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m -CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m -CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m -CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m -CONFIG_DRM_PANEL_RAYDIUM_RM67191=m -CONFIG_DRM_PANEL_RAYDIUM_RM68200=m -CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m -CONFIG_DRM_PANEL_RONBO_RB070D30=m -CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m -CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m -CONFIG_DRM_PANEL_SEIKO_43WVF1G=m -CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m -CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m -CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m -CONFIG_DRM_PANEL_SITRONIX_ST7701=m -CONFIG_DRM_PANEL_SITRONIX_ST7789V=m -CONFIG_DRM_PANEL_SONY_ACX424AKP=m -CONFIG_DRM_PANEL_SONY_ACX565AKM=m -CONFIG_DRM_PANEL_TPO_TD028TTEC1=m -CONFIG_DRM_PANEL_TPO_TD043MTEA1=m -CONFIG_DRM_PANEL_TPO_TPG110=m -CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m -CONFIG_DRM_PANEL_VISIONOX_RM69299=m -CONFIG_DRM_PANEL_XINPENG_XPP055C272=m -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_CDNS_DSI=m -CONFIG_DRM_CHRONTEL_CH7033=m -CONFIG_DRM_DISPLAY_CONNECTOR=m -CONFIG_DRM_LVDS_CODEC=m -CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m -CONFIG_DRM_NWL_MIPI_DSI=m -CONFIG_DRM_NXP_PTN3460=m -CONFIG_DRM_PARADE_PS8622=m -CONFIG_DRM_PARADE_PS8640=m -CONFIG_DRM_SIL_SII8620=m -CONFIG_DRM_SII902X=m -CONFIG_DRM_SII9234=m -CONFIG_DRM_SIMPLE_BRIDGE=m -CONFIG_DRM_THINE_THC63LVD1024=m -CONFIG_DRM_TOSHIBA_TC358764=m -CONFIG_DRM_TOSHIBA_TC358767=m -CONFIG_DRM_TOSHIBA_TC358768=m -CONFIG_DRM_TI_TFP410=m -CONFIG_DRM_TI_SN65DSI86=m -CONFIG_DRM_TI_TPD12S015=m -CONFIG_DRM_ANALOGIX_ANX6345=m -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_DRM_ANALOGIX_DP=m -CONFIG_DRM_I2C_ADV7511=m -CONFIG_DRM_I2C_ADV7511_AUDIO=y -CONFIG_DRM_I2C_ADV7511_CEC=y -CONFIG_DRM_DW_HDMI=m -CONFIG_DRM_DW_HDMI_AHB_AUDIO=m -CONFIG_DRM_DW_HDMI_I2S_AUDIO=m -CONFIG_DRM_DW_HDMI_CEC=m -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_ARCPGU=m -CONFIG_DRM_MXS=y -CONFIG_DRM_MXSFB=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_GM12U320=m -CONFIG_TINYDRM_HX8357D=m -CONFIG_TINYDRM_ILI9225=m -CONFIG_TINYDRM_ILI9341=m -CONFIG_TINYDRM_ILI9486=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -CONFIG_TINYDRM_ST7735R=m -CONFIG_DRM_XEN=y -CONFIG_DRM_XEN_FRONTEND=m -CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_BACKLIGHT=m -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -CONFIG_XEN_FBDEV_FRONTEND=m -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -CONFIG_FB_HYPERV=m -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SSD1307 is not set -# CONFIG_FB_SM712 is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_LCD_OTM3225A=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_GENERIC=m -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA903X=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_MAX8925=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP5520=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_88PM860X=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_AAT2870=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_LP8788=m -CONFIG_BACKLIGHT_PANDORA=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_AS3711=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -CONFIG_BACKLIGHT_RAVE_SP=m -CONFIG_BACKLIGHT_LED=m -# end of Backlight & LCD device support - -CONFIG_VIDEOMODE_HELPERS=y -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_VGACON_SOFT_SCROLLBACK=y -CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -# CONFIG_SND_DEBUG_VERBOSE is not set -# CONFIG_SND_PCM_XRUN_DEBUG is not set -# CONFIG_SND_CTL_VALIDATION is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -# CONFIG_SND_BT87X_OVERCLOCK is not set -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_AC97_BUS=y -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m -CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m -CONFIG_SND_SOC_AMD_ACP3x=m -CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m -CONFIG_SND_SOC_AMD_RENOIR=m -CONFIG_SND_SOC_AMD_RENOIR_MACH=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_SOC_MIKROE_PROTO=m -CONFIG_SND_BCM63XX_I2S_WHISTLER=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_PCI=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m -CONFIG_SND_SOC_INTEL_HASWELL=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL=m -CONFIG_SND_SOC_INTEL_APL=m -CONFIG_SND_SOC_INTEL_KBL=m -CONFIG_SND_SOC_INTEL_GLK=m -CONFIG_SND_SOC_INTEL_CNL=m -CONFIG_SND_SOC_INTEL_CFL=m -CONFIG_SND_SOC_INTEL_CML_H=m -CONFIG_SND_SOC_INTEL_CML_LP=m -CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m -CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m -# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set -CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m -CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m -CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m -CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH=m -CONFIG_SND_SOC_MTK_BTCVSD=m -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_OF=m -# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set -# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_ACPI=m -CONFIG_SND_SOC_SOF_INTEL_PCI=m -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -CONFIG_SND_SOC_XILINX_I2S=m -CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m -CONFIG_SND_SOC_XILINX_SPDIF=m -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -CONFIG_SND_SOC_AC97_CODEC=m -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_ADAU7118=m -CONFIG_SND_SOC_ADAU7118_HW=m -CONFIG_SND_SOC_ADAU7118_I2C=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4118=m -CONFIG_SND_SOC_AK4458=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_AK5558=m -CONFIG_SND_SOC_ALC5623=m -CONFIG_SND_SOC_BD28623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CPCAP=m -CONFIG_SND_SOC_CROS_EC_CODEC=m -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS35L36=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4341=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_CX2072X=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES7241=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_HDAC_HDA=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_LOCHNAGAR_SC=m -CONFIG_SND_SOC_MAX98088=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX9867=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX98373=m -CONFIG_SND_SOC_MAX98390=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM1789=m -CONFIG_SND_SOC_PCM1789_I2C=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM186X=m -CONFIG_SND_SOC_PCM186X_I2C=m -CONFIG_SND_SOC_PCM186X_SPI=m -CONFIG_SND_SOC_PCM3060=m -CONFIG_SND_SOC_PCM3060_I2C=m -CONFIG_SND_SOC_PCM3060_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RK3328=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT1011=m -CONFIG_SND_SOC_RT1015=m -CONFIG_SND_SOC_RT1308_SDW=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5660=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_RT5682=m -CONFIG_SND_SOC_RT5682_I2C=m -CONFIG_SND_SOC_RT5682_SDW=m -CONFIG_SND_SOC_RT700=m -CONFIG_SND_SOC_RT700_SDW=m -CONFIG_SND_SOC_RT711=m -CONFIG_SND_SOC_RT711_SDW=m -CONFIG_SND_SOC_RT715=m -CONFIG_SND_SOC_RT715_SDW=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2305=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS2562=m -CONFIG_SND_SOC_TAS2770=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TAS6424=m -CONFIG_SND_SOC_TDA7419=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC32X4=m -CONFIG_SND_SOC_TLV320AIC32X4_I2C=m -CONFIG_SND_SOC_TLV320AIC32X4_SPI=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TLV320ADCX140=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_TSCS42XX=m -CONFIG_SND_SOC_TSCS454=m -CONFIG_SND_SOC_UDA1334=m -CONFIG_SND_SOC_WCD9335=m -CONFIG_SND_SOC_WCD934X=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8782=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8904=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_WSA881X=m -CONFIG_SND_SOC_ZL38060=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_MAX9759=m -CONFIG_SND_SOC_MT6351=m -CONFIG_SND_SOC_MT6358=m -CONFIG_SND_SOC_MT6660=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8822=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -# end of CODEC drivers - -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_AUDIO_GRAPH_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_SND_XEN_FRONTEND=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_BIGBEN_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_COUGAR=m -CONFIG_HID_MACALLY=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CREATIVE_SB0540=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELAN=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_GLORIOUS=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GOOGLE_HAMMER=m -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_VIEWSONIC=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_JABRA=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MALTRON=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_REDRAGON=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEAM=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_U2FZERO=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set -CONFIG_HID_ALPS=m -CONFIG_HID_MCP2221=m -# end of Special HID drivers - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# end of USB HID Boot Protocol drivers -# end of USB HID support - -# -# I2C HID support -# -CONFIG_I2C_HID=m -# end of I2C HID support - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support -# end of HID support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_USB_CONN_GPIO=m -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=y -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -CONFIG_USB_DYNAMIC_MINORS=y -# CONFIG_USB_OTG is not set -# CONFIG_USB_OTG_WHITELIST is not set -# CONFIG_USB_OTG_BLACKLIST_HUB is not set -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_MON=m - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PCI_RENESAS=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_CDNS3=m -CONFIG_USB_CDNS3_GADGET=y -CONFIG_USB_CDNS3_HOST=y -CONFIG_USB_CDNS3_PCI_WRAP=m -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -# CONFIG_MUSB_PIO_ONLY is not set -CONFIG_USB_DWC3=m -CONFIG_USB_DWC3_ULPI=y -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC3_HAPS=m -CONFIG_USB_DWC3_OF_SIMPLE=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_MSM=m -CONFIG_USB_CHIPIDEA_IMX=m -CONFIG_USB_CHIPIDEA_GENERIC=m -CONFIG_USB_CHIPIDEA_TEGRA=m -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_CONSOLE=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -CONFIG_USB_SERIAL_UPD78F0730=m -CONFIG_USB_SERIAL_DEBUG=m - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_APPLE_MFI_FASTCHARGE=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set -CONFIG_USB_ISP1301=m -# end of USB Physical Layer drivers - -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_SNP_UDC_PLAT=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -CONFIG_USB_GADGET_XILINX=m -CONFIG_USB_MAX3420_UDC=m -CONFIG_USB_DUMMY_HCD=m -# end of USB Peripheral Controller - -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC1_LEGACY=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y - -# -# USB Gadget precomposed configurations -# -CONFIG_USB_ZERO=m -CONFIG_USB_AUDIO=m -# CONFIG_GADGET_UAC1 is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -CONFIG_USB_G_DBGP=m -# CONFIG_USB_G_DBGP_PRINTK is not set -CONFIG_USB_G_DBGP_SERIAL=y -CONFIG_USB_G_WEBCAM=m -CONFIG_USB_RAW_GADGET=m -# end of USB Gadget precomposed configurations - -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -CONFIG_TYPEC_FUSB302=m -CONFIG_TYPEC_WCOVE=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -CONFIG_TYPEC_HD3SS3220=m -CONFIG_TYPEC_TPS6598X=m - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -CONFIG_TYPEC_MUX_PI3USB30532=m -CONFIG_TYPEC_MUX_INTEL_PMC=m -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -CONFIG_TYPEC_NVIDIA_ALTMODE=m -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -CONFIG_USB_ROLES_INTEL_XHCI=m -CONFIG_MMC=m -CONFIG_PWRSEQ_EMMC=m -CONFIG_PWRSEQ_SD8787=m -CONFIG_PWRSEQ_SIMPLE=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -CONFIG_MMC_TEST=m - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_SDHCI_OF_ARASAN=m -CONFIG_MMC_SDHCI_OF_ASPEED=m -CONFIG_MMC_SDHCI_OF_AT91=m -CONFIG_MMC_SDHCI_OF_DWCMSHC=m -CONFIG_MMC_SDHCI_CADENCE=m -CONFIG_MMC_SDHCI_F_SDH30=m -CONFIG_MMC_SDHCI_MILBEAUT=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_ALCOR=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MMC_SDHCI_OMAP=m -CONFIG_MMC_SDHCI_AM654=m -CONFIG_MMC_SDHCI_EXTERNAL_DMA=y -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y - -# -# LED drivers -# -CONFIG_LEDS_88PM860X=m -CONFIG_LEDS_AAT1290=m -CONFIG_LEDS_AN30259A=m -CONFIG_LEDS_APU=m -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_AW2013=m -CONFIG_LEDS_BCM6328=m -CONFIG_LEDS_BCM6358=m -CONFIG_LEDS_CPCAP=m -CONFIG_LEDS_CR0014114=m -CONFIG_LEDS_EL15203000=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3532=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_LM3692X=m -CONFIG_LEDS_LM3601X=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -# CONFIG_LEDS_LP5521 is not set -# CONFIG_LEDS_LP5523 is not set -# CONFIG_LEDS_LP5562 is not set -# CONFIG_LEDS_LP8501 is not set -CONFIG_LEDS_LP8788=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -CONFIG_LEDS_PCA955X_GPIO=y -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_WM8350=m -CONFIG_LEDS_DA903X=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_MAX77650=m -CONFIG_LEDS_MAX77693=m -CONFIG_LEDS_MAX8997=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m -CONFIG_LEDS_KTD2692=m -CONFIG_LEDS_IS31FL319X=m -CONFIG_LEDS_IS31FL32XX=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_SYSCON=y -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_MLXREG=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m -CONFIG_LEDS_SPI_BYTE=m -CONFIG_LEDS_TI_LMU_COMMON=m -CONFIG_LEDS_LM3697=m -CONFIG_LEDS_LM36274=m -CONFIG_LEDS_TPS6105X=m -CONFIG_LEDS_SGM3140=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_MTD=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_ACTIVITY=m -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_LEDS_TRIGGER_NETDEV=m -CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_EFA=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_RDMA_SIW=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_RTRS=m -CONFIG_INFINIBAND_RTRS_CLIENT=m -CONFIG_INFINIBAND_RTRS_SERVER=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_I10NM=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM860X=m -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABEOZ9=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_AS3722=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_CENTURY=y -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_HYM8563=m -CONFIG_RTC_DRV_LP8788=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_MAX8925=m -CONFIG_RTC_DRV_MAX8998=m -CONFIG_RTC_DRV_MAX8997=m -CONFIG_RTC_DRV_MAX77686=m -CONFIG_RTC_DRV_RK808=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_ISL12026=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF85363=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BD70528=m -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_TWL4030=m -CONFIG_RTC_DRV_PALMAS=m -CONFIG_RTC_DRV_TPS6586X=m -CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m -CONFIG_RTC_DRV_RC5T583=m -CONFIG_RTC_DRV_RC5T619=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3028=m -CONFIG_RTC_DRV_RV8803=m -CONFIG_RTC_DRV_S5M=m -CONFIG_RTC_DRV_SD3078=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9055=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_WM8350=m -CONFIG_RTC_DRV_PCF50633=m -CONFIG_RTC_DRV_AB3100=m -CONFIG_RTC_DRV_ZYNQMP=m -CONFIG_RTC_DRV_CROS_EC=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_CADENCE=m -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m -CONFIG_RTC_DRV_R7301=m -CONFIG_RTC_DRV_CPCAP=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_RTC_DRV_WILCO_EC=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_DMA_OF=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_DW_AXI_DMAC=m -CONFIG_FSL_EDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IDXD=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_PLX_DMA=m -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=y -CONFIG_DW_DMAC_PCI=y -CONFIG_DW_EDMA=m -CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y -CONFIG_SF_PDMA=m - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -# CONFIG_DMABUF_MOVE_NOTIFY is not set -# CONFIG_DMABUF_SELFTESTS is not set -CONFIG_DMABUF_HEAPS=y -CONFIG_DMABUF_HEAPS_SYSTEM=y -# end of DMABUF options - -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_HT16K33=m -CONFIG_PARPORT_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -# CONFIG_CHARLCD_BL_OFF is not set -# CONFIG_CHARLCD_BL_ON is not set -CONFIG_CHARLCD_BL_FLASH=y -CONFIG_PANEL=m -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VBOXGUEST=m -CONFIG_VIRTIO=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_MEM=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y -CONFIG_VDPA=m -CONFIG_VDPA_SIM=m -CONFIG_IFCVF=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_RING=m -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TIMER=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -# end of Microsoft Hyper-V guest support - -# -# Xen driver support -# -CONFIG_XEN_BALLOON=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 -CONFIG_XEN_SCRUB_PAGES_DEFAULT=y -CONFIG_XEN_DEV_EVTCHN=m -CONFIG_XEN_BACKEND=y -CONFIG_XENFS=m -CONFIG_XEN_COMPAT_XENFS=y -CONFIG_XEN_SYS_HYPERVISOR=y -CONFIG_XEN_XENBUS_FRONTEND=y -CONFIG_XEN_GNTDEV=m -CONFIG_XEN_GNTDEV_DMABUF=y -CONFIG_XEN_GRANT_DEV_ALLOC=m -CONFIG_XEN_GRANT_DMA_ALLOC=y -CONFIG_SWIOTLB_XEN=y -CONFIG_XEN_PCIDEV_BACKEND=m -CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y -CONFIG_XEN_SCSI_BACKEND=m -CONFIG_XEN_PRIVCMD=m -CONFIG_XEN_ACPI_PROCESSOR=m -CONFIG_XEN_MCE_LOG=y -CONFIG_XEN_HAVE_PVMMU=y -CONFIG_XEN_EFI=y -CONFIG_XEN_AUTO_XLATE=y -CONFIG_XEN_ACPI=y -CONFIG_XEN_SYMS=y -CONFIG_XEN_HAVE_VPMU=y -CONFIG_XEN_FRONT_PGDIR_SHBUF=m -# end of Xen driver support - -# CONFIG_GREYBUS is not set -CONFIG_STAGING=y -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -# CONFIG_COMEDI_ISA_DRIVERS is not set -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_COMEDI_NI_ROUTING=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16203=m -CONFIG_ADIS16240=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD7816=m -CONFIG_AD7280=m -# end of Analog to digital converters - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m -# end of Analog digital bi-direction converters - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7746=m -# end of Capacitance to digital converters - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m -# end of Direct Digital Synthesis - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m -# end of Network Analyzer, Impedance Converters - -# -# Active energy metering IC -# -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m -# end of Active energy metering IC - -# -# Resolver to digital converters -# -CONFIG_AD2S1210=m -# end of Resolver to digital converters -# end of IIO staging drivers - -# CONFIG_FB_SM750 is not set - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -CONFIG_SPEAKUP_SYNTH_DUMMY=m -# end of Speakup console speech - -CONFIG_STAGING_MEDIA=y -CONFIG_INTEL_ATOMISP=y -CONFIG_VIDEO_ATOMISP=m -CONFIG_VIDEO_ATOMISP_ISP2401=y -CONFIG_VIDEO_ATOMISP_OV5693=m -CONFIG_VIDEO_ATOMISP_OV2722=m -CONFIG_VIDEO_ATOMISP_GC2235=m -CONFIG_VIDEO_ATOMISP_MSRLIST_HELPER=m -CONFIG_VIDEO_ATOMISP_MT9M114=m -CONFIG_VIDEO_ATOMISP_GC0310=m -CONFIG_VIDEO_ATOMISP_OV2680=m -CONFIG_VIDEO_ATOMISP_LM3554=m -CONFIG_VIDEO_IPU3_IMGU=m - -# -# soc_camera sensor drivers -# -CONFIG_VIDEO_USBVISION=m - -# -# Android -# -# end of Android - -CONFIG_STAGING_BOARD=y -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_GS_FPGABOOT=m -CONFIG_UNISYSSPAR=y -CONFIG_UNISYS_VISORNIC=m -CONFIG_UNISYS_VISORINPUT=m -CONFIG_UNISYS_VISORHBA=m -CONFIG_COMMON_CLK_XLNX_CLKWZRD=m -# CONFIG_FB_TFT is not set -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_MOST_COMPONENTS=m -CONFIG_MOST_CDEV=m -CONFIG_MOST_NET=m -CONFIG_MOST_SOUND=m -CONFIG_MOST_VIDEO=m -CONFIG_MOST_DIM2=m -CONFIG_MOST_I2C=m -CONFIG_MOST_USB=m -CONFIG_KS7010=m -CONFIG_PI433=m - -# -# Gasket devices -# -CONFIG_STAGING_GASKET_FRAMEWORK=m -CONFIG_STAGING_APEX_DRIVER=m -# end of Gasket devices - -CONFIG_XIL_AXIS_FIFO=m -CONFIG_FIELDBUS_DEV=m -CONFIG_HMS_ANYBUSS_BUS=m -CONFIG_ARCX_ANYBUS_CONTROLLER=m -CONFIG_HMS_PROFINET=m -CONFIG_KPC2000=y -CONFIG_KPC2000_CORE=m -CONFIG_KPC2000_SPI=m -CONFIG_KPC2000_I2C=m -CONFIG_KPC2000_DMA=m -CONFIG_QLGE=m -CONFIG_WFX=m -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_HUAWEI_WMI=m -CONFIG_INTEL_WMI_SBL_FW_UPDATE=m -CONFIG_INTEL_WMI_THUNDERBOLT=m -CONFIG_MXM_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_XIAOMI_WMI=m -CONFIG_ACERHDF=m -CONFIG_ACER_WIRELESS=m -CONFIG_ACER_WMI=m -CONFIG_APPLE_GMUX=m -CONFIG_ASUS_LAPTOP=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_EEEPC_WMI=m -CONFIG_DCDBAS=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_SMBIOS_WMI=y -CONFIG_DELL_SMBIOS_SMM=y -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_RBTN=m -# CONFIG_DELL_RBU is not set -CONFIG_DELL_SMO8800=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_DESCRIPTOR=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_AMILO_RFKILL=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_GPD_POCKET_FAN=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_IBM_RTL=m -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_MENLOW=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_VBTN=m -CONFIG_SURFACE3_WMI=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_SURFACE_3_POWER_OPREGION=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_MSI_LAPTOP=m -CONFIG_MSI_WMI=m -CONFIG_PCENGINES_APU2=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_SAMSUNG_Q10=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_LG_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_SYSTEM76_ACPI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_I2C_MULTI_INSTANTIATE=m -CONFIG_MLX_PLATFORM=m -CONFIG_TOUCHSCREEN_DMI=y -CONFIG_INTEL_IPS=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_INTEL_CHTDC_TI_PWRBTN=m -CONFIG_INTEL_MFLD_THERMAL=m -CONFIG_INTEL_MID_POWER_BUTTON=m -CONFIG_INTEL_MRFLD_PWRBTN=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_SCU_IPC=y -CONFIG_INTEL_SCU=y -CONFIG_INTEL_SCU_PCI=y -CONFIG_INTEL_SCU_PLATFORM=m -CONFIG_INTEL_SCU_IPC_UTIL=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_PMC_ATOM=y -CONFIG_MFD_CROS_EC=m -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CHROMEOS_TBMC=m -CONFIG_CROS_EC=m -CONFIG_CROS_EC_I2C=m -CONFIG_CROS_EC_RPMSG=m -CONFIG_CROS_EC_ISHTP=m -CONFIG_CROS_EC_SPI=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LIGHTBAR=m -CONFIG_CROS_EC_VBC=m -CONFIG_CROS_EC_DEBUGFS=m -CONFIG_CROS_EC_SENSORHUB=m -CONFIG_CROS_EC_SYSFS=m -CONFIG_CROS_EC_TYPEC=m -CONFIG_CROS_USBPD_LOGGER=m -CONFIG_CROS_USBPD_NOTIFY=m -CONFIG_WILCO_EC=m -CONFIG_WILCO_EC_DEBUGFS=m -CONFIG_WILCO_EC_EVENTS=m -CONFIG_WILCO_EC_TELEMETRY=m -CONFIG_MELLANOX_PLATFORM=y -CONFIG_MLXREG_HOTPLUG=m -CONFIG_MLXREG_IO=m -CONFIG_HAVE_CLK=y -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y -CONFIG_COMMON_CLK_WM831X=m -CONFIG_CLK_HSDK=y -CONFIG_COMMON_CLK_MAX77686=m -CONFIG_COMMON_CLK_MAX9485=m -CONFIG_COMMON_CLK_RK808=m -CONFIG_COMMON_CLK_SI5341=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_SI514=m -CONFIG_COMMON_CLK_SI544=m -CONFIG_COMMON_CLK_SI570=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CDCE925=m -CONFIG_COMMON_CLK_CS2000_CP=m -CONFIG_COMMON_CLK_S2MPS11=m -CONFIG_CLK_TWL6040=m -CONFIG_COMMON_CLK_LOCHNAGAR=m -CONFIG_COMMON_CLK_PALMAS=m -CONFIG_COMMON_CLK_PWM=m -CONFIG_COMMON_CLK_VC5=m -CONFIG_COMMON_CLK_BD718XX=m -CONFIG_COMMON_CLK_FIXED_MMIO=y -CONFIG_CLK_LGM_CGU=y -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_TIMER_OF=y -CONFIG_TIMER_PROBE=y -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -CONFIG_CLKSRC_MMIO=y -CONFIG_MICROCHIP_PIT64B=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PLATFORM_MHU=m -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_MAILBOX_TEST=m -CONFIG_IOMMU_IOVA=y -CONFIG_IOASID=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_OF_IOMMU=y -CONFIG_IOMMU_DMA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set -CONFIG_IRQ_REMAP=y -CONFIG_HYPERV_IOMMU=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=y -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m -CONFIG_RPMSG_VIRTIO=m -# end of Rpmsg drivers - -CONFIG_SOUNDWIRE=m - -# -# SoundWire Devices -# -CONFIG_SOUNDWIRE_CADENCE=m -CONFIG_SOUNDWIRE_INTEL=m -CONFIG_SOUNDWIRE_QCOM=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Aspeed SoC drivers -# -# end of Aspeed SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Qualcomm SoC drivers -# -# end of Qualcomm SoC drivers - -CONFIG_SOC_TI=y - -# -# Xilinx SoC drivers -# -CONFIG_XILINX_VCU=m -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_FSA9480=m -CONFIG_EXTCON_GPIO=m -CONFIG_EXTCON_INTEL_INT3496=m -CONFIG_EXTCON_INTEL_CHT_WC=m -CONFIG_EXTCON_INTEL_MRFLD=m -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_MAX77843=m -CONFIG_EXTCON_MAX8997=m -CONFIG_EXTCON_PALMAS=m -CONFIG_EXTCON_PTN5150=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -CONFIG_EXTCON_USB_GPIO=m -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_BUFFER_DMA=m -CONFIG_IIO_BUFFER_DMAENGINE=m -CONFIG_IIO_BUFFER_HW_CONSUMER=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16209=m -CONFIG_ADXL372=m -CONFIG_ADXL372_SPI=m -CONFIG_ADXL372_I2C=m -CONFIG_BMA220=m -CONFIG_BMA400=m -CONFIG_BMA400_I2C=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD06=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7091R5=m -CONFIG_AD7124=m -CONFIG_AD7192=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7292=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7766=m -CONFIG_AD7768_1=m -CONFIG_AD7780=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD7949=m -CONFIG_AD799X=m -CONFIG_AD9467=m -CONFIG_ADI_AXI_ADC=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_CPCAP_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_ENVELOPE_DETECTOR=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_INTEL_MRFLD_ADC=m -CONFIG_LP8788_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2496=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1241=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MCP3911=m -CONFIG_MEN_Z188_ADC=m -CONFIG_MP2629_ADC=m -CONFIG_NAU7802=m -CONFIG_PALMAS_GPADC=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_QCOM_SPMI_ADC5=m -CONFIG_RN5T618_ADC=m -CONFIG_SD_ADC_MODULATOR=m -CONFIG_STMPE_ADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_ADS8344=m -CONFIG_TI_ADS8688=m -CONFIG_TI_ADS124S08=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_TWL4030_MADC=m -CONFIG_TWL6030_GPADC=m -CONFIG_VF610_ADC=m -CONFIG_VIPERBOARD_ADC=m -CONFIG_XILINX_XADC=m -# end of Analog to digital converters - -# -# Analog Front Ends -# -CONFIG_IIO_RESCALE=m -# end of Analog Front Ends - -# -# Amplifiers -# -CONFIG_AD8366=m -CONFIG_HMC425=m -# end of Amplifiers - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_ATLAS_EZO_SENSOR=m -CONFIG_BME680=m -CONFIG_BME680_I2C=m -CONFIG_BME680_SPI=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_PMS7003=m -CONFIG_SENSIRION_SGP30=m -CONFIG_SPS30=m -CONFIG_VZ89X=m -# end of Chemical Sensors - -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m -CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -# end of Hid Sensor IIO Common - -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -# end of SSP Sensor Common - -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_AD5686=m -CONFIG_AD5686_SPI=m -CONFIG_AD5696_I2C=m -CONFIG_AD5755=m -CONFIG_AD5758=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5770R=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_DPOT_DAC=m -CONFIG_DS4424=m -CONFIG_LTC1660=m -CONFIG_LTC2632=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MAX5821=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m -CONFIG_TI_DAC082S085=m -CONFIG_TI_DAC5571=m -CONFIG_TI_DAC7311=m -CONFIG_TI_DAC7612=m -CONFIG_VF610_DAC=m -# end of Digital to analog converters - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set -# end of IIO dummy driver - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m -# end of Clock Generator/Distribution - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m -CONFIG_ADF4371=m -# end of Phase-Locked Loop (PLL) frequency synthesizers -# end of Frequency Synthesizers DDS/PLL - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_FXAS21002C=m -CONFIG_FXAS21002C_I2C=m -CONFIG_FXAS21002C_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m -# end of Digital gyroscope sensors - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m -# end of Heart Rate Monitors -# end of Health Sensors - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m -# end of Humidity sensors - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16460=m -CONFIG_ADIS16475=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_FXOS8700=m -CONFIG_FXOS8700_I2C=m -CONFIG_FXOS8700_SPI=m -CONFIG_KMX61=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ST_LSM6DSX_I3C=m -# end of Inertial measurement units - -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -CONFIG_ACPI_ALS=m -CONFIG_ADJD_S311=m -CONFIG_ADUX1020=m -CONFIG_AL3010=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM3605=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP002=m -CONFIG_GP2AP020A00F=m -CONFIG_IQS621_ALS=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_LV0104CS=m -CONFIG_MAX44000=m -CONFIG_MAX44009=m -CONFIG_NOA1305=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1133=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_ST_UVIS25=m -CONFIG_ST_UVIS25_I2C=m -CONFIG_ST_UVIS25_SPI=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL2772=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VCNL4035=m -CONFIG_VEML6030=m -CONFIG_VEML6070=m -CONFIG_VL6180=m -CONFIG_ZOPT2201=m -# end of Light sensors - -# -# Magnetometer sensors -# -CONFIG_AK8974=m -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m -CONFIG_SENSORS_RM3100=m -CONFIG_SENSORS_RM3100_I2C=m -CONFIG_SENSORS_RM3100_SPI=m -# end of Magnetometer sensors - -# -# Multiplexers -# -CONFIG_IIO_MUX=m -# end of Multiplexers - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m -# end of Inclinometer sensors - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m -# end of Triggers - standalone - -# -# Linear and angular position sensors -# -CONFIG_IQS624_POS=m -# end of Linear and angular position sensors - -# -# Digital potentiometers -# -CONFIG_AD5272=m -CONFIG_DS1803=m -CONFIG_MAX5432=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4018=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_MCP41010=m -CONFIG_TPL0102=m -# end of Digital potentiometers - -# -# Digital potentiostats -# -CONFIG_LMP91000=m -# end of Digital potentiostats - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_DLHL60D=m -CONFIG_DPS310=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_ICP10100=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m -# end of Pressure sensors - -# -# Lightning sensors -# -CONFIG_AS3935=m -# end of Lightning sensors - -# -# Proximity and distance sensors -# -CONFIG_ISL29501=m -CONFIG_LIDAR_LITE_V2=m -CONFIG_MB1232=m -CONFIG_PING=m -CONFIG_RFD77402=m -CONFIG_SRF04=m -CONFIG_SX9310=m -CONFIG_SX9500=m -CONFIG_SRF08=m -CONFIG_VCNL3020=m -CONFIG_VL53L0X_I2C=m -# end of Proximity and distance sensors - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -# end of Resolver to digital converters - -# -# Temperature sensors -# -CONFIG_IQS620AT_TEMP=m -CONFIG_LTC2983=m -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_MLX90632=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_MAX31856=m -# end of Temperature sensors - -CONFIG_NTB=m -CONFIG_NTB_MSI=y -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_SWITCHTEC=m -# CONFIG_NTB_PINGPONG is not set -# CONFIG_NTB_TOOL is not set -# CONFIG_NTB_PERF is not set -# CONFIG_NTB_MSI_TEST is not set -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -# CONFIG_VME_FAKE is not set - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_ATMEL_HLCDC_PWM=m -CONFIG_PWM_CRC=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_FSL_FTM=m -CONFIG_PWM_IQS620A=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_PWM_STMPE=y -CONFIG_PWM_TWL=m -CONFIG_PWM_TWL_LED=m - -# -# IRQ chip support -# -CONFIG_IRQCHIP=y -CONFIG_AL_FIC=y -CONFIG_MADERA_IRQ=m -# end of IRQ chip support - -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -CONFIG_RESET_BRCMSTB_RESCAL=y -CONFIG_RESET_INTEL_GW=y -CONFIG_RESET_TI_SYSCON=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_GENERIC_PHY_MIPI_DPHY=y -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_CADENCE_TORRENT=m -CONFIG_PHY_CADENCE_DPHY=m -CONFIG_PHY_CADENCE_SIERRA=m -CONFIG_PHY_CADENCE_SALVO=m -CONFIG_PHY_FSL_IMX8MQ_USB=m -CONFIG_PHY_MIXEL_MIPI_DPHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_MAPPHONE_MDM6600=m -CONFIG_PHY_OCELOT_SERDES=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -CONFIG_PHY_TUSB1210=m -CONFIG_PHY_INTEL_COMBO=y -CONFIG_PHY_INTEL_EMMC=m -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_IDLE_INJECT=y -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -# end of Performance monitor support - -CONFIG_RAS=y -CONFIG_RAS_CEC=y -# CONFIG_RAS_CEC_DEBUG is not set -CONFIG_USB4=m - -# -# Android -# -# CONFIG_ANDROID is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_ND_PFN=m -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_OF_PMEM=m -CONFIG_DAX_DRIVER=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_HMEM=m -CONFIG_DEV_DAX_KMEM=m -CONFIG_DEV_DAX_PMEM_COMPAT=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -CONFIG_NVMEM_SPMI_SDAM=m -CONFIG_RAVE_SP_EEPROM=m - -# -# HW tracing support -# -CONFIG_STM=m -CONFIG_STM_PROTO_BASIC=m -CONFIG_STM_PROTO_SYS_T=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_STM_SOURCE_FTRACE=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_ACPI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -# end of HW tracing support - -CONFIG_FPGA=m -CONFIG_ALTERA_PR_IP_CORE=m -CONFIG_ALTERA_PR_IP_CORE_PLAT=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_FPGA_MGR_ICE40_SPI=m -CONFIG_FPGA_MGR_MACHXO2_SPI=m -CONFIG_FPGA_BRIDGE=m -CONFIG_ALTERA_FREEZE_BRIDGE=m -CONFIG_XILINX_PR_DECOUPLER=m -CONFIG_FPGA_REGION=m -CONFIG_OF_FPGA_REGION=m -CONFIG_FPGA_DFL=m -CONFIG_FPGA_DFL_FME=m -CONFIG_FPGA_DFL_FME_MGR=m -CONFIG_FPGA_DFL_FME_BRIDGE=m -CONFIG_FPGA_DFL_FME_REGION=m -CONFIG_FPGA_DFL_AFU=m -CONFIG_FPGA_DFL_PCI=m -CONFIG_FSI=m -CONFIG_FSI_NEW_DEV_NODE=y -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_MASTER_ASPEED=m -CONFIG_FSI_SCOM=m -CONFIG_FSI_SBEFIFO=m -CONFIG_FSI_OCC=m -CONFIG_TEE=m - -# -# TEE drivers -# -CONFIG_AMDTEE=m -# end of TEE drivers - -CONFIG_MULTIPLEXER=m - -# -# Multiplexer drivers -# -CONFIG_MUX_ADG792A=m -CONFIG_MUX_ADGS1408=m -CONFIG_MUX_GPIO=m -CONFIG_MUX_MMIO=m -# end of Multiplexer drivers - -CONFIG_PM_OPP=y -CONFIG_UNISYS_VISORBUS=m -CONFIG_SIOX=m -CONFIG_SIOX_BUS_GPIO=m -CONFIG_SLIMBUS=m -CONFIG_SLIM_QCOM_CTRL=m -CONFIG_INTERCONNECT=y -CONFIG_COUNTER=m -CONFIG_FTM_QUADDEC=m -CONFIG_MOST=m -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=m -CONFIG_EXT4_USE_FOR_EXT2=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -# CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=m -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_XFS_ONLINE_SCRUB=y -CONFIG_XFS_ONLINE_REPAIR=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_OCFS2_FS_O2CB=m -CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m -CONFIG_OCFS2_FS_STATS=y -CONFIG_OCFS2_DEBUG_MASKLOG=y -# CONFIG_OCFS2_DEBUG_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_IO_TRACE is not set -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_F2FS_FS_LZORLE=y -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -# CONFIG_MANDATORY_FILE_LOCKING is not set -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=m -CONFIG_FS_VERITY=y -# CONFIG_FS_VERITY_DEBUG is not set -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_QUOTACTL_COMPAT=y -CONFIG_AUTOFS4_FS=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -CONFIG_FSCACHE_HISTOGRAM=y -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_VMCORE=y -CONFIG_PROC_VMCORE_DEVICE_DUMP=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_MEMFD_CREATE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=y -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -# CONFIG_JFFS2_FS_WBUF_VERIFY is not set -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_RTIME=y -CONFIG_UBIFS_FS=m -# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_FS_ZSTD=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_XATTR=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_UBIFS_FS_AUTHENTICATION=y -CONFIG_CRAMFS=m -CONFIG_CRAMFS_BLOCKDEV=y -CONFIG_CRAMFS_MTD=y -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -CONFIG_SQUASHFS_DECOMP_MULTI=y -# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set -# CONFIG_SQUASHFS_EMBEDDED is not set -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -CONFIG_ROMFS_FS=m -CONFIG_ROMFS_BACKED_BY_BLOCK=y -# CONFIG_ROMFS_BACKED_BY_MTD is not set -# CONFIG_ROMFS_BACKED_BY_BOTH is not set -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFLATE_COMPRESS=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -# CONFIG_PSTORE_842_COMPRESS is not set -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y -CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=y -CONFIG_PSTORE_ZONE=m -CONFIG_PSTORE_BLK=m -CONFIG_PSTORE_BLK_BLKDEV="" -CONFIG_PSTORE_BLK_KMSG_SIZE=64 -CONFIG_PSTORE_BLK_MAX_REASON=2 -# CONFIG_SYSV_FS is not set -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EROFS_FS=m -# CONFIG_EROFS_FS_DEBUG is not set -CONFIG_EROFS_FS_XATTR=y -CONFIG_EROFS_FS_POSIX_ACL=y -CONFIG_EROFS_FS_SECURITY=y -CONFIG_EROFS_FS_ZIP=y -CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 -CONFIG_VBOXSF_FS=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFS_DEBUG=y -# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -# CONFIG_NFSD_FLEXFILELAYOUT is not set -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y -CONFIG_SUNRPC_DEBUG=y -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CEPH_FS_SECURITY_LABEL=y -CONFIG_CIFS=m -# CONFIG_CIFS_STATS2 is not set -CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y -# CONFIG_CIFS_WEAK_PW_HASH is not set -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -CONFIG_CIFS_DEBUG=y -# CONFIG_CIFS_DEBUG2 is not set -# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set -CONFIG_CIFS_DFS_UPCALL=y -# CONFIG_CIFS_SMB_DIRECT is not set -CONFIG_CIFS_FSCACHE=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -# CONFIG_AFS_DEBUG_CURSOR is not set -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -CONFIG_KEY_DH_OPERATIONS=y -CONFIG_KEY_NOTIFICATIONS=y -# CONFIG_SECURITY_DMESG_RESTRICT is not set -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_SECURITY_INFINIBAND=y -CONFIG_SECURITY_NETWORK_XFRM=y -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HARDENED_USERCOPY_FALLBACK=y -# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set -CONFIG_FORTIFY_SOURCE=y -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DISABLE is not set -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -CONFIG_SECURITY_SMACK=y -CONFIG_SECURITY_SMACK_BRINGUP=y -CONFIG_SECURITY_SMACK_NETFILTER=y -CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y -CONFIG_SECURITY_TOMOYO=y -CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 -CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 -# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set -CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" -CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" -# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -# CONFIG_INTEGRITY is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_SMACK is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama" - -# -# Kernel hardening options -# -CONFIG_GCC_PLUGIN_STRUCTLEAK=y - -# -# Memory initialization -# -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set -CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y -# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set -# CONFIG_GCC_PLUGIN_STACKLEAK is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set -# end of Memory initialization -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=y -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=y -CONFIG_CRYPTO_ECC=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_CURVE25519_X86=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_NHPOLY1305=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_ESSIV=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_BLAKE2B=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_BLAKE2S_X86=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=y -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=y -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -# CONFIG_CRYPTO_STATS is not set -CONFIG_CRYPTO_HASH_INFO=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA256=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -CONFIG_CRYPTO_DEV_ATMEL_I2C=m -CONFIG_CRYPTO_DEV_ATMEL_ECC=m -CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CHELSIO_IPSEC_INLINE=y -CONFIG_CHELSIO_TLS_DEVICE=y -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_CRYPTO_DEV_SAFEXCEL=m -CONFIG_CRYPTO_DEV_CCREE=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_TPM_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -CONFIG_SIGNED_PE_FILE_VERIFICATION=y - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_LINEAR_RANGES=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_CORDIC=m -# CONFIG_PRIME_NUMBERS is not set -CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_ARCH_USE_SYM_ANNOTATIONS=y -CONFIG_CRC_CCITT=y -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=y -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_DMA_DECLARE_COHERENT=y -CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y -CONFIG_DMA_VIRT_OPS=y -CONFIG_SWIOTLB=y -CONFIG_DMA_COHERENT_POOL=y -# CONFIG_DMA_API_DEBUG is not set -CONFIG_SGL_ALLOC=y -CONFIG_IOMMU_HELPER=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_DIMLIB=y -CONFIG_LIBFDT=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -CONFIG_FONT_TER16x32=y -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_UACCESS_MCSAFE=y -CONFIG_ARCH_STACKWALK=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -CONFIG_OBJAGG=m -# CONFIG_STRING_SELFTEST is not set -# end of Library routines - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 -CONFIG_CONSOLE_LOGLEVEL_QUIET=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -CONFIG_DYNAMIC_DEBUG=y -CONFIG_DYNAMIC_DEBUG_CORE=y -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -# -# Compile-time checks and compiler options -# -CONFIG_DEBUG_INFO=y -# CONFIG_DEBUG_INFO_REDUCED is not set -# CONFIG_DEBUG_INFO_COMPRESSED is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y -# CONFIG_GDB_SCRIPTS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_UBSAN is not set -# end of Generic Kernel Debugging Instruments - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_OWNER is not set -CONFIG_PAGE_POISONING=y -CONFIG_PAGE_POISONING_NO_SANITY=y -CONFIG_PAGE_POISONING_ZERO=y -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_ARCH_HAS_DEBUG_WX=y -CONFIG_DEBUG_WX=y -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_VM_PGTABLE is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y -# CONFIG_KASAN is not set -CONFIG_KASAN_STACK=1 -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -# CONFIG_DEBUG_LIST is not set -# CONFIG_DEBUG_PLIST is not set -# CONFIG_DEBUG_SG is not set -# CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BUG_ON_DATA_CORRUPTION is not set -# end of Debug kernel data structures - -# CONFIG_DEBUG_CREDENTIALS is not set - -# -# RCU Debugging -# -# CONFIG_RCU_PERF_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_RING_BUFFER_ALLOW_SWAP=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_STACK_TRACER=y -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -CONFIG_SCHED_TRACER=y -CONFIG_HWLAT_TRACER=y -CONFIG_MMIOTRACE=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT=y -# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -CONFIG_BPF_KPROBE_OVERRIDE=y -CONFIG_FTRACE_MCOUNT_RECORD=y -CONFIG_TRACING_MAP=y -CONFIG_SYNTH_EVENTS=y -CONFIG_HIST_TRIGGERS=y -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_MMIOTRACE_TEST is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_SYNTH_EVENT_GEN_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_HIST_TRIGGERS_DEBUG is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_HAVE_ARCH_KCSAN=y -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -CONFIG_STRICT_DEVMEM=y -CONFIG_IO_STRICT_DEVMEM=y - -# -# x86 Debugging -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_X86_VERBOSE_BOOTUP is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEBUG_BOOT_PARAMS=y -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# CONFIG_UNWINDER_GUESS is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_STRSCPY is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_BITFIELD is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_OVERFLOW is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_HASH is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_PARMAN is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_BITOPS is not set -# CONFIG_TEST_VMALLOC is not set -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_OBJAGG is not set -# CONFIG_TEST_STACKINIT is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_TEST_HMM is not set -# CONFIG_MEMTEST is not set -# CONFIG_HYPERV_TESTING is not set -# end of Kernel Testing and Coverage -# end of Kernel hacking diff --git a/linux58-tkg/linux58-tkg-config/generic-desktop-profile.cfg b/linux58-tkg/linux58-tkg-config/generic-desktop-profile.cfg deleted file mode 100644 index d14bf2e..0000000 --- a/linux58-tkg/linux58-tkg-config/generic-desktop-profile.cfg +++ /dev/null @@ -1,35 +0,0 @@ -# linux58-TkG config file -# Generic Desktop - - -#### KERNEL OPTIONS #### - -# Disable some non-module debugging - See PKGBUILD for the list -_debugdisable="false" - -# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME - -# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" -_ftracedisable="false" - -# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" -_numadisable="false" - -# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" -_voluntary_preempt="false" - -# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" -_zenify="true" - -# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" -_compileroptlevel="1" - -# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" -_random_trust_cpu="false" - -# CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) -# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "mc" -_runqueue_sharing="mc" - -# Timer frequency - "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "750" -_timer_freq="500" diff --git a/linux58-tkg/linux58-tkg-config/prepare b/linux58-tkg/linux58-tkg-config/prepare deleted file mode 100644 index 9a4672c..0000000 --- a/linux58-tkg/linux58-tkg-config/prepare +++ /dev/null @@ -1,1015 +0,0 @@ -#!/bin/bash - -_basever=58 -_basekernel=5.8 -_sub=16 - -_tkg_initscript() { - - cp "$_where"/linux"$_basever"-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - cp "$_where"/linux"$_basever"-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - - # Load external configuration file if present. Available variable values will overwrite customization.cfg ones. - if [ -e "$_EXT_CONFIG_PATH" ]; then - source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" - fi - - if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then - # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. - plain "Do you want to use a predefined optimized profile?" - read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; - fi - if [ "$_OPTIPROFILE" = "2" ]; then - source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" - elif [ "$_OPTIPROFILE" = "3" ]; then - source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" - fi - - # source cpuschedset early if present - if [ -e "$_where"/cpuschedset ]; then - source "$_where"/cpuschedset - fi - - # CPU SCHED selector - if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then - plain "What CPU sched variant do you want to build/install?" - read -rp "`echo $' > 1.Undead PDS (TkG)\n 2.Project C / PDS\n 3.Project C / BMQ\n 4.CFS\nchoice[1-3?]: '`" CONDITION; - if [ "$CONDITION" = "2" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "3" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "4" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - echo "_cpusched=\"upds\"" > "$_where"/cpuschedset - fi - if [ -n "$_custom_pkgbase" ]; then - echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset - fi - elif [ "$_cpusched" = "upds" ]; then - echo "_cpusched=\"upds\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "pds" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "cfs" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "bmq" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - else - if [ "$_nofallback" != "true" ]; then - warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" - read -rp "`echo $' > N/y : '`" _fallback; - fi - if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - error "Exiting..." - exit 1 - fi - fi - - source "$_where"/cpuschedset -} - -user_patcher() { - # To patch the user because all your base are belong to us - local _patches=("$_where"/*."${_userpatch_ext}revert") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Reverting your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 -R < "${_f}" - echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi - - _patches=("$_where"/*."${_userpatch_ext}patch") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Applying your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 < "${_f}" - echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi -} - -_tkg_srcprep() { - - if [ "${_distro}" = "Arch" ]; then - msg2 "Setting version..." - scripts/setlocalversion --save-scmversion - echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel - echo "" > localversion.20-pkgname - - # add upstream patch - msg2 "Patching from $_basekernel to $pkgver" - patch -p1 -i "$srcdir"/patch-"${pkgver}" - - # ARCH Patches - if [ "${_configfile}" = "config_hardened.x86_64" ] && [ "${_cpusched}" = "cfs" ]; then - msg2 "Using linux hardened patchset" - patch -Np1 -i "$srcdir"/0012-linux-hardened.patch - else - patch -Np1 -i "$srcdir"/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - fi - fi - - # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch - msg2 "Applying graysky's cpu opts patch" - if [ "${_distro}" = "Arch" ]; then - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch - else - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v5.8+.patch - fi - - # TkG - msg2 "Applying clear linux patches" - patch -Np1 -i "$srcdir"/0002-clear-patches.patch - - msg2 "Applying glitched base patch" - patch -Np1 -i "$srcdir"/0003-glitched-base.patch - - if [ -z $_misc_adds ]; then - plain "Enable misc additions ? May contain temporary fixes pending upstream or changes that can break on non-Arch. " - read -rp "`echo $' > [Y]/n : '`" _interactive_misc_adds; - if [ "$_interactive_misc_adds" != "n" ] && [ "$_interactive_misc_adds" != "N" ]; then - _misc_adds="true" - fi - fi - - if [ "$_misc_adds" = "true" ]; then - msg2 "Applying misc additions patch" - patch -Np1 -i "$srcdir"/0012-misc-additions.patch - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS - msg2 "Applying MuQSS base patch" - patch -Np1 -i "$srcdir"/0004-5.8-ck1.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying MuQSS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0004-glitched-ondemand-muqss.patch - fi - - msg2 "Applying Glitched MuQSS patch" - patch -Np1 -i "$srcdir"/0004-glitched-muqss.patch - - elif [ "${_cpusched}" = "pds" ]; then - # PDS-mq - msg2 "Applying PDS base patch" - patch -Np1 -i "$srcdir"/0009-prjc_v5.8-r3.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying PDS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched PDS patch" - patch -Np1 -i "$srcdir"/0005-glitched-pds.patch - - elif [ "${_cpusched}" = "upds" ]; then - # PDS-mq - msg2 "Applying PDS base patch" - patch -Np1 -i "$srcdir"/0005-v5.8_undead-pds099o.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying PDS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0005-undead-glitched-ondemand-pds.patch - fi - - msg2 "Applying Glitched PDS patch" - patch -Np1 -i "$srcdir"/0005-undead-glitched-pds.patch - - elif [ "${_cpusched}" = "bmq" ]; then - # Project C / BMQ - msg2 "Applying Project C / BMQ base patch" - - patch -Np1 -i "$srcdir"/0009-prjc_v5.8-r3.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying BMQ agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched BMQ patch" - patch -Np1 -i "$srcdir"/0009-glitched-bmq.patch - - elif [ "${_cpusched}" = "cfs" ]; then - msg2 "Applying Glitched CFS patch" - patch -Np1 -i "$srcdir"/0003-glitched-cfs.patch - fi - - if [ "${_distro}" = "Arch" ]; then - if [ -z "${_configfile}" ]; then - _configfile="config.x86_64" - fi - - cat "${srcdir}/${_configfile}" > ./.config - fi - - - # Set some -tkg defaults - echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config - sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config - echo "CONFIG_DEFAULT_CAKE=y" >> ./.config - echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config - echo "# CONFIG_NTP_PPS is not set" >> ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config - sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO=y/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set/' ./.config - sed -i -e 's/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lzo"/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4"/' ./.config - #sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config - sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config - echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config - echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config - echo "# CONFIG_X86_P6_NOP is not set" >> ./.config - - # openrgb - echo "CONFIG_I2C_NCT6775=m" >> ./.config - - # ccache fix - if [ "$_noccache" != "true" ]; then - if { [ "$_distro" = "Arch" ] && pacman -Qq ccache &> /dev/null; } || { [ "$_distro" = "Ubuntu" ] && dpkg -l ccache > /dev/null; }; then - sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config - fi - fi - # Skip dbg package creation on non-Arch - if [ "$_distro" != "Arch" ]; then - sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config - fi - - if [ "$_font_autoselect" != "false" ]; then - sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config - fi - - # Inject cpuopts options - echo "# CONFIG_MK8SSE3 is not set" >> ./.config - echo "# CONFIG_MK10 is not set" >> ./.config - echo "# CONFIG_MBARCELONA is not set" >> ./.config - echo "# CONFIG_MBOBCAT is not set" >> ./.config - echo "# CONFIG_MJAGUAR is not set" >> ./.config - echo "# CONFIG_MBULLDOZER is not set" >> ./.config - echo "# CONFIG_MPILEDRIVER is not set" >> ./.config - echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config - echo "# CONFIG_MEXCAVATOR is not set" >> ./.config - echo "# CONFIG_MZEN is not set" >> ./.config - echo "# CONFIG_MZEN2 is not set" >> ./.config - echo "# CONFIG_MATOM is not set" >> ./.config - echo "# CONFIG_MNEHALEM is not set" >> ./.config - echo "# CONFIG_MWESTMERE is not set" >> ./.config - echo "# CONFIG_MSILVERMONT is not set" >> ./.config - echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config - echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config - echo "# CONFIG_MHASWELL is not set" >> ./.config - echo "# CONFIG_MBROADWELL is not set" >> ./.config - echo "# CONFIG_MSKYLAKE is not set" >> ./.config - echo "# CONFIG_MSKYLAKEX is not set" >> ./.config - echo "# CONFIG_MCANNONLAKE is not set" >> ./.config - echo "# CONFIG_MICELAKE is not set" >> ./.config - echo "# CONFIG_MGOLDMONT is not set" >> ./.config - echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config - echo "# CONFIG_MCASCADELAKE is not set" >> ./.config - echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config - echo "# CONFIG_MTIGERLAKE is not set" >> ./.config - - # Disable some debugging - if [ "${_debugdisable}" = "true" ]; then - sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config - sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS default config - echo "CONFIG_SCHED_MUQSS=y" >> ./.config - elif [ "${_cpusched}" = "upds" ]; then - # PDS default config - echo "CONFIG_SCHED_PDS=y" >> ./.config - elif [ "${_cpusched}" = "pds" ]; then - # PDS default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_PDS=y" >> ./.config - echo "# CONFIG_SCHED_BMQ is not set" >> ./.config - elif [ "${_cpusched}" = "bmq" ]; then - # BMQ default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_BMQ=y" >> ./.config - echo "# CONFIG_SCHED_PDS is not set" >> ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "upds" ]; then - # Disable CFS - sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config - sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config - # sched yield type - if [ -n "$_sched_yield_type" ]; then - CONDITION0="$_sched_yield_type" - else - plain "" - plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." - plain "" - plain "For PDS and MuQSS:" - plain "0: No yield." - plain "1: Yield only to better priority/deadline tasks." - plain "2: Expire timeslice and recalculate deadline." - plain "" - plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" - plain "0: No yield." - plain "1: Deboost and requeue task. (default)" - plain "2: Set rq skip task." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0. Supposedly best option for gaming performance - could lead to stability issues on some (AMD) platforms when combined with MuQSS\n > 1. Default and recommended option for MuQSS - could lead to stability issues on some (Intel) platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - else - read -rp "`echo $'\n > 0. Recommended option for gaming on PDS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - fi - fi - if [ "$CONDITION0" = "0" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - elif [ "${_cpusched}" = "upds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/pds.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - elif [ "$CONDITION0" = "1" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "$CONDITION0" = "2" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/alt_core.c - elif [ "${_cpusched}" = "upds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/pds.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c - fi - else - if [ "${_cpusched}" = "MuQSS" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - elif [ "${_cpusched}" = "upds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/pds.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - fi - fi - - # Round Robin interval - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "upds" ]; then - if [ -n "$_rr_interval" ]; then - CONDITION1="$_rr_interval" - else - plain "" - plain "Round Robin interval is the longest duration two tasks with the same nice level will" - plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" - plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" - plain "value can help offset the disadvantages of rescheduling a process that has yielded." - plain "" - plain "MuQSS default: 6ms" - plain "PDS default: 4ms" - plain "BMQ default: 2ms" - read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; - fi - if [ "$CONDITION1" = "1" ]; then - msg2 "Using 2ms rr_interval" - _rrvalue="2" - elif [ "$CONDITION1" = "2" ]; then - msg2 "Using 4ms rr_interval" - _rrvalue="4" - elif [ "$CONDITION1" = "3" ]; then - msg2 "Using 6ms rr_interval" - _rrvalue="6" - elif [ "$CONDITION1" = "4" ]; then - msg2 "Using 8ms rr_interval" - _rrvalue="8" - else - msg2 "Using default rr_interval" - _rrvalue="default" - fi - if [ "$_rrvalue" != "default" ]; then - if [ "${_cpusched}" = "MuQSS" ]; then - sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" = "upds" ]; then - sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/pds.c - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - else - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - fi - fi - - # zenify - if [ "$_zenify" = "true" ]; then - echo "CONFIG_ZENIFY=y" >> ./.config - elif [ "$_zenify" = "false" ]; then - echo "# CONFIG_ZENIFY is not set" >> ./.config - fi - - # compiler optimization level - if [ "$_compileroptlevel" = "1" ]; then - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - elif [ "$_compileroptlevel" = "2" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config - elif [ "$_compileroptlevel" = "3" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - fi - - # cpu opt - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then - echo "# CONFIG_MNATIVE is not set" >> ./.config - fi - - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then - sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config - fi - - if [ "$_processor_opt" = "native" ]; then - echo "CONFIG_MNATIVE=y" >> ./.config - elif [ "$_processor_opt" = "k8" ]; then - sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config - elif [ "$_processor_opt" = "k8sse3" ]; then - sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config - elif [ "$_processor_opt" = "k10" ]; then - sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config - elif [ "$_processor_opt" = "barcelona" ]; then - sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config - elif [ "$_processor_opt" = "bobcat" ]; then - sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config - elif [ "$_processor_opt" = "jaguar" ]; then - sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config - elif [ "$_processor_opt" = "bulldozer" ]; then - sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config - elif [ "$_processor_opt" = "piledriver" ]; then - sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config - elif [ "$_processor_opt" = "steamroller" ]; then - sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config - elif [ "$_processor_opt" = "excavator" ]; then - sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config - elif [ "$_processor_opt" = "zen" ]; then - sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config - elif [ "$_processor_opt" = "zen2" ]; then - sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config - elif [ "$_processor_opt" = "mpsc" ]; then - sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config - elif [ "$_processor_opt" = "atom" ]; then - sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config - elif [ "$_processor_opt" = "core2" ]; then - sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config - elif [ "$_processor_opt" = "nehalem" ]; then - sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config - elif [ "$_processor_opt" = "westmere" ]; then - sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config - elif [ "$_processor_opt" = "silvermont" ]; then - sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config - elif [ "$_processor_opt" = "sandybridge" ]; then - sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "ivybridge" ]; then - sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "haswell" ]; then - sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config - elif [ "$_processor_opt" = "broadwell" ]; then - sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config - elif [ "$_processor_opt" = "skylake" ]; then - sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config - elif [ "$_processor_opt" = "skylakex" ]; then - sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config - elif [ "$_processor_opt" = "cannonlake" ]; then - sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config - elif [ "$_processor_opt" = "icelake" ]; then - sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config - elif [ "$_processor_opt" = "goldmont" ]; then - sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config - elif [ "$_processor_opt" = "goldmontplus" ]; then - sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config - elif [ "$_processor_opt" = "cascadelake" ]; then - sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config - elif [ "$_processor_opt" = "cooperlake" ]; then - sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config - elif [ "$_processor_opt" = "tigerlake" ]; then - sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config - fi - - # irq threading - if [ "$_irq_threading" = "true" ]; then - echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config - elif [ "$_irq_threading" = "false" ]; then - echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config - fi - - # smt nice - if [ "$_smt_nice" = "true" ]; then - echo "CONFIG_SMT_NICE=y" >> ./.config - elif [ "$_smt_nice" = "false" ]; then - echo "# CONFIG_SMT_NICE is not set" >> ./.config - fi - - # random trust cpu - if [ "$_random_trust_cpu" = "true" ]; then - sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config - fi - - # rq sharing - if [ "$_runqueue_sharing" = "none" ]; then - echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" = "smt" ]; then - echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "mc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "smp" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "all" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config - elif [ "$_runqueue_sharing" = "mc-llc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - fi - - # timer freq - if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - if [ "$_timer_freq" = "1000" ]; then - sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "CONFIG_HZ_1000_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "750" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "CONFIG_HZ_750=y" >> ./.config - echo "CONFIG_HZ_750_NODEF=y" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "500" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "100" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - fi - elif [ "${_cpusched}" = "MuQSS" ] && [ -z "$_timer_freq" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - else - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - fi - - # default cpu gov - if [ "$_default_cpu_gov" = "performance" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config - elif [ "$_default_cpu_gov" = "ondemand" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config - fi - - # ACPI_CPUFREQ disablement - if [ "$_disable_acpi_cpufreq" = "true" ]; then - sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config - fi - - # ftrace - if [ -z "$_ftracedisable" ]; then - plain "" - plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" - plain "and analyzing of kernel functions." - read -rp "`echo $' > N/y : '`" CONDITION2; - fi - if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" = "true" ]; then - sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config - sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config - fi - - # disable numa - if [ -z "$_numadisable" ]; then - plain "" - plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." - plain "https://bbs.archlinux.org/viewtopic.php?id=239174" - read -rp "`echo $' > N/y : '`" CONDITION3; - fi - if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" = "true" ]; then - # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU - sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ - -i -e '/CONFIG_AMD_NUMA=y/d' \ - -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ - -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ - -i -e '/# CONFIG_NUMA_EMU is not set/d' \ - -i -e '/CONFIG_NODES_SHIFT=6/d' \ - -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ - -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ - -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config - fi - - # tickless - if [ -z "$_tickless" ]; then - plain "" - plain "Use CattaRappa mode (Tickless/Dynticks) ?" - plain "Can give higher performances in many cases but lower consistency on some hardware." - plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - else - read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - fi - fi - if [ "$CONDITION4" = "0" ] || [ "$_tickless" = "0" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config - elif [ "$CONDITION4" = "2" ] || [ "$_tickless" = "2" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - if [ "${_cpusched}" = "MuQSS" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config - echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config - fi - fi - - # voluntary preempt - if [ -z "$_voluntary_preempt" ]; then - plain "" - plain "Use explicit preemption points?" - plain "It can improve latency on PDS (at the cost of throughput)" - plain "and improve throughput on other schedulers (at the cost of latency)" - read -rp "`echo $' > N/y : '`" CONDITION5; - fi - if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" = "true" ]; then - sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config - sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config - sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config - fi - - # Open Firmware support - if [ -z "$_OFenable" ]; then - plain "" - plain "Enable Device Tree and Open Firmware support?" - read -rp "`echo $' > N/y : '`" CONDITION6; - fi - if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" = "true" ]; then - sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config - fi - - # acs override - if [ -z "$_acs_override" ]; then - plain "" - plain "Use ACS override patch?" - plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" - read -rp "`echo $' > N/y : '`" CONDITION7; - fi - if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" = "true" ]; then - msg2 "Patching ACS override" - patch -Np1 -i "$srcdir"/0006-add-acs-overrides_iommu.patch - fi - - # bcachefs - if [ -z "$_bcachefs" ]; then - plain "" - plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." - plain "https://bcachefs.org/" - read -rp "`echo $' > N/y : '`" CONDITION8; - fi - if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" = "true" ]; then - msg2 "Patching Bcache filesystem support override" - patch -Np1 -i "$srcdir"/0008-5.8-bcachefs.patch - echo "CONFIG_BCACHEFS_FS=m" >> ./.config - echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config - echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config - echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config - echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config - echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config - fi - - # fsync support - if [ -z "$_fsync" ]; then - plain "" - plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" - plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; - fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then - msg2 "Patching Fsync support" - patch -Np1 -i "$srcdir"/0007-v5.8-fsync.patch - fi - - # ZFS fix - if [ -z "$_zfsfix" ]; then - plain "" - plain "Add back missing symbol for AES-NI/AVX support on ZFS" - plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" - read -rp "`echo $' > N/y : '`" CONDITION11; - fi - if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" = "true" ]; then - msg2 "Patching missing symbol for AES-NI/AVX support on ZFS" - patch -Np1 -i "$srcdir"/0011-ZFS-fix.patch - fi - - # Community patches - if [ -n "$_community_patches" ]; then - if [ ! -d "$_where/../../community-patches" ]; then - cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/${_srcpath}" - fi - _community_patches=($_community_patches) - for _p in ${_community_patches[@]}; do - ln -s "$_where"/../../community-patches/linux"$_basever"-tkg/$_p "$_where"/ - done - fi - - # userpatches - if [ "$_user_patches" = "true" ]; then - _userpatch_target="linux-${_basekernel}" - _userpatch_ext="my" - user_patcher - fi - - # Community patches removal - for _p in ${_community_patches[@]}; do - rm -f "$_where"/$_p - done - - if [ "$_distro" = "Arch" ]; then - # don't run depmod on 'make install'. We'll do this ourselves in packaging - sed -i '2iexit 0' scripts/depmod.sh - - # get kernel version - make prepare - fi - - # modprobed-db - if [ -z "$_modprobeddb" ]; then - plain "" - plain "Use modprobed db to clean config from unneeded modules?" - plain "Speeds up compilation considerably. Requires root." - plain "https://wiki.archlinux.org/index.php/Modprobed-db" - plain "!!!! Make sure to have a well populated db !!!!" - read -rp "`echo $' > N/y : '`" CONDITIONMPDB; - fi - if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" = "true" ]; then - sudo modprobed-db recall - yes "" | make localmodconfig - fi - - if [ true = "$_config_fragments" ]; then - local fragments=() - mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) - - if [ true = "$_config_fragments_no_confirm" ]; then - printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" - else - for i in "${!fragments[@]}"; do - while true; do - read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB - CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONMPDB" in - y|yes) - break;; - n|no|'') - unset fragments[$i] - break;; - *) - echo 'Please answer with yes or no' - esac - done - done - fi - - if [ 0 -lt "${#fragments[@]}" ]; then - scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" - fi - fi - - # menuconfig / nconfig - if [ -z "$_menunconfig" ]; then - plain "" - plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" - plain "to configure the kernel before building it?" - plain "If you do, make sure your terminal is currently" - plain "at least 19 lines by 80 columns large or you'll get an error :D" - read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n 3. xconfig\n choice[0-3?]: '`" CONDITIONMNC; - _menunconfig="$CONDITIONMNC" - fi - if [ 1 = "$_menunconfig" ]; then - cp .config .config.orig - make menuconfig - elif [ 2 = "$_menunconfig" ]; then - cp .config .config.orig - make nconfig - elif [ 3 = "$_menunconfig" ]; then - cp .config .config.orig - make xconfig - else - # rewrite configuration - yes "" | make config >/dev/null - fi - if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ] || [ 3 = "$_menunconfig" ]; then - if [ -z "${_diffconfig}" ]; then - while true; do - read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF - CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONF" in - y|yes) - _diffconfig=true - break;; - n|no|'') - _diffconfig=false - break;; - *) - echo 'Please answer with yes or no' - esac - done - fi - if [ true = "$_diffconfig" ]; then - if [ -z "$_diffconfig_name" ]; then - IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name - fi - if [ -z "$_diffconfig_name" ]; then - echo 'No file name given, not generating config fragment.' - else ( - prev_pwd="${PWD:-$(pwd)}" - cd "$_where" - "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" - ) fi - fi - rm .config.orig - fi - - if [ "$_distro" = "Arch" ]; then - make -s kernelrelease > version - msg2 "Prepared %s version %s" "$pkgbase" "$( -From: Serge Hallyn -Date: Fri, 31 May 2013 19:12:12 +0100 -Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default - -Signed-off-by: Serge Hallyn -[bwh: Remove unneeded binary sysctl bits] -Signed-off-by: Daniel Micay ---- - kernel/fork.c | 15 +++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 3 +++ - 3 files changed, 30 insertions(+) - -diff --git a/kernel/fork.c b/kernel/fork.c -index 07cc743698d3668e..4011d68a8ff9305c 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b86520ed3fb60fbf..f7dab3760839f1a1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -105,6 +105,9 @@ extern int core_uses_pid; - - #if defined(CONFIG_SYSCTL) - -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR - static int sixty = 60; -@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index c490f1e4313b998a..dd03bd39d7bf194d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - --- -2.15.1 - -From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Thu, 7 Dec 2017 13:50:48 +0100 -Subject: ZEN: Add CONFIG for unprivileged_userns_clone - -This way our default behavior continues to match the vanilla kernel. ---- - init/Kconfig | 16 ++++++++++++++++ - kernel/user_namespace.c | 4 ++++ - 2 files changed, 20 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 4592bf7997c0..f3df02990aff 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1004,6 +1004,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 6b9dbc257e34..107b17f0d528 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -27,7 +27,11 @@ - #include - - /* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else - int unprivileged_userns_clone; -+#endif - - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux58-tkg/linux58-tkg-patches/0002-clear-patches.patch b/linux58-tkg/linux58-tkg-patches/0002-clear-patches.patch deleted file mode 100644 index 22a32f5..0000000 --- a/linux58-tkg/linux58-tkg-patches/0002-clear-patches.patch +++ /dev/null @@ -1,360 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Mon, 14 Mar 2016 11:10:58 -0600 -Subject: [PATCH] pci pme wakeups - -Reduce wakeups for PME checks, which are a workaround for miswired -boards (sadly, too many of them) in laptops. ---- - drivers/pci/pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index c9338f9..6974fbf 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -62,7 +62,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sat, 19 Mar 2016 21:32:19 -0400 -Subject: [PATCH] intel_idle: tweak cpuidle cstates - -Increase target_residency in cpuidle cstate - -Tune intel_idle to be a bit less agressive; -Clear linux is cleaner in hygiene (wakupes) than the average linux, -so we can afford changing these in a way that increases -performance while keeping power efficiency ---- - drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- - 1 file changed, 22 insertions(+), 22 deletions(-) - -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index f449584..c994d24 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Fri, 6 Jan 2017 15:34:09 +0000 -Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little - bigger than default - ---- - net/ipv4/tcp.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 30c1142..4345075 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -4201,8 +4201,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH] locking: rwsem: spin faster - -tweak rwsem owner spinning a bit ---- - kernel/locking/rwsem.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index f11b9bd..1bbfcc1 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); -@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - rcu_read_unlock(); - --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: [PATCH] initialize ata before graphics - -ATA init is the long pole in the boot process, and its asynchronous. -move the graphics init after it so that ata and graphics initialize -in parallel ---- - drivers/Makefile | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/Makefile b/drivers/Makefile -index c0cd1b9..af1e2fb 100644 ---- a/drivers/Makefile -+++ b/drivers/Makefile -@@ -59,15 +59,8 @@ obj-y += char/ - # iommu/ comes before gpu as gpu are using iommu controllers - obj-y += iommu/ - --# gpu/ comes after char for AGP vs DRM startup and after iommu --obj-y += gpu/ -- - obj-$(CONFIG_CONNECTOR) += connector/ - --# i810fb and intelfb depend on char/agp/ --obj-$(CONFIG_FB_I810) += video/fbdev/i810/ --obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -- - obj-$(CONFIG_PARPORT) += parport/ - obj-$(CONFIG_NVM) += lightnvm/ - obj-y += base/ block/ misc/ mfd/ nfc/ -@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/ - obj-y += scsi/ - obj-y += nvme/ - obj-$(CONFIG_ATA) += ata/ -+ -+# gpu/ comes after char for AGP vs DRM startup and after iommu -+obj-y += gpu/ -+ -+# i810fb and intelfb depend on char/agp/ -+obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -+ - obj-$(CONFIG_TARGET_CORE) += target/ - obj-$(CONFIG_MTD) += mtd/ - obj-$(CONFIG_SPI) += spi/ --- -https://clearlinux.org - diff --git a/linux58-tkg/linux58-tkg-patches/0003-glitched-base.patch b/linux58-tkg/linux58-tkg-patches/0003-glitched-base.patch deleted file mode 100644 index fb09b35..0000000 --- a/linux58-tkg/linux58-tkg-patches/0003-glitched-base.patch +++ /dev/null @@ -1,708 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: [PATCH 01/17] glitched - ---- - scripts/mkcompile_h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h -index baf3ab8d9d49..854e32e6aec7 100755 ---- a/scripts/mkcompile_h -+++ b/scripts/mkcompile_h -@@ -41,8 +41,8 @@ else - fi - - UTS_VERSION="#$VERSION" --CONFIG_FLAGS="" --if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi -+CONFIG_FLAGS="TKG" -+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi - if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi - if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"; fi - --- -2.28.0 - - -From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 16:59:22 +0000 -Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which - VFS caches are reclaimed - -Signed-off-by: Alexandre Frade ---- - fs/dcache.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/dcache.c b/fs/dcache.c -index 361ea7ab30ea..0c5cf69b241a 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -71,7 +71,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); --- -2.28.0 - - -From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 18:29:13 +0000 -Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks - to iterate in a single balance run. - -Signed-off-by: Alexandre Frade ---- - kernel/sched/core.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f788cd61df21..2bfbb4213707 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features = - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ --const_debug unsigned int sysctl_sched_nr_migrate = 32; -+const_debug unsigned int sysctl_sched_nr_migrate = 128; - - /* - * period over which we measure -rt task CPU usage in us. -@@ -71,9 +71,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - /* - * __task_rq_lock - lock the rq @p resides on. --- -2.28.0 - - -From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 17:41:29 +0000 -Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo - -Signed-off-by: Alexandre Frade ---- - scripts/setlocalversion | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 20f2efd57b11..0552d8b9f582 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ # echo "+" - return - fi - # If we are past a tagged commit (like --- -2.28.0 - - -From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001 -From: Oleksandr Natalenko -Date: Wed, 11 Dec 2019 11:46:19 +0100 -Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches - -Building a kernel with -O3 may help in hunting bugs like [1] and thus -using this switch should not be restricted to one specific arch only. - -With that, lets expose it for everyone. - -[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/ - -Signed-off-by: Oleksandr Natalenko ---- - init/Kconfig | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/init/Kconfig b/init/Kconfig -index 0498af567f70..3ae8678e1145 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - - config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" -- depends on ARC - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. --- -2.28.0 - - -From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Fri, 26 Oct 2018 11:22:33 +0100 -Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 - inlining - ---- - drivers/infiniband/core/addr.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 3a98439bba83..6efc4f907f58 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - union { - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - --- -2.28.0 - - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: [PATCH 07/17] Zenify & stuff - ---- - init/Kconfig | 32 ++++++++++++++++++++++++++++++++ - kernel/sched/fair.c | 25 +++++++++++++++++++++++++ - mm/page-writeback.c | 8 ++++++++ - 3 files changed, 65 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 3ae8678e1145..da708eed0f1e 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 6b3b59cc51d6..2a0072192c3d 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -37,8 +37,13 @@ - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_latency = 3000000ULL; -+static unsigned int normalized_sysctl_sched_latency = 3000000ULL; -+#else - unsigned int sysctl_sched_latency = 6000000ULL; - static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -+#endif - - /* - * The initial- and re-scaling of tunables is configurable -@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_min_granularity = 300000ULL; -+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -+#else - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+#endif - - /* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -+#ifdef CONFIG_ZENIFY -+static unsigned int sched_nr_latency = 10; -+#else - static unsigned int sched_nr_latency = 8; -+#endif - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_wakeup_granularity = 500000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; -+ -+const_debug unsigned int sysctl_sched_migration_cost = 50000UL; -+#else - unsigned int sysctl_sched_wakeup_granularity = 1000000UL; - static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -+#endif - - int sched_thermal_decay_shift; - static int __init setup_sched_thermal_decay_shift(char *str) -@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -+#else - unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif -+#endif - - static inline void update_load_add(struct load_weight *lw, unsigned long inc) - { -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 28b3e7a67565..01a1aef2b9b1 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; - /* - * Start background writeback (via writeback threads) at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int dirty_background_ratio = 20; -+#else - int dirty_background_ratio = 10; -+#endif - - /* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of -@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; - /* - * The generator of dirty data starts writeback at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int vm_dirty_ratio = 50; -+#else - int vm_dirty_ratio = 20; -+#endif - - /* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of --- -2.28.0 - - -From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Sun, 16 Jan 2011 18:57:32 -0600 -Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control - -4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, - reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 - seconds (netperf TCP_STREAM) including long stalls. - - Be careful when choosing this. ~heftig ---- - net/ipv4/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index e64e59b536d3..bfb55ef7ebbe 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -691,6 +691,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO --- -2.28.0 - - -From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 28 Nov 2018 19:01:27 -0600 -Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag - strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: https://lwn.net/Articles/711248/ ---- - mm/huge_memory.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 74300e337c3c..9277f22c10a7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1< -Date: Wed, 24 Oct 2018 16:58:52 -0300 -Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default - -Signed-off-by: Alexandre Frade ---- - net/sched/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/sched/Kconfig b/net/sched/Kconfig -index 84badf00647e..6a922bca9f39 100644 ---- a/net/sched/Kconfig -+++ b/net/sched/Kconfig -@@ -471,6 +471,9 @@ choice - config DEFAULT_SFQ - bool "Stochastic Fair Queue" if NET_SCH_SFQ - -+ config DEFAULT_CAKE -+ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE -+ - config DEFAULT_PFIFO_FAST - bool "Priority FIFO Fast" - endchoice -@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH - default "fq" if DEFAULT_FQ - default "fq_codel" if DEFAULT_FQ_CODEL - default "sfq" if DEFAULT_SFQ -+ default "cake" if DEFAULT_CAKE - default "pfifo_fast" - endif - --- -2.28.0 - - -From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 18 Feb 2019 17:40:57 +0100 -Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10) - -Multiple users have reported it's helping reducing/eliminating stuttering -with DXVK. ---- - mm/page_alloc.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 898ff44f2c7b..e72074034793 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly; - #else - int watermark_boost_factor __read_mostly = 15000; - #endif --int watermark_scale_factor = 10; -+int watermark_scale_factor = 200; - - static unsigned long nr_kernel_pages __initdata; - static unsigned long nr_all_pages __initdata; --- -2.28.0 - - -From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Fri, 19 Apr 2019 12:33:38 +0200 -Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default - -The value is still pretty low, and AMD64-ABI and ELF extended numbering -supports that, so we should be fine on modern x86 systems. - -This fixes crashes in some applications using more than 65535 vmas (also -affects some windows games running in wine, such as Star Citizen). ---- - include/linux/mm.h | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index bc05c3588aa3..b0cefe94920d 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define MAPCOUNT_ELF_CORE_MARGIN (5) --#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -+#define DEFAULT_MAX_MAP_COUNT (262144) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 27 Jul 2020 00:19:18 +0200 -Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT - -Some games such as Detroit: Become Human tend to be very crash prone with -lower values. ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index b0cefe94920d..890165099b07 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define DEFAULT_MAX_MAP_COUNT (262144) -+#define DEFAULT_MAX_MAP_COUNT (524288) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 4eab3d70e880..79669aa39d79 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* --- -2.28.0 - - -From e2111bc5989131c675659d40e0cc4f214df2f990 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 16:45:59 -0300 -Subject: [PATCH 15/17] block: set rq_affinity = 2 for full multithreading I/O - requests - -Signed-off-by: Alexandre Frade ---- - include/linux/blkdev.h | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 28efe374a2e1..d4e5d35d2ece 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -624,7 +624,8 @@ struct request_queue { - #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ - - #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ -- (1 << QUEUE_FLAG_SAME_COMP)) -+ (1 << QUEUE_FLAG_SAME_COMP) | \ -+ (1 << QUEUE_FLAG_SAME_FORCE)) - - void blk_queue_flag_set(unsigned int flag, struct request_queue *q); - void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); --- -2.28.0 - - -From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 3 Aug 2020 17:05:04 +0000 -Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file - read-ahead pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/pagemap.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index cf2468da68e9..007dea784451 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); - void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); - --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); --- -2.28.0 - - -From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 15 Jan 2020 20:43:56 -0600 -Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter - -If intel-pstate is compiled into the kernel, it will preempt the loading -of acpi-cpufreq so you can take advantage of hardware p-states without -any friction. - -However, intel-pstate is not completely superior to cpufreq's ondemand -for one reason. There's no concept of an up_threshold property. - -In ondemand, up_threshold essentially reduces the maximum utilization to -compare against, allowing you to hit max frequencies and turbo boost -from a much lower core utilization. - -With intel-pstate, you have the concept of minimum and maximum -performance, but no tunable that lets you define, maximum frequency -means 50% core utilization. For just this oversight, there's reasons -you may want ondemand. - -Lets support setting "enable" in kernel boot parameters. This lets -kernel maintainers include "intel_pstate=disable" statically in the -static boot parameters, but let users of the kernel override this -selection. ---- - Documentation/admin-guide/kernel-parameters.txt | 3 +++ - drivers/cpufreq/intel_pstate.c | 2 ++ - 2 files changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index fb95fad81c79..3e92fee81e33 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -1857,6 +1857,9 @@ - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors -+ enable -+ Enable intel_pstate in-case "disable" was passed -+ previously in the kernel boot parameters - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 36a469150ff9..aee891c9b78a 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) - pr_info("HWP disabled\n"); - no_hwp = 1; - } -+ if (!strcmp(str, "enable")) -+ no_load = 0; - if (!strcmp(str, "force")) - force_load = 1; - if (!strcmp(str, "hwp_only")) --- -2.28.0 - diff --git a/linux58-tkg/linux58-tkg-patches/0003-glitched-cfs.patch b/linux58-tkg/linux58-tkg-patches/0003-glitched-cfs.patch deleted file mode 100644 index 06b7f02..0000000 --- a/linux58-tkg/linux58-tkg-patches/0003-glitched-cfs.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - diff --git a/linux58-tkg/linux58-tkg-patches/0005-glitched-pds.patch b/linux58-tkg/linux58-tkg-patches/0005-glitched-pds.patch deleted file mode 100644 index 4307c45..0000000 --- a/linux58-tkg/linux58-tkg-patches/0005-glitched-pds.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. diff --git a/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-ondemand-pds.patch b/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-ondemand-pds.patch deleted file mode 100644 index c1929e8..0000000 --- a/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-ondemand-pds.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (63) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-pds.patch b/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-pds.patch deleted file mode 100644 index 23271f5..0000000 --- a/linux58-tkg/linux58-tkg-patches/0005-undead-glitched-pds.patch +++ /dev/null @@ -1,166 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. - -diff --git a/init/Kconfig b/init/Kconfig -index 11fd9b502d06..e9bc34d3019b 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -715,6 +715,7 @@ menu "Scheduler features" - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_PDS - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -948,7 +948,6 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -- depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index b23231bae996..cab4e5c5b38e 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o - obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o --obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - endif - obj-y += loadavg.o clock.o cputime.o - obj-y += idle.o - obj-y += wait.o wait_bit.o swait.o completion.o - obj-$(CONFIG_SMP) += cpupri.o pelt.o - obj-$(CONFIG_SCHEDSTATS) += stats.o -+obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o - -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -index 9281ad164..f09a609cf 100644 ---- a/kernel/sched/pds.c -+++ b/kernel/sched/pds.c -@@ -81,6 +81,18 @@ enum { - NR_CPU_AFFINITY_CHK_LEVEL - }; - -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ - static inline void print_scheduler_version(void) - { - printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); -@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) - #ifdef CONFIG_SCHED_DEBUG - void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - struct seq_file *m) --{} -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} - - void proc_sched_set_task(struct task_struct *p) - {} diff --git a/linux58-tkg/linux58-tkg-patches/0005-v5.8_undead-pds099o.patch b/linux58-tkg/linux58-tkg-patches/0005-v5.8_undead-pds099o.patch deleted file mode 100644 index 7cb7e91..0000000 --- a/linux58-tkg/linux58-tkg-patches/0005-v5.8_undead-pds099o.patch +++ /dev/null @@ -1,8530 +0,0 @@ -From 68f1a9541ef3185b1021e8e54d2712c7039418d7 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 15 Jun 2020 23:58:41 +0200 -Subject: PDS 099o, initial 5.8 rebase - - -diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt -new file mode 100644 -index 000000000000..709e86f6487e ---- /dev/null -+++ b/Documentation/scheduler/sched-PDS-mq.txt -@@ -0,0 +1,56 @@ -+ Priority and Deadline based Skiplist multiple queue Scheduler -+ ------------------------------------------------------------- -+ -+CONTENT -+======== -+ -+ 0. Development -+ 1. Overview -+ 1.1 Design goal -+ 1.2 Design summary -+ 2. Design Detail -+ 2.1 Skip list implementation -+ 2.2 Task preempt -+ 2.3 Task policy, priority and deadline -+ 2.4 Task selection -+ 2.5 Run queue balance -+ 2.6 Task migration -+ -+ -+0. Development -+============== -+ -+Priority and Deadline based Skiplist multiple queue scheduler, referred to as -+PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run -+Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing -+design from VRQ and inspired by the introduction of skiplist data structure -+to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple -+Queue Skiplist Scheduler, the successor after BFS) in many ways. -+ -+1. Overview -+=========== -+ -+1.1 Design goal -+--------------- -+ -+PDS is designed to make the cpu process scheduler code to be simple, but while -+efficiency and scalable. Be Simple, the scheduler code will be easy to be read -+and the behavious of scheduler will be easy to predict. Be efficiency, the -+scheduler shall be well balance the thoughput performance and task interactivity -+at the same time for different properties the tasks behave. Be scalable, the -+performance of the scheduler should be in good shape with the glowing of -+workload or with the growing of the cpu numbers. -+ -+1.2 Design summary -+------------------ -+ -+PDS is described as a multiple run queues cpu scheduler. Each cpu has its own -+run queue. A heavry customized skiplist is used as the backend data structure -+of the cpu run queue. Tasks in run queue is sorted by priority then virtual -+deadline(simplfy to just deadline from here on). In PDS, balance action among -+run queues are kept as less as possible to reduce the migration cost. Cpumask -+data structure is widely used in cpu affinity checking and cpu preemption/ -+selection to make PDS scalable with increasing cpu number. -+ -+ -+To be continued... -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 2d3f963fd6f1..5f41ead019b1 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1006,6 +1006,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_PDS && SCHED_SMT -+ default y -+ ---help--- -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c -index 737ff3b9c2c0..b5bc5a1b6de7 100644 ---- a/drivers/cpufreq/cpufreq_conservative.c -+++ b/drivers/cpufreq/cpufreq_conservative.c -@@ -28,8 +28,8 @@ struct cs_dbs_tuners { - }; - - /* Conservative governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_FREQUENCY_DOWN_THRESHOLD (20) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) -+#define DEF_FREQUENCY_DOWN_THRESHOLD (26) - #define DEF_FREQUENCY_STEP (5) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (10) -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 82a4d37ddecb..1130e0f5db72 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -18,7 +18,7 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) -+#define DEF_FREQUENCY_UP_THRESHOLD (63) - #define DEF_SAMPLING_DOWN_FACTOR (1) - #define MAX_SAMPLING_DOWN_FACTOR (100000) - #define MICRO_FREQUENCY_UP_THRESHOLD (95) -@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) - } - - /* -- * Every sampling_rate, we check, if current idle time is less than 20% -+ * Every sampling_rate, we check, if current idle time is less than 37% - * (default), then we try to increase frequency. Else, we adjust the frequency - * proportional to load. - */ -diff --git a/fs/proc/base.c b/fs/proc/base.c -index eb2255e95f62..62b8cedbccb6 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..1a7987c40c80 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_PDS -+#define INIT_TASK_COMM "PDS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif /* !CONFIG_SCHED_PDS */ - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h -index fed6ba96c527..f03a5ee419a1 100644 ---- a/include/linux/jiffies.h -+++ b/include/linux/jiffies.h -@@ -169,7 +169,7 @@ static inline u64 get_jiffies_64(void) - * Have the 32 bit jiffies value wrap 5 minutes after boot - * so jiffies wrap bugs show up earlier. - */ --#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) -+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) - - /* - * Change timeval to jiffies, trying to avoid the -diff --git a/kernel/smp.c b/kernel/smp.c -index 4418f5cb8324..2b51afac5b06 100644 ---- a/kernel/smp.c -+++ b/kernel/smp.c -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 4418f5cb8324..2b51afac5b06 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -31,6 +31,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -652,9 +653,13 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) - int on_cpu; -+#endif -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - struct __call_single_node wake_entry; -+#endif -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; -@@ -663,6 +668,7 @@ struct task_struct { - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; - -+#ifndef CONFIG_SCHED_PDS - /* - * recent_used_cpu is initially set as the last CPU used by a task - * that wakes affine another task. Waker/wakee relationships can -@@ -671,6 +677,7 @@ struct task_struct { - * used CPU that may be idle. - */ - int recent_used_cpu; -+#endif /* CONFIG_SCHED_PDS */ - int wake_cpu; - #endif - int on_rq; -@@ -680,13 +687,27 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_PDS -+ int time_slice; -+ u64 deadline; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+ /* 8bits prio and 56bits deadline for quick processing */ -+ u64 priodl; -+ u64 last_ran; -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* CONFIG_SCHED_PDS */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1306,6 +1327,29 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_PDS -+void cpu_scaling(int cpu); -+void cpu_nonscaling(int cpu); -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+ -+#define task_running_idle(p) ((p)->prio == IDLE_PRIO) -+#else /* CFS */ -+extern int runqueue_is_locked(int cpu); -+static inline void cpu_scaling(int cpu) -+{ -+} -+ -+static inline void cpu_nonscaling(int cpu) -+{ -+} -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+#define iso_task(p) (false) -+#endif /* CONFIG_SCHED_PDS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..a5e5fc2c9170 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,22 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_PDS -+ -+#define __tsk_deadline(p) ((p)->deadline) -+ -+static inline int dl_prio(int prio) -+{ -+ return 1; -+} -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 1; -+} -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_PDS */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..fba04bb91492 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,7 +20,18 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_PDS -+#define ISO_PRIO (MAX_USER_RT_PRIO) -+ -+#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) -+ -+#define NORMAL_PRIO (MAX_RT_PRIO) -+#define IDLE_PRIO ((MAX_RT_PRIO) + 1) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* !CONFIG_SCHED_PDS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO -+#endif /* CONFIG_SCHED_PDS */ - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..a96012e6f15e 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_PDS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index 38359071236a..90328ccd527f 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..713fedd8034f ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ Copyright (C) 2016 Alfred Chen. -+ -+ Code based on Con Kolivas's skip list implementation for BFS, and -+ which is based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+ -+This file only provides a infrastructure of skip list. -+ -+skiplist_node is embedded into container data structure, to get rid the -+dependency of kmalloc/kfree operation in scheduler code. -+ -+A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+macro and be used for skip list insert operation. -+ -+Random Level is also not defined in this file, instead, it should be customized -+implemented and set to node->level then pass to the customized skiplist_insert -+function. -+ -+Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ -+NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+considering that there will be 256 entries to enable the top level when using -+random level p=0.5, and that number is more than enough for a run queue usage -+in a scheduler usage. And it also help to reduce the memory usage of the -+embedded skip list node in task_struct to about 50%. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+BFS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+*/ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty()*/ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..d6d384ddb57d 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -115,7 +115,10 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ -+ -+#define SCHED_ISO 4 -+ - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 - -diff --git a/init/Kconfig b/init/Kconfig -index 74a5ac65644f..e4fd406b58dd 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -61,6 +61,21 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_PDS -+ bool "PDS-mq cpu scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler for excellent interactivity and responsiveness on the -+ desktop and solid scalability on normal hardware and commodity -+ servers. -+ -+ Currently incompatible with the Group CPU scheduler, and RCU TORTURE -+ TEST so these options are disabled. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -777,6 +792,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_PDS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -878,7 +894,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_PDS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1007,6 +1023,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_PDS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1134,6 +1151,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_PDS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index bd403ed3e418..162d3deddd45 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -59,6 +59,126 @@ struct task_struct init_task - __init_task_data - #endif - = { -+#ifdef CONFIG_SCHED_PDS -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ .thread_info = INIT_THREAD_INFO(init_task), -+ .stack_refcount = ATOMIC_INIT(1), -+#endif -+ .state = 0, -+ .stack = init_stack, -+ .usage = ATOMIC_INIT(2), -+ .flags = PF_KTHREAD, -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, /* PDS only */ -+ .policy = SCHED_NORMAL, -+ .cpus_ptr = &init_task.cpus_mask, -+ .cpus_mask = CPU_MASK_ALL, -+ .nr_cpus_allowed= NR_CPUS, -+ .mm = NULL, -+ .active_mm = &init_mm, -+ .restart_block = { -+ .fn = do_no_restart_syscall, -+ }, -+ .sl_level = 0, /* PDS only */ -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ -+ .time_slice = HZ, /* PDS only */ -+ .tasks = LIST_HEAD_INIT(init_task.tasks), -+#ifdef CONFIG_SMP -+ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -+#endif -+#ifdef CONFIG_CGROUP_SCHED -+ .sched_task_group = &root_task_group, -+#endif -+ .ptraced = LIST_HEAD_INIT(init_task.ptraced), -+ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -+ .real_parent = &init_task, -+ .parent = &init_task, -+ .children = LIST_HEAD_INIT(init_task.children), -+ .sibling = LIST_HEAD_INIT(init_task.sibling), -+ .group_leader = &init_task, -+ RCU_POINTER_INITIALIZER(real_cred, &init_cred), -+ RCU_POINTER_INITIALIZER(cred, &init_cred), -+ .comm = INIT_TASK_COMM, -+ .thread = INIT_THREAD, -+ .fs = &init_fs, -+ .files = &init_files, -+ .signal = &init_signals, -+ .sighand = &init_sighand, -+ .nsproxy = &init_nsproxy, -+ .pending = { -+ .list = LIST_HEAD_INIT(init_task.pending.list), -+ .signal = {{0}} -+ }, -+ .blocked = {{0}}, -+ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), -+ .journal_info = NULL, -+ INIT_CPU_TIMERS(init_task) -+ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), -+ .timer_slack_ns = 50000, /* 50 usec default slack */ -+ .thread_pid = &init_struct_pid, -+ .thread_group = LIST_HEAD_INIT(init_task.thread_group), -+ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), -+#ifdef CONFIG_AUDITSYSCALL -+ .loginuid = INVALID_UID, -+ .sessionid = AUDIT_SID_UNSET, -+#endif -+#ifdef CONFIG_PERF_EVENTS -+ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), -+ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), -+#endif -+#ifdef CONFIG_PREEMPT_RCU -+ .rcu_read_lock_nesting = 0, -+ .rcu_read_unlock_special.s = 0, -+ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), -+ .rcu_blocked_node = NULL, -+#endif -+#ifdef CONFIG_TASKS_RCU -+ .rcu_tasks_holdout = false, -+ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), -+ .rcu_tasks_idle_cpu = -1, -+#endif -+#ifdef CONFIG_CPUSETS -+ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), -+#endif -+#ifdef CONFIG_RT_MUTEXES -+ .pi_waiters = RB_ROOT_CACHED, -+ .pi_top_task = NULL, -+#endif -+ INIT_PREV_CPUTIME(init_task) -+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -+ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), -+ .vtime.starttime = 0, -+ .vtime.state = VTIME_SYS, -+#endif -+#ifdef CONFIG_NUMA_BALANCING -+ .numa_preferred_nid = -1, -+ .numa_group = NULL, -+ .numa_faults = NULL, -+#endif -+#ifdef CONFIG_KASAN -+ .kasan_depth = 1, -+#endif -+#ifdef CONFIG_TRACE_IRQFLAGS -+ .softirqs_enabled = 1, -+#endif -+#ifdef CONFIG_LOCKDEP -+ .lockdep_recursion = 0, -+#endif -+#ifdef CONFIG_FUNCTION_GRAPH_TRACER -+ .ret_stack = NULL, -+#endif -+#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) -+ .trace_recursion = 0, -+#endif -+#ifdef CONFIG_LIVEPATCH -+ .patch_state = KLP_UNDEFINED, -+#endif -+#ifdef CONFIG_SECURITY -+ .security = NULL, -+#endif -+#else /* CONFIG_SCHED_PDS */ - #ifdef CONFIG_THREAD_INFO_IN_TASK - .thread_info = INIT_THREAD_INFO(init_task), - .stack_refcount = REFCOUNT_INIT(1), -@@ -182,6 +302,7 @@ struct task_struct init_task - #ifdef CONFIG_SECURITY - .security = NULL, - #endif -+#endif /* CONFIG_SCHED_PDS */ - }; - EXPORT_SYMBOL(init_task); - -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 729d3a5c772e..10a7c52b90d5 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index ce2a75bc0ade..f0f864bc1ab9 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..b5de980c7d4e 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_PDS -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index c9f090d64f00..063d15a1ab8b 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, -@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 21fb5a5662b5..8ebe4e33fb5f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_PDS -+obj-y += pds.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 7fbaee24c824..28377ad56248 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_PDS */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_PDS - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -916,6 +927,7 @@ static int __init sugov_register(void) - core_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_PDS - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_PDS */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index ff9435dee1df..1377ea3d1b76 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -+#ifdef CONFIG_SCHED_PDS -+ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : -+ CPUTIME_USER; -+#else - index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+#endif - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -+#ifdef CONFIG_SCHED_PDS -+ if (task_nice(p) > 0 || task_running_idle(p)) { -+#else - if (task_nice(p) > 0) { -+#endif - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -658,7 +667,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index b743bf38f08f..16e5754af1cf 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * idle-task scheduling class. - */ -@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c -new file mode 100644 -index 000000000000..02d7d5a67c77 ---- /dev/null -+++ b/kernel/sched/pds.c -@@ -0,0 +1,6619 @@ -+/* -+ * kernel/sched/pds.c, was kernel/sched.c -+ * -+ * PDS-mq Core kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ */ -+#include "pds_sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+#include "smp.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+ -+#define rt_prio(prio) ((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR || \ -+ (policy) == SCHED_ISO) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define idle_policy(policy) ((policy) == SCHED_IDLE) -+#define idleprio_task(p) unlikely(idle_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -+#define JIFFY_NS (1000000000 / HZ) -+#define HALF_JIFFY_NS (1000000000 / HZ / 2) -+#define HALF_JIFFY_US (1000000 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); -+} -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. -+ * Tunable via /proc interface. -+ */ -+#define SCHED_DEFAULT_RR (4) -+int rr_interval __read_mostly = SCHED_DEFAULT_RR; -+ -+static int __init rr_interval_set(char *str) -+{ -+ u32 rr; -+ -+ pr_info("rr_interval: "); -+ if (kstrtouint(str, 0, &rr)) { -+ pr_cont("using default of %u, unable to parse %s\n", -+ rr_interval, str); -+ return 1; -+ } -+ -+ rr_interval = rr; -+ pr_cont("%d\n", rr_interval); -+ -+ return 1; -+} -+__setup("rr_interval=", rr_interval_set); -+ -+ -+static const u64 sched_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, -+/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, -+/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, -+/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, -+/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, -+/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, -+/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, -+/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 -+}; -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+#ifdef CONFIG_SMP -+enum { -+SCHED_RQ_EMPTY = 0, -+SCHED_RQ_IDLE, -+SCHED_RQ_NORMAL_0, -+SCHED_RQ_NORMAL_1, -+SCHED_RQ_NORMAL_2, -+SCHED_RQ_NORMAL_3, -+SCHED_RQ_NORMAL_4, -+SCHED_RQ_NORMAL_5, -+SCHED_RQ_NORMAL_6, -+SCHED_RQ_NORMAL_7, -+SCHED_RQ_ISO, -+SCHED_RQ_RT, -+NR_SCHED_RQ_QUEUED_LEVEL -+}; -+ -+static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] -+____cacheline_aligned_in_smp; -+ -+static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) -+____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_PER_CPU(int, sched_sibling_cpu); -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+ -+static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SMT_NICE -+/* -+ * Preemptible sibling group mask -+ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO -+ */ -+static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; -+/* -+ * SMT supressed mask -+ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu -+ * will be supressed to run IDLE priority task. -+ */ -+static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; -+#endif /* CONFIG_SMT_NICE */ -+#endif -+ -+static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes CPU fairly amongst tasks of the -+ * same nice value, it proportions CPU according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 task_deadline_diff(const struct task_struct *p) -+{ -+ return sched_prio2deadline[TASK_USER_PRIO(p)]; -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return sched_prio2deadline[USER_PRIO(static_prio)]; -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline for non-rt tasks. -+ */ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ if (p->prio >= NORMAL_PRIO) -+ p->deadline = rq->clock + task_deadline_diff(p); -+ -+ update_task_priodl(p); -+} -+ -+static inline struct task_struct *rq_first_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct *rq_second_queued_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; -+ -+ if (node == &rq->sl_header) -+ return rq->idle; -+ -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) -+{ -+ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); -+} -+ -+static const int task_dl_hash_tbl[] = { -+/* 0 4 8 12 */ -+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -+/* 16 20 24 28 */ -+ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 -+}; -+ -+static inline int -+task_deadline_level(const struct task_struct *p, const struct rq *rq) -+{ -+ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; -+ -+ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); -+ return task_dl_hash_tbl[delta]; -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * flush_smp_call_function_from_idle() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_SMT_NICE -+static void resched_cpu_if_curr_is(int cpu, int priority) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rcu_read_lock(); -+ -+ if (rcu_dereference(rq->curr)->prio != priority) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ if (!do_raw_spin_trylock(&rq->lock)) -+ goto out; -+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if (priority == rq->curr->prio) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ -+ spin_release(&rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&rq->lock); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline bool -+__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, -+ cpumask_t cpumasks[], unsigned long bitmap[]) -+{ -+ if (*plevel == level) -+ return false; -+ -+ cpumask_clear_cpu(cpu, cpumasks + *plevel); -+ if (cpumask_empty(cpumasks + *plevel)) -+ clear_bit(*plevel, bitmap); -+ cpumask_set_cpu(cpu, cpumasks + level); -+ set_bit(level, bitmap); -+ -+ *plevel = level; -+ -+ return true; -+} -+ -+static inline int -+task_running_policy_level(const struct task_struct *p, const struct rq *rq) -+{ -+ int prio = p->prio; -+ -+ if (NORMAL_PRIO == prio) -+ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); -+ -+ if (ISO_PRIO == prio) -+ return SCHED_RQ_ISO; -+ if (prio < MAX_RT_PRIO) -+ return SCHED_RQ_RT; -+ return PRIO_LIMIT - prio; -+} -+ -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) -+{ -+ struct task_struct *p = rq_first_queued_task(rq); -+ -+ if (p->prio != NORMAL_PRIO) -+ return; -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, -+ task_running_policy_level(p, rq), -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0]); -+} -+ -+#ifdef CONFIG_SMT_NICE -+static inline void update_sched_cpu_psg_mask(const int cpu) -+{ -+ cpumask_t tmp; -+ -+ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_IDLE]); -+ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+ else -+ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, -+ cpu_smt_mask(cpu)); -+} -+#endif -+ -+static inline void update_sched_rq_queued_masks(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ struct task_struct *p = rq_first_queued_task(rq); -+ unsigned long level; -+#ifdef CONFIG_SCHED_SMT -+ unsigned long last_level = rq->queued_level; -+#endif -+ -+ level = task_running_policy_level(p, rq); -+ sched_rq_prio[cpu] = p->prio; -+ -+ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, -+ &sched_rq_queued_masks[0], -+ &sched_rq_queued_masks_bitmap[0])) -+ return; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpu == per_cpu(sched_sibling_cpu, cpu)) -+ return; -+ -+ if (SCHED_RQ_EMPTY == last_level) { -+ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, -+ cpu_smt_mask(cpu)); -+ } else if (SCHED_RQ_EMPTY == level) { -+ cpumask_t tmp; -+ -+ cpumask_and(&tmp, cpu_smt_mask(cpu), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_cpu_sg_idle_mask); -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { -+ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); -+ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { -+ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), -+ &sched_smt_supressed_mask); -+ update_sched_cpu_psg_mask(cpu); -+ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); -+ } -+#endif /* CONFIG_SMT_NICE */ -+#endif -+} -+ -+static inline void update_sched_rq_pending_masks(struct rq *rq) -+{ -+ unsigned long level; -+ struct task_struct *p = rq_second_queued_task(rq); -+ -+ level = task_running_policy_level(p, rq); -+ -+ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, -+ &sched_rq_pending_masks[0], -+ &sched_rq_pending_masks_bitmap[0]); -+} -+ -+#else /* CONFIG_SMP */ -+static inline void update_sched_rq_queued_masks(struct rq *rq) {} -+static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} -+static inline void update_sched_rq_pending_masks(struct rq *rq) {} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Removing from the runqueue. Deleting a task from the skip list is done -+ * via the stored node reference in the task struct and does not require a full -+ * look up. Thus it occurs in O(k) time where k is the "level" of the list the -+ * task was stored at - usually < 4, max 16. -+ * -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running--; -+ -+ sched_update_tick_dependency(rq); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ -+ sched_info_dequeued(rq, p); -+} -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLE to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static inline bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!freezing(p) && !signal_pending(p) && -+ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+/** -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Adding task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq)) -+ update_sched_rq_pending_masks(rq); -+ rq->nr_running++; -+ -+ sched_update_tick_dependency(rq); -+ -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ bool b_first, b_second; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); -+ b_second = is_second_in_rq(p, rq); -+ -+ p->sl_node.level = p->sl_level; -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { -+ update_sched_rq_queued_masks(rq); -+ update_sched_rq_pending_masks(rq); -+ } else if (is_second_in_rq(p, rq) || b_second) -+ update_sched_rq_pending_masks(rq); -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) -+{ -+ struct task_struct *curr = rq->curr; -+ -+ if (curr->prio == PRIO_LIMIT) -+ resched_curr(rq); -+ -+ if (task_running_idle(p)) -+ return; -+ -+ if (p->priodl < curr->priodl) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * PDS doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq->hrtick_csd.flags = 0; -+ rq->hrtick_csd.func = __hrtick_start; -+ rq->hrtick_csd.info = rq; -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) -+ return 0; -+ -+ return HALF_JIFFY_NS; -+} -+ -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ static const int policy_to_prio[] = { -+ NORMAL_PRIO, /* SCHED_NORMAL */ -+ 0, /* SCHED_FIFO */ -+ 0, /* SCHED_RR */ -+ IDLE_PRIO, /* SCHED_BATCH */ -+ ISO_PRIO, /* SCHED_ISO */ -+ IDLE_PRIO /* SCHED_IDLE */ -+ }; -+ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ return policy_to_prio[p->policy]; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible--; -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = 1; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ if (task_contributes_to_load(p)) -+ rq->nr_uninterruptible++; -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_this_cpu(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ p->on_rq = TASK_ON_RQ_MIGRATING; -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq, p); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq) -+ if (task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/* Enter with rq lock held. We know p is on the local CPU */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_mask is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, &p->cpus_mask) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ cpumask_t *mask; -+ -+ if (cpumask_test_cpu(cpu, cpumask)) -+ return cpu; -+ -+ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ -+ return cpu; -+} -+ -+/* -+ * task_preemptible_rq - return the rq which the given task can preempt on -+ * @p: task wants to preempt CPU -+ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p -+ */ -+static inline int -+task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) -+{ -+ cpumask_t tmp; -+ -+#ifdef CONFIG_SCHED_SMT -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+ -+#ifdef CONFIG_SMT_NICE -+ /* Only ttwu on cpu which is not smt supressed */ -+ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { -+ cpumask_t t; -+ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &t); -+ return best_mask_cpu(task_cpu(p), &tmp); -+ } -+#endif -+ -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+static inline int -+task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, -+ int preempt_level) -+{ -+ cpumask_t tmp; -+ int level; -+ -+#ifdef CONFIG_SCHED_SMT -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#else -+ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) -+ return best_mask_cpu(task_cpu(p), &tmp); -+#endif -+#endif -+ -+ level = find_first_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL); -+ -+ while (level < preempt_level) { -+ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ level = find_next_bit(sched_rq_queued_masks_bitmap, -+ NR_SCHED_RQ_QUEUED_LEVEL, -+ level + 1); -+ } -+ -+ if (unlikely(SCHED_RQ_RT == level && -+ level == preempt_level && -+ cpumask_and(&tmp, chk_mask, -+ &sched_rq_queued_masks[SCHED_RQ_RT]))) { -+ unsigned int cpu; -+ -+ for_each_cpu (cpu, &tmp) -+ if (p->prio < sched_rq_prio[cpu]) -+ return cpu; -+ } -+ -+ return best_mask_cpu(task_cpu(p), chk_mask); -+} -+ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ cpumask_t chk_mask; -+ -+ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ /* Check IDLE tasks suitable to run normal priority */ -+ if (idleprio_task(p)) { -+ if (idleprio_suitable(p)) { -+ p->prio = p->normal_prio; -+ update_task_priodl(p); -+ return task_preemptible_rq_idle(p, &chk_mask); -+ } -+ p->prio = NORMAL_PRIO; -+ update_task_priodl(p); -+ } -+ -+ return task_preemptible_rq(p, &chk_mask, -+ task_running_policy_level(p, this_rq())); -+} -+#else /* CONFIG_SMP */ -+static inline int select_task_rq(struct task_struct *p) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** PDS ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+#ifdef CONFIG_SMP -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+#endif -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ int cpu, success = 0; -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto out; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ cpu = task_cpu(p); -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * flush_smp_call_function_from_idle() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (p->on_rq && ttwu_remote(p, wake_flags)) -+ goto stat; -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ p->sched_contributes_to_load = !!task_contributes_to_load(p); -+ p->state = TASK_WAKING; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { -+ p->prio = ISO_PRIO; -+ p->deadline = 0UL; -+ update_task_priodl(p); -+ } -+ -+ cpu = select_task_rq(p); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else /* CONFIG_SMP */ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+#endif -+ -+ rq = cpu_rq(cpu); -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ -+stat: -+ ttwu_stat(p, cpu, wake_flags); -+out: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, &rf); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ rq_unlock(rq, &rf); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ int cpu = get_cpu(); -+ struct rq *rq = this_rq(); -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ /* Should be reset in fork.c but done here for ease of PDS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = 0; -+ -+ p->sl_level = pds_skiplist_random_level(p); -+ INIT_SKIPLIST_NODE(&p->sl_node); -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); -+#endif -+ -+ if (p->time_slice < RESCHED_US) { -+ update_rq_clock(rq); -+ time_slice_expired(p, rq); -+ resched_curr(rq); -+ } else -+ update_task_priodl(p); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ put_cpu(); -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p)); -+#ifdef CONFIG_SMP -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_mask can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq, p); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void pds_update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ p->time_slice -= NS_TO_US(ns); -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ pds_update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void pds_scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+ -+ /** -+ * p->time_slice < RESCHED_US. We will modify task_struct under -+ * rq lock as p is rq->curr -+ */ -+ __set_tsk_resched(p); -+} -+ -+#ifdef CONFIG_SMP -+ -+#ifdef CONFIG_SCHED_SMT -+static int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ int cpu; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* -+ * _something_ may have changed the task, double check again -+ */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) -+ rq = __migrate_task(rq, p, cpu); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ -+static void pds_sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return; -+ curr = rq->curr; -+ if (!is_idle_task(curr) && -+ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { -+ int active_balance = 0; -+ -+ if (likely(!rq->active_balance)) { -+ rq->active_balance = 1; -+ active_balance = 1; -+ } -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (likely(active_balance)) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ } else -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+/* -+ * pds_sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void pds_sg_balance_check(const struct rq *rq) -+{ -+ cpumask_t chk; -+ int i; -+ -+ /* Only online cpu will do sg balance checking */ -+ if (unlikely(!rq->online)) -+ return; -+ -+ /* Only cpu in slibing idle group will do the checking */ -+ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) -+ return; -+ -+ /* Find potential cpus which can migrate the currently running task */ -+ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ return; -+ -+ for_each_cpu(i, &chk) { -+ /* skip the cpu which has idle slibing cpu */ -+ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), -+ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) -+ continue; -+ pds_sg_balance_trigger(i); -+ } -+} -+DEFINE_PER_CPU(unsigned long, thermal_pressure); -+ -+void arch_set_thermal_pressure(struct cpumask *cpus, -+ unsigned long th_pressure) -+{ -+ int cpu; -+ -+ for_each_cpu(cpu, cpus) -+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -+} -+#endif /* CONFIG_SCHED_SMT */ -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ pds_scheduler_task_tick(rq); -+ update_sched_rq_queued_masks_normal(rq); -+ calc_load_nohz_remote(rq); -+ -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (rq->idle == p) -+ return; -+ -+ pds_update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_US) { -+ time_slice_expired(p, rq); -+ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { -+ p->prio = NORMAL_PRIO; -+ p->deadline = rq->clock + task_deadline_diff(p); -+ update_task_priodl(p); -+ } -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) -+{ -+ struct task_struct *p; -+ int dest_cpu = cpu_of(dest_rq); -+ int nr_migrated = 0; -+ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ while (nr_tries && node != &rq->sl_header) { -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ node = node->next[0]; -+ -+ if (task_running(p)) -+ continue; -+ if (p->prio >= filter_prio) -+ break; -+ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, dest_cpu); -+ enqueue_task(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ /* make a jump */ -+ if (node == &rq->sl_header) -+ break; -+ node = node->next[0]; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int -+take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) -+{ -+ int src_cpu; -+ -+ for_each_cpu(src_cpu, chk_mask) { -+ int nr_migrated; -+ struct rq *src_rq = cpu_rq(src_cpu); -+ -+ if (!do_raw_spin_trylock(&src_rq->lock)) { -+ if (PRIO_LIMIT == filter_prio) -+ continue; -+ return 0; -+ } -+ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ update_rq_clock(src_rq); -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) -+ cpufreq_update_this_cpu(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ if (nr_migrated || PRIO_LIMIT != filter_prio) -+ return nr_migrated; -+ } -+ return 0; -+} -+ -+static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) -+{ -+ struct cpumask *affinity_mask, *end; -+ struct cpumask chk; -+ -+ if (PRIO_LIMIT == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+#ifdef CONFIG_SMT_NICE -+ { -+ /* also try to take IDLE priority tasks from smt supressed cpu */ -+ struct cpumask t; -+ if (cpumask_and(&t, &sched_smt_supressed_mask, -+ &sched_rq_queued_masks[SCHED_RQ_IDLE])) -+ cpumask_or(&chk, &chk, &t); -+ } -+#endif -+ } else if (NORMAL_PRIO == filter_prio) { -+ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], -+ &sched_rq_pending_masks[SCHED_RQ_ISO]); -+ } else if (IDLE_PRIO == filter_prio) { -+ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); -+ } else -+ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); -+ -+ if (cpumask_empty(&chk)) -+ return 0; -+ -+ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); -+ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); -+ do { -+ struct cpumask tmp; -+ -+ if (cpumask_and(&tmp, &chk, affinity_mask) && -+ take_queued_task_cpumask(rq, &tmp, filter_prio)) -+ return 1; -+ } while (++affinity_mask < end); -+ -+ return 0; -+} -+#endif -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next = rq_first_queued_task(rq); -+ -+#ifdef CONFIG_SMT_NICE -+ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { -+ if (next->prio >= IDLE_PRIO) { -+ if (rq->online && -+ take_other_rq_task(rq, cpu, IDLE_PRIO)) -+ return rq_first_queued_task(rq); -+ return rq->idle; -+ } -+ } -+#endif -+ -+#ifdef CONFIG_SMP -+ if (likely(rq->online)) -+ if (take_other_rq_task(rq, cpu, next->prio)) { -+ resched_curr(rq); -+ return rq_first_queued_task(rq); -+ } -+#endif -+ return next; -+} -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ p->last_ran = rq->clock_task; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (p != rq->idle) -+ hrtick_start(rq, US_TO_NS(p->time_slice)); -+#endif -+ /* update rq->dither */ -+ rq->dither = rq_dither(rq); -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(). -+ * -+ * The membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ if (!preempt && prev->state) { -+ if (signal_pending_state(prev->state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_deadline(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ set_rq_task(rq, next); -+ -+ if (prev != next) { -+ if (next->prio == PRIO_LIMIT) -+ schedstat_inc(rq->sched_goidle); -+ -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ rq->nr_switches++; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+#ifdef CONFIG_SCHED_SMT -+ pds_sg_balance_check(rq); -+#endif -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state || tsk_is_pi_blocked(tsk) || -+ signal_pending_state(tsk->state, tsk)) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void -+check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* -+ * Trigger changes when task priority/deadline modified. -+ */ -+ if (task_on_rq_queued(p)) { -+ struct task_struct *first; -+ -+ requeue_task(p, rq); -+ -+ /* Resched if first queued task not running and not IDLE */ -+ if ((first = rq_first_queued_task(rq)) != rq->curr && -+ !task_running_idle(first)) -+ resched_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+ -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ /* rq lock may not held!! */ -+ update_rq_clock(rq); -+ -+ p->static_prio = new_static; -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->deadline -= task_deadline_diff(p); -+ p->deadline += static_deadline_diff(new_static); -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int level, prio = p->prio - MAX_RT_PRIO; -+ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; -+ -+ /* rt tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ preempt_disable(); -+ level = task_deadline_level(p, this_rq()); -+ preempt_enable(); -+ prio += level_to_nice_prio[level]; -+ if (idleprio_task(p)) -+ prio += NICE_WIDTH; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ return cpu_curr(cpu) == cpu_rq(cpu)->idle; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+#ifdef CONFIG_SMP -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif -+ -+static u64 task_init_deadline(const struct task_struct *p) -+{ -+ return task_rq(p)->clock + task_deadline_diff(p); -+} -+ -+u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { -+ task_init_deadline, /* SCHED_NORMAL */ -+ NULL, /* SCHED_FIFO */ -+ NULL, /* SCHED_RR */ -+ task_init_deadline, /* SCHED_BATCH */ -+ NULL, /* SCHED_ISO */ -+ task_init_deadline /* SCHED_IDLE */ -+}; -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int old_policy = p->policy; -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+ -+ if (old_policy != policy) -+ p->deadline = (task_init_deadline_func_tbl[p->policy])? -+ task_init_deadline_func_tbl[p->policy](p):0ULL; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int -+__sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_mask, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_mask); -+ cpumask_and(new_mask, in_mask, cpus_mask); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_mask); -+ if (!cpumask_subset(new_mask, cpus_mask)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_mask to the -+ * cpuset's cpus_mask -+ */ -+ cpumask_copy(new_mask, cpus_mask); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_mask); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ if (sched_yield_type > 1) { -+ time_slice_expired(current, rq); -+ requeue_task(current, rq); -+ } -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In PDS, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* PDS TODO: should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); -+ update_task_priodl(idle); -+ -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+static bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) -+{ -+} -+ -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ struct skiplist_node *node; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ node = &rq->sl_header; -+ while ((node = node->next[0]) != &rq->sl_header) { -+ int dest_cpu; -+ -+ p = skiplist_entry(node, struct task_struct, sl_node); -+ -+ /* skip the running task */ -+ if (task_running(p)) -+ continue; -+ -+ /* -+ * Rules for changing task_struct::cpus_mask are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ node = &rq->sl_header; -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+ -+static __read_mostly int sched_debug_enabled; -+ -+static int __init sched_debug_setup(char *str) -+{ -+ sched_debug_enabled = 1; -+ -+ return 0; -+} -+early_param("sched_debug", sched_debug_setup); -+ -+static inline bool sched_debug(void) -+{ -+ return sched_debug_enabled; -+} -+#else /* !CONFIG_SCHED_DEBUG */ -+static inline bool sched_debug(void) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_DEBUG */ -+ -+#ifdef CONFIG_SMP -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ /*llist_for_each_entry_safe(p, t, llist, wake_entry) -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);*/ -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Topology list, bottom-up. -+ */ -+static struct sched_domain_topology_level default_topology[] = { -+#ifdef CONFIG_SCHED_SMT -+ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, -+#endif -+#ifdef CONFIG_SCHED_MC -+ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, -+#endif -+ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, -+ { NULL, }, -+}; -+ -+static struct sched_domain_topology_level *sched_domain_topology = -+ default_topology; -+ -+#define for_each_sd_topology(tl) \ -+ for (tl = sched_domain_topology; tl->mask; tl++) -+ -+void set_sched_topology(struct sched_domain_topology_level *tl) -+{ -+ if (WARN_ON_ONCE(sched_smp_initialized)) -+ return; -+ -+ sched_domain_topology = tl; -+} -+ -+/* -+ * Initializers for schedule domains -+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() -+ */ -+ -+int sched_domain_level_max; -+ -+/* -+ * Partition sched domains as specified by the 'ndoms_new' -+ * cpumasks in the array doms_new[] of cpumasks. This compares -+ * doms_new[] to the current sched domain partitioning, doms_cur[]. -+ * It destroys each deleted domain and builds each new domain. -+ * -+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. -+ * The masks don't intersect (don't overlap.) We should setup one -+ * sched domain for each mask. CPUs not in any of the cpumasks will -+ * not be load balanced. If the same cpumask appears both in the -+ * current 'doms_cur' domains and in the new 'doms_new', we can leave -+ * it as it is. -+ * -+ * The passed in 'doms_new' should be allocated using -+ * alloc_sched_domains. This routine takes ownership of it and will -+ * free_sched_domains it when done with it. If the caller failed the -+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, -+ * and partition_sched_domains() will fallback to the single partition -+ * 'fallback_doms', it also forces the domains to be rebuilt. -+ * -+ * If doms_new == NULL it will be replaced with cpu_online_mask. -+ * ndoms_new == 0 is a special case for destroying existing domains, -+ * and it will not create the default domain. -+ * -+ * Call with hotplug lock held -+ */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{ -+ /** -+ * PDS doesn't depend on sched domains, but just keep this api -+ */ -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+/* -+ * sched_numa_find_closest() - given the NUMA topology, find the cpu -+ * closest to @cpu from @cpumask. -+ * cpumask: cpumask to find a cpu from -+ * cpu: cpu to be close to -+ * -+ * returns: cpu, or nr_cpu_ids when nothing found. -+ */ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ sched_tick_stop(cpu); -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_start_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = -+ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); -+ } -+} -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); -+ -+#ifdef CONFIG_SCHED_SMT -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { -+ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+#endif -+#ifdef CONFIG_SCHED_MC -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ } -+ cpumask_complement(chk, cpu_coregroup_mask(cpu)); -+ -+ /** -+ * Set up sd_llc_id per CPU -+ */ -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(cpu_coregroup_mask(cpu)); -+#else -+ per_cpu(sd_llc_id, cpu) = -+ cpumask_first(topology_core_cpumask(cpu)); -+ -+ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; -+ -+ cpumask_setall(chk); -+ cpumask_clear_cpu(cpu, chk); -+#endif /* NOT CONFIG_SCHED_MC */ -+ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ cpumask_complement(chk, topology_core_cpumask(cpu)); -+ -+ if (cpumask_and(chk, chk, cpu_online_mask)) -+ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", -+ cpu, (chk++)->bits[0]); -+ -+ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ print_scheduler_version(); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) -+ cpumask_clear(&sched_rq_queued_masks[i]); -+ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); -+ -+ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); -+ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ raw_spin_lock_init(&rq->lock); -+ rq->dither = 0; -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+ rq->queued_level = SCHED_RQ_EMPTY; -+ rq->pending_level = SCHED_RQ_EMPTY; -+#ifdef CONFIG_SCHED_SMT -+ per_cpu(sched_sibling_cpu, i) = i; -+ rq->active_balance = 0; -+#endif -+#endif -+ rq->nr_switches = 0; -+ atomic_set(&rq->nr_iowait, 0); -+ hrtick_rq_init(rq); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h -new file mode 100644 -index 000000000000..6c3361f06087 ---- /dev/null -+++ b/kernel/sched/pds_sched.h -@@ -0,0 +1,577 @@ -+#ifndef PDS_SCHED_H -+#define PDS_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+ -+/* -+ * rq::clock_update_flags bits -+ */ -+#define RQCF_REQ_SKIP 0x01 -+#define RQCF_ACT_SKIP 0x02 -+#define RQCF_UPDATED 0x04 -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop; -+ struct mm_struct *prev_mm; -+ -+ struct skiplist_node sl_header; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ unsigned int ttwu_pending; -+ unsigned int clock_update_flags; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+ struct sched_avg avg_thermal; -+#endif -+ -+ unsigned long queued_level; -+ unsigned long pending_level; -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 clock_task; -+ int dither; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ -+ (task->flags & PF_FROZEN) == 0 && \ -+ (task->state & TASK_NOLOAD) == 0) -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+#endif /* CONFIG_SMP */ -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/** -+ * By default the decay is the default pelt decay period. -+ * The decay shift can change the decay period in -+ * multiples of 32. -+ * Decay shift Decay period(ms) -+ * 0 32 -+ * 1 64 -+ * 2 128 -+ * 3 256 -+ * 4 512 -+ */ -+extern int sched_thermal_decay_shift; -+ -+static inline u64 rq_clock_thermal(struct rq *rq) -+{ -+ return rq_clock_task(rq) >> sched_thermal_decay_shift; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : PDS need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+ struct pin_cookie cookie; -+ unsigned int clock_update_flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) -+{ -+ rf->cookie = lockdep_pin_lock(&rq->lock); -+ -+#ifdef CONFIG_SCHED_DEBUG -+ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); -+ rf->clock_update_flags = 0; -+#endif -+} -+ -+static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ if (rq->clock_update_flags > RQCF_ACT_SKIP) -+ rf->clock_update_flags = RQCF_UPDATED; -+#endif -+ -+ lockdep_unpin_lock(&rq->lock, rf->cookie); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+ rq_pin_lock(rq, rf); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ rq_unpin_lock(rq, rf); -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline void -+rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ rq_unpin_lock(rq, rf); -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+ -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) -+{ -+ if (cpu_of(rq) == smp_processor_id()) -+ cpufreq_update_util(rq, flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* PDS_SCHED_H */ -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index b647d04d9c8b..05b6cfd91842 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_PDS - /* - * sched_entity: - * -@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - /* -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index eb034d9f024d..a074572f2976 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,11 +1,13 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_PDS - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - - #ifdef CONFIG_SCHED_THERMAL_PRESSURE - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); -@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_PDS - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_PDS */ - - #else - -+#ifndef CONFIG_SCHED_PDS - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -188,6 +193,7 @@ static inline u64 thermal_load_avg(struct rq *rq) - { - return 0; - } -+#endif - - static inline int - update_irq_load_avg(struct rq *rq, u64 running) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index db3a57675ccf..5a8060bd2343 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_PDS -+#include "pds_sched.h" -+#else -+ - #include - - #include -@@ -2546,3 +2550,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* !CONFIG_SCHED_PDS */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..45bd43942575 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_PDS - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 8a176d8727a3..b9dde576b576 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -130,9 +130,13 @@ static int __maybe_unused four = 4; - static unsigned long zero_ul; - static unsigned long one_ul = 1; - static unsigned long long_max = LONG_MAX; --static int one_hundred = 100; --static int two_hundred = 200; --static int one_thousand = 1000; -+static int __read_mostly one_hundred = 100; -+static int __read_mostly two_hundred = 200; -+static int __read_mostly one_thousand = 1000; -+#ifdef CONFIG_SCHED_PDS -+extern int rr_interval; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { - { } - }; - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; - #endif - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_PDS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_PDS */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -1049,6 +1055,26 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_PDS -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ONE, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 2fd3b3fa68bf..6f3b08afdd4c 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_PDS - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_PDS - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..0816db0b9c16 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_PDS -+ /* No deadline on BFS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux58-tkg/linux58-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux58-tkg/linux58-tkg-patches/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index d1303a5..0000000 --- a/linux58-tkg/linux58-tkg-patches/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/linux58-tkg/linux58-tkg-patches/0007-v5.8-fsync.patch b/linux58-tkg/linux58-tkg-patches/0007-v5.8-fsync.patch deleted file mode 100644 index 01c86d8..0000000 --- a/linux58-tkg/linux58-tkg-patches/0007-v5.8-fsync.patch +++ /dev/null @@ -1,908 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 20 Apr 2020 14:09:11 +0200 -Subject: Import Fsync v3 patchset - Squashed from https://gitlab.collabora.com/tonyk/linux/-/commits/futex-proton-v3 - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e2ee527be1e3e11b1117ff5bf94b4..580001e89c6caed57dd8b3cb491d65dce846caff 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -40,6 +41,8 @@ - FUTEX_PRIVATE_FLAG) - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -+#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ -+ FUTEX_PRIVATE_FLAG) - - /* - * Support for robust futexes: the kernel cleans up held futexes at -@@ -150,4 +153,21 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+/* -+ * Maximum number of multiple futexes to wait for -+ */ -+#define FUTEX_MULTIPLE_MAX_COUNT 128 -+ -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index 0cf84c8664f207c574325b899ef2e57f01295a94..58cf9eb2b851b4858e29b5ef4114a29a92e676ba 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -215,6 +215,8 @@ struct futex_pi_state { - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup -+ * @uaddr: userspace address of futex -+ * @uval: expected futex's value - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). -@@ -237,6 +239,8 @@ struct futex_q { - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -+ u32 __user *uaddr; -+ u32 uval; - } __randomize_layout; - - static const struct futex_q futex_q_init = { -@@ -2420,6 +2424,29 @@ static int unqueue_me(struct futex_q *q) - return ret; - } - -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_q *q, int count) -+{ -+ int ret = -1; -+ int i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&q[i])) -+ ret = i; -+ } -+ return ret; -+} -+ - /* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -@@ -2783,6 +2810,211 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_q *qs, int count, -+ unsigned int flags, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret)) { -+ for (--i; i >= 0; i--) -+ put_futex_key(&qs[i].key); -+ return ret; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &qs[i]; -+ -+ hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, q->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ -+ /* -+ * Keys 0..(i-1) are implicitly put -+ * on unqueue_multiple. -+ */ -+ put_futex_key(&q->key); -+ -+ *awaken = unqueue_multiple(qs, i); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * On a real fault, prioritize the error even if -+ * some other futex was awoken. Userspace gave -+ * us a bad address, -EFAULT them. -+ */ -+ ret = get_user(uval, q->uaddr); -+ if (ret) -+ return ret; -+ -+ /* -+ * Even if the page fault was handled, If -+ * something was already awaken, we can safely -+ * give up and succeed to give a hint for userspace to -+ * acquire the right futex faster. -+ */ -+ if (*awaken >= 0) -+ return 1; -+ -+ goto retry; -+ } -+ -+ if (uval != q->uval) { -+ queue_unlock(hb); -+ -+ put_futex_key(&qs[i].key); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ __set_current_state(TASK_RUNNING); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_q *qs, int op, -+ u32 count, ktime_t *abs_time) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ int ret, flags = 0, hint = 0; -+ unsigned int i; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) -+ flags |= FLAGS_CLOCKRT; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, 0); -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, flags, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ break; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ ret = unqueue_multiple(qs, count); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (ret >= 0) -+ break; -+ if (to && !to->task) { -+ ret = -ETIMEDOUT; -+ break; -+ } else if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -3907,6 +4139,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - return -ENOSYS; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = fwb.uaddr; -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} - - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -@@ -3919,7 +4188,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3940,6 +4210,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs; -+ -+#ifdef CONFIG_X86_X32 -+ if (unlikely(in_x32_syscall())) -+ return -ENOSYS; -+#endif -+ qs = futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - -@@ -4102,6 +4391,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - #endif /* CONFIG_COMPAT */ - - #ifdef CONFIG_COMPAT_32BIT_TIME -+/** -+ * struct compat_futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex (compatible pointer) -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct compat_futex_wait_block { -+ compat_uptr_t uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ -+/** -+ * compat_futex_read_wait_block - Read an array of futex_wait_block from -+ * userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function does the same as futex_read_wait_block(), except that it -+ * converts the pointer to the futex from the compat version to the regular one. -+ */ -+inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, -+ u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct compat_futex_wait_block fwb; -+ struct compat_futex_wait_block __user *entry = -+ (struct compat_futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = compat_ptr(fwb.uaddr); -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} -+ - SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -@@ -4113,7 +4453,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) -@@ -4128,6 +4469,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ -diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -index ee55e6d389a3f053194435342c4e471dc7cf8786..2a63e1c2cfb6407a5988233217cff2e52787bc66 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c -@@ -11,6 +11,7 @@ - * - * HISTORY - * 2009-Nov-6: Initial version by Darren Hart -+ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman - * - *****************************************************************************/ - -@@ -41,6 +42,8 @@ int main(int argc, char *argv[]) - { - futex_t f1 = FUTEX_INITIALIZER; - struct timespec to; -+ time_t secs; -+ struct futex_wait_block fwb = {&f1, f1, 0}; - int res, ret = RET_PASS; - int c; - -@@ -65,7 +68,7 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Block on a futex and wait for timeout\n", - basename(argv[0])); - ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); -@@ -79,8 +82,39 @@ int main(int argc, char *argv[]) - if (!res || errno != ETIMEDOUT) { - fail("futex_wait returned %d\n", ret < 0 ? errno : ret); - ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait timeout succeeds\n"); -+ -+ info("Calling futex_wait_multiple on f1: %u @ %p\n", f1, &f1); -+ -+ /* Setup absolute time */ -+ ret = clock_gettime(CLOCK_REALTIME, &to); -+ secs = (to.tv_nsec + timeout_ns) / 1000000000; -+ to.tv_nsec = ((int64_t)to.tv_nsec + timeout_ns) % 1000000000; -+ to.tv_sec += secs; -+ info("to.tv_sec = %ld\n", to.tv_sec); -+ info("to.tv_nsec = %ld\n", to.tv_nsec); -+ -+ res = futex_wait_multiple(&fwb, 1, &to, -+ FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); -+ -+#ifdef __ILP32__ -+ if (res == -1 && errno == ENOSYS) { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } else { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; - } -+#else -+ if (!res || errno != ETIMEDOUT) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait_multiple timeout succeeds\n"); -+#endif /* __ILP32__ */ - -- print_result(TEST_NAME, ret); -+ ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h -index ddbcfc9b7bac4aebb5bac2f249e26ecfd948aa84..bb103bef4557012ef9a389ca74c868e4476a8a31 100644 ---- a/tools/testing/selftests/futex/include/futextest.h -+++ b/tools/testing/selftests/futex/include/futextest.h -@@ -38,6 +38,14 @@ typedef volatile u_int32_t futex_t; - #ifndef FUTEX_CMP_REQUEUE_PI - #define FUTEX_CMP_REQUEUE_PI 12 - #endif -+#ifndef FUTEX_WAIT_MULTIPLE -+#define FUTEX_WAIT_MULTIPLE 13 -+struct futex_wait_block { -+ futex_t *uaddr; -+ futex_t val; -+ __u32 bitset; -+}; -+#endif - #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE - #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -@@ -80,6 +88,20 @@ futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) - return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); - } - -+/** -+ * futex_wait_multiple() - block on several futexes with optional timeout -+ * @fwb: wait block user space address -+ * @count: number of entities at fwb -+ * @timeout: absolute timeout -+ */ -+static inline int -+futex_wait_multiple(struct futex_wait_block *fwb, int count, -+ struct timespec *timeout, int opflags) -+{ -+ return futex(fwb, FUTEX_WAIT_MULTIPLE, count, timeout, NULL, 0, -+ opflags); -+} -+ - /** - * futex_wake() - wake one or more tasks blocked on uaddr - * @nr_wake: wake up to this many tasks -diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -index 0ae390ff816449c88d0bb655a26eb014382c2b4f..bcbac042992d447e0bc9ef5fefe94e875de310f2 100644 ---- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -+++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c -@@ -12,6 +12,7 @@ - * - * HISTORY - * 2009-Nov-14: Initial version by Gowrishankar -+ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman - * - *****************************************************************************/ - -@@ -40,6 +41,7 @@ int main(int argc, char *argv[]) - { - struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; - futex_t f1 = FUTEX_INITIALIZER; -+ struct futex_wait_block fwb = {&f1, f1+1, 0}; - int res, ret = RET_PASS; - int c; - -@@ -61,7 +63,7 @@ int main(int argc, char *argv[]) - } - - ksft_print_header(); -- ksft_set_plan(1); -+ ksft_set_plan(2); - ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", - basename(argv[0])); - -@@ -71,8 +73,30 @@ int main(int argc, char *argv[]) - fail("futex_wait returned: %d %s\n", - res ? errno : res, res ? strerror(errno) : ""); - ret = RET_FAIL; -+ } else -+ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); -+ -+ info("Calling futex_wait_multiple on f1: %u @ %p with val=%u\n", -+ f1, &f1, f1+1); -+ res = futex_wait_multiple(&fwb, 1, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (!res || errno != EWOULDBLOCK) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; - } -+ ksft_test_result_pass("futex_wait_multiple wouldblock succeeds\n"); -+#endif /* __ILP32__ */ - -- print_result(TEST_NAME, ret); -+ ksft_print_cnts(); - return ret; - } -diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore -index a09f570619023750f558c84004aff166b4337d72..4660128a545edb04a17cc6bd9760931c1386122f 100644 ---- a/tools/testing/selftests/futex/functional/.gitignore -+++ b/tools/testing/selftests/futex/functional/.gitignore -@@ -5,3 +5,4 @@ futex_wait_private_mapped_file - futex_wait_timeout - futex_wait_uninitialized_heap - futex_wait_wouldblock -+futex_wait_multiple -diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile -index 30996306cabcfe89a47977643e529b122893bb7e..75f9fface11fa3c90c1bdb9a49b3ea51291afd58 100644 ---- a/tools/testing/selftests/futex/functional/Makefile -+++ b/tools/testing/selftests/futex/functional/Makefile -@@ -14,7 +14,8 @@ TEST_GEN_FILES := \ - futex_requeue_pi_signal_restart \ - futex_requeue_pi_mismatched_ops \ - futex_wait_uninitialized_heap \ -- futex_wait_private_mapped_file -+ futex_wait_private_mapped_file \ -+ futex_wait_multiple - - TEST_PROGS := run.sh - -diff --git a/tools/testing/selftests/futex/functional/futex_wait_multiple.c b/tools/testing/selftests/futex/functional/futex_wait_multiple.c -new file mode 100644 -index 0000000000000000000000000000000000000000..b48422e79f42edba1653bb0bd2a4c4fd98d2d48d ---- /dev/null -+++ b/tools/testing/selftests/futex/functional/futex_wait_multiple.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: GPL-2.0-or-later -+/****************************************************************************** -+ * -+ * Copyright © Collabora, Ltd., 2019 -+ * -+ * DESCRIPTION -+ * Test basic semantics of FUTEX_WAIT_MULTIPLE -+ * -+ * AUTHOR -+ * Gabriel Krisman Bertazi -+ * -+ * HISTORY -+ * 2019-Dec-13: Initial version by Krisman -+ * -+ *****************************************************************************/ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "futextest.h" -+#include "logging.h" -+ -+#define TEST_NAME "futex-wait-multiple" -+#define timeout_ns 100000 -+#define MAX_COUNT 128 -+#define WAKE_WAIT_US 3000000 -+ -+int ret = RET_PASS; -+char *progname; -+futex_t f[MAX_COUNT] = {0}; -+struct futex_wait_block fwb[MAX_COUNT]; -+ -+void usage(char *prog) -+{ -+ printf("Usage: %s\n", prog); -+ printf(" -c Use color\n"); -+ printf(" -h Display this help message\n"); -+ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", -+ VQUIET, VCRITICAL, VINFO); -+} -+ -+void test_count_overflow(void) -+{ -+ futex_t f = FUTEX_INITIALIZER; -+ struct futex_wait_block fwb[MAX_COUNT+1]; -+ int res, i; -+ -+ ksft_print_msg("%s: Test a too big number of futexes\n", progname); -+ -+ for (i = 0; i < MAX_COUNT+1; i++) { -+ fwb[i].uaddr = &f; -+ fwb[i].val = f; -+ fwb[i].bitset = 0; -+ } -+ -+ res = futex_wait_multiple(fwb, MAX_COUNT+1, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (res != -1 || errno != EINVAL) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_pass("futex_wait_multiple count overflow succeed\n"); -+ } -+ -+#endif /* __ILP32__ */ -+} -+ -+void *waiterfn(void *arg) -+{ -+ int res; -+ -+ res = futex_wait_multiple(fwb, MAX_COUNT, NULL, FUTEX_PRIVATE_FLAG); -+ -+#ifdef __ILP32__ -+ if (res != -1 || errno != ENOSYS) { -+ ksft_test_result_fail("futex_wait_multiple returned %d\n", -+ res < 0 ? errno : res); -+ ret = RET_FAIL; -+ } else { -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+ } -+#else -+ if (res < 0) -+ ksft_print_msg("waiter failed %d\n", res); -+ -+ info("futex_wait_multiple: Got hint futex %d was freed\n", res); -+#endif /* __ILP32__ */ -+ -+ return NULL; -+} -+ -+void test_fwb_wakeup(void) -+{ -+ int res, i; -+ pthread_t waiter; -+ -+ ksft_print_msg("%s: Test wake up in a list of futex\n", progname); -+ -+ for (i = 0; i < MAX_COUNT; i++) { -+ fwb[i].uaddr = &f[i]; -+ fwb[i].val = f[i]; -+ fwb[i].bitset = 0xffffffff; -+ } -+ -+ res = pthread_create(&waiter, NULL, waiterfn, NULL); -+ if (res) { -+ ksft_test_result_fail("Creating waiting thread failed"); -+ ksft_exit_fail(); -+ } -+ -+ usleep(WAKE_WAIT_US); -+ res = futex_wake(&(f[MAX_COUNT-1]), 1, FUTEX_PRIVATE_FLAG); -+ if (res != 1) { -+ ksft_test_result_fail("Failed to wake thread res=%d\n", res); -+ ksft_exit_fail(); -+ } -+ -+ pthread_join(waiter, NULL); -+ ksft_test_result_pass("%s succeed\n", __func__); -+} -+ -+int main(int argc, char *argv[]) -+{ -+ int c; -+ -+ while ((c = getopt(argc, argv, "cht:v:")) != -1) { -+ switch (c) { -+ case 'c': -+ log_color(1); -+ break; -+ case 'h': -+ usage(basename(argv[0])); -+ exit(0); -+ case 'v': -+ log_verbosity(atoi(optarg)); -+ break; -+ default: -+ usage(basename(argv[0])); -+ exit(1); -+ } -+ } -+ -+ progname = basename(argv[0]); -+ -+ ksft_print_header(); -+ ksft_set_plan(2); -+ -+ test_count_overflow(); -+ -+#ifdef __ILP32__ -+ // if it's a 32x binary, there's no futex to wakeup -+ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); -+#else -+ test_fwb_wakeup(); -+#endif /* __ILP32__ */ -+ -+ ksft_print_cnts(); -+ return ret; -+} -diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh -index 1acb6ace1680e8f3d6b3ee2dc528c19ddfdb018e..a8be94f28ff78b4879d2d19bca5d9b0fcb26c1f8 100755 ---- a/tools/testing/selftests/futex/functional/run.sh -+++ b/tools/testing/selftests/futex/functional/run.sh -@@ -73,3 +73,6 @@ echo - echo - ./futex_wait_uninitialized_heap $COLOR - ./futex_wait_private_mapped_file $COLOR -+ -+echo -+./futex_wait_multiple $COLOR -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 580001e89c6caed57dd8b3cb491d65dce846caff..a3e760886b8e7e74285fdcf2caaaa6f66ad16675 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -diff --git a/kernel/futex.c b/kernel/futex.c -index 58cf9eb2b851b4858e29b5ef4114a29a92e676ba..e0bb628a5e1988dcc9ae5442a4259edc229d578d 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -4198,7 +4198,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } -@@ -4399,6 +4399,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - */ - struct compat_futex_wait_block { - compat_uptr_t uaddr; -+ __u32 pad; - __u32 val; - __u32 bitset; - }; -@@ -4461,7 +4462,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } diff --git a/linux58-tkg/linux58-tkg-patches/0008-5.8-bcachefs.patch b/linux58-tkg/linux58-tkg-patches/0008-5.8-bcachefs.patch deleted file mode 100644 index 69cd9f9..0000000 --- a/linux58-tkg/linux58-tkg-patches/0008-5.8-bcachefs.patch +++ /dev/null @@ -1,70598 +0,0 @@ -diff --git a/block/bio.c b/block/bio.c -index a7366c02c9b5..9a5a289757f9 100644 ---- a/block/bio.c -+++ b/block/bio.c -@@ -1316,6 +1316,7 @@ void bio_set_pages_dirty(struct bio *bio) - set_page_dirty_lock(bvec->bv_page); - } - } -+EXPORT_SYMBOL_GPL(bio_set_pages_dirty); - - /* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1375,6 +1376,7 @@ void bio_check_pages_dirty(struct bio *bio) - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } -+EXPORT_SYMBOL_GPL(bio_check_pages_dirty); - - static inline bool bio_remaining_done(struct bio *bio) - { -diff --git a/block/blk-core.c b/block/blk-core.c -index 03252af8c82c..71907944fa78 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -215,18 +215,23 @@ int blk_status_to_errno(blk_status_t status) - } - EXPORT_SYMBOL_GPL(blk_status_to_errno); - --static void print_req_error(struct request *req, blk_status_t status, -- const char *caller) -+const char *blk_status_to_str(blk_status_t status) - { - int idx = (__force int)status; - - if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) -- return; -+ return "(invalid error)"; -+ return blk_errors[idx].name; -+} -+EXPORT_SYMBOL_GPL(blk_status_to_str); - -+static void print_req_error(struct request *req, blk_status_t status, -+ const char *caller) -+{ - printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", -- caller, blk_errors[idx].name, -+ caller, blk_status_to_str(status), - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, -diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig -index bf7dd96db9b3..14274562f6e1 100644 ---- a/drivers/md/bcache/Kconfig -+++ b/drivers/md/bcache/Kconfig -@@ -3,6 +3,7 @@ - config BCACHE - tristate "Block device as cache" - select CRC64 -+ select CLOSURES - help - Allows a block device to be used as cache for other devices; uses - a btree for indexing and the layout is optimized for SSDs. -@@ -18,15 +19,6 @@ config BCACHE_DEBUG - Enables extra debugging tools, allows expensive runtime checks to be - turned on. - --config BCACHE_CLOSURES_DEBUG -- bool "Debug closures" -- depends on BCACHE -- select DEBUG_FS -- help -- Keeps all active closures in a linked list and provides a debugfs -- interface to list them, which makes it possible to see asynchronous -- operations that get stuck. -- - config BCACHE_ASYNC_REGISTRAION - bool "Asynchronous device registration (EXPERIMENTAL)" - depends on BCACHE -diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile -index fd714628da6a..0fb1b6009da3 100644 ---- a/drivers/md/bcache/Makefile -+++ b/drivers/md/bcache/Makefile -@@ -2,6 +2,6 @@ - - obj-$(CONFIG_BCACHE) += bcache.o - --bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ -- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ -+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ -+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 221e0191b687..4e82115c5524 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -180,6 +180,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -192,7 +193,6 @@ - - #include "bset.h" - #include "util.h" --#include "closure.h" - - struct bucket { - atomic_t pin; -diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c -deleted file mode 100644 -index 0164a1fe94a9..000000000000 ---- a/drivers/md/bcache/closure.c -+++ /dev/null -@@ -1,217 +0,0 @@ --// SPDX-License-Identifier: GPL-2.0 --/* -- * Asynchronous refcounty things -- * -- * Copyright 2010, 2011 Kent Overstreet -- * Copyright 2012 Google, Inc. -- */ -- --#include --#include --#include --#include -- --#include "closure.h" -- --static inline void closure_put_after_sub(struct closure *cl, int flags) --{ -- int r = flags & CLOSURE_REMAINING_MASK; -- -- BUG_ON(flags & CLOSURE_GUARD_MASK); -- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -- -- if (!r) { -- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -- atomic_set(&cl->remaining, -- CLOSURE_REMAINING_INITIALIZER); -- closure_queue(cl); -- } else { -- struct closure *parent = cl->parent; -- closure_fn *destructor = cl->fn; -- -- closure_debug_destroy(cl); -- -- if (destructor) -- destructor(cl); -- -- if (parent) -- closure_put(parent); -- } -- } --} -- --/* For clearing flags with the same atomic op as a put */ --void closure_sub(struct closure *cl, int v) --{ -- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); --} -- --/* -- * closure_put - decrement a closure's refcount -- */ --void closure_put(struct closure *cl) --{ -- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); --} -- --/* -- * closure_wake_up - wake up all closures on a wait list, without memory barrier -- */ --void __closure_wake_up(struct closure_waitlist *wait_list) --{ -- struct llist_node *list; -- struct closure *cl, *t; -- struct llist_node *reverse = NULL; -- -- list = llist_del_all(&wait_list->list); -- -- /* We first reverse the list to preserve FIFO ordering and fairness */ -- reverse = llist_reverse_order(list); -- -- /* Then do the wakeups */ -- llist_for_each_entry_safe(cl, t, reverse, list) { -- closure_set_waiting(cl, 0); -- closure_sub(cl, CLOSURE_WAITING + 1); -- } --} -- --/** -- * closure_wait - add a closure to a waitlist -- * @waitlist: will own a ref on @cl, which will be released when -- * closure_wake_up() is called on @waitlist. -- * @cl: closure pointer. -- * -- */ --bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) --{ -- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -- return false; -- -- closure_set_waiting(cl, _RET_IP_); -- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -- llist_add(&cl->list, &waitlist->list); -- -- return true; --} -- --struct closure_syncer { -- struct task_struct *task; -- int done; --}; -- --static void closure_sync_fn(struct closure *cl) --{ -- struct closure_syncer *s = cl->s; -- struct task_struct *p; -- -- rcu_read_lock(); -- p = READ_ONCE(s->task); -- s->done = 1; -- wake_up_process(p); -- rcu_read_unlock(); --} -- --void __sched __closure_sync(struct closure *cl) --{ -- struct closure_syncer s = { .task = current }; -- -- cl->s = &s; -- continue_at(cl, closure_sync_fn, NULL); -- -- while (1) { -- set_current_state(TASK_UNINTERRUPTIBLE); -- if (s.done) -- break; -- schedule(); -- } -- -- __set_current_state(TASK_RUNNING); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --static LIST_HEAD(closure_list); --static DEFINE_SPINLOCK(closure_list_lock); -- --void closure_debug_create(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_ALIVE; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_add(&cl->all, &closure_list); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --void closure_debug_destroy(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_DEAD; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_del(&cl->all); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --static struct dentry *closure_debug; -- --static int debug_seq_show(struct seq_file *f, void *data) --{ -- struct closure *cl; -- -- spin_lock_irq(&closure_list_lock); -- -- list_for_each_entry(cl, &closure_list, all) { -- int r = atomic_read(&cl->remaining); -- -- seq_printf(f, "%p: %pS -> %pS p %p r %i ", -- cl, (void *) cl->ip, cl->fn, cl->parent, -- r & CLOSURE_REMAINING_MASK); -- -- seq_printf(f, "%s%s\n", -- test_bit(WORK_STRUCT_PENDING_BIT, -- work_data_bits(&cl->work)) ? "Q" : "", -- r & CLOSURE_RUNNING ? "R" : ""); -- -- if (r & CLOSURE_WAITING) -- seq_printf(f, " W %pS\n", -- (void *) cl->waiting_on); -- -- seq_printf(f, "\n"); -- } -- -- spin_unlock_irq(&closure_list_lock); -- return 0; --} -- --static int debug_seq_open(struct inode *inode, struct file *file) --{ -- return single_open(file, debug_seq_show, NULL); --} -- --static const struct file_operations debug_ops = { -- .owner = THIS_MODULE, -- .open = debug_seq_open, -- .read = seq_read, -- .release = single_release --}; -- --void __init closure_debug_init(void) --{ -- if (!IS_ERR_OR_NULL(bcache_debug)) -- /* -- * it is unnecessary to check return value of -- * debugfs_create_file(), we should not care -- * about this. -- */ -- closure_debug = debugfs_create_file( -- "closures", 0400, bcache_debug, NULL, &debug_ops); --} --#endif -- --MODULE_AUTHOR("Kent Overstreet "); --MODULE_LICENSE("GPL"); -diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h -deleted file mode 100644 -index c88cdc4ae4ec..000000000000 ---- a/drivers/md/bcache/closure.h -+++ /dev/null -@@ -1,378 +0,0 @@ --/* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _LINUX_CLOSURE_H --#define _LINUX_CLOSURE_H -- --#include --#include --#include --#include -- --/* -- * Closure is perhaps the most overused and abused term in computer science, but -- * since I've been unable to come up with anything better you're stuck with it -- * again. -- * -- * What are closures? -- * -- * They embed a refcount. The basic idea is they count "things that are in -- * progress" - in flight bios, some other thread that's doing something else - -- * anything you might want to wait on. -- * -- * The refcount may be manipulated with closure_get() and closure_put(). -- * closure_put() is where many of the interesting things happen, when it causes -- * the refcount to go to 0. -- * -- * Closures can be used to wait on things both synchronously and asynchronously, -- * and synchronous and asynchronous use can be mixed without restriction. To -- * wait synchronously, use closure_sync() - you will sleep until your closure's -- * refcount hits 1. -- * -- * To wait asynchronously, use -- * continue_at(cl, next_function, workqueue); -- * -- * passing it, as you might expect, the function to run when nothing is pending -- * and the workqueue to run that function out of. -- * -- * continue_at() also, critically, requires a 'return' immediately following the -- * location where this macro is referenced, to return to the calling function. -- * There's good reason for this. -- * -- * To use safely closures asynchronously, they must always have a refcount while -- * they are running owned by the thread that is running them. Otherwise, suppose -- * you submit some bios and wish to have a function run when they all complete: -- * -- * foo_endio(struct bio *bio) -- * { -- * closure_put(cl); -- * } -- * -- * closure_init(cl); -- * -- * do_stuff(); -- * closure_get(cl); -- * bio1->bi_endio = foo_endio; -- * bio_submit(bio1); -- * -- * do_more_stuff(); -- * closure_get(cl); -- * bio2->bi_endio = foo_endio; -- * bio_submit(bio2); -- * -- * continue_at(cl, complete_some_read, system_wq); -- * -- * If closure's refcount started at 0, complete_some_read() could run before the -- * second bio was submitted - which is almost always not what you want! More -- * importantly, it wouldn't be possible to say whether the original thread or -- * complete_some_read()'s thread owned the closure - and whatever state it was -- * associated with! -- * -- * So, closure_init() initializes a closure's refcount to 1 - and when a -- * closure_fn is run, the refcount will be reset to 1 first. -- * -- * Then, the rule is - if you got the refcount with closure_get(), release it -- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -- * on a closure because you called closure_init() or you were run out of a -- * closure - _always_ use continue_at(). Doing so consistently will help -- * eliminate an entire class of particularly pernicious races. -- * -- * Lastly, you might have a wait list dedicated to a specific event, and have no -- * need for specifying the condition - you just want to wait until someone runs -- * closure_wake_up() on the appropriate wait list. In that case, just use -- * closure_wait(). It will return either true or false, depending on whether the -- * closure was already on a wait list or not - a closure can only be on one wait -- * list at a time. -- * -- * Parents: -- * -- * closure_init() takes two arguments - it takes the closure to initialize, and -- * a (possibly null) parent. -- * -- * If parent is non null, the new closure will have a refcount for its lifetime; -- * a closure is considered to be "finished" when its refcount hits 0 and the -- * function to run is null. Hence -- * -- * continue_at(cl, NULL, NULL); -- * -- * returns up the (spaghetti) stack of closures, precisely like normal return -- * returns up the C stack. continue_at() with non null fn is better thought of -- * as doing a tail call. -- * -- * All this implies that a closure should typically be embedded in a particular -- * struct (which its refcount will normally control the lifetime of), and that -- * struct can very much be thought of as a stack frame. -- */ -- --struct closure; --struct closure_syncer; --typedef void (closure_fn) (struct closure *); --extern struct dentry *bcache_debug; -- --struct closure_waitlist { -- struct llist_head list; --}; -- --enum closure_state { -- /* -- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -- * the thread that owns the closure, and cleared by the thread that's -- * waking up the closure. -- * -- * The rest are for debugging and don't affect behaviour: -- * -- * CLOSURE_RUNNING: Set when a closure is running (i.e. by -- * closure_init() and when closure_put() runs then next function), and -- * must be cleared before remaining hits 0. Primarily to help guard -- * against incorrect usage and accidentally transferring references. -- * continue_at() and closure_return() clear it for you, if you're doing -- * something unusual you can use closure_set_dead() which also helps -- * annotate where references are being transferred. -- */ -- -- CLOSURE_BITS_START = (1U << 26), -- CLOSURE_DESTRUCTOR = (1U << 26), -- CLOSURE_WAITING = (1U << 28), -- CLOSURE_RUNNING = (1U << 30), --}; -- --#define CLOSURE_GUARD_MASK \ -- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -- --#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) --#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -- --struct closure { -- union { -- struct { -- struct workqueue_struct *wq; -- struct closure_syncer *s; -- struct llist_node list; -- closure_fn *fn; -- }; -- struct work_struct work; -- }; -- -- struct closure *parent; -- -- atomic_t remaining; -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG --#define CLOSURE_MAGIC_DEAD 0xc054dead --#define CLOSURE_MAGIC_ALIVE 0xc054a11e -- -- unsigned int magic; -- struct list_head all; -- unsigned long ip; -- unsigned long waiting_on; --#endif --}; -- --void closure_sub(struct closure *cl, int v); --void closure_put(struct closure *cl); --void __closure_wake_up(struct closure_waitlist *list); --bool closure_wait(struct closure_waitlist *list, struct closure *cl); --void __closure_sync(struct closure *cl); -- --/** -- * closure_sync - sleep until a closure a closure has nothing left to wait on -- * -- * Sleeps until the refcount hits 1 - the thread that's running the closure owns -- * the last refcount. -- */ --static inline void closure_sync(struct closure *cl) --{ -- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -- __closure_sync(cl); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --void closure_debug_init(void); --void closure_debug_create(struct closure *cl); --void closure_debug_destroy(struct closure *cl); -- --#else -- --static inline void closure_debug_init(void) {} --static inline void closure_debug_create(struct closure *cl) {} --static inline void closure_debug_destroy(struct closure *cl) {} -- --#endif -- --static inline void closure_set_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _THIS_IP_; --#endif --} -- --static inline void closure_set_ret_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _RET_IP_; --#endif --} -- --static inline void closure_set_waiting(struct closure *cl, unsigned long f) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->waiting_on = f; --#endif --} -- --static inline void closure_set_stopped(struct closure *cl) --{ -- atomic_sub(CLOSURE_RUNNING, &cl->remaining); --} -- --static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -- struct workqueue_struct *wq) --{ -- closure_set_ip(cl); -- cl->fn = fn; -- cl->wq = wq; -- /* between atomic_dec() in closure_put() */ -- smp_mb__before_atomic(); --} -- --static inline void closure_queue(struct closure *cl) --{ -- struct workqueue_struct *wq = cl->wq; -- /** -- * Changes made to closure, work_struct, or a couple of other structs -- * may cause work.func not pointing to the right location. -- */ -- BUILD_BUG_ON(offsetof(struct closure, fn) -- != offsetof(struct work_struct, func)); -- if (wq) { -- INIT_WORK(&cl->work, cl->work.func); -- BUG_ON(!queue_work(wq, &cl->work)); -- } else -- cl->fn(cl); --} -- --/** -- * closure_get - increment a closure's refcount -- */ --static inline void closure_get(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- BUG_ON((atomic_inc_return(&cl->remaining) & -- CLOSURE_REMAINING_MASK) <= 1); --#else -- atomic_inc(&cl->remaining); --#endif --} -- --/** -- * closure_init - Initialize a closure, setting the refcount to 1 -- * @cl: closure to initialize -- * @parent: parent of the new closure. cl will take a refcount on it for its -- * lifetime; may be NULL. -- */ --static inline void closure_init(struct closure *cl, struct closure *parent) --{ -- memset(cl, 0, sizeof(struct closure)); -- cl->parent = parent; -- if (parent) -- closure_get(parent); -- -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -- -- closure_debug_create(cl); -- closure_set_ip(cl); --} -- --static inline void closure_init_stack(struct closure *cl) --{ -- memset(cl, 0, sizeof(struct closure)); -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); --} -- --/** -- * closure_wake_up - wake up all closures on a wait list, -- * with memory barrier -- */ --static inline void closure_wake_up(struct closure_waitlist *list) --{ -- /* Memory barrier for the wait list */ -- smp_mb(); -- __closure_wake_up(list); --} -- --/** -- * continue_at - jump to another function with barrier -- * -- * After @cl is no longer waiting on anything (i.e. all outstanding refs have -- * been dropped with closure_put()), it will resume execution at @fn running out -- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -- * -- * This is because after calling continue_at() you no longer have a ref on @cl, -- * and whatever @cl owns may be freed out from under you - a running closure fn -- * has a ref on its own closure which continue_at() drops. -- * -- * Note you are expected to immediately return after using this macro. -- */ --#define continue_at(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_sub(_cl, CLOSURE_RUNNING + 1); \ --} while (0) -- --/** -- * closure_return - finish execution of a closure -- * -- * This is used to indicate that @cl is finished: when all outstanding refs on -- * @cl have been dropped @cl's ref on its parent closure (as passed to -- * closure_init()) will be dropped, if one was specified - thus this can be -- * thought of as returning to the parent closure. -- */ --#define closure_return(_cl) continue_at((_cl), NULL, NULL) -- --/** -- * continue_at_nobarrier - jump to another function without barrier -- * -- * Causes @fn to be executed out of @cl, in @wq context (or called directly if -- * @wq is NULL). -- * -- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -- * thus it's not safe to touch anything protected by @cl after a -- * continue_at_nobarrier(). -- */ --#define continue_at_nobarrier(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_queue(_cl); \ --} while (0) -- --/** -- * closure_return_with_destructor - finish execution of a closure, -- * with destructor -- * -- * Works like closure_return(), except @destructor will be called when all -- * outstanding refs on @cl have been dropped; @destructor may be used to safely -- * free the memory occupied by @cl, and it is called with the ref on the parent -- * closure still held - so @destructor could safely return an item to a -- * freelist protected by @cl's parent. -- */ --#define closure_return_with_destructor(_cl, _destructor) \ --do { \ -- set_closure_fn(_cl, _destructor, NULL); \ -- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ --} while (0) -- --/** -- * closure_call - execute @fn out of a new, uninitialized closure -- * -- * Typically used when running out of one closure, and we want to run @fn -- * asynchronously out of a new closure - @parent will then wait for @cl to -- * finish. -- */ --static inline void closure_call(struct closure *cl, closure_fn fn, -- struct workqueue_struct *wq, -- struct closure *parent) --{ -- closure_init(cl, parent); -- continue_at_nobarrier(cl, fn, wq); --} -- --#endif /* _LINUX_CLOSURE_H */ -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 2014016f9a60..331febeabade 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2819,7 +2819,6 @@ static int __init bcache_init(void) - goto err; - - bch_debug_init(); -- closure_debug_init(); - - bcache_is_reboot = false; - -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index c029f7443190..59093f9f1793 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -4,6 +4,7 @@ - #define _BCACHE_UTIL_H - - #include -+#include - #include - #include - #include -@@ -13,8 +14,6 @@ - #include - #include - --#include "closure.h" -- - #define PAGE_SECTORS (PAGE_SIZE / 512) - - struct closure; -diff --git a/fs/Kconfig b/fs/Kconfig -index a88aa3af73c1..18e1627b95f9 100644 ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" - source "fs/btrfs/Kconfig" - source "fs/nilfs2/Kconfig" - source "fs/f2fs/Kconfig" -+source "fs/bcachefs/Kconfig" - source "fs/zonefs/Kconfig" - - config FS_DAX -diff --git a/fs/Makefile b/fs/Makefile -index 2ce5112b02c8..8e926e6bf48f 100644 ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ - obj-$(CONFIG_BTRFS_FS) += btrfs/ - obj-$(CONFIG_GFS2_FS) += gfs2/ - obj-$(CONFIG_F2FS_FS) += f2fs/ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ - obj-$(CONFIG_CEPH_FS) += ceph/ - obj-$(CONFIG_PSTORE) += pstore/ - obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -new file mode 100644 -index 000000000000..10abddae6a80 ---- /dev/null -+++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,50 @@ -+ -+config BCACHEFS_FS -+ tristate "bcachefs filesystem support" -+ depends on BLOCK -+ select EXPORTFS -+ select CLOSURES -+ select LIBCRC32C -+ select CRC64 -+ select FS_POSIX_ACL -+ select LZ4_COMPRESS -+ select LZ4_DECOMPRESS -+ select ZLIB_DEFLATE -+ select ZLIB_INFLATE -+ select ZSTD_COMPRESS -+ select ZSTD_DECOMPRESS -+ select CRYPTO_SHA256 -+ select CRYPTO_CHACHA20 -+ select CRYPTO_POLY1305 -+ select KEYS -+ select SIXLOCKS -+ select RAID6_PQ -+ select XOR_BLOCKS -+ ---help--- -+ The bcachefs filesystem - a modern, copy on write filesystem, with -+ support for multiple devices, compression, checksumming, etc. -+ -+config BCACHEFS_QUOTA -+ bool "bcachefs quota support" -+ depends on BCACHEFS_FS -+ select QUOTACTL -+ -+config BCACHEFS_POSIX_ACL -+ bool "bcachefs POSIX ACL support" -+ depends on BCACHEFS_FS -+ select FS_POSIX_ACL -+ -+config BCACHEFS_DEBUG -+ bool "bcachefs debugging" -+ depends on BCACHEFS_FS -+ ---help--- -+ Enables many extra debugging checks and assertions. -+ -+ The resulting code will be significantly slower than normal; you -+ probably shouldn't select this option unless you're a developer. -+ -+config BCACHEFS_TESTS -+ bool "bcachefs unit and performance tests" -+ depends on BCACHEFS_FS -+ ---help--- -+ Include some unit and performance tests for the core btree code -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -new file mode 100644 -index 000000000000..d85ced62c0dd ---- /dev/null -+++ b/fs/bcachefs/Makefile -@@ -0,0 +1,59 @@ -+ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o -+ -+bcachefs-y := \ -+ acl.o \ -+ alloc_background.o \ -+ alloc_foreground.o \ -+ bkey.o \ -+ bkey_methods.o \ -+ bkey_sort.o \ -+ bset.o \ -+ btree_cache.o \ -+ btree_gc.o \ -+ btree_io.o \ -+ btree_iter.o \ -+ btree_key_cache.o \ -+ btree_update_interior.o \ -+ btree_update_leaf.o \ -+ buckets.o \ -+ chardev.o \ -+ checksum.o \ -+ clock.o \ -+ compress.o \ -+ debug.o \ -+ dirent.o \ -+ disk_groups.o \ -+ ec.o \ -+ error.o \ -+ extents.o \ -+ extent_update.o \ -+ fs.o \ -+ fs-common.o \ -+ fs-ioctl.o \ -+ fs-io.o \ -+ fsck.o \ -+ inode.o \ -+ io.o \ -+ journal.o \ -+ journal_io.o \ -+ journal_reclaim.o \ -+ journal_seq_blacklist.o \ -+ keylist.o \ -+ migrate.o \ -+ move.o \ -+ movinggc.o \ -+ opts.o \ -+ quota.o \ -+ rebalance.o \ -+ recovery.o \ -+ reflink.o \ -+ replicas.o \ -+ siphash.o \ -+ super.o \ -+ super-io.o \ -+ sysfs.o \ -+ tests.o \ -+ trace.o \ -+ util.o \ -+ xattr.o -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -new file mode 100644 -index 000000000000..76c98ddbf628 ---- /dev/null -+++ b/fs/bcachefs/acl.c -@@ -0,0 +1,388 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#include "bcachefs.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "acl.h" -+#include "fs.h" -+#include "xattr.h" -+ -+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -+{ -+ return sizeof(bch_acl_header) + -+ sizeof(bch_acl_entry_short) * nr_short + -+ sizeof(bch_acl_entry) * nr_long; -+} -+ -+static inline int acl_to_xattr_type(int type) -+{ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; -+ case ACL_TYPE_DEFAULT: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Convert from filesystem to in-memory representation. -+ */ -+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) -+{ -+ const void *p, *end = value + size; -+ struct posix_acl *acl; -+ struct posix_acl_entry *out; -+ unsigned count = 0; -+ -+ if (!value) -+ return NULL; -+ if (size < sizeof(bch_acl_header)) -+ goto invalid; -+ if (((bch_acl_header *)value)->a_version != -+ cpu_to_le32(BCH_ACL_VERSION)) -+ goto invalid; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *entry = p; -+ -+ if (p + sizeof(bch_acl_entry_short) > end) -+ goto invalid; -+ -+ switch (le16_to_cpu(entry->e_tag)) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ case ACL_GROUP: -+ p += sizeof(bch_acl_entry); -+ break; -+ default: -+ goto invalid; -+ } -+ -+ count++; -+ } -+ -+ if (p > end) -+ goto invalid; -+ -+ if (!count) -+ return NULL; -+ -+ acl = posix_acl_alloc(count, GFP_KERNEL); -+ if (!acl) -+ return ERR_PTR(-ENOMEM); -+ -+ out = acl->a_entries; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *in = p; -+ -+ out->e_tag = le16_to_cpu(in->e_tag); -+ out->e_perm = le16_to_cpu(in->e_perm); -+ -+ switch (out->e_tag) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ out->e_uid = make_kuid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ out->e_gid = make_kgid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ } -+ -+ out++; -+ } -+ -+ BUG_ON(out != acl->a_entries + acl->a_count); -+ -+ return acl; -+invalid: -+ pr_err("invalid acl entry"); -+ return ERR_PTR(-EINVAL); -+} -+ -+#define acl_for_each_entry(acl, acl_e) \ -+ for (acl_e = acl->a_entries; \ -+ acl_e < acl->a_entries + acl->a_count; \ -+ acl_e++) -+ -+/* -+ * Convert from in-memory to filesystem representation. -+ */ -+static struct bkey_i_xattr * -+bch2_acl_to_xattr(struct btree_trans *trans, -+ const struct posix_acl *acl, -+ int type) -+{ -+ struct bkey_i_xattr *xattr; -+ bch_acl_header *acl_header; -+ const struct posix_acl_entry *acl_e; -+ void *outptr; -+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; -+ -+ acl_for_each_entry(acl, acl_e) { -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ case ACL_GROUP: -+ nr_long++; -+ break; -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ nr_short++; -+ break; -+ default: -+ return ERR_PTR(-EINVAL); -+ } -+ } -+ -+ acl_len = bch2_acl_size(nr_short, nr_long); -+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); -+ -+ if (u64s > U8_MAX) -+ return ERR_PTR(-E2BIG); -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return xattr; -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = acl_to_xattr_type(type); -+ xattr->v.x_name_len = 0, -+ xattr->v.x_val_len = cpu_to_le16(acl_len); -+ -+ acl_header = xattr_val(&xattr->v); -+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); -+ -+ outptr = (void *) acl_header + sizeof(*acl_header); -+ -+ acl_for_each_entry(acl, acl_e) { -+ bch_acl_entry *entry = outptr; -+ -+ entry->e_tag = cpu_to_le16(acl_e->e_tag); -+ entry->e_perm = cpu_to_le16(acl_e->e_perm); -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ entry->e_id = cpu_to_le32( -+ from_kuid(&init_user_ns, acl_e->e_uid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ entry->e_id = cpu_to_le32( -+ from_kgid(&init_user_ns, acl_e->e_gid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ outptr += sizeof(bch_acl_entry_short); -+ break; -+ } -+ } -+ -+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); -+ -+ return xattr; -+} -+ -+struct posix_acl *bch2_get_acl(struct inode *vinode, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct posix_acl *acl = NULL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(acl_to_xattr_type(type), "", 0), -+ 0); -+ if (IS_ERR(iter)) { -+ if (PTR_ERR(iter) == -EINTR) -+ goto retry; -+ -+ if (PTR_ERR(iter) != -ENOENT) -+ acl = ERR_CAST(iter); -+ goto out; -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ -+ if (!IS_ERR(acl)) -+ set_cached_acl(&inode->v, type, acl); -+out: -+ bch2_trans_exit(&trans); -+ return acl; -+} -+ -+int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ int ret; -+ -+ if (type == ACL_TYPE_DEFAULT && -+ !S_ISDIR(inode_u->bi_mode)) -+ return acl ? -EACCES : 0; -+ -+ if (acl) { -+ struct bkey_i_xattr *xattr = -+ bch2_acl_to_xattr(trans, acl, type); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &xattr->k_i, 0); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(acl_to_xattr_type(type), "", 0); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &search); -+ } -+ -+ return ret == -ENOENT ? 0 : ret; -+} -+ -+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl; -+ umode_t mode; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ acl = _acl; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ mode = inode_u.bi_mode; -+ -+ if (type == ACL_TYPE_ACCESS) { -+ ret = posix_acl_update_mode(&inode->v, &mode, &acl); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_set_acl_trans(&trans, &inode_u, -+ &inode->ei_str_hash, -+ acl, type); -+ if (ret) -+ goto btree_err; -+ -+ inode_u.bi_ctime = bch2_current_time(c); -+ inode_u.bi_mode = mode; -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_CTIME|ATTR_MODE); -+ -+ set_cached_acl(&inode->v, type, acl); -+err: -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct bkey_i_xattr *new; -+ struct posix_acl *acl; -+ int ret = 0; -+ -+ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ if (IS_ERR_OR_NULL(acl)) -+ return PTR_ERR(acl); -+ -+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); -+ if (ret) -+ goto err; -+ -+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); -+ if (IS_ERR(new)) { -+ ret = PTR_ERR(new); -+ goto err; -+ } -+ -+ new->k.p = iter->pos; -+ bch2_trans_update(trans, iter, &new->k_i, 0); -+ *new_acl = acl; -+ acl = NULL; -+err: -+ kfree(acl); -+ return ret; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h -new file mode 100644 -index 000000000000..cb62d502a7ff ---- /dev/null -+++ b/fs/bcachefs/acl.h -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ACL_H -+#define _BCACHEFS_ACL_H -+ -+struct bch_inode_unpacked; -+struct bch_hash_info; -+struct bch_inode_info; -+struct posix_acl; -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#define BCH_ACL_VERSION 0x0001 -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+ __le32 e_id; -+} bch_acl_entry; -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+} bch_acl_entry_short; -+ -+typedef struct { -+ __le32 a_version; -+} bch_acl_header; -+ -+struct posix_acl *bch2_get_acl(struct inode *, int); -+ -+int bch2_set_acl_trans(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ const struct bch_hash_info *, -+ struct posix_acl *, int); -+int bch2_set_acl(struct inode *, struct posix_acl *, int); -+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, -+ umode_t, struct posix_acl **); -+ -+#else -+ -+static inline int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ return 0; -+} -+ -+static inline int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -+ -+#endif /* _BCACHEFS_ACL_H */ -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -new file mode 100644 -index 000000000000..9aa0b42b26b6 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1436 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "recovery.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static const char * const bch2_alloc_field_names[] = { -+#define x(name, bytes) #name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ NULL -+}; -+ -+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); -+ -+/* Ratelimiting/PD controllers */ -+ -+static void pd_controllers_update(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(to_delayed_work(work), -+ struct bch_fs, -+ pd_controllers_update); -+ struct bch_dev *ca; -+ s64 free = 0, fragmented = 0; -+ unsigned i; -+ -+ for_each_member_device(ca, c, i) { -+ struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ -+ free += bucket_to_sector(ca, -+ __dev_buckets_free(ca, stats)) << 9; -+ /* -+ * Bytes of internal fragmentation, which can be -+ * reclaimed by copy GC -+ */ -+ fragmented += max_t(s64, 0, (bucket_to_sector(ca, -+ stats.buckets[BCH_DATA_user] + -+ stats.buckets[BCH_DATA_cached]) - -+ (stats.sectors[BCH_DATA_user] + -+ stats.sectors[BCH_DATA_cached])) << 9); -+ } -+ -+ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); -+ schedule_delayed_work(&c->pd_controllers_update, -+ c->pd_controllers_update_seconds * HZ); -+} -+ -+/* Persistent alloc info: */ -+ -+static inline u64 get_alloc_field(const struct bch_alloc *a, -+ const void **p, unsigned field) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ u64 v; -+ -+ if (!(a->fields & (1 << field))) -+ return 0; -+ -+ switch (bytes) { -+ case 1: -+ v = *((const u8 *) *p); -+ break; -+ case 2: -+ v = le16_to_cpup(*p); -+ break; -+ case 4: -+ v = le32_to_cpup(*p); -+ break; -+ case 8: -+ v = le64_to_cpup(*p); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+ return v; -+} -+ -+static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, -+ unsigned field, u64 v) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ -+ if (!v) -+ return; -+ -+ a->v.fields |= 1 << field; -+ -+ switch (bytes) { -+ case 1: -+ *((u8 *) *p) = v; -+ break; -+ case 2: -+ *((__le16 *) *p) = cpu_to_le16(v); -+ break; -+ case 4: -+ *((__le32 *) *p) = cpu_to_le32(v); -+ break; -+ case 8: -+ *((__le64 *) *p) = cpu_to_le64(v); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -+{ -+ struct bkey_alloc_unpacked ret = { .gen = 0 }; -+ -+ if (k.k->type == KEY_TYPE_alloc) { -+ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; -+ const void *d = a->data; -+ unsigned idx = 0; -+ -+ ret.gen = a->gen; -+ -+#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); -+ BCH_ALLOC_FIELDS() -+#undef x -+ } -+ return ret; -+} -+ -+void bch2_alloc_pack(struct bkey_i_alloc *dst, -+ const struct bkey_alloc_unpacked src) -+{ -+ unsigned idx = 0; -+ void *d = dst->v.data; -+ unsigned bytes; -+ -+ dst->v.fields = 0; -+ dst->v.gen = src.gen; -+ -+#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); -+ BCH_ALLOC_FIELDS() -+#undef x -+ -+ bytes = (void *) d - (void *) &dst->v; -+ set_bkey_val_bytes(&dst->k, bytes); -+ memset_u64s_tail(&dst->v, 0, bytes); -+} -+ -+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) -+{ -+ unsigned i, bytes = offsetof(struct bch_alloc, data); -+ -+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) -+ if (a->fields & (1 << i)) -+ bytes += BCH_ALLOC_FIELD_BYTES[i]; -+ -+ return DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ -+ if (k.k->p.inode >= c->sb.nr_devices || -+ !c->devs[k.k->p.inode]) -+ return "invalid device"; -+ -+ /* allow for unknown fields */ -+ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ const void *d = a.v->data; -+ unsigned i; -+ -+ pr_buf(out, "gen %u", a.v->gen); -+ -+ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) -+ if (a.v->fields & (1 << i)) -+ pr_buf(out, " %s %llu", -+ bch2_alloc_field_names[i], -+ get_alloc_field(a.v, &d, i)); -+} -+ -+static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ if (!level) -+ bch2_mark_key(c, k, 0, 0, NULL, 0, -+ BTREE_TRIGGER_ALLOC_READ| -+ BTREE_TRIGGER_NOATOMIC); -+ -+ return 0; -+} -+ -+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, -+ NULL, bch2_alloc_read_fn); -+ if (ret) { -+ bch_err(c, "error reading alloc info: %i", ret); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_dev_usage_from_buckets(c); -+ percpu_up_write(&c->mark_lock); -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, READ); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[READ].lock); -+ -+ mutex_lock(&c->bucket_clock[WRITE].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, WRITE); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[WRITE].lock); -+ -+ return 0; -+} -+ -+enum alloc_write_ret { -+ ALLOC_WROTE, -+ ALLOC_NOWROTE, -+ ALLOC_END, -+}; -+ -+static int bch2_alloc_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ struct bucket_array *ba; -+ struct bucket *g; -+ struct bucket_mark m; -+ struct bkey_alloc_unpacked old_u, new_u; -+ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ -+ struct bkey_i_alloc *a; -+ int ret; -+retry: -+ bch2_trans_begin(trans); -+ -+ ret = bch2_btree_key_cache_flush(trans, -+ BTREE_ID_ALLOC, iter->pos); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ old_u = bch2_alloc_unpack(k); -+ -+ if (iter->pos.inode >= c->sb.nr_devices || -+ !c->devs[iter->pos.inode]) -+ return ALLOC_END; -+ -+ percpu_down_read(&c->mark_lock); -+ ca = bch_dev_bkey_exists(c, iter->pos.inode); -+ ba = bucket_array(ca); -+ -+ if (iter->pos.offset >= ba->nbuckets) { -+ percpu_up_read(&c->mark_lock); -+ return ALLOC_END; -+ } -+ -+ g = &ba->b[iter->pos.offset]; -+ m = READ_ONCE(g->mark); -+ new_u = alloc_mem_to_key(g, m); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) -+ return ALLOC_NOWROTE; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, new_u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_NORUN); -+ ret = bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ flags); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ return ret; -+} -+ -+int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ for_each_rw_member(ca, c, i) { -+ unsigned first_bucket; -+ -+ percpu_down_read(&c->mark_lock); -+ first_bucket = bucket_array(ca)->first_bucket; -+ percpu_up_read(&c->mark_lock); -+ -+ bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); -+ -+ while (1) { -+ bch2_trans_cond_resched(&trans); -+ -+ ret = bch2_alloc_write_key(&trans, iter, flags); -+ if (ret < 0 || ret == ALLOC_END) -+ break; -+ if (ret == ALLOC_WROTE) -+ *wrote = true; -+ bch2_btree_iter_next_slot(iter); -+ } -+ -+ if (ret < 0) { -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* Bucket IO clocks: */ -+ -+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket *g; -+ u16 max_last_io = 0; -+ unsigned i; -+ -+ lockdep_assert_held(&c->bucket_clock[rw].lock); -+ -+ /* Recalculate max_last_io for this device: */ -+ for_each_bucket(g, buckets) -+ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); -+ -+ ca->max_last_bucket_io[rw] = max_last_io; -+ -+ /* Recalculate global max_last_io: */ -+ max_last_io = 0; -+ -+ for_each_member_device(ca, c, i) -+ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); -+ -+ clock->max_last_io = max_last_io; -+} -+ -+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets; -+ struct bch_dev *ca; -+ struct bucket *g; -+ unsigned i; -+ -+ trace_rescale_prios(c); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->io_time[rw] = clock->hand - -+ bucket_last_io(c, g, rw) / 2; -+ -+ bch2_recalc_oldest_io(c, ca, rw); -+ -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+static inline u64 bucket_clock_freq(u64 capacity) -+{ -+ return max(capacity >> 10, 2028ULL); -+} -+ -+static void bch2_inc_clock_hand(struct io_timer *timer) -+{ -+ struct bucket_clock *clock = container_of(timer, -+ struct bucket_clock, rescale); -+ struct bch_fs *c = container_of(clock, -+ struct bch_fs, bucket_clock[clock->rw]); -+ struct bch_dev *ca; -+ u64 capacity; -+ unsigned i; -+ -+ mutex_lock(&clock->lock); -+ -+ /* if clock cannot be advanced more, rescale prio */ -+ if (clock->max_last_io >= U16_MAX - 2) -+ bch2_rescale_bucket_io_times(c, clock->rw); -+ -+ BUG_ON(clock->max_last_io >= U16_MAX - 2); -+ -+ for_each_member_device(ca, c, i) -+ ca->max_last_bucket_io[clock->rw]++; -+ clock->max_last_io++; -+ clock->hand++; -+ -+ mutex_unlock(&clock->lock); -+ -+ capacity = READ_ONCE(c->capacity); -+ -+ if (!capacity) -+ return; -+ -+ /* -+ * we only increment when 0.1% of the filesystem capacity has been read -+ * or written too, this determines if it's time -+ * -+ * XXX: we shouldn't really be going off of the capacity of devices in -+ * RW mode (that will be 0 when we're RO, yet we can still service -+ * reads) -+ */ -+ timer->expire += bucket_clock_freq(capacity); -+ -+ bch2_io_timer_add(&c->io_clock[clock->rw], timer); -+} -+ -+static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ -+ clock->hand = 1; -+ clock->rw = rw; -+ clock->rescale.fn = bch2_inc_clock_hand; -+ clock->rescale.expire = bucket_clock_freq(c->capacity); -+ mutex_init(&clock->lock); -+} -+ -+/* Background allocator thread: */ -+ -+/* -+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens -+ * (marking them as invalidated on disk), then optionally issues discard -+ * commands to the newly free buckets, then puts them on the various freelists. -+ */ -+ -+#define BUCKET_GC_GEN_MAX 96U -+ -+/** -+ * wait_buckets_available - wait on reclaimable buckets -+ * -+ * If there aren't enough available buckets to fill up free_inc, wait until -+ * there are. -+ */ -+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned long gc_count = c->gc_count; -+ u64 available; -+ int ret = 0; -+ -+ ca->allocator_state = ALLOCATOR_BLOCKED; -+ closure_wake_up(&c->freelist_wait); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ if (gc_count != c->gc_count) -+ ca->inc_gen_really_needs_gc = 0; -+ -+ available = max_t(s64, 0, dev_buckets_available(ca) - -+ ca->inc_gen_really_needs_gc); -+ -+ if (available > fifo_free(&ca->free_inc) || -+ (available && -+ (!fifo_full(&ca->free[RESERVE_BTREE]) || -+ !fifo_full(&ca->free[RESERVE_MOVINGGC])))) -+ break; -+ -+ up_read(&c->gc_lock); -+ schedule(); -+ try_to_freeze(); -+ down_read(&c->gc_lock); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ closure_wake_up(&c->freelist_wait); -+ -+ return ret; -+} -+ -+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, -+ size_t bucket, -+ struct bucket_mark mark) -+{ -+ u8 gc_gen; -+ -+ if (!is_available_bucket(mark)) -+ return false; -+ -+ if (ca->buckets_nouse && -+ test_bit(bucket, ca->buckets_nouse)) -+ return false; -+ -+ gc_gen = bucket_gc_gen(ca, bucket); -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) -+ ca->inc_gen_needs_gc++; -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX) -+ ca->inc_gen_really_needs_gc++; -+ -+ return gc_gen < BUCKET_GC_GEN_MAX; -+} -+ -+/* -+ * Determines what order we're going to reuse buckets, smallest bucket_key() -+ * first. -+ * -+ * -+ * - We take into account the read prio of the bucket, which gives us an -+ * indication of how hot the data is -- we scale the prio so that the prio -+ * farthest from the clock is worth 1/8th of the closest. -+ * -+ * - The number of sectors of cached data in the bucket, which gives us an -+ * indication of the cost in cache misses this eviction will cause. -+ * -+ * - If hotness * sectors used compares equal, we pick the bucket with the -+ * smallest bucket_gc_gen() - since incrementing the same bucket's generation -+ * number repeatedly forces us to run mark and sweep gc to avoid generation -+ * number wraparound. -+ */ -+ -+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark m) -+{ -+ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); -+ unsigned max_last_io = ca->max_last_bucket_io[READ]; -+ -+ /* -+ * Time since last read, scaled to [0, 8) where larger value indicates -+ * more recently read data: -+ */ -+ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; -+ -+ /* How much we want to keep the data in this bucket: */ -+ unsigned long data_wantness = -+ (hotness + 1) * bucket_sectors_used(m); -+ -+ unsigned long needs_journal_commit = -+ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); -+ -+ return (data_wantness << 9) | -+ (needs_journal_commit << 8) | -+ (bucket_gc_gen(ca, b) / 16); -+} -+ -+static inline int bucket_alloc_cmp(alloc_heap *h, -+ struct alloc_heap_entry l, -+ struct alloc_heap_entry r) -+{ -+ return cmp_int(l.key, r.key) ?: -+ cmp_int(r.nr, l.nr) ?: -+ cmp_int(l.bucket, r.bucket); -+} -+ -+static inline int bucket_idx_cmp(const void *_l, const void *_r) -+{ -+ const struct alloc_heap_entry *l = _l, *r = _r; -+ -+ return cmp_int(l->bucket, r->bucket); -+} -+ -+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ struct alloc_heap_entry e = { 0 }; -+ size_t b, i, nr = 0; -+ -+ ca->alloc_heap.used = 0; -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ down_read(&ca->bucket_lock); -+ -+ buckets = bucket_array(ca); -+ -+ bch2_recalc_oldest_io(c, ca, READ); -+ -+ /* -+ * Find buckets with lowest read priority, by building a maxheap sorted -+ * by read priority and repeatedly replacing the maximum element until -+ * all buckets have been visited. -+ */ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ unsigned long key = bucket_sort_key(c, ca, b, m); -+ -+ if (!bch2_can_invalidate_bucket(ca, b, m)) -+ continue; -+ -+ if (e.nr && e.bucket + e.nr == b && e.key == key) { -+ e.nr++; -+ } else { -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ e = (struct alloc_heap_entry) { -+ .bucket = b, -+ .nr = 1, -+ .key = key, -+ }; -+ } -+ -+ cond_resched(); -+ } -+ -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { -+ nr -= ca->alloc_heap.data[0].nr; -+ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); -+ } -+ -+ up_read(&ca->bucket_lock); -+ mutex_unlock(&c->bucket_clock[READ].lock); -+} -+ -+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t b, start; -+ -+ if (ca->fifo_last_bucket < ca->mi.first_bucket || -+ ca->fifo_last_bucket >= ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ start = ca->fifo_last_bucket; -+ -+ do { -+ ca->fifo_last_bucket++; -+ if (ca->fifo_last_bucket == ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ b = ca->fifo_last_bucket; -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } while (ca->fifo_last_bucket != start); -+} -+ -+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t checked, i; -+ -+ for (checked = 0; -+ checked < ca->mi.nbuckets / 2; -+ checked++) { -+ size_t b = bch2_rand_range(ca->mi.nbuckets - -+ ca->mi.first_bucket) + -+ ca->mi.first_bucket; -+ -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ sort(ca->alloc_heap.data, -+ ca->alloc_heap.used, -+ sizeof(ca->alloc_heap.data[0]), -+ bucket_idx_cmp, NULL); -+ -+ /* remove duplicates: */ -+ for (i = 0; i + 1 < ca->alloc_heap.used; i++) -+ if (ca->alloc_heap.data[i].bucket == -+ ca->alloc_heap.data[i + 1].bucket) -+ ca->alloc_heap.data[i].nr = 0; -+} -+ -+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ size_t i, nr = 0; -+ -+ ca->inc_gen_needs_gc = 0; -+ -+ switch (ca->mi.replacement) { -+ case CACHE_REPLACEMENT_LRU: -+ find_reclaimable_buckets_lru(c, ca); -+ break; -+ case CACHE_REPLACEMENT_FIFO: -+ find_reclaimable_buckets_fifo(c, ca); -+ break; -+ case CACHE_REPLACEMENT_RANDOM: -+ find_reclaimable_buckets_random(c, ca); -+ break; -+ } -+ -+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ return nr; -+} -+ -+static inline long next_alloc_bucket(struct bch_dev *ca) -+{ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ while (ca->alloc_heap.used) { -+ if (top->nr) { -+ size_t b = top->bucket; -+ -+ top->bucket++; -+ top->nr--; -+ return b; -+ } -+ -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ } -+ -+ return -1; -+} -+ -+/* -+ * returns sequence number of most recent journal entry that updated this -+ * bucket: -+ */ -+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -+{ -+ if (m.journal_seq_valid) { -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u64 bucket_seq = journal_seq; -+ -+ bucket_seq &= ~((u64) U16_MAX); -+ bucket_seq |= m.journal_seq; -+ -+ if (bucket_seq > journal_seq) -+ bucket_seq -= 1 << 16; -+ -+ return bucket_seq; -+ } else { -+ return 0; -+ } -+} -+ -+static int bch2_invalidate_one_bucket2(struct btree_trans *trans, -+ struct bch_dev *ca, -+ struct btree_iter *iter, -+ u64 *journal_seq, unsigned flags) -+{ -+#if 0 -+ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -+#else -+ /* hack: */ -+ __BKEY_PADDED(k, 8) alloc_key; -+#endif -+ struct bch_fs *c = trans->c; -+ struct bkey_i_alloc *a; -+ struct bkey_alloc_unpacked u; -+ struct bucket *g; -+ struct bucket_mark m; -+ bool invalidating_cached_data; -+ size_t b; -+ int ret = 0; -+ -+ BUG_ON(!ca->alloc_heap.used || -+ !ca->alloc_heap.data[0].nr); -+ b = ca->alloc_heap.data[0].bucket; -+ -+ /* first, put on free_inc and mark as owned by allocator: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ verify_not_on_freelist(c, ca, b); -+ -+ BUG_ON(!fifo_push(&ca->free_inc, b)); -+ -+ g = bucket(ca, b); -+ m = READ_ONCE(g->mark); -+ -+ invalidating_cached_data = m.cached_sectors != 0; -+ -+ /* -+ * If we're not invalidating cached data, we only increment the bucket -+ * gen in memory here, the incremented gen will be updated in the btree -+ * by bch2_trans_mark_pointer(): -+ */ -+ -+ if (!invalidating_cached_data) -+ bch2_invalidate_bucket(c, ca, b, &m); -+ else -+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!invalidating_cached_data) -+ goto out; -+ -+ /* -+ * If the read-only path is trying to shut down, we can't be generating -+ * new btree updates: -+ */ -+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { -+ ret = 1; -+ goto out; -+ } -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, iter->pos.offset); -+ m = READ_ONCE(g->mark); -+ u = alloc_mem_to_key(g, m); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ invalidating_cached_data = u.cached_sectors != 0; -+ -+ u.gen++; -+ u.data_type = 0; -+ u.dirty_sectors = 0; -+ u.cached_sectors = 0; -+ u.read_time = c->bucket_clock[READ].hand; -+ u.write_time = c->bucket_clock[WRITE].hand; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_BUCKET_INVALIDATE); -+ -+ /* -+ * XXX: -+ * when using deferred btree updates, we have journal reclaim doing -+ * btree updates and thus requiring the allocator to make forward -+ * progress, and here the allocator is requiring space in the journal - -+ * so we need a journal pre-reservation: -+ */ -+ ret = bch2_trans_commit(trans, NULL, -+ invalidating_cached_data ? journal_seq : NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ flags); -+ if (ret == -EINTR) -+ goto retry; -+out: -+ if (!ret) { -+ /* remove from alloc_heap: */ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ top->bucket++; -+ top->nr--; -+ -+ if (!top->nr) -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ -+ /* -+ * Make sure we flush the last journal entry that updated this -+ * bucket (i.e. deleting the last reference) before writing to -+ * this bucket again: -+ */ -+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); -+ } else { -+ size_t b2; -+ -+ /* remove from free_inc: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ bch2_mark_alloc_bucket(c, ca, b, false, -+ gc_pos_alloc(c, NULL), 0); -+ -+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); -+ BUG_ON(b != b2); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* -+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: -+ */ -+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 journal_seq = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ -+ /* Only use nowait if we've already invalidated at least one bucket: */ -+ while (!ret && -+ !fifo_full(&ca->free_inc) && -+ ca->alloc_heap.used) -+ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, -+ BTREE_INSERT_GC_LOCK_HELD| -+ (!fifo_empty(&ca->free_inc) -+ ? BTREE_INSERT_NOWAIT : 0)); -+ -+ bch2_trans_exit(&trans); -+ -+ /* If we used NOWAIT, don't return the error: */ -+ if (!fifo_empty(&ca->free_inc)) -+ ret = 0; -+ if (ret) { -+ bch_err(ca, "error invalidating buckets: %i", ret); -+ return ret; -+ } -+ -+ if (journal_seq) -+ ret = bch2_journal_flush_seq(&c->journal, journal_seq); -+ if (ret) { -+ bch_err(ca, "journal error: %i", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ -+ /* -+ * Don't strand buckets on the copygc freelist until -+ * after recovery is finished: -+ */ -+ if (!test_bit(BCH_FS_STARTED, &c->flags) && -+ i == RESERVE_MOVINGGC) -+ continue; -+ -+ if (fifo_push(&ca->free[i], bucket)) { -+ fifo_pop(&ca->free_inc, bucket); -+ -+ closure_wake_up(&c->freelist_wait); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ spin_unlock(&c->freelist_lock); -+ goto out; -+ } -+ } -+ -+ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { -+ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; -+ closure_wake_up(&c->freelist_wait); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ if ((current->flags & PF_KTHREAD) && -+ kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ schedule(); -+ try_to_freeze(); -+ } -+out: -+ __set_current_state(TASK_RUNNING); -+ return ret; -+} -+ -+/* -+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to -+ * freelists, waiting until there's room if necessary: -+ */ -+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ while (!fifo_empty(&ca->free_inc)) { -+ size_t bucket = fifo_peek(&ca->free_inc); -+ -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, bucket), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ if (push_invalidated_bucket(c, ca, bucket)) -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/** -+ * bch_allocator_thread - move buckets from free_inc to reserves -+ * -+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and -+ * the reserves are depleted by bucket allocation. When we run out -+ * of free_inc, try to invalidate some buckets and write out -+ * prios and gens. -+ */ -+static int bch2_allocator_thread(void *arg) -+{ -+ struct bch_dev *ca = arg; -+ struct bch_fs *c = ca->fs; -+ size_t nr; -+ int ret; -+ -+ set_freezable(); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ while (1) { -+ cond_resched(); -+ if (kthread_should_stop()) -+ break; -+ -+ pr_debug("discarding %zu invalidated buckets", -+ fifo_used(&ca->free_inc)); -+ -+ ret = discard_invalidated_buckets(c, ca); -+ if (ret) -+ goto stop; -+ -+ down_read(&c->gc_lock); -+ -+ ret = bch2_invalidate_buckets(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ -+ if (!fifo_empty(&ca->free_inc)) { -+ up_read(&c->gc_lock); -+ continue; -+ } -+ -+ pr_debug("free_inc now empty"); -+ -+ do { -+ /* -+ * Find some buckets that we can invalidate, either -+ * they're completely unused, or only contain clean data -+ * that's been written back to the backing device or -+ * another cache tier -+ */ -+ -+ pr_debug("scanning for reclaimable buckets"); -+ -+ nr = find_reclaimable_buckets(c, ca); -+ -+ pr_debug("found %zu buckets", nr); -+ -+ trace_alloc_batch(ca, nr, ca->alloc_heap.size); -+ -+ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || -+ ca->inc_gen_really_needs_gc) && -+ c->gc_thread) { -+ atomic_inc(&c->kick_gc); -+ wake_up_process(c->gc_thread); -+ } -+ -+ /* -+ * If we found any buckets, we have to invalidate them -+ * before we scan for more - but if we didn't find very -+ * many we may want to wait on more buckets being -+ * available so we don't spin: -+ */ -+ if (!nr || -+ (nr < ALLOC_SCAN_BATCH(ca) && -+ !fifo_empty(&ca->free[RESERVE_NONE]))) { -+ ret = wait_buckets_available(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ } -+ } while (!nr); -+ -+ up_read(&c->gc_lock); -+ -+ pr_debug("%zu buckets to invalidate", nr); -+ -+ /* -+ * alloc_heap is now full of newly-invalidated buckets: next, -+ * write out the new bucket gens: -+ */ -+ } -+ -+stop: -+ pr_debug("alloc thread stopping (ret %i)", ret); -+ ca->allocator_state = ALLOCATOR_STOPPED; -+ closure_wake_up(&c->freelist_wait); -+ return 0; -+} -+ -+/* Startup/shutdown (ro/rw): */ -+ -+void bch2_recalc_capacity(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; -+ unsigned bucket_size_max = 0; -+ unsigned long ra_pages = 0; -+ unsigned i, j; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ for_each_online_member(ca, c, i) { -+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ ra_pages += bdi->ra_pages; -+ } -+ -+ bch2_set_ra_pages(c, ra_pages); -+ -+ for_each_rw_member(ca, c, i) { -+ u64 dev_reserve = 0; -+ -+ /* -+ * We need to reserve buckets (from the number -+ * of currently available buckets) against -+ * foreground writes so that mainly copygc can -+ * make forward progress. -+ * -+ * We need enough to refill the various reserves -+ * from scratch - copygc will use its entire -+ * reserve all at once, then run against when -+ * its reserve is refilled (from the formerly -+ * available buckets). -+ * -+ * This reserve is just used when considering if -+ * allocations for foreground writes must wait - -+ * not -ENOSPC calculations. -+ */ -+ for (j = 0; j < RESERVE_NONE; j++) -+ dev_reserve += ca->free[j].size; -+ -+ dev_reserve += 1; /* btree write point */ -+ dev_reserve += 1; /* copygc write point */ -+ dev_reserve += 1; /* rebalance write point */ -+ -+ dev_reserve *= ca->mi.bucket_size; -+ -+ copygc_threshold += dev_reserve; -+ -+ capacity += bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket); -+ -+ reserved_sectors += dev_reserve * 2; -+ -+ bucket_size_max = max_t(unsigned, bucket_size_max, -+ ca->mi.bucket_size); -+ } -+ -+ gc_reserve = c->opts.gc_reserve_bytes -+ ? c->opts.gc_reserve_bytes >> 9 -+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); -+ -+ reserved_sectors = max(gc_reserve, reserved_sectors); -+ -+ reserved_sectors = min(reserved_sectors, capacity); -+ -+ c->copygc_threshold = copygc_threshold; -+ c->capacity = capacity - reserved_sectors; -+ -+ c->bucket_size_max = bucket_size_max; -+ -+ if (c->capacity) { -+ bch2_io_timer_add(&c->io_clock[READ], -+ &c->bucket_clock[READ].rescale); -+ bch2_io_timer_add(&c->io_clock[WRITE], -+ &c->bucket_clock[WRITE].rescale); -+ } else { -+ bch2_io_timer_del(&c->io_clock[READ], -+ &c->bucket_clock[READ].rescale); -+ bch2_io_timer_del(&c->io_clock[WRITE], -+ &c->bucket_clock[WRITE].rescale); -+ } -+ -+ /* Wake up case someone was waiting for buckets */ -+ closure_wake_up(&c->freelist_wait); -+} -+ -+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct open_bucket *ob; -+ bool ret = false; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list && -+ ob->ptr.dev == ca->dev_idx) -+ ret = true; -+ spin_unlock(&ob->lock); -+ } -+ -+ return ret; -+} -+ -+/* device goes ro: */ -+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ BUG_ON(ca->alloc_thread); -+ -+ /* First, remove device from allocation groups: */ -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ clear_bit(ca->dev_idx, c->rw_devs[i].d); -+ -+ /* -+ * Capacity is calculated based off of devices in allocation groups: -+ */ -+ bch2_recalc_capacity(c); -+ -+ /* Next, close write points that point to this device... */ -+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) -+ bch2_writepoint_stop(c, ca, &c->write_points[i]); -+ -+ bch2_writepoint_stop(c, ca, &c->copygc_write_point); -+ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); -+ bch2_writepoint_stop(c, ca, &c->btree_write_point); -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ while (c->btree_reserve_cache_nr) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ bch2_open_buckets_put(c, &a->ob); -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+ while (1) { -+ struct open_bucket *ob; -+ -+ spin_lock(&c->freelist_lock); -+ if (!ca->open_buckets_partial_nr) { -+ spin_unlock(&c->freelist_lock); -+ break; -+ } -+ ob = c->open_buckets + -+ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; -+ ob->on_partial_list = false; -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_ec_stop_dev(c, ca); -+ -+ /* -+ * Wake up threads that were blocked on allocation, so they can notice -+ * the device can no longer be removed and the capacity has changed: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ -+ /* -+ * journal_res_get() can block waiting for free space in the journal - -+ * it needs to notice there may not be devices to allocate from anymore: -+ */ -+ wake_up(&c->journal.wait); -+ -+ /* Now wait for any in flight writes: */ -+ -+ closure_wait_event(&c->open_buckets_wait, -+ !bch2_dev_has_open_write_point(c, ca)); -+} -+ -+/* device goes rw: */ -+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ if (ca->mi.data_allowed & (1 << i)) -+ set_bit(ca->dev_idx, c->rw_devs[i].d); -+} -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (ca->alloc_thread) -+ closure_wait_event(&c->freelist_wait, -+ ca->allocator_state != ALLOCATOR_RUNNING); -+} -+ -+/* stop allocator thread: */ -+void bch2_dev_allocator_stop(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ p = rcu_dereference_protected(ca->alloc_thread, 1); -+ ca->alloc_thread = NULL; -+ -+ /* -+ * We need an rcu barrier between setting ca->alloc_thread = NULL and -+ * the thread shutting down to avoid bch2_wake_allocator() racing: -+ * -+ * XXX: it would be better to have the rcu barrier be asynchronous -+ * instead of blocking us here -+ */ -+ synchronize_rcu(); -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+/* start allocator thread: */ -+int bch2_dev_allocator_start(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ /* -+ * allocator thread already started? -+ */ -+ if (ca->alloc_thread) -+ return 0; -+ -+ p = kthread_create(bch2_allocator_thread, ca, -+ "bch_alloc[%s]", ca->name); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(ca->alloc_thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_allocator_background_init(struct bch_fs *c) -+{ -+ spin_lock_init(&c->freelist_lock); -+ bch2_bucket_clock_init(c, READ); -+ bch2_bucket_clock_init(c, WRITE); -+ -+ c->pd_controllers_update_seconds = 5; -+ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); -+} -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -new file mode 100644 -index 000000000000..f6b9f27f0713 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,97 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -+#define _BCACHEFS_ALLOC_BACKGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "debug.h" -+ -+struct bkey_alloc_unpacked { -+ u8 gen; -+#define x(_name, _bits) u##_bits _name; -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+/* returns true if not equal */ -+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, -+ struct bkey_alloc_unpacked r) -+{ -+ return l.gen != r.gen -+#define x(_name, _bits) || l._name != r._name -+ BCH_ALLOC_FIELDS() -+#undef x -+ ; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -+void bch2_alloc_pack(struct bkey_i_alloc *, -+ const struct bkey_alloc_unpacked); -+ -+static inline struct bkey_alloc_unpacked -+alloc_mem_to_key(struct bucket *g, struct bucket_mark m) -+{ -+ return (struct bkey_alloc_unpacked) { -+ .gen = m.gen, -+ .oldest_gen = g->oldest_gen, -+ .data_type = m.data_type, -+ .dirty_sectors = m.dirty_sectors, -+ .cached_sectors = m.cached_sectors, -+ .read_time = g->io_time[READ], -+ .write_time = g->io_time[WRITE], -+ }; -+} -+ -+#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -+ -+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_alloc (struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+} -+ -+struct journal_keys; -+int bch2_alloc_read(struct bch_fs *, struct journal_keys *); -+ -+static inline void bch2_wake_allocator(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(ca->alloc_thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, -+ size_t bucket) -+{ -+ if (expensive_debug_checks(c)) { -+ size_t iter; -+ long i; -+ unsigned j; -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ BUG_ON(i == bucket); -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ BUG_ON(i == bucket); -+ } -+} -+ -+void bch2_recalc_capacity(struct bch_fs *); -+ -+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_stop(struct bch_dev *); -+int bch2_dev_allocator_start(struct bch_dev *); -+ -+int bch2_alloc_write(struct bch_fs *, unsigned, bool *); -+void bch2_fs_allocator_background_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -new file mode 100644 -index 000000000000..4a048828869b ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,992 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Primary bucket allocation code -+ * -+ * Copyright 2012 Google, Inc. -+ * -+ * Allocation in bcache is done in terms of buckets: -+ * -+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in -+ * btree pointers - they must match for the pointer to be considered valid. -+ * -+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a -+ * bucket simply by incrementing its gen. -+ * -+ * The gens (along with the priorities; it's really the gens are important but -+ * the code is named as if it's the priorities) are written in an arbitrary list -+ * of buckets on disk, with a pointer to them in the journal header. -+ * -+ * When we invalidate a bucket, we have to write its new gen to disk and wait -+ * for that write to complete before we use it - otherwise after a crash we -+ * could have pointers that appeared to be good but pointed to data that had -+ * been overwritten. -+ * -+ * Since the gens and priorities are all stored contiguously on disk, we can -+ * batch this up: We fill up the free_inc list with freshly invalidated buckets, -+ * call prio_write(), and when prio_write() finishes we pull buckets off the -+ * free_inc list and optionally discard them. -+ * -+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while -+ * priorities and gens were being written before we could allocate. c->free is a -+ * smaller freelist, and buckets on that list are always ready to be used. -+ * -+ * If we've got discards enabled, that happens when a bucket moves from the -+ * free_inc list to the free list. -+ * -+ * It's important to ensure that gens don't wrap around - with respect to -+ * either the oldest gen in the btree or the gen on disk. This is quite -+ * difficult to do in practice, but we explicitly guard against it anyways - if -+ * a bucket is in danger of wrapping around we simply skip invalidating it that -+ * time around, and we garbage collect or rewrite the priorities sooner than we -+ * would have otherwise. -+ * -+ * bch2_bucket_alloc() allocates a single bucket from a specific device. -+ * -+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices -+ * in a given filesystem. -+ * -+ * invalidate_buckets() drives all the processes described above. It's called -+ * from bch2_bucket_alloc() and a few other places that need to make sure free -+ * buckets are ready. -+ * -+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be -+ * invalidated, and then invalidate them and stick them on the free_inc list - -+ * in either lru or fifo order. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Open buckets represent a bucket that's currently being allocated from. They -+ * serve two purposes: -+ * -+ * - They track buckets that have been partially allocated, allowing for -+ * sub-bucket sized allocations - they're used by the sector allocator below -+ * -+ * - They provide a reference to the buckets they own that mark and sweep GC -+ * can find, until the new allocation has a pointer to it inserted into the -+ * btree -+ * -+ * When allocating some space with the sector allocator, the allocation comes -+ * with a reference to an open bucket - the caller is required to put that -+ * reference _after_ doing the index update that makes its allocation reachable. -+ */ -+ -+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (ob->ec) { -+ bch2_ec_bucket_written(c, ob); -+ return; -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&ob->lock); -+ -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), -+ false, gc_pos_alloc(c, ob), 0); -+ ob->valid = false; -+ ob->type = 0; -+ -+ spin_unlock(&ob->lock); -+ percpu_up_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ c->open_buckets_nr_free++; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *c, -+ struct open_buckets *obs, -+ unsigned dev) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ptr.dev == dev && -+ ob->ec) -+ bch2_ec_bucket_cancel(c, ob); -+} -+ -+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); -+ -+ ob = c->open_buckets + c->open_buckets_freelist; -+ c->open_buckets_freelist = ob->freelist; -+ atomic_set(&ob->pin, 1); -+ ob->type = 0; -+ -+ c->open_buckets_nr_free--; -+ return ob; -+} -+ -+static void open_bucket_free_unused(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bool may_realloc = wp->type == BCH_DATA_user; -+ -+ BUG_ON(ca->open_buckets_partial_nr > -+ ARRAY_SIZE(ca->open_buckets_partial)); -+ -+ if (ca->open_buckets_partial_nr < -+ ARRAY_SIZE(ca->open_buckets_partial) && -+ may_realloc) { -+ spin_lock(&c->freelist_lock); -+ ob->on_partial_list = true; -+ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = -+ ob - c->open_buckets; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+ closure_wake_up(&c->freelist_wait); -+ } else { -+ bch2_open_bucket_put(c, ob); -+ } -+} -+ -+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ BUG_ON(ptr_stale(ca, &ob->ptr)); -+ } -+#endif -+} -+ -+/* _only_ for allocating the journal on a new device: */ -+long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ ssize_t b; -+ -+ rcu_read_lock(); -+ buckets = bucket_array(ca); -+ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) -+ if (is_available_bucket(buckets->b[b].mark)) -+ goto success; -+ b = -1; -+success: -+ rcu_read_unlock(); -+ return b; -+} -+ -+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) -+{ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ return 0; -+ case RESERVE_BTREE: -+ return OPEN_BUCKETS_COUNT / 4; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ -+/** -+ * bch_bucket_alloc - allocate a single bucket from a specific device -+ * -+ * Returns index of bucket on success, 0 on failure -+ * */ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, -+ enum alloc_reserve reserve, -+ bool may_alloc_partial, -+ struct closure *cl) -+{ -+ struct bucket_array *buckets; -+ struct open_bucket *ob; -+ long bucket = 0; -+ -+ spin_lock(&c->freelist_lock); -+ -+ if (may_alloc_partial) { -+ int i; -+ -+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { -+ ob = c->open_buckets + ca->open_buckets_partial[i]; -+ -+ if (reserve <= ob->alloc_reserve) { -+ array_remove_item(ca->open_buckets_partial, -+ ca->open_buckets_partial_nr, -+ i); -+ ob->on_partial_list = false; -+ ob->alloc_reserve = reserve; -+ spin_unlock(&c->freelist_lock); -+ return ob; -+ } -+ } -+ } -+ -+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { -+ if (cl) -+ closure_wait(&c->open_buckets_wait, cl); -+ -+ if (!c->blocked_allocate_open_bucket) -+ c->blocked_allocate_open_bucket = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ trace_open_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-OPEN_BUCKETS_EMPTY); -+ } -+ -+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) -+ goto out; -+ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_BTREE: -+ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= -+ ca->free[RESERVE_BTREE].size && -+ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_MOVINGGC: -+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) -+ goto out; -+ break; -+ default: -+ break; -+ } -+ -+ if (cl) -+ closure_wait(&c->freelist_wait, cl); -+ -+ if (!c->blocked_allocate) -+ c->blocked_allocate = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ -+ trace_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-FREELIST_EMPTY); -+out: -+ verify_not_on_freelist(c, ca, bucket); -+ -+ ob = bch2_open_bucket_alloc(c); -+ -+ spin_lock(&ob->lock); -+ buckets = bucket_array(ca); -+ -+ ob->valid = true; -+ ob->sectors_free = ca->mi.bucket_size; -+ ob->alloc_reserve = reserve; -+ ob->ptr = (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = buckets->b[bucket].mark.gen, -+ .offset = bucket_to_sector(ca, bucket), -+ .dev = ca->dev_idx, -+ }; -+ -+ bucket_io_clock_reset(c, ca, bucket, READ); -+ bucket_io_clock_reset(c, ca, bucket, WRITE); -+ spin_unlock(&ob->lock); -+ -+ if (c->blocked_allocate_open_bucket) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate_open_bucket], -+ c->blocked_allocate_open_bucket); -+ c->blocked_allocate_open_bucket = 0; -+ } -+ -+ if (c->blocked_allocate) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate], -+ c->blocked_allocate); -+ c->blocked_allocate = 0; -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_wake_allocator(ca); -+ -+ trace_bucket_alloc(ca, reserve); -+ return ob; -+} -+ -+static int __dev_stripe_cmp(struct dev_stripe_state *stripe, -+ unsigned l, unsigned r) -+{ -+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - -+ (stripe->next_alloc[l] < stripe->next_alloc[r])); -+} -+ -+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs) -+{ -+ struct dev_alloc_list ret = { .nr = 0 }; -+ unsigned i; -+ -+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) -+ ret.devs[ret.nr++] = i; -+ -+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); -+ return ret; -+} -+ -+void bch2_dev_stripe_increment(struct bch_dev *ca, -+ struct dev_stripe_state *stripe) -+{ -+ u64 *v = stripe->next_alloc + ca->dev_idx; -+ u64 free_space = dev_buckets_free(ca); -+ u64 free_space_inv = free_space -+ ? div64_u64(1ULL << 48, free_space) -+ : 1ULL << 48; -+ u64 scale = *v / 4; -+ -+ if (*v + free_space_inv >= *v) -+ *v += free_space_inv; -+ else -+ *v = U64_MAX; -+ -+ for (v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -+ *v = *v < scale ? 0 : *v - scale; -+} -+ -+#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) -+#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) -+ -+static void add_new_bucket(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ struct open_bucket *ob) -+{ -+ unsigned durability = -+ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; -+ -+ __clear_bit(ob->ptr.dev, devs_may_alloc->d); -+ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) -+ ? durability : 1; -+ *have_cache |= !durability; -+ -+ ob_push(c, ptrs, ob); -+} -+ -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct dev_alloc_list devs_sorted = -+ bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ struct bch_dev *ca; -+ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; -+ unsigned i; -+ -+ BUG_ON(*nr_effective >= nr_replicas); -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ struct open_bucket *ob; -+ -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ if (!ca->mi.durability && *have_cache) -+ continue; -+ -+ ob = bch2_bucket_alloc(c, ca, reserve, -+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); -+ if (IS_ERR(ob)) { -+ ret = -PTR_ERR(ob); -+ -+ if (cl) -+ return ret; -+ continue; -+ } -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ -+ bch2_dev_stripe_increment(ca, stripe); -+ -+ if (*nr_effective >= nr_replicas) -+ return ALLOC_SUCCESS; -+ } -+ -+ return ret; -+} -+ -+/* Allocate from stripes: */ -+ -+/* -+ * if we can't allocate a new stripe because there are already too many -+ * partially filled stripes, force allocating from an existing stripe even when -+ * it's to a device we don't want: -+ */ -+ -+static void bucket_alloc_from_stripe(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags) -+{ -+ struct dev_alloc_list devs_sorted; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ struct bch_dev *ca; -+ unsigned i, ec_idx; -+ -+ if (!erasure_code) -+ return; -+ -+ if (nr_replicas < 2) -+ return; -+ -+ if (ec_open_bucket(c, ptrs)) -+ return; -+ -+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); -+ if (!h) -+ return; -+ -+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ -+ for (i = 0; i < devs_sorted.nr; i++) -+ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) -+ if (ob->ptr.dev == devs_sorted.devs[i] && -+ !test_and_set_bit(h->s->data_block_idx[ec_idx], -+ h->s->blocks_allocated)) -+ goto got_bucket; -+ goto out_put_head; -+got_bucket: -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ ob->ec_idx = h->s->data_block_idx[ec_idx]; -+ ob->ec = h->s; -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ atomic_inc(&h->s->pin); -+out_put_head: -+ bch2_ec_stripe_head_put(c, h); -+} -+ -+/* Sector allocator */ -+ -+static void get_buckets_from_writepoint(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ bool need_ec) -+{ -+ struct open_buckets ptrs_skip = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (*nr_effective < nr_replicas && -+ test_bit(ob->ptr.dev, devs_may_alloc->d) && -+ (ca->mi.durability || -+ (wp->type == BCH_DATA_user && !*have_cache)) && -+ (ob->ec || !need_ec)) { -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, -+ flags, ob); -+ } else { -+ ob_push(c, &ptrs_skip, ob); -+ } -+ } -+ wp->ptrs = ptrs_skip; -+} -+ -+static enum bucket_alloc_ret -+open_bucket_add_buckets(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_list *devs_have, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *_cl) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ struct closure *cl = NULL; -+ enum bucket_alloc_ret ret; -+ unsigned i; -+ -+ rcu_read_lock(); -+ devs = target_rw_devs(c, wp->type, target); -+ rcu_read_unlock(); -+ -+ /* Don't allocate from devices we already have pointers to: */ -+ for (i = 0; i < devs_have->nr; i++) -+ __clear_bit(devs_have->devs[i], devs.d); -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ if (erasure_code) { -+ if (!ec_open_bucket(c, ptrs)) { -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, true); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ -+ if (!ec_open_bucket(c, ptrs)) { -+ bucket_alloc_from_stripe(c, ptrs, wp, &devs, -+ target, erasure_code, -+ nr_replicas, nr_effective, -+ have_cache, flags); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ } -+ -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, false); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+retry_blocking: -+ /* -+ * Try nonblocking first, so that if one device is full we'll try from -+ * other devices: -+ */ -+ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, -+ nr_replicas, nr_effective, have_cache, -+ reserve, flags, cl); -+ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { -+ cl = _cl; -+ goto retry_blocking; -+ } -+ -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, -+ struct open_buckets *obs) -+{ -+ struct open_buckets ptrs = { .nr = 0 }; -+ struct open_bucket *ob, *ob2; -+ unsigned i, j; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ bool drop = !ca || ob->ptr.dev == ca->dev_idx; -+ -+ if (!drop && ob->ec) { -+ mutex_lock(&ob->ec->lock); -+ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ open_bucket_for_each(c, &ob->ec->parity, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ mutex_unlock(&ob->ec->lock); -+ } -+ -+ if (drop) -+ bch2_open_bucket_put(c, ob); -+ else -+ ob_push(c, &ptrs, ob); -+ } -+ -+ *obs = ptrs; -+} -+ -+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, -+ struct write_point *wp) -+{ -+ mutex_lock(&wp->lock); -+ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); -+ mutex_unlock(&wp->lock); -+} -+ -+static inline struct hlist_head *writepoint_hash(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ unsigned hash = -+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); -+ -+ return &c->write_points_hash[hash]; -+} -+ -+static struct write_point *__writepoint_find(struct hlist_head *head, -+ unsigned long write_point) -+{ -+ struct write_point *wp; -+ -+ hlist_for_each_entry_rcu(wp, head, node) -+ if (wp->write_point == write_point) -+ return wp; -+ -+ return NULL; -+} -+ -+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -+{ -+ u64 stranded = c->write_points_nr * c->bucket_size_max; -+ u64 free = bch2_fs_usage_read_short(c).free; -+ -+ return stranded * factor > free; -+} -+ -+static bool try_increase_writepoints(struct bch_fs *c) -+{ -+ struct write_point *wp; -+ -+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || -+ too_many_writepoints(c, 32)) -+ return false; -+ -+ wp = c->write_points + c->write_points_nr++; -+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); -+ return true; -+} -+ -+static bool try_decrease_writepoints(struct bch_fs *c, -+ unsigned old_nr) -+{ -+ struct write_point *wp; -+ -+ mutex_lock(&c->write_points_hash_lock); -+ if (c->write_points_nr < old_nr) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return true; -+ } -+ -+ if (c->write_points_nr == 1 || -+ !too_many_writepoints(c, 8)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return false; -+ } -+ -+ wp = c->write_points + --c->write_points_nr; -+ -+ hlist_del_rcu(&wp->node); -+ mutex_unlock(&c->write_points_hash_lock); -+ -+ bch2_writepoint_stop(c, NULL, wp); -+ return true; -+} -+ -+static struct write_point *writepoint_find(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ struct write_point *wp, *oldest; -+ struct hlist_head *head; -+ -+ if (!(write_point & 1UL)) { -+ wp = (struct write_point *) write_point; -+ mutex_lock(&wp->lock); -+ return wp; -+ } -+ -+ head = writepoint_hash(c, write_point); -+restart_find: -+ wp = __writepoint_find(head, write_point); -+ if (wp) { -+lock_wp: -+ mutex_lock(&wp->lock); -+ if (wp->write_point == write_point) -+ goto out; -+ mutex_unlock(&wp->lock); -+ goto restart_find; -+ } -+restart_find_oldest: -+ oldest = NULL; -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) -+ if (!oldest || time_before64(wp->last_used, oldest->last_used)) -+ oldest = wp; -+ -+ mutex_lock(&oldest->lock); -+ mutex_lock(&c->write_points_hash_lock); -+ if (oldest >= c->write_points + c->write_points_nr || -+ try_increase_writepoints(c)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto restart_find_oldest; -+ } -+ -+ wp = __writepoint_find(head, write_point); -+ if (wp && wp != oldest) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto lock_wp; -+ } -+ -+ wp = oldest; -+ hlist_del_rcu(&wp->node); -+ wp->write_point = write_point; -+ hlist_add_head_rcu(&wp->node, head); -+ mutex_unlock(&c->write_points_hash_lock); -+out: -+ wp->last_used = sched_clock(); -+ return wp; -+} -+ -+/* -+ * Get us an open_bucket we can allocate from, return with it locked: -+ */ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct write_point *wp; -+ struct open_bucket *ob; -+ struct open_buckets ptrs; -+ unsigned nr_effective, write_points_nr; -+ unsigned ob_flags = 0; -+ bool have_cache; -+ enum bucket_alloc_ret ret; -+ int i; -+ -+ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) -+ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; -+ -+ BUG_ON(!nr_replicas || !nr_replicas_required); -+retry: -+ ptrs.nr = 0; -+ nr_effective = 0; -+ write_points_nr = c->write_points_nr; -+ have_cache = false; -+ -+ wp = writepoint_find(c, write_point.v); -+ -+ if (wp->type == BCH_DATA_user) -+ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; -+ -+ /* metadata may not allocate on cache devices: */ -+ if (wp->type != BCH_DATA_user) -+ have_cache = true; -+ -+ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } else { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, NULL); -+ if (!ret) -+ goto alloc_done; -+ -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ 0, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } -+alloc_done: -+ BUG_ON(!ret && nr_effective < nr_replicas); -+ -+ if (erasure_code && !ec_open_bucket(c, &ptrs)) -+ pr_debug("failed to get ec bucket: ret %u", ret); -+ -+ if (ret == INSUFFICIENT_DEVICES && -+ nr_effective >= nr_replicas_required) -+ ret = 0; -+ -+ if (ret) -+ goto err; -+ -+ /* Free buckets we didn't use: */ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_free_unused(c, wp, ob); -+ -+ wp->ptrs = ptrs; -+ -+ wp->sectors_free = UINT_MAX; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); -+ -+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); -+ -+ verify_not_stale(c, &wp->ptrs); -+ -+ return wp; -+err: -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) -+ ob_push(c, &ptrs, ob); -+ else -+ open_bucket_free_unused(c, wp, ob); -+ wp->ptrs = ptrs; -+ -+ mutex_unlock(&wp->lock); -+ -+ if (ret == FREELIST_EMPTY && -+ try_decrease_writepoints(c, write_points_nr)) -+ goto retry; -+ -+ switch (ret) { -+ case OPEN_BUCKETS_EMPTY: -+ case FREELIST_EMPTY: -+ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); -+ case INSUFFICIENT_DEVICES: -+ return ERR_PTR(-EROFS); -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors) -+ -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(sectors > wp->sectors_free); -+ wp->sectors_free -= sectors; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ struct bch_extent_ptr tmp = ob->ptr; -+ -+ tmp.cached = !ca->mi.durability && -+ wp->type == BCH_DATA_user; -+ -+ tmp.offset += ca->mi.bucket_size - ob->sectors_free; -+ bch2_bkey_append_ptr(k, tmp); -+ -+ BUG_ON(sectors > ob->sectors_free); -+ ob->sectors_free -= sectors; -+ } -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); -+ wp->ptrs = keep; -+ -+ mutex_unlock(&wp->lock); -+ -+ bch2_open_buckets_put(c, &ptrs); -+} -+ -+static inline void writepoint_init(struct write_point *wp, -+ enum bch_data_type type) -+{ -+ mutex_init(&wp->lock); -+ wp->type = type; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ struct write_point *wp; -+ -+ mutex_init(&c->write_points_hash_lock); -+ c->write_points_nr = ARRAY_SIZE(c->write_points); -+ -+ /* open bucket 0 is a sentinal NULL: */ -+ spin_lock_init(&c->open_buckets[0].lock); -+ -+ for (ob = c->open_buckets + 1; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { -+ spin_lock_init(&ob->lock); -+ c->open_buckets_nr_free++; -+ -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ } -+ -+ writepoint_init(&c->btree_write_point, BCH_DATA_btree); -+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); -+ writepoint_init(&c->copygc_write_point, BCH_DATA_user); -+ -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) { -+ writepoint_init(wp, BCH_DATA_user); -+ -+ wp->last_used = sched_clock(); -+ wp->write_point = (unsigned long) wp; -+ hlist_add_head_rcu(&wp->node, -+ writepoint_hash(c, wp->write_point)); -+ } -+} -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -new file mode 100644 -index 000000000000..c658295cb8e0 ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,138 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -+#define _BCACHEFS_ALLOC_FOREGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+ -+#include -+ -+struct bkey; -+struct bch_dev; -+struct bch_fs; -+struct bch_devs_List; -+ -+enum bucket_alloc_ret { -+ ALLOC_SUCCESS, -+ OPEN_BUCKETS_EMPTY, -+ FREELIST_EMPTY, /* Allocator thread not keeping up */ -+ INSUFFICIENT_DEVICES, -+}; -+ -+struct dev_alloc_list { -+ unsigned nr; -+ u8 devs[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -+ struct dev_stripe_state *, -+ struct bch_devs_mask *); -+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -+ -+long bch2_bucket_alloc_new_fs(struct bch_dev *); -+ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, -+ enum alloc_reserve, bool, -+ struct closure *); -+ -+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, -+ struct open_bucket *ob) -+{ -+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); -+ -+ obs->v[obs->nr++] = ob - c->open_buckets; -+} -+ -+#define open_bucket_for_each(_c, _obs, _ob, _i) \ -+ for ((_i) = 0; \ -+ (_i) < (_obs)->nr && \ -+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ -+ (_i)++) -+ -+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, -+ struct open_buckets *obs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ec) -+ return ob; -+ -+ return NULL; -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *, -+ struct open_buckets *, unsigned); -+ -+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); -+ -+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ if (atomic_dec_and_test(&ob->pin)) -+ __bch2_open_bucket_put(c, ob); -+} -+ -+static inline void bch2_open_buckets_put(struct bch_fs *c, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ bch2_open_bucket_put(c, ob); -+ ptrs->nr = 0; -+} -+ -+static inline void bch2_open_bucket_get(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ ob->type = wp->type; -+ atomic_inc(&ob->pin); -+ ob_push(c, ptrs, ob); -+ } -+} -+ -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, -+ struct dev_stripe_state *, struct bch_devs_mask *, -+ unsigned, unsigned *, bool *, enum alloc_reserve, -+ unsigned, struct closure *); -+ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum alloc_reserve, -+ unsigned, -+ struct closure *); -+ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, -+ struct bkey_i *, unsigned); -+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, -+ struct open_buckets *); -+ -+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, -+ struct write_point *); -+ -+static inline struct write_point_specifier writepoint_hashed(unsigned long v) -+{ -+ return (struct write_point_specifier) { .v = v | 1 }; -+} -+ -+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -+{ -+ return (struct write_point_specifier) { .v = (unsigned long) wp }; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -new file mode 100644 -index 000000000000..20705460bb0a ---- /dev/null -+++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,113 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_TYPES_H -+#define _BCACHEFS_ALLOC_TYPES_H -+ -+#include -+#include -+ -+#include "clock_types.h" -+#include "fifo.h" -+ -+struct ec_bucket_buf; -+ -+/* There's two of these clocks, one for reads and one for writes: */ -+struct bucket_clock { -+ /* -+ * "now" in (read/write) IO time - incremented whenever we do X amount -+ * of reads or writes. -+ * -+ * Goes with the bucket read/write prios: when we read or write to a -+ * bucket we reset the bucket's prio to the current hand; thus hand - -+ * prio = time since bucket was last read/written. -+ * -+ * The units are some amount (bytes/sectors) of data read/written, and -+ * the units can change on the fly if we need to rescale to fit -+ * everything in a u16 - your only guarantee is that the units are -+ * consistent. -+ */ -+ u16 hand; -+ u16 max_last_io; -+ -+ int rw; -+ -+ struct io_timer rescale; -+ struct mutex lock; -+}; -+ -+/* There is one reserve for each type of btree, one for prios and gens -+ * and one for moving GC */ -+enum alloc_reserve { -+ RESERVE_ALLOC = -1, -+ RESERVE_BTREE = 0, -+ RESERVE_MOVINGGC = 1, -+ RESERVE_NONE = 2, -+ RESERVE_NR = 3, -+}; -+ -+typedef FIFO(long) alloc_fifo; -+ -+#define OPEN_BUCKETS_COUNT 1024 -+ -+#define WRITE_POINT_HASH_NR 32 -+#define WRITE_POINT_MAX 32 -+ -+typedef u16 open_bucket_idx_t; -+ -+struct open_bucket { -+ spinlock_t lock; -+ atomic_t pin; -+ open_bucket_idx_t freelist; -+ -+ /* -+ * When an open bucket has an ec_stripe attached, this is the index of -+ * the block in the stripe this open_bucket corresponds to: -+ */ -+ u8 ec_idx; -+ u8 type; -+ unsigned valid:1; -+ unsigned on_partial_list:1; -+ int alloc_reserve:3; -+ unsigned sectors_free; -+ struct bch_extent_ptr ptr; -+ struct ec_stripe_new *ec; -+}; -+ -+#define OPEN_BUCKET_LIST_MAX 15 -+ -+struct open_buckets { -+ open_bucket_idx_t nr; -+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -+}; -+ -+struct dev_stripe_state { -+ u64 next_alloc[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct write_point { -+ struct hlist_node node; -+ struct mutex lock; -+ u64 last_used; -+ unsigned long write_point; -+ enum bch_data_type type; -+ bool is_ec; -+ -+ /* calculated based on how many pointers we're actually going to use: */ -+ unsigned sectors_free; -+ -+ struct open_buckets ptrs; -+ struct dev_stripe_state stripe; -+}; -+ -+struct write_point_specifier { -+ unsigned long v; -+}; -+ -+struct alloc_heap_entry { -+ size_t bucket; -+ size_t nr; -+ unsigned long key; -+}; -+ -+typedef HEAP(struct alloc_heap_entry) alloc_heap; -+ -+#endif /* _BCACHEFS_ALLOC_TYPES_H */ -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -new file mode 100644 -index 000000000000..3a5a00e53cbf ---- /dev/null -+++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,883 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_H -+#define _BCACHEFS_H -+ -+/* -+ * SOME HIGH LEVEL CODE DOCUMENTATION: -+ * -+ * Bcache mostly works with cache sets, cache devices, and backing devices. -+ * -+ * Support for multiple cache devices hasn't quite been finished off yet, but -+ * it's about 95% plumbed through. A cache set and its cache devices is sort of -+ * like a md raid array and its component devices. Most of the code doesn't care -+ * about individual cache devices, the main abstraction is the cache set. -+ * -+ * Multiple cache devices is intended to give us the ability to mirror dirty -+ * cached data and metadata, without mirroring clean cached data. -+ * -+ * Backing devices are different, in that they have a lifetime independent of a -+ * cache set. When you register a newly formatted backing device it'll come up -+ * in passthrough mode, and then you can attach and detach a backing device from -+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly -+ * invalidates any cached data for that backing device. -+ * -+ * A cache set can have multiple (many) backing devices attached to it. -+ * -+ * There's also flash only volumes - this is the reason for the distinction -+ * between struct cached_dev and struct bcache_device. A flash only volume -+ * works much like a bcache device that has a backing device, except the -+ * "cached" data is always dirty. The end result is that we get thin -+ * provisioning with very little additional code. -+ * -+ * Flash only volumes work but they're not production ready because the moving -+ * garbage collector needs more work. More on that later. -+ * -+ * BUCKETS/ALLOCATION: -+ * -+ * Bcache is primarily designed for caching, which means that in normal -+ * operation all of our available space will be allocated. Thus, we need an -+ * efficient way of deleting things from the cache so we can write new things to -+ * it. -+ * -+ * To do this, we first divide the cache device up into buckets. A bucket is the -+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ -+ * works efficiently. -+ * -+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with -+ * it. The gens and priorities for all the buckets are stored contiguously and -+ * packed on disk (in a linked list of buckets - aside from the superblock, all -+ * of bcache's metadata is stored in buckets). -+ * -+ * The priority is used to implement an LRU. We reset a bucket's priority when -+ * we allocate it or on cache it, and every so often we decrement the priority -+ * of each bucket. It could be used to implement something more sophisticated, -+ * if anyone ever gets around to it. -+ * -+ * The generation is used for invalidating buckets. Each pointer also has an 8 -+ * bit generation embedded in it; for a pointer to be considered valid, its gen -+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all -+ * we have to do is increment its gen (and write its new gen to disk; we batch -+ * this up). -+ * -+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that -+ * contain metadata (including btree nodes). -+ * -+ * THE BTREE: -+ * -+ * Bcache is in large part design around the btree. -+ * -+ * At a high level, the btree is just an index of key -> ptr tuples. -+ * -+ * Keys represent extents, and thus have a size field. Keys also have a variable -+ * number of pointers attached to them (potentially zero, which is handy for -+ * invalidating the cache). -+ * -+ * The key itself is an inode:offset pair. The inode number corresponds to a -+ * backing device or a flash only volume. The offset is the ending offset of the -+ * extent within the inode - not the starting offset; this makes lookups -+ * slightly more convenient. -+ * -+ * Pointers contain the cache device id, the offset on that device, and an 8 bit -+ * generation number. More on the gen later. -+ * -+ * Index lookups are not fully abstracted - cache lookups in particular are -+ * still somewhat mixed in with the btree code, but things are headed in that -+ * direction. -+ * -+ * Updates are fairly well abstracted, though. There are two different ways of -+ * updating the btree; insert and replace. -+ * -+ * BTREE_INSERT will just take a list of keys and insert them into the btree - -+ * overwriting (possibly only partially) any extents they overlap with. This is -+ * used to update the index after a write. -+ * -+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is -+ * overwriting a key that matches another given key. This is used for inserting -+ * data into the cache after a cache miss, and for background writeback, and for -+ * the moving garbage collector. -+ * -+ * There is no "delete" operation; deleting things from the index is -+ * accomplished by either by invalidating pointers (by incrementing a bucket's -+ * gen) or by inserting a key with 0 pointers - which will overwrite anything -+ * previously present at that location in the index. -+ * -+ * This means that there are always stale/invalid keys in the btree. They're -+ * filtered out by the code that iterates through a btree node, and removed when -+ * a btree node is rewritten. -+ * -+ * BTREE NODES: -+ * -+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and -+ * free smaller than a bucket - so, that's how big our btree nodes are. -+ * -+ * (If buckets are really big we'll only use part of the bucket for a btree node -+ * - no less than 1/4th - but a bucket still contains no more than a single -+ * btree node. I'd actually like to change this, but for now we rely on the -+ * bucket's gen for deleting btree nodes when we rewrite/split a node.) -+ * -+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook -+ * btree implementation. -+ * -+ * The way this is solved is that btree nodes are internally log structured; we -+ * can append new keys to an existing btree node without rewriting it. This -+ * means each set of keys we write is sorted, but the node is not. -+ * -+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would -+ * be expensive, and we have to distinguish between the keys we have written and -+ * the keys we haven't. So to do a lookup in a btree node, we have to search -+ * each sorted set. But we do merge written sets together lazily, so the cost of -+ * these extra searches is quite low (normally most of the keys in a btree node -+ * will be in one big set, and then there'll be one or two sets that are much -+ * smaller). -+ * -+ * This log structure makes bcache's btree more of a hybrid between a -+ * conventional btree and a compacting data structure, with some of the -+ * advantages of both. -+ * -+ * GARBAGE COLLECTION: -+ * -+ * We can't just invalidate any bucket - it might contain dirty data or -+ * metadata. If it once contained dirty data, other writes might overwrite it -+ * later, leaving no valid pointers into that bucket in the index. -+ * -+ * Thus, the primary purpose of garbage collection is to find buckets to reuse. -+ * It also counts how much valid data it each bucket currently contains, so that -+ * allocation can reuse buckets sooner when they've been mostly overwritten. -+ * -+ * It also does some things that are really internal to the btree -+ * implementation. If a btree node contains pointers that are stale by more than -+ * some threshold, it rewrites the btree node to avoid the bucket's generation -+ * wrapping around. It also merges adjacent btree nodes if they're empty enough. -+ * -+ * THE JOURNAL: -+ * -+ * Bcache's journal is not necessary for consistency; we always strictly -+ * order metadata writes so that the btree and everything else is consistent on -+ * disk in the event of an unclean shutdown, and in fact bcache had writeback -+ * caching (with recovery from unclean shutdown) before journalling was -+ * implemented. -+ * -+ * Rather, the journal is purely a performance optimization; we can't complete a -+ * write until we've updated the index on disk, otherwise the cache would be -+ * inconsistent in the event of an unclean shutdown. This means that without the -+ * journal, on random write workloads we constantly have to update all the leaf -+ * nodes in the btree, and those writes will be mostly empty (appending at most -+ * a few keys each) - highly inefficient in terms of amount of metadata writes, -+ * and it puts more strain on the various btree resorting/compacting code. -+ * -+ * The journal is just a log of keys we've inserted; on startup we just reinsert -+ * all the keys in the open journal entries. That means that when we're updating -+ * a node in the btree, we can wait until a 4k block of keys fills up before -+ * writing them out. -+ * -+ * For simplicity, we only journal updates to leaf nodes; updates to parent -+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth -+ * the complexity to deal with journalling them (in particular, journal replay) -+ * - updates to non leaf nodes just happen synchronously (see btree_split()). -+ */ -+ -+#undef pr_fmt -+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "fifo.h" -+#include "opts.h" -+#include "util.h" -+ -+#define dynamic_fault(...) 0 -+#define race_fault(...) 0 -+ -+#define bch2_fs_init_fault(name) \ -+ dynamic_fault("bcachefs:bch_fs_init:" name) -+#define bch2_meta_read_fault(name) \ -+ dynamic_fault("bcachefs:meta:read:" name) -+#define bch2_meta_write_fault(name) \ -+ dynamic_fault("bcachefs:meta:write:" name) -+ -+#ifdef __KERNEL__ -+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) -+#else -+#define bch2_fmt(_c, fmt) fmt "\n" -+#endif -+ -+#define bch_info(c, fmt, ...) \ -+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_notice(c, fmt, ...) \ -+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn(c, fmt, ...) \ -+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err(c, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+ -+#define bch_verbose(c, fmt, ...) \ -+do { \ -+ if ((c)->opts.verbose) \ -+ bch_info(c, fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+#define pr_verbose_init(opts, fmt, ...) \ -+do { \ -+ if (opt_get(opts, verbose)) \ -+ pr_info(fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+/* Parameters that are useful for debugging, but should always be compiled in: */ -+#define BCH_DEBUG_PARAMS_ALWAYS() \ -+ BCH_DEBUG_PARAM(key_merging_disabled, \ -+ "Disables merging of extents") \ -+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ -+ "Causes mark and sweep to compact and rewrite every " \ -+ "btree node it traverses") \ -+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ -+ "Disables rewriting of btree nodes during mark and sweep")\ -+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ -+ "Disables the shrinker callback for the btree node cache") -+ -+/* Parameters that should only be compiled in in debug mode: */ -+#define BCH_DEBUG_PARAMS_DEBUG() \ -+ BCH_DEBUG_PARAM(expensive_debug_checks, \ -+ "Enables various runtime debugging checks that " \ -+ "significantly affect performance") \ -+ BCH_DEBUG_PARAM(debug_check_iterators, \ -+ "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_bkeys, \ -+ "Run bkey_debugcheck (primarily checking GC/allocation "\ -+ "information) when iterating over keys") \ -+ BCH_DEBUG_PARAM(verify_btree_ondisk, \ -+ "Reread btree nodes at various points to verify the " \ -+ "mergesort in the read path against modifications " \ -+ "done in memory") \ -+ BCH_DEBUG_PARAM(journal_seq_verify, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(inject_invalid_keys, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(test_alloc_startup, \ -+ "Force allocator startup to use the slowpath where it" \ -+ "can't find enough free buckets without invalidating" \ -+ "cached data") \ -+ BCH_DEBUG_PARAM(force_reconstruct_read, \ -+ "Force reads to use the reconstruct path, when reading" \ -+ "from erasure coded extents") \ -+ BCH_DEBUG_PARAM(test_restart_gc, \ -+ "Test restarting mark and sweep gc when bucket gens change") -+ -+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -+#else -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -+#endif -+ -+#define BCH_TIME_STATS() \ -+ x(btree_node_mem_alloc) \ -+ x(btree_node_split) \ -+ x(btree_node_sort) \ -+ x(btree_node_read) \ -+ x(btree_gc) \ -+ x(btree_lock_contended_read) \ -+ x(btree_lock_contended_intent) \ -+ x(btree_lock_contended_write) \ -+ x(data_write) \ -+ x(data_read) \ -+ x(data_promote) \ -+ x(journal_write) \ -+ x(journal_delay) \ -+ x(journal_flush_seq) \ -+ x(blocked_journal) \ -+ x(blocked_allocate) \ -+ x(blocked_allocate_open_bucket) -+ -+enum bch_time_stats { -+#define x(name) BCH_TIME_##name, -+ BCH_TIME_STATS() -+#undef x -+ BCH_TIME_STAT_NR -+}; -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "clock_types.h" -+#include "ec_types.h" -+#include "journal_types.h" -+#include "keylist_types.h" -+#include "quota_types.h" -+#include "rebalance_types.h" -+#include "replicas_types.h" -+#include "super_types.h" -+ -+/* Number of nodes btree coalesce will try to coalesce at once */ -+#define GC_MERGE_NODES 4U -+ -+/* Maximum number of nodes we might need to allocate atomically: */ -+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) -+ -+/* Size of the freelist we allocate btree nodes from: */ -+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) -+ -+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) -+ -+struct btree; -+ -+enum gc_phase { -+ GC_PHASE_NOT_RUNNING, -+ GC_PHASE_START, -+ GC_PHASE_SB, -+ -+ GC_PHASE_BTREE_EC, -+ GC_PHASE_BTREE_EXTENTS, -+ GC_PHASE_BTREE_INODES, -+ GC_PHASE_BTREE_DIRENTS, -+ GC_PHASE_BTREE_XATTRS, -+ GC_PHASE_BTREE_ALLOC, -+ GC_PHASE_BTREE_QUOTAS, -+ GC_PHASE_BTREE_REFLINK, -+ -+ GC_PHASE_PENDING_DELETE, -+ GC_PHASE_ALLOC, -+}; -+ -+struct gc_pos { -+ enum gc_phase phase; -+ struct bpos pos; -+ unsigned level; -+}; -+ -+struct io_count { -+ u64 sectors[2][BCH_DATA_NR]; -+}; -+ -+struct bch_dev { -+ struct kobject kobj; -+ struct percpu_ref ref; -+ struct completion ref_completion; -+ struct percpu_ref io_ref; -+ struct completion io_ref_completion; -+ -+ struct bch_fs *fs; -+ -+ u8 dev_idx; -+ /* -+ * Cached version of this device's member info from superblock -+ * Committed by bch2_write_super() -> bch_fs_mi_update() -+ */ -+ struct bch_member_cpu mi; -+ uuid_le uuid; -+ char name[BDEVNAME_SIZE]; -+ -+ struct bch_sb_handle disk_sb; -+ struct bch_sb *sb_read_scratch; -+ int sb_write_error; -+ -+ struct bch_devs_mask self; -+ -+ /* biosets used in cloned bios for writing multiple replicas */ -+ struct bio_set replica_set; -+ -+ /* -+ * Buckets: -+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and -+ * gc_lock, for device resize - holding any is sufficient for access: -+ * Or rcu_read_lock(), but only for ptr_stale(): -+ */ -+ struct bucket_array __rcu *buckets[2]; -+ unsigned long *buckets_nouse; -+ struct rw_semaphore bucket_lock; -+ -+ struct bch_dev_usage __percpu *usage[2]; -+ -+ /* Allocator: */ -+ struct task_struct __rcu *alloc_thread; -+ -+ /* -+ * free: Buckets that are ready to be used -+ * -+ * free_inc: Incoming buckets - these are buckets that currently have -+ * cached data in them, and we can't reuse them until after we write -+ * their new gen to disk. After prio_write() finishes writing the new -+ * gens/prios, they'll be moved to the free list (and possibly discarded -+ * in the process) -+ */ -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ -+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; -+ open_bucket_idx_t open_buckets_partial_nr; -+ -+ size_t fifo_last_bucket; -+ -+ /* last calculated minimum prio */ -+ u16 max_last_bucket_io[2]; -+ -+ size_t inc_gen_needs_gc; -+ size_t inc_gen_really_needs_gc; -+ -+ /* -+ * XXX: this should be an enum for allocator state, so as to include -+ * error state -+ */ -+ enum { -+ ALLOCATOR_STOPPED, -+ ALLOCATOR_RUNNING, -+ ALLOCATOR_BLOCKED, -+ ALLOCATOR_BLOCKED_FULL, -+ } allocator_state; -+ -+ alloc_heap alloc_heap; -+ -+ atomic64_t rebalance_work; -+ -+ struct journal_device journal; -+ -+ struct work_struct io_error_work; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic64_t cur_latency[2]; -+ struct time_stats io_latency[2]; -+ -+#define CONGESTED_MAX 1024 -+ atomic_t congested; -+ u64 congested_last; -+ -+ struct io_count __percpu *io_done; -+}; -+ -+enum { -+ /* startup: */ -+ BCH_FS_ALLOC_READ_DONE, -+ BCH_FS_ALLOC_CLEAN, -+ BCH_FS_ALLOCATOR_RUNNING, -+ BCH_FS_ALLOCATOR_STOPPING, -+ BCH_FS_INITIAL_GC_DONE, -+ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, -+ BCH_FS_FSCK_DONE, -+ BCH_FS_STARTED, -+ BCH_FS_RW, -+ -+ /* shutdown: */ -+ BCH_FS_STOPPING, -+ BCH_FS_EMERGENCY_RO, -+ BCH_FS_WRITE_DISABLE_COMPLETE, -+ -+ /* errors: */ -+ BCH_FS_ERROR, -+ BCH_FS_ERRORS_FIXED, -+ -+ /* misc: */ -+ BCH_FS_BDEV_MOUNTED, -+ BCH_FS_FIXED_GENS, -+ BCH_FS_ALLOC_WRITTEN, -+ BCH_FS_REBUILD_REPLICAS, -+ BCH_FS_HOLD_BTREE_WRITES, -+}; -+ -+struct btree_debug { -+ unsigned id; -+ struct dentry *btree; -+ struct dentry *btree_format; -+ struct dentry *failed; -+}; -+ -+struct bch_fs_pcpu { -+ u64 sectors_available; -+}; -+ -+struct journal_seq_blacklist_table { -+ size_t nr; -+ struct journal_seq_blacklist_table_entry { -+ u64 start; -+ u64 end; -+ bool dirty; -+ } entries[0]; -+}; -+ -+struct journal_keys { -+ struct journal_key { -+ enum btree_id btree_id:8; -+ unsigned level:8; -+ struct bkey_i *k; -+ u32 journal_seq; -+ u32 journal_offset; -+ } *d; -+ size_t nr; -+ u64 journal_seq_base; -+}; -+ -+struct bch_fs { -+ struct closure cl; -+ -+ struct list_head list; -+ struct kobject kobj; -+ struct kobject internal; -+ struct kobject opts_dir; -+ struct kobject time_stats; -+ unsigned long flags; -+ -+ int minor; -+ struct device *chardev; -+ struct super_block *vfs_sb; -+ char name[40]; -+ -+ /* ro/rw, add/remove/resize devices: */ -+ struct rw_semaphore state_lock; -+ -+ /* Counts outstanding writes, for clean transition to read-only */ -+ struct percpu_ref writes; -+ struct work_struct read_only_work; -+ -+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; -+ -+ struct bch_replicas_cpu replicas; -+ struct bch_replicas_cpu replicas_gc; -+ struct mutex replicas_gc_lock; -+ -+ struct journal_entry_res replicas_journal_res; -+ -+ struct bch_disk_groups_cpu __rcu *disk_groups; -+ -+ struct bch_opts opts; -+ -+ /* Updated by bch2_sb_update():*/ -+ struct { -+ uuid_le uuid; -+ uuid_le user_uuid; -+ -+ u16 version; -+ u16 encoded_extent_max; -+ -+ u8 nr_devices; -+ u8 clean; -+ -+ u8 encryption_type; -+ -+ u64 time_base_lo; -+ u32 time_base_hi; -+ u32 time_precision; -+ u64 features; -+ u64 compat; -+ } sb; -+ -+ struct bch_sb_handle disk_sb; -+ -+ unsigned short block_bits; /* ilog2(block_size) */ -+ -+ u16 btree_foreground_merge_threshold; -+ -+ struct closure sb_write; -+ struct mutex sb_lock; -+ -+ /* BTREE CACHE */ -+ struct bio_set btree_bio; -+ -+ struct btree_root btree_roots[BTREE_ID_NR]; -+ struct mutex btree_root_lock; -+ -+ struct btree_cache btree_cache; -+ -+ /* -+ * Cache of allocated btree nodes - if we allocate a btree node and -+ * don't use it, if we free it that space can't be reused until going -+ * _all_ the way through the allocator (which exposes us to a livelock -+ * when allocating btree reserves fail halfway through) - instead, we -+ * can stick them here: -+ */ -+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; -+ unsigned btree_reserve_cache_nr; -+ struct mutex btree_reserve_cache_lock; -+ -+ mempool_t btree_interior_update_pool; -+ struct list_head btree_interior_update_list; -+ struct list_head btree_interior_updates_unwritten; -+ struct mutex btree_interior_update_lock; -+ struct closure_waitlist btree_interior_update_wait; -+ -+ struct workqueue_struct *btree_interior_update_worker; -+ struct work_struct btree_interior_update_work; -+ -+ /* btree_iter.c: */ -+ struct mutex btree_trans_lock; -+ struct list_head btree_trans_list; -+ mempool_t btree_iters_pool; -+ -+ struct btree_key_cache btree_key_cache; -+ -+ struct workqueue_struct *wq; -+ /* copygc needs its own workqueue for index updates.. */ -+ struct workqueue_struct *copygc_wq; -+ struct workqueue_struct *journal_reclaim_wq; -+ -+ /* ALLOCATION */ -+ struct delayed_work pd_controllers_update; -+ unsigned pd_controllers_update_seconds; -+ -+ struct bch_devs_mask rw_devs[BCH_DATA_NR]; -+ -+ u64 capacity; /* sectors */ -+ -+ /* -+ * When capacity _decreases_ (due to a disk being removed), we -+ * increment capacity_gen - this invalidates outstanding reservations -+ * and forces them to be revalidated -+ */ -+ u32 capacity_gen; -+ unsigned bucket_size_max; -+ -+ atomic64_t sectors_available; -+ -+ struct bch_fs_pcpu __percpu *pcpu; -+ -+ struct percpu_rw_semaphore mark_lock; -+ -+ seqcount_t usage_lock; -+ struct bch_fs_usage *usage_base; -+ struct bch_fs_usage __percpu *usage[2]; -+ struct bch_fs_usage __percpu *usage_gc; -+ -+ /* single element mempool: */ -+ struct mutex usage_scratch_lock; -+ struct bch_fs_usage *usage_scratch; -+ -+ /* -+ * When we invalidate buckets, we use both the priority and the amount -+ * of good data to determine which buckets to reuse first - to weight -+ * those together consistently we keep track of the smallest nonzero -+ * priority of any bucket. -+ */ -+ struct bucket_clock bucket_clock[2]; -+ -+ struct io_clock io_clock[2]; -+ -+ /* JOURNAL SEQ BLACKLIST */ -+ struct journal_seq_blacklist_table * -+ journal_seq_blacklist_table; -+ struct work_struct journal_seq_blacklist_gc_work; -+ -+ /* ALLOCATOR */ -+ spinlock_t freelist_lock; -+ struct closure_waitlist freelist_wait; -+ u64 blocked_allocate; -+ u64 blocked_allocate_open_bucket; -+ open_bucket_idx_t open_buckets_freelist; -+ open_bucket_idx_t open_buckets_nr_free; -+ struct closure_waitlist open_buckets_wait; -+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; -+ -+ struct write_point btree_write_point; -+ struct write_point rebalance_write_point; -+ -+ struct write_point write_points[WRITE_POINT_MAX]; -+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; -+ struct mutex write_points_hash_lock; -+ unsigned write_points_nr; -+ -+ /* GARBAGE COLLECTION */ -+ struct task_struct *gc_thread; -+ atomic_t kick_gc; -+ unsigned long gc_count; -+ -+ /* -+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] -+ * has been marked by GC. -+ * -+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) -+ * -+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread -+ * can read without a lock. -+ */ -+ seqcount_t gc_pos_lock; -+ struct gc_pos gc_pos; -+ -+ /* -+ * The allocation code needs gc_mark in struct bucket to be correct, but -+ * it's not while a gc is in progress. -+ */ -+ struct rw_semaphore gc_lock; -+ -+ /* IO PATH */ -+ struct semaphore io_in_flight; -+ struct bio_set bio_read; -+ struct bio_set bio_read_split; -+ struct bio_set bio_write; -+ struct mutex bio_bounce_pages_lock; -+ mempool_t bio_bounce_pages; -+ struct rhashtable promote_table; -+ -+ mempool_t compression_bounce[2]; -+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; -+ mempool_t decompress_workspace; -+ ZSTD_parameters zstd_params; -+ -+ struct crypto_shash *sha256; -+ struct crypto_sync_skcipher *chacha20; -+ struct crypto_shash *poly1305; -+ -+ atomic64_t key_version; -+ -+ mempool_t large_bkey_pool; -+ -+ /* REBALANCE */ -+ struct bch_fs_rebalance rebalance; -+ -+ /* COPYGC */ -+ struct task_struct *copygc_thread; -+ copygc_heap copygc_heap; -+ struct bch_pd_controller copygc_pd; -+ struct write_point copygc_write_point; -+ u64 copygc_threshold; -+ -+ /* STRIPES: */ -+ GENRADIX(struct stripe) stripes[2]; -+ -+ ec_stripes_heap ec_stripes_heap; -+ spinlock_t ec_stripes_heap_lock; -+ -+ /* ERASURE CODING */ -+ struct list_head ec_stripe_head_list; -+ struct mutex ec_stripe_head_lock; -+ -+ struct list_head ec_stripe_new_list; -+ struct mutex ec_stripe_new_lock; -+ -+ struct work_struct ec_stripe_create_work; -+ u64 ec_stripe_hint; -+ -+ struct bio_set ec_bioset; -+ -+ struct work_struct ec_stripe_delete_work; -+ struct llist_head ec_stripe_delete_list; -+ -+ /* REFLINK */ -+ u64 reflink_hint; -+ -+ /* VFS IO PATH - fs-io.c */ -+ struct bio_set writepage_bioset; -+ struct bio_set dio_write_bioset; -+ struct bio_set dio_read_bioset; -+ -+ struct bio_list btree_write_error_list; -+ struct work_struct btree_write_error_work; -+ spinlock_t btree_write_error_lock; -+ -+ /* ERRORS */ -+ struct list_head fsck_errors; -+ struct mutex fsck_error_lock; -+ bool fsck_alloc_err; -+ -+ /* QUOTAS */ -+ struct bch_memquota_type quotas[QTYP_NR]; -+ -+ /* DEBUG JUNK */ -+ struct dentry *debug; -+ struct btree_debug btree_debug[BTREE_ID_NR]; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree *verify_data; -+ struct btree_node *verify_ondisk; -+ struct mutex verify_lock; -+#endif -+ -+ u64 unused_inode_hint; -+ -+ /* -+ * A btree node on disk could have too many bsets for an iterator to fit -+ * on the stack - have to dynamically allocate them -+ */ -+ mempool_t fill_iter; -+ -+ mempool_t btree_bounce_pool; -+ -+ struct journal journal; -+ struct list_head journal_entries; -+ struct journal_keys journal_keys; -+ -+ u64 last_bucket_seq_cleanup; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic_long_t read_realloc_races; -+ atomic_long_t extent_migrate_done; -+ atomic_long_t extent_migrate_raced; -+ -+ unsigned btree_gc_periodic:1; -+ unsigned copy_gc_enabled:1; -+ bool promote_whole_extents; -+ -+#define BCH_DEBUG_PARAM(name, description) bool name; -+ BCH_DEBUG_PARAMS_ALL() -+#undef BCH_DEBUG_PARAM -+ -+ struct time_stats times[BCH_TIME_STAT_NR]; -+}; -+ -+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -+{ -+#ifndef NO_BCACHEFS_FS -+ if (c->vfs_sb) -+ c->vfs_sb->s_bdi->ra_pages = ra_pages; -+#endif -+} -+ -+static inline unsigned bucket_bytes(const struct bch_dev *ca) -+{ -+ return ca->mi.bucket_size << 9; -+} -+ -+static inline unsigned block_bytes(const struct bch_fs *c) -+{ -+ return c->opts.block_size << 9; -+} -+ -+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) -+{ -+ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); -+} -+ -+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) -+{ -+ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; -+ -+ if (c->sb.time_precision == 1) -+ return ns; -+ -+ return div_s64(ns, c->sb.time_precision); -+} -+ -+static inline s64 bch2_current_time(struct bch_fs *c) -+{ -+ struct timespec64 now; -+ -+ ktime_get_coarse_real_ts64(&now); -+ return timespec_to_bch2_time(c, now); -+} -+ -+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -+{ -+ return dev < c->sb.nr_devices && c->devs[dev]; -+} -+ -+#endif /* _BCACHEFS_H */ -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -new file mode 100644 -index 000000000000..d5a2230e403c ---- /dev/null -+++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,1671 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FORMAT_H -+#define _BCACHEFS_FORMAT_H -+ -+/* -+ * bcachefs on disk data structures -+ * -+ * OVERVIEW: -+ * -+ * There are three main types of on disk data structures in bcachefs (this is -+ * reduced from 5 in bcache) -+ * -+ * - superblock -+ * - journal -+ * - btree -+ * -+ * The btree is the primary structure; most metadata exists as keys in the -+ * various btrees. There are only a small number of btrees, they're not -+ * sharded - we have one btree for extents, another for inodes, et cetera. -+ * -+ * SUPERBLOCK: -+ * -+ * The superblock contains the location of the journal, the list of devices in -+ * the filesystem, and in general any metadata we need in order to decide -+ * whether we can start a filesystem or prior to reading the journal/btree -+ * roots. -+ * -+ * The superblock is extensible, and most of the contents of the superblock are -+ * in variable length, type tagged fields; see struct bch_sb_field. -+ * -+ * Backup superblocks do not reside in a fixed location; also, superblocks do -+ * not have a fixed size. To locate backup superblocks we have struct -+ * bch_sb_layout; we store a copy of this inside every superblock, and also -+ * before the first superblock. -+ * -+ * JOURNAL: -+ * -+ * The journal primarily records btree updates in the order they occurred; -+ * journal replay consists of just iterating over all the keys in the open -+ * journal entries and re-inserting them into the btrees. -+ * -+ * The journal also contains entry types for the btree roots, and blacklisted -+ * journal sequence numbers (see journal_seq_blacklist.c). -+ * -+ * BTREE: -+ * -+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically -+ * 128k-256k) and log structured. We use struct btree_node for writing the first -+ * entry in a given node (offset 0), and struct btree_node_entry for all -+ * subsequent writes. -+ * -+ * After the header, btree node entries contain a list of keys in sorted order. -+ * Values are stored inline with the keys; since values are variable length (and -+ * keys effectively are variable length too, due to packing) we can't do random -+ * access without building up additional in memory tables in the btree node read -+ * path. -+ * -+ * BTREE KEYS (struct bkey): -+ * -+ * The various btrees share a common format for the key - so as to avoid -+ * switching in fastpath lookup/comparison code - but define their own -+ * structures for the key values. -+ * -+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max -+ * size is just under 2k. The common part also contains a type tag for the -+ * value, and a format field indicating whether the key is packed or not (and -+ * also meant to allow adding new key fields in the future, if desired). -+ * -+ * bkeys, when stored within a btree node, may also be packed. In that case, the -+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can -+ * be generous with field sizes in the common part of the key format (64 bit -+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define LE_BITMASK(_bits, name, type, field, offset, end) \ -+static const unsigned name##_OFFSET = offset; \ -+static const unsigned name##_BITS = (end - offset); \ -+static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ -+ \ -+static inline __u64 name(const type *k) \ -+{ \ -+ return (__le##_bits##_to_cpu(k->field) >> offset) & \ -+ ~(~0ULL << (end - offset)); \ -+} \ -+ \ -+static inline void SET_##name(type *k, __u64 v) \ -+{ \ -+ __u##_bits new = __le##_bits##_to_cpu(k->field); \ -+ \ -+ new &= ~(~(~0ULL << (end - offset)) << offset); \ -+ new |= (v & ~(~0ULL << (end - offset))) << offset; \ -+ k->field = __cpu_to_le##_bits(new); \ -+} -+ -+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) -+ -+struct bkey_format { -+ __u8 key_u64s; -+ __u8 nr_fields; -+ /* One unused slot for now: */ -+ __u8 bits_per_field[6]; -+ __le64 field_offset[6]; -+}; -+ -+/* Btree keys - all units are in sectors */ -+ -+struct bpos { -+ /* -+ * Word order matches machine byte order - btree code treats a bpos as a -+ * single large integer, for search/comparison purposes -+ * -+ * Note that wherever a bpos is embedded in another on disk data -+ * structure, it has to be byte swabbed when reading in metadata that -+ * wasn't written in native endian order: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u32 snapshot; -+ __u64 offset; -+ __u64 inode; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u64 inode; -+ __u64 offset; /* Points to end of extent - sectors */ -+ __u32 snapshot; -+#else -+#error edit for your odd byteorder. -+#endif -+} __attribute__((packed, aligned(4))); -+ -+#define KEY_INODE_MAX ((__u64)~0ULL) -+#define KEY_OFFSET_MAX ((__u64)~0ULL) -+#define KEY_SNAPSHOT_MAX ((__u32)~0U) -+#define KEY_SIZE_MAX ((__u32)~0U) -+ -+static inline struct bpos POS(__u64 inode, __u64 offset) -+{ -+ struct bpos ret; -+ -+ ret.inode = inode; -+ ret.offset = offset; -+ ret.snapshot = 0; -+ -+ return ret; -+} -+ -+#define POS_MIN POS(0, 0) -+#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) -+ -+/* Empty placeholder struct, for container_of() */ -+struct bch_val { -+ __u64 __nothing[0]; -+}; -+ -+struct bversion { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u64 lo; -+ __u32 hi; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u32 hi; -+ __u64 lo; -+#endif -+} __attribute__((packed, aligned(4))); -+ -+struct bkey { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u8 pad[1]; -+ -+ struct bversion version; -+ __u32 size; /* extent size, in sectors */ -+ struct bpos p; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ struct bpos p; -+ __u32 size; /* extent size, in sectors */ -+ struct bversion version; -+ -+ __u8 pad[1]; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bkey_packed { -+ __u64 _data[0]; -+ -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+ -+ /* -+ * XXX: next incompat on disk format change, switch format and -+ * needs_whiteout - bkey_packed() will be cheaper if format is the high -+ * bits of the bitfield -+ */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ __u8 key_start[0]; -+ -+ /* -+ * We copy bkeys with struct assignment in various places, and while -+ * that shouldn't be done with packed bkeys we can't disallow it in C, -+ * and it's legal to cast a bkey to a bkey_packed - so padding it out -+ * to the same size as struct bkey should hopefully be safest. -+ */ -+ __u8 pad[sizeof(struct bkey) - 3]; -+} __attribute__((packed, aligned(8))); -+ -+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -+#define BKEY_U64s_MAX U8_MAX -+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) -+ -+#define KEY_PACKED_BITS_START 24 -+ -+#define KEY_FORMAT_LOCAL_BTREE 0 -+#define KEY_FORMAT_CURRENT 1 -+ -+enum bch_bkey_fields { -+ BKEY_FIELD_INODE, -+ BKEY_FIELD_OFFSET, -+ BKEY_FIELD_SNAPSHOT, -+ BKEY_FIELD_SIZE, -+ BKEY_FIELD_VERSION_HI, -+ BKEY_FIELD_VERSION_LO, -+ BKEY_NR_FIELDS, -+}; -+ -+#define bkey_format_field(name, field) \ -+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) -+ -+#define BKEY_FORMAT_CURRENT \ -+((struct bkey_format) { \ -+ .key_u64s = BKEY_U64s, \ -+ .nr_fields = BKEY_NR_FIELDS, \ -+ .bits_per_field = { \ -+ bkey_format_field(INODE, p.inode), \ -+ bkey_format_field(OFFSET, p.offset), \ -+ bkey_format_field(SNAPSHOT, p.snapshot), \ -+ bkey_format_field(SIZE, size), \ -+ bkey_format_field(VERSION_HI, version.hi), \ -+ bkey_format_field(VERSION_LO, version.lo), \ -+ }, \ -+}) -+ -+/* bkey with inline value */ -+struct bkey_i { -+ __u64 _data[0]; -+ -+ union { -+ struct { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ }; -+ struct { -+ struct bkey k; -+ struct bch_val v; -+ }; -+ }; -+}; -+ -+#define KEY(_inode, _offset, _size) \ -+((struct bkey) { \ -+ .u64s = BKEY_U64s, \ -+ .format = KEY_FORMAT_CURRENT, \ -+ .p = POS(_inode, _offset), \ -+ .size = _size, \ -+}) -+ -+static inline void bkey_init(struct bkey *k) -+{ -+ *k = KEY(0, 0, 0); -+} -+ -+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) -+ -+#define __BKEY_PADDED(key, pad) \ -+ struct { struct bkey_i key; __u64 key ## _pad[pad]; } -+ -+/* -+ * - DELETED keys are used internally to mark keys that should be ignored but -+ * override keys in composition order. Their version number is ignored. -+ * -+ * - DISCARDED keys indicate that the data is all 0s because it has been -+ * discarded. DISCARDs may have a version; if the version is nonzero the key -+ * will be persistent, otherwise the key will be dropped whenever the btree -+ * node is rewritten (like DELETED keys). -+ * -+ * - ERROR: any read of the data returns a read error, as the data was lost due -+ * to a failing device. Like DISCARDED keys, they can be removed (overridden) -+ * by new writes or cluster-wide GC. Node repair can also overwrite them with -+ * the same or a more recent version number, but not with an older version -+ * number. -+ * -+ * - WHITEOUT: for hash table btrees -+*/ -+#define BCH_BKEY_TYPES() \ -+ x(deleted, 0) \ -+ x(discard, 1) \ -+ x(error, 2) \ -+ x(cookie, 3) \ -+ x(whiteout, 4) \ -+ x(btree_ptr, 5) \ -+ x(extent, 6) \ -+ x(reservation, 7) \ -+ x(inode, 8) \ -+ x(inode_generation, 9) \ -+ x(dirent, 10) \ -+ x(xattr, 11) \ -+ x(alloc, 12) \ -+ x(quota, 13) \ -+ x(stripe, 14) \ -+ x(reflink_p, 15) \ -+ x(reflink_v, 16) \ -+ x(inline_data, 17) \ -+ x(btree_ptr_v2, 18) -+ -+enum bch_bkey_type { -+#define x(name, nr) KEY_TYPE_##name = nr, -+ BCH_BKEY_TYPES() -+#undef x -+ KEY_TYPE_MAX, -+}; -+ -+struct bch_cookie { -+ struct bch_val v; -+ __le64 cookie; -+}; -+ -+/* Extents */ -+ -+/* -+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally -+ * preceded by checksum/compression information (bch_extent_crc32 or -+ * bch_extent_crc64). -+ * -+ * One major determining factor in the format of extents is how we handle and -+ * represent extents that have been partially overwritten and thus trimmed: -+ * -+ * If an extent is not checksummed or compressed, when the extent is trimmed we -+ * don't have to remember the extent we originally allocated and wrote: we can -+ * merely adjust ptr->offset to point to the start of the data that is currently -+ * live. The size field in struct bkey records the current (live) size of the -+ * extent, and is also used to mean "size of region on disk that we point to" in -+ * this case. -+ * -+ * Thus an extent that is not checksummed or compressed will consist only of a -+ * list of bch_extent_ptrs, with none of the fields in -+ * bch_extent_crc32/bch_extent_crc64. -+ * -+ * When an extent is checksummed or compressed, it's not possible to read only -+ * the data that is currently live: we have to read the entire extent that was -+ * originally written, and then return only the part of the extent that is -+ * currently live. -+ * -+ * Thus, in addition to the current size of the extent in struct bkey, we need -+ * to store the size of the originally allocated space - this is the -+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, -+ * when the extent is trimmed, instead of modifying the offset field of the -+ * pointer, we keep a second smaller offset field - "offset into the original -+ * extent of the currently live region". -+ * -+ * The other major determining factor is replication and data migration: -+ * -+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated -+ * write, we will initially write all the replicas in the same format, with the -+ * same checksum type and compression format - however, when copygc runs later (or -+ * tiering/cache promotion, anything that moves data), it is not in general -+ * going to rewrite all the pointers at once - one of the replicas may be in a -+ * bucket on one device that has very little fragmentation while another lives -+ * in a bucket that has become heavily fragmented, and thus is being rewritten -+ * sooner than the rest. -+ * -+ * Thus it will only move a subset of the pointers (or in the case of -+ * tiering/cache promotion perhaps add a single pointer without dropping any -+ * current pointers), and if the extent has been partially overwritten it must -+ * write only the currently live portion (or copygc would not be able to reduce -+ * fragmentation!) - which necessitates a different bch_extent_crc format for -+ * the new pointer. -+ * -+ * But in the interests of space efficiency, we don't want to store one -+ * bch_extent_crc for each pointer if we don't have to. -+ * -+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and -+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the -+ * type of a given entry with a scheme similar to utf8 (except we're encoding a -+ * type, not a size), encoding the type in the position of the first set bit: -+ * -+ * bch_extent_crc32 - 0b1 -+ * bch_extent_ptr - 0b10 -+ * bch_extent_crc64 - 0b100 -+ * -+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and -+ * bch_extent_crc64 is the least constrained). -+ * -+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, -+ * until the next bch_extent_crc32/64. -+ * -+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer -+ * is neither checksummed nor compressed. -+ */ -+ -+/* 128 bits, sufficient for cryptographic MACs: */ -+struct bch_csum { -+ __le64 lo; -+ __le64 hi; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_EXTENT_ENTRY_TYPES() \ -+ x(ptr, 0) \ -+ x(crc32, 1) \ -+ x(crc64, 2) \ -+ x(crc128, 3) \ -+ x(stripe_ptr, 4) -+#define BCH_EXTENT_ENTRY_MAX 5 -+ -+enum bch_extent_entry_type { -+#define x(f, n) BCH_EXTENT_ENTRY_##f = n, -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+/* Compressed/uncompressed size are stored biased by 1: */ -+struct bch_extent_crc32 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u32 type:2, -+ _compressed_size:7, -+ _uncompressed_size:7, -+ offset:7, -+ _unused:1, -+ csum_type:4, -+ compression_type:4; -+ __u32 csum; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u32 csum; -+ __u32 compression_type:4, -+ csum_type:4, -+ _unused:1, -+ offset:7, -+ _uncompressed_size:7, -+ _compressed_size:7, -+ type:2; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+#define CRC32_SIZE_MAX (1U << 7) -+#define CRC32_NONCE_MAX 0 -+ -+struct bch_extent_crc64 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:3, -+ _compressed_size:9, -+ _uncompressed_size:9, -+ offset:9, -+ nonce:10, -+ csum_type:4, -+ compression_type:4, -+ csum_hi:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 csum_hi:16, -+ compression_type:4, -+ csum_type:4, -+ nonce:10, -+ offset:9, -+ _uncompressed_size:9, -+ _compressed_size:9, -+ type:3; -+#endif -+ __u64 csum_lo; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC64_SIZE_MAX (1U << 9) -+#define CRC64_NONCE_MAX ((1U << 10) - 1) -+ -+struct bch_extent_crc128 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:4, -+ _compressed_size:13, -+ _uncompressed_size:13, -+ offset:13, -+ nonce:13, -+ csum_type:4, -+ compression_type:4; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 compression_type:4, -+ csum_type:4, -+ nonce:13, -+ offset:13, -+ _uncompressed_size:13, -+ _compressed_size:13, -+ type:4; -+#endif -+ struct bch_csum csum; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC128_SIZE_MAX (1U << 13) -+#define CRC128_NONCE_MAX ((1U << 13) - 1) -+ -+/* -+ * @reservation - pointer hasn't been written to, just reserved -+ */ -+struct bch_extent_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:1, -+ cached:1, -+ unused:1, -+ reservation:1, -+ offset:44, /* 8 petabytes */ -+ dev:8, -+ gen:8; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 gen:8, -+ dev:8, -+ offset:44, -+ reservation:1, -+ unused:1, -+ cached:1, -+ type:1; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent_stripe_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:5, -+ block:8, -+ idx:51; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 idx:51, -+ block:8, -+ type:5; -+#endif -+}; -+ -+struct bch_extent_reservation { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:22, -+ replicas:4, -+ generation:32; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 generation:32, -+ replicas:4, -+ unused:22, -+ type:6; -+#endif -+}; -+ -+union bch_extent_entry { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 -+ unsigned long type; -+#elif __BITS_PER_LONG == 32 -+ struct { -+ unsigned long pad; -+ unsigned long type; -+ }; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define x(f, n) struct bch_extent_##f f; -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+struct bch_btree_ptr { -+ struct bch_val v; -+ -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_btree_ptr_v2 { -+ struct bch_val v; -+ -+ __u64 mem_ptr; -+ __le64 seq; -+ __le16 sectors_written; -+ /* In case we ever decide to do variable size btree nodes: */ -+ __le16 sectors; -+ struct bpos min_key; -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent { -+ struct bch_val v; -+ -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_reservation { -+ struct bch_val v; -+ -+ __le32 generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+} __attribute__((packed, aligned(8))); -+ -+/* Maximum size (in u64s) a single pointer could be: */ -+#define BKEY_EXTENT_PTR_U64s_MAX\ -+ ((sizeof(struct bch_extent_crc128) + \ -+ sizeof(struct bch_extent_ptr)) / sizeof(u64)) -+ -+/* Maximum possible size of an entire extent value: */ -+#define BKEY_EXTENT_VAL_U64s_MAX \ -+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -+ -+#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* * Maximum possible size of an entire extent, key + value: */ -+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* Btree pointers don't carry around checksums: */ -+#define BKEY_BTREE_PTR_VAL_U64s_MAX \ -+ ((sizeof(struct bch_btree_ptr_v2) + \ -+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) -+#define BKEY_BTREE_PTR_U64s_MAX \ -+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -+ -+/* Inodes */ -+ -+#define BLOCKDEV_INODE_MAX 4096 -+ -+#define BCACHEFS_ROOT_INO 4096 -+ -+struct bch_inode { -+ struct bch_val v; -+ -+ __le64 bi_hash_seed; -+ __le32 bi_flags; -+ __le16 bi_mode; -+ __u8 fields[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_inode_generation { -+ struct bch_val v; -+ -+ __le32 bi_generation; -+ __le32 pad; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_INODE_FIELDS() \ -+ x(bi_atime, 64) \ -+ x(bi_ctime, 64) \ -+ x(bi_mtime, 64) \ -+ x(bi_otime, 64) \ -+ x(bi_size, 64) \ -+ x(bi_sectors, 64) \ -+ x(bi_uid, 32) \ -+ x(bi_gid, 32) \ -+ x(bi_nlink, 32) \ -+ x(bi_generation, 32) \ -+ x(bi_dev, 32) \ -+ x(bi_data_checksum, 8) \ -+ x(bi_compression, 8) \ -+ x(bi_project, 32) \ -+ x(bi_background_compression, 8) \ -+ x(bi_data_replicas, 8) \ -+ x(bi_promote_target, 16) \ -+ x(bi_foreground_target, 16) \ -+ x(bi_background_target, 16) \ -+ x(bi_erasure_code, 16) \ -+ x(bi_fields_set, 16) -+ -+/* subset of BCH_INODE_FIELDS */ -+#define BCH_INODE_OPTS() \ -+ x(data_checksum, 8) \ -+ x(compression, 8) \ -+ x(project, 32) \ -+ x(background_compression, 8) \ -+ x(data_replicas, 8) \ -+ x(promote_target, 16) \ -+ x(foreground_target, 16) \ -+ x(background_target, 16) \ -+ x(erasure_code, 16) -+ -+enum inode_opt_id { -+#define x(name, ...) \ -+ Inode_opt_##name, -+ BCH_INODE_OPTS() -+#undef x -+ Inode_opt_nr, -+}; -+ -+enum { -+ /* -+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL -+ * flags) -+ */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, -+ -+ __BCH_INODE_I_SIZE_DIRTY= 5, -+ __BCH_INODE_I_SECTORS_DIRTY= 6, -+ __BCH_INODE_UNLINKED = 7, -+ -+ /* bits 20+ reserved for packed fields below: */ -+}; -+ -+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -+ -+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); -+ -+/* Dirents */ -+ -+/* -+ * Dirents (and xattrs) have to implement string lookups; since our b-tree -+ * doesn't support arbitrary length strings for the key, we instead index by a -+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset -+ * field of the key - using linear probing to resolve hash collisions. This also -+ * provides us with the readdir cookie posix requires. -+ * -+ * Linear probing requires us to use whiteouts for deletions, in the event of a -+ * collision: -+ */ -+ -+struct bch_dirent { -+ struct bch_val v; -+ -+ /* Target inode number: */ -+ __le64 d_inum; -+ -+ /* -+ * Copy of mode bits 12-15 from the target inode - so userspace can get -+ * the filetype without having to do a stat() -+ */ -+ __u8 d_type; -+ -+ __u8 d_name[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ -+ sizeof(struct bkey) - \ -+ offsetof(struct bch_dirent, d_name)) -+ -+ -+/* Xattrs */ -+ -+#define KEY_TYPE_XATTR_INDEX_USER 0 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -+#define KEY_TYPE_XATTR_INDEX_SECURITY 4 -+ -+struct bch_xattr { -+ struct bch_val v; -+ __u8 x_type; -+ __u8 x_name_len; -+ __le16 x_val_len; -+ __u8 x_name[]; -+} __attribute__((packed, aligned(8))); -+ -+/* Bucket/allocation information: */ -+ -+struct bch_alloc { -+ struct bch_val v; -+ __u8 fields; -+ __u8 gen; -+ __u8 data[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_ALLOC_FIELDS() \ -+ x(read_time, 16) \ -+ x(write_time, 16) \ -+ x(data_type, 8) \ -+ x(dirty_sectors, 16) \ -+ x(cached_sectors, 16) \ -+ x(oldest_gen, 8) -+ -+enum { -+#define x(name, bytes) BCH_ALLOC_FIELD_##name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ BCH_ALLOC_FIELD_NR -+}; -+ -+static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+#define x(name, bits) + (bits / 8) -+static const unsigned BKEY_ALLOC_VAL_U64s_MAX = -+ DIV_ROUND_UP(offsetof(struct bch_alloc, data) -+ BCH_ALLOC_FIELDS(), sizeof(u64)); -+#undef x -+ -+#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) -+ -+/* Quotas: */ -+ -+enum quota_types { -+ QTYP_USR = 0, -+ QTYP_GRP = 1, -+ QTYP_PRJ = 2, -+ QTYP_NR = 3, -+}; -+ -+enum quota_counters { -+ Q_SPC = 0, -+ Q_INO = 1, -+ Q_COUNTERS = 2, -+}; -+ -+struct bch_quota_counter { -+ __le64 hardlimit; -+ __le64 softlimit; -+}; -+ -+struct bch_quota { -+ struct bch_val v; -+ struct bch_quota_counter c[Q_COUNTERS]; -+} __attribute__((packed, aligned(8))); -+ -+/* Erasure coding */ -+ -+struct bch_stripe { -+ struct bch_val v; -+ __le16 sectors; -+ __u8 algorithm; -+ __u8 nr_blocks; -+ __u8 nr_redundant; -+ -+ __u8 csum_granularity_bits; -+ __u8 csum_type; -+ __u8 pad; -+ -+ struct bch_extent_ptr ptrs[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* Reflink: */ -+ -+struct bch_reflink_p { -+ struct bch_val v; -+ __le64 idx; -+ -+ __le32 reservation_generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+}; -+ -+struct bch_reflink_v { -+ struct bch_val v; -+ __le64 refcount; -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+}; -+ -+/* Inline data */ -+ -+struct bch_inline_data { -+ struct bch_val v; -+ u8 data[0]; -+}; -+ -+/* Optional/variable size superblock sections: */ -+ -+struct bch_sb_field { -+ __u64 _data[0]; -+ __le32 u64s; -+ __le32 type; -+}; -+ -+#define BCH_SB_FIELDS() \ -+ x(journal, 0) \ -+ x(members, 1) \ -+ x(crypt, 2) \ -+ x(replicas_v0, 3) \ -+ x(quota, 4) \ -+ x(disk_groups, 5) \ -+ x(clean, 6) \ -+ x(replicas, 7) \ -+ x(journal_seq_blacklist, 8) -+ -+enum bch_sb_field_type { -+#define x(f, nr) BCH_SB_FIELD_##f = nr, -+ BCH_SB_FIELDS() -+#undef x -+ BCH_SB_FIELD_NR -+}; -+ -+/* BCH_SB_FIELD_journal: */ -+ -+struct bch_sb_field_journal { -+ struct bch_sb_field field; -+ __le64 buckets[0]; -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+#define BCH_MIN_NR_NBUCKETS (1 << 6) -+ -+struct bch_member { -+ uuid_le uuid; -+ __le64 nbuckets; /* device size */ -+ __le16 first_bucket; /* index of first bucket used */ -+ __le16 bucket_size; /* sectors */ -+ __le32 pad; -+ __le64 last_mount; /* time_t */ -+ -+ __le64 flags[2]; -+}; -+ -+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -+/* 4-10 unused, was TIER, HAS_(META)DATA */ -+LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) -+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) -+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) -+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) -+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -+ -+#define BCH_TIER_MAX 4U -+ -+#if 0 -+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -+#endif -+ -+enum bch_member_state { -+ BCH_MEMBER_STATE_RW = 0, -+ BCH_MEMBER_STATE_RO = 1, -+ BCH_MEMBER_STATE_FAILED = 2, -+ BCH_MEMBER_STATE_SPARE = 3, -+ BCH_MEMBER_STATE_NR = 4, -+}; -+ -+enum cache_replacement { -+ CACHE_REPLACEMENT_LRU = 0, -+ CACHE_REPLACEMENT_FIFO = 1, -+ CACHE_REPLACEMENT_RANDOM = 2, -+ CACHE_REPLACEMENT_NR = 3, -+}; -+ -+struct bch_sb_field_members { -+ struct bch_sb_field field; -+ struct bch_member members[0]; -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+struct nonce { -+ __le32 d[4]; -+}; -+ -+struct bch_key { -+ __le64 key[4]; -+}; -+ -+#define BCH_KEY_MAGIC \ -+ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ -+ ((u64) 'h' << 16)|((u64) '*' << 24)| \ -+ ((u64) '*' << 32)|((u64) 'k' << 40)| \ -+ ((u64) 'e' << 48)|((u64) 'y' << 56)) -+ -+struct bch_encrypted_key { -+ __le64 magic; -+ struct bch_key key; -+}; -+ -+/* -+ * If this field is present in the superblock, it stores an encryption key which -+ * is used encrypt all other data/metadata. The key will normally be encrypted -+ * with the key userspace provides, but if encryption has been turned off we'll -+ * just store the master key unencrypted in the superblock so we can access the -+ * previously encrypted data. -+ */ -+struct bch_sb_field_crypt { -+ struct bch_sb_field field; -+ -+ __le64 flags; -+ __le64 kdf_flags; -+ struct bch_encrypted_key key; -+}; -+ -+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); -+ -+enum bch_kdf_types { -+ BCH_KDF_SCRYPT = 0, -+ BCH_KDF_NR = 1, -+}; -+ -+/* stored as base 2 log of scrypt params: */ -+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -+ -+/* BCH_SB_FIELD_replicas: */ -+ -+#define BCH_DATA_TYPES() \ -+ x(none, 0) \ -+ x(sb, 1) \ -+ x(journal, 2) \ -+ x(btree, 3) \ -+ x(user, 4) \ -+ x(cached, 5) -+ -+enum bch_data_type { -+#define x(t, n) BCH_DATA_##t, -+ BCH_DATA_TYPES() -+#undef x -+ BCH_DATA_NR -+}; -+ -+struct bch_replicas_entry_v0 { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+struct bch_sb_field_replicas_v0 { -+ struct bch_sb_field field; -+ struct bch_replicas_entry_v0 entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_entry { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 nr_required; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+#define replicas_entry_bytes(_i) \ -+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) -+ -+struct bch_sb_field_replicas { -+ struct bch_sb_field field; -+ struct bch_replicas_entry entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_quota: */ -+ -+struct bch_sb_quota_counter { -+ __le32 timelimit; -+ __le32 warnlimit; -+}; -+ -+struct bch_sb_quota_type { -+ __le64 flags; -+ struct bch_sb_quota_counter c[Q_COUNTERS]; -+}; -+ -+struct bch_sb_field_quota { -+ struct bch_sb_field field; -+ struct bch_sb_quota_type q[QTYP_NR]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_disk_groups: */ -+ -+#define BCH_SB_LABEL_SIZE 32 -+ -+struct bch_disk_group { -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 flags[2]; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) -+ -+struct bch_sb_field_disk_groups { -+ struct bch_sb_field field; -+ struct bch_disk_group entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * On clean shutdown, store btree roots and current journal sequence number in -+ * the superblock: -+ */ -+struct jset_entry { -+ __le16 u64s; -+ __u8 btree_id; -+ __u8 level; -+ __u8 type; /* designates what this jset holds */ -+ __u8 pad[3]; -+ -+ union { -+ struct bkey_i start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct bch_sb_field_clean { -+ struct bch_sb_field field; -+ -+ __le32 flags; -+ __le16 read_clock; -+ __le16 write_clock; -+ __le64 journal_seq; -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct journal_seq_blacklist_entry { -+ __le64 start; -+ __le64 end; -+}; -+ -+struct bch_sb_field_journal_seq_blacklist { -+ struct bch_sb_field field; -+ -+ union { -+ struct journal_seq_blacklist_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+/* Superblock: */ -+ -+/* -+ * New versioning scheme: -+ * One common version number for all on disk data structures - superblock, btree -+ * nodes, journal entries -+ */ -+#define BCH_JSET_VERSION_OLD 2 -+#define BCH_BSET_VERSION_OLD 3 -+ -+enum bcachefs_metadata_version { -+ bcachefs_metadata_version_min = 9, -+ bcachefs_metadata_version_new_versioning = 10, -+ bcachefs_metadata_version_bkey_renumber = 10, -+ bcachefs_metadata_version_inode_btree_change = 11, -+ bcachefs_metadata_version_max = 12, -+}; -+ -+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) -+ -+#define BCH_SB_SECTOR 8 -+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ -+ -+struct bch_sb_layout { -+ uuid_le magic; /* bcachefs superblock UUID */ -+ __u8 layout_type; -+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ -+ __u8 nr_superblocks; -+ __u8 pad[5]; -+ __le64 sb_offset[61]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_SB_LAYOUT_SECTOR 7 -+ -+/* -+ * @offset - sector where this sb was written -+ * @version - on disk format version -+ * @version_min - Oldest metadata version this filesystem contains; so we can -+ * safely drop compatibility code and refuse to mount filesystems -+ * we'd need it for -+ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) -+ * @seq - incremented each time superblock is written -+ * @uuid - used for generating various magic numbers and identifying -+ * member devices, never changes -+ * @user_uuid - user visible UUID, may be changed -+ * @label - filesystem label -+ * @seq - identifies most recent superblock, incremented each time -+ * superblock is written -+ * @features - enabled incompatible features -+ */ -+struct bch_sb { -+ struct bch_csum csum; -+ __le16 version; -+ __le16 version_min; -+ __le16 pad[2]; -+ uuid_le magic; -+ uuid_le uuid; -+ uuid_le user_uuid; -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 offset; -+ __le64 seq; -+ -+ __le16 block_size; -+ __u8 dev_idx; -+ __u8 nr_devices; -+ __le32 u64s; -+ -+ __le64 time_base_lo; -+ __le32 time_base_hi; -+ __le32 time_precision; -+ -+ __le64 flags[8]; -+ __le64 features[2]; -+ __le64 compat[2]; -+ -+ struct bch_sb_layout layout; -+ -+ union { -+ struct bch_sb_field start[0]; -+ __le64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * Flags: -+ * BCH_SB_INITALIZED - set on first mount -+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect -+ * behaviour of mount/recovery path: -+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits -+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 -+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides -+ * DATA/META_CSUM_TYPE. Also indicates encryption -+ * algorithm in use, if/when we get more than one -+ */ -+ -+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); -+ -+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); -+ -+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); -+ -+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); -+ -+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -+ -+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); -+ -+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -+ -+LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); -+ -+/* 61-64 unused */ -+ -+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); -+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); -+ -+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -+ -+/* -+ * Max size of an extent that may require bouncing to read or write -+ * (checksummed, compressed): 64k -+ */ -+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, -+ struct bch_sb, flags[1], 14, 20); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); -+ -+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); -+ -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, -+ struct bch_sb, flags[2], 0, 4); -+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); -+ -+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -+ -+/* -+ * Features: -+ * -+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist -+ * reflink: gates KEY_TYPE_reflink -+ * inline_data: gates KEY_TYPE_inline_data -+ * new_siphash: gates BCH_STR_HASH_SIPHASH -+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE -+ */ -+#define BCH_SB_FEATURES() \ -+ x(lz4, 0) \ -+ x(gzip, 1) \ -+ x(zstd, 2) \ -+ x(atomic_nlink, 3) \ -+ x(ec, 4) \ -+ x(journal_seq_blacklist_v3, 5) \ -+ x(reflink, 6) \ -+ x(new_siphash, 7) \ -+ x(inline_data, 8) \ -+ x(new_extent_overwrite, 9) \ -+ x(incompressible, 10) \ -+ x(btree_ptr_v2, 11) \ -+ x(extents_above_btree_updates, 12) \ -+ x(btree_updates_journalled, 13) -+ -+#define BCH_SB_FEATURES_ALL \ -+ ((1ULL << BCH_FEATURE_new_siphash)| \ -+ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ -+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -+ (1ULL << BCH_FEATURE_extents_above_btree_updates)) -+ -+enum bch_sb_feature { -+#define x(f, n) BCH_FEATURE_##f, -+ BCH_SB_FEATURES() -+#undef x -+ BCH_FEATURE_NR, -+}; -+ -+enum bch_sb_compat { -+ BCH_COMPAT_FEAT_ALLOC_INFO = 0, -+ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, -+}; -+ -+/* options: */ -+ -+#define BCH_REPLICAS_MAX 4U -+ -+enum bch_error_actions { -+ BCH_ON_ERROR_CONTINUE = 0, -+ BCH_ON_ERROR_RO = 1, -+ BCH_ON_ERROR_PANIC = 2, -+ BCH_NR_ERROR_ACTIONS = 3, -+}; -+ -+enum bch_str_hash_type { -+ BCH_STR_HASH_CRC32C = 0, -+ BCH_STR_HASH_CRC64 = 1, -+ BCH_STR_HASH_SIPHASH_OLD = 2, -+ BCH_STR_HASH_SIPHASH = 3, -+ BCH_STR_HASH_NR = 4, -+}; -+ -+enum bch_str_hash_opts { -+ BCH_STR_HASH_OPT_CRC32C = 0, -+ BCH_STR_HASH_OPT_CRC64 = 1, -+ BCH_STR_HASH_OPT_SIPHASH = 2, -+ BCH_STR_HASH_OPT_NR = 3, -+}; -+ -+enum bch_csum_type { -+ BCH_CSUM_NONE = 0, -+ BCH_CSUM_CRC32C_NONZERO = 1, -+ BCH_CSUM_CRC64_NONZERO = 2, -+ BCH_CSUM_CHACHA20_POLY1305_80 = 3, -+ BCH_CSUM_CHACHA20_POLY1305_128 = 4, -+ BCH_CSUM_CRC32C = 5, -+ BCH_CSUM_CRC64 = 6, -+ BCH_CSUM_NR = 7, -+}; -+ -+static const unsigned bch_crc_bytes[] = { -+ [BCH_CSUM_NONE] = 0, -+ [BCH_CSUM_CRC32C_NONZERO] = 4, -+ [BCH_CSUM_CRC32C] = 4, -+ [BCH_CSUM_CRC64_NONZERO] = 8, -+ [BCH_CSUM_CRC64] = 8, -+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, -+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -+}; -+ -+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -+{ -+ switch (type) { -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+enum bch_csum_opts { -+ BCH_CSUM_OPT_NONE = 0, -+ BCH_CSUM_OPT_CRC32C = 1, -+ BCH_CSUM_OPT_CRC64 = 2, -+ BCH_CSUM_OPT_NR = 3, -+}; -+ -+#define BCH_COMPRESSION_TYPES() \ -+ x(none, 0) \ -+ x(lz4_old, 1) \ -+ x(gzip, 2) \ -+ x(lz4, 3) \ -+ x(zstd, 4) \ -+ x(incompressible, 5) -+ -+enum bch_compression_type { -+#define x(t, n) BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_TYPES() -+#undef x -+ BCH_COMPRESSION_TYPE_NR -+}; -+ -+#define BCH_COMPRESSION_OPTS() \ -+ x(none, 0) \ -+ x(lz4, 1) \ -+ x(gzip, 2) \ -+ x(zstd, 3) -+ -+enum bch_compression_opts { -+#define x(t, n) BCH_COMPRESSION_OPT_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ BCH_COMPRESSION_OPT_NR -+}; -+ -+/* -+ * Magic numbers -+ * -+ * The various other data structures have their own magic numbers, which are -+ * xored with the first part of the cache set's UUID -+ */ -+ -+#define BCACHE_MAGIC \ -+ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ -+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -+ -+#define BCACHEFS_STATFS_MAGIC 0xca451a4e -+ -+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) -+ -+static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -+{ -+ __le64 ret; -+ memcpy(&ret, &sb->uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 __jset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -+} -+ -+static inline __u64 __bset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -+} -+ -+/* Journal */ -+ -+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) -+ -+#define BCH_JSET_ENTRY_TYPES() \ -+ x(btree_keys, 0) \ -+ x(btree_root, 1) \ -+ x(prio_ptrs, 2) \ -+ x(blacklist, 3) \ -+ x(blacklist_v2, 4) \ -+ x(usage, 5) \ -+ x(data_usage, 6) -+ -+enum { -+#define x(f, nr) BCH_JSET_ENTRY_##f = nr, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+ BCH_JSET_ENTRY_NR -+}; -+ -+/* -+ * Journal sequence numbers can be blacklisted: bsets record the max sequence -+ * number of all the journal entries they contain updates for, so that on -+ * recovery we can ignore those bsets that contain index updates newer that what -+ * made it into the journal. -+ * -+ * This means that we can't reuse that journal_seq - we have to skip it, and -+ * then record that we skipped it so that the next time we crash and recover we -+ * don't think there was a missing journal entry. -+ */ -+struct jset_entry_blacklist { -+ struct jset_entry entry; -+ __le64 seq; -+}; -+ -+struct jset_entry_blacklist_v2 { -+ struct jset_entry entry; -+ __le64 start; -+ __le64 end; -+}; -+ -+enum { -+ FS_USAGE_RESERVED = 0, -+ FS_USAGE_INODES = 1, -+ FS_USAGE_KEY_VERSION = 2, -+ FS_USAGE_NR = 3 -+}; -+ -+struct jset_entry_usage { -+ struct jset_entry entry; -+ __le64 v; -+} __attribute__((packed)); -+ -+struct jset_entry_data_usage { -+ struct jset_entry entry; -+ __le64 v; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+/* -+ * On disk format for a journal entry: -+ * seq is monotonically increasing; every journal entry has its own unique -+ * sequence number. -+ * -+ * last_seq is the oldest journal entry that still has keys the btree hasn't -+ * flushed to disk yet. -+ * -+ * version is for on disk format changes. -+ */ -+struct jset { -+ struct bch_csum csum; -+ -+ __le64 magic; -+ __le64 seq; -+ __le32 version; -+ __le32 flags; -+ -+ __le32 u64s; /* size of d[] in u64s */ -+ -+ __u8 encrypted_start[0]; -+ -+ __le16 read_clock; -+ __le16 write_clock; -+ -+ /* Sequence number of oldest dirty journal entry */ -+ __le64 last_seq; -+ -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -+ -+#define BCH_JOURNAL_BUCKETS_MIN 8 -+ -+/* Btree: */ -+ -+#define BCH_BTREE_IDS() \ -+ x(EXTENTS, 0, "extents") \ -+ x(INODES, 1, "inodes") \ -+ x(DIRENTS, 2, "dirents") \ -+ x(XATTRS, 3, "xattrs") \ -+ x(ALLOC, 4, "alloc") \ -+ x(QUOTAS, 5, "quotas") \ -+ x(EC, 6, "stripes") \ -+ x(REFLINK, 7, "reflink") -+ -+enum btree_id { -+#define x(kwd, val, name) BTREE_ID_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BTREE_ID_NR -+}; -+ -+#define BTREE_MAX_DEPTH 4U -+ -+/* Btree nodes */ -+ -+/* -+ * Btree nodes -+ * -+ * On disk a btree node is a list/log of these; within each set the keys are -+ * sorted -+ */ -+struct bset { -+ __le64 seq; -+ -+ /* -+ * Highest journal entry this bset contains keys for. -+ * If on recovery we don't see that journal entry, this bset is ignored: -+ * this allows us to preserve the order of all index updates after a -+ * crash, since the journal records a total order of all index updates -+ * and anything that didn't make it to the journal doesn't get used. -+ */ -+ __le64 journal_seq; -+ -+ __le32 flags; -+ __le16 version; -+ __le16 u64s; /* count of d[] in u64s */ -+ -+ union { -+ struct bkey_packed start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); -+ -+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, -+ struct bset, flags, 5, 6); -+ -+struct btree_node { -+ struct bch_csum csum; -+ __le64 magic; -+ -+ /* this flags field is encrypted, unlike bset->flags: */ -+ __le64 flags; -+ -+ /* Closed interval: */ -+ struct bpos min_key; -+ struct bpos max_key; -+ struct bch_extent_ptr ptr; -+ struct bkey_format format; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); -+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, -+ struct btree_node, flags, 8, 9); -+/* 9-32 unused */ -+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); -+ -+struct btree_node_entry { -+ struct bch_csum csum; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+#endif /* _BCACHEFS_FORMAT_H */ -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -new file mode 100644 -index 000000000000..d71157a3e073 ---- /dev/null -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -0,0 +1,332 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IOCTL_H -+#define _BCACHEFS_IOCTL_H -+ -+#include -+#include -+#include "bcachefs_format.h" -+ -+/* -+ * Flags common to multiple ioctls: -+ */ -+#define BCH_FORCE_IF_DATA_LOST (1 << 0) -+#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) -+ -+#define BCH_FORCE_IF_DEGRADED \ -+ (BCH_FORCE_IF_DATA_DEGRADED| \ -+ BCH_FORCE_IF_METADATA_DEGRADED) -+ -+/* -+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname -+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the -+ * filesystem: -+ */ -+#define BCH_BY_INDEX (1 << 4) -+ -+/* -+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem -+ * wide superblock: -+ */ -+#define BCH_READ_DEV (1 << 5) -+ -+/* global control dev: */ -+ -+/* These are currently broken, and probably unnecessary: */ -+#if 0 -+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) -+ -+struct bch_ioctl_assemble { -+ __u32 flags; -+ __u32 nr_devs; -+ __u64 pad; -+ __u64 devs[]; -+}; -+ -+struct bch_ioctl_incremental { -+ __u32 flags; -+ __u64 pad; -+ __u64 dev; -+}; -+#endif -+ -+/* filesystem ioctls: */ -+ -+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) -+ -+/* These only make sense when we also have incremental assembly */ -+#if 0 -+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -+#define BCH_IOCTL_STOP _IO(0xbc, 3) -+#endif -+ -+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -+ -+/* ioctl below act on a particular file, not the filesystem as a whole: */ -+ -+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) -+ -+/* -+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID -+ * -+ * Returns user visible UUID, not internal UUID (which may not ever be changed); -+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with -+ * this UUID. -+ */ -+struct bch_ioctl_query_uuid { -+ uuid_le uuid; -+}; -+ -+#if 0 -+struct bch_ioctl_start { -+ __u32 flags; -+ __u32 pad; -+}; -+#endif -+ -+/* -+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem -+ * -+ * The specified device must not be open or in use. On success, the new device -+ * will be an online member of the filesystem just like any other member. -+ * -+ * The device must first be prepared by userspace by formatting with a bcachefs -+ * superblock, which is only used for passing in superblock options/parameters -+ * for that device (in struct bch_member). The new device's superblock should -+ * not claim to be a member of any existing filesystem - UUIDs on it will be -+ * ignored. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem -+ * -+ * Any data present on @dev will be permanently deleted, and @dev will be -+ * removed from its slot in the filesystem's list of member devices. The device -+ * may be either offline or offline. -+ * -+ * Will fail removing @dev would leave us with insufficient read write devices -+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are -+ * set. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem -+ * but is not open (e.g. because we started in degraded mode), bring it online -+ * -+ * all existing data on @dev will be available once the device is online, -+ * exactly as if @dev was present when the filesystem was first mounted -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that -+ * block device, without removing it from the filesystem (so it can be brought -+ * back online later) -+ * -+ * Data present on @dev will be unavailable while @dev is offline (unless -+ * replicated), but will still be intact and untouched if @dev is brought back -+ * online -+ * -+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would -+ * leave us with insufficient read write devices or degraded/unavailable data, -+ * unless the approprate BCH_FORCE_IF_* flags are set. -+ */ -+ -+struct bch_ioctl_disk { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem -+ * -+ * @new_state - one of the bch_member_state states (rw, ro, failed, -+ * spare) -+ * -+ * Will refuse to change member state if we would then have insufficient devices -+ * to write to, or if it would result in degraded data (when @new_state is -+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. -+ */ -+struct bch_ioctl_disk_set_state { -+ __u32 flags; -+ __u8 new_state; -+ __u8 pad[3]; -+ __u64 dev; -+}; -+ -+enum bch_data_ops { -+ BCH_DATA_OP_SCRUB = 0, -+ BCH_DATA_OP_REREPLICATE = 1, -+ BCH_DATA_OP_MIGRATE = 2, -+ BCH_DATA_OP_NR = 3, -+}; -+ -+/* -+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. -+ * scrub, rereplicate, migrate). -+ * -+ * This ioctl kicks off a job in the background, and returns a file descriptor. -+ * Reading from the file descriptor returns a struct bch_ioctl_data_event, -+ * indicating current progress, and closing the file descriptor will stop the -+ * job. The file descriptor is O_CLOEXEC. -+ */ -+struct bch_ioctl_data { -+ __u32 op; -+ __u32 flags; -+ -+ struct bpos start; -+ struct bpos end; -+ -+ union { -+ struct { -+ __u32 dev; -+ __u32 pad; -+ } migrate; -+ struct { -+ __u64 pad[8]; -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+enum bch_data_event { -+ BCH_DATA_EVENT_PROGRESS = 0, -+ /* XXX: add an event for reporting errors */ -+ BCH_DATA_EVENT_NR = 1, -+}; -+ -+struct bch_ioctl_data_progress { -+ __u8 data_type; -+ __u8 btree_id; -+ __u8 pad[2]; -+ struct bpos pos; -+ -+ __u64 sectors_done; -+ __u64 sectors_total; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_ioctl_data_event { -+ __u8 type; -+ __u8 pad[7]; -+ union { -+ struct bch_ioctl_data_progress p; -+ __u64 pad2[15]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_usage { -+ __u64 sectors; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+static inline struct bch_replicas_usage * -+replicas_usage_next(struct bch_replicas_usage *u) -+{ -+ return (void *) u + replicas_entry_bytes(&u->r) + 8; -+} -+ -+/* -+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage -+ * -+ * Returns disk space usage broken out by data type, number of replicas, and -+ * by component device -+ * -+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries -+ * -+ * On success, @replica_entries_bytes will be changed to indicate the number of -+ * bytes actually used. -+ * -+ * Returns -ERANGE if @replica_entries_bytes was too small -+ */ -+struct bch_ioctl_fs_usage { -+ __u64 capacity; -+ __u64 used; -+ __u64 online_reserved; -+ __u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ -+ __u32 replica_entries_bytes; -+ __u32 pad; -+ -+ struct bch_replicas_usage replicas[0]; -+}; -+ -+/* -+ * BCH_IOCTL_DEV_USAGE: query device disk space usage -+ * -+ * Returns disk space usage broken out by data type - both by buckets and -+ * sectors. -+ */ -+struct bch_ioctl_dev_usage { -+ __u64 dev; -+ __u32 flags; -+ __u8 state; -+ __u8 pad[7]; -+ -+ __u32 bucket_size; -+ __u64 nr_buckets; -+ __u64 available_buckets; -+ -+ __u64 buckets[BCH_DATA_NR]; -+ __u64 sectors[BCH_DATA_NR]; -+ -+ __u64 ec_buckets; -+ __u64 ec_sectors; -+}; -+ -+/* -+ * BCH_IOCTL_READ_SUPER: read filesystem superblock -+ * -+ * Equivalent to reading the superblock directly from the block device, except -+ * avoids racing with the kernel writing the superblock or having to figure out -+ * which block device to read -+ * -+ * @sb - buffer to read into -+ * @size - size of userspace allocated buffer -+ * @dev - device to read superblock for, if BCH_READ_DEV flag is -+ * specified -+ * -+ * Returns -ERANGE if buffer provided is too small -+ */ -+struct bch_ioctl_read_super { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 size; -+ __u64 sb; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to -+ * determine if disk is a (online) member - if so, returns device's index -+ * -+ * Returns -ENOENT if not found -+ */ -+struct bch_ioctl_disk_get_idx { -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device -+ * -+ * @dev - member to resize -+ * @nbuckets - new number of buckets -+ */ -+struct bch_ioctl_disk_resize { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 nbuckets; -+}; -+ -+#endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c -new file mode 100644 -index 000000000000..4d0c9129cd4a ---- /dev/null -+++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1154 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "bset.h" -+#include "util.h" -+ -+#undef EBUG_ON -+ -+#ifdef DEBUG_BKEYS -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) -+#endif -+ -+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) -+{ -+ unsigned bit = high_bit_offset, done = 0; -+ -+ while (1) { -+ while (bit < 64) { -+ if (done && !(done % 8)) -+ *out++ = ' '; -+ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; -+ bit++; -+ done++; -+ if (done == nr_bits) { -+ *out++ = '\0'; -+ return; -+ } -+ } -+ -+ p = next_word(p); -+ bit = 0; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) -+{ -+ struct bkey tmp; -+ -+ BUG_ON(bkeyp_val_u64s(format, packed) != -+ bkey_val_u64s(unpacked)); -+ -+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); -+ -+ tmp = __bch2_bkey_unpack_key(format, packed); -+ -+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { -+ char buf1[160], buf2[160]; -+ char buf3[160], buf4[160]; -+ -+ bch2_bkey_to_text(&PBUF(buf1), unpacked); -+ bch2_bkey_to_text(&PBUF(buf2), &tmp); -+ bch2_to_binary(buf3, (void *) unpacked, 80); -+ bch2_to_binary(buf4, high_word(format, packed), 80); -+ -+ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", -+ format->key_u64s, -+ format->bits_per_field[0], -+ format->bits_per_field[1], -+ format->bits_per_field[2], -+ format->bits_per_field[3], -+ format->bits_per_field[4], -+ buf1, buf2, buf3, buf4); -+ } -+} -+ -+#else -+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) {} -+#endif -+ -+struct pack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct pack_state pack_state_init(const struct bkey_format *format, -+ struct bkey_packed *k) -+{ -+ u64 *p = high_word(format, k); -+ -+ return (struct pack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = 0, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static void pack_state_finish(struct pack_state *state, -+ struct bkey_packed *k) -+{ -+ EBUG_ON(state->p < k->_data); -+ EBUG_ON(state->p >= k->_data + state->format->key_u64s); -+ -+ *state->p = state->w; -+} -+ -+struct unpack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ const u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct unpack_state unpack_state_init(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(format, k); -+ -+ return (struct unpack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = *p << high_bit_offset, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static u64 get_inc_field(struct unpack_state *state, unsigned field) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (bits >= state->bits) { -+ v = state->w >> (64 - bits); -+ bits -= state->bits; -+ -+ state->p = next_word(state->p); -+ state->w = *state->p; -+ state->bits = 64; -+ } -+ -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ v |= (state->w >> 1) >> (63 - bits); -+ state->w <<= bits; -+ state->bits -= bits; -+ -+ return v + offset; -+} -+ -+__always_inline -+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (v < offset) -+ return false; -+ -+ v -= offset; -+ -+ if (fls64(v) > bits) -+ return false; -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return true; -+} -+ -+/* -+ * Note: does NOT set out->format (we don't know what it should be here!) -+ * -+ * Also: doesn't work on extents - it doesn't preserve the invariant that -+ * if k is packed bkey_start_pos(k) will successfully pack -+ */ -+static bool bch2_bkey_transform_key(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ struct pack_state out_s = pack_state_init(out_f, out); -+ struct unpack_state in_s = unpack_state_init(in_f, in); -+ unsigned i; -+ -+ out->_data[0] = 0; -+ -+ for (i = 0; i < BKEY_NR_FIELDS; i++) -+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) -+ return false; -+ -+ /* Can't happen because the val would be too big to unpack: */ -+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); -+ -+ pack_state_finish(&out_s, out); -+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ return true; -+} -+ -+bool bch2_bkey_transform(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) -+ return false; -+ -+ memcpy_u64s((u64 *) out + out_f->key_u64s, -+ (u64 *) in + in_f->key_u64s, -+ (in->u64s - in_f->key_u64s)); -+ return true; -+} -+ -+#define bkey_fields() \ -+ x(BKEY_FIELD_INODE, p.inode) \ -+ x(BKEY_FIELD_OFFSET, p.offset) \ -+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ -+ x(BKEY_FIELD_SIZE, size) \ -+ x(BKEY_FIELD_VERSION_HI, version.hi) \ -+ x(BKEY_FIELD_VERSION_LO, version.lo) -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bkey out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); -+ -+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; -+ out.format = KEY_FORMAT_CURRENT; -+ out.needs_whiteout = in->needs_whiteout; -+ out.type = in->type; -+ out.pad[0] = 0; -+ -+#define x(id, field) out.field = get_inc_field(&state, id); -+ bkey_fields() -+#undef x -+ -+ return out; -+} -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bpos out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ -+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); -+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); -+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); -+ -+ return out; -+} -+#endif -+ -+/** -+ * bch2_bkey_pack_key -- pack just the key, not the value -+ */ -+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, -+ const struct bkey_format *format) -+{ -+ struct pack_state state = pack_state_init(format, out); -+ -+ EBUG_ON((void *) in == (void *) out); -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->format != KEY_FORMAT_CURRENT); -+ -+ out->_data[0] = 0; -+ -+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; -+ bkey_fields() -+#undef x -+ -+ /* -+ * Extents - we have to guarantee that if an extent is packed, a trimmed -+ * version will also pack: -+ */ -+ if (bkey_start_offset(in) < -+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) -+ return false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ bch2_bkey_pack_verify(out, in, format); -+ return true; -+} -+ -+/** -+ * bch2_bkey_unpack -- unpack the key and the value -+ */ -+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, -+ const struct bkey_packed *src) -+{ -+ __bkey_unpack_key(b, &dst->k, src); -+ -+ memcpy_u64s(&dst->v, -+ bkeyp_val(&b->format, src), -+ bkeyp_val_u64s(&b->format, src)); -+} -+ -+/** -+ * bch2_bkey_pack -- pack the key and the value -+ */ -+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, -+ const struct bkey_format *format) -+{ -+ struct bkey_packed tmp; -+ -+ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) -+ return false; -+ -+ memmove_u64s((u64 *) out + format->key_u64s, -+ &in->v, -+ bkey_val_u64s(&in->k)); -+ memcpy_u64s(out, &tmp, format->key_u64s); -+ -+ return true; -+} -+ -+__always_inline -+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ bool ret = true; -+ -+ EBUG_ON(v < offset); -+ v -= offset; -+ -+ if (fls64(v) > bits) { -+ v = ~(~0ULL << bits); -+ ret = false; -+ } -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static bool bkey_packed_successor(struct bkey_packed *out, -+ const struct btree *b, -+ struct bkey_packed k) -+{ -+ const struct bkey_format *f = &b->format; -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned first_bit, offset; -+ u64 *p; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ if (!nr_key_bits) -+ return false; -+ -+ *out = k; -+ -+ first_bit = high_bit_offset + nr_key_bits - 1; -+ p = nth_word(high_word(f, out), first_bit >> 6); -+ offset = 63 - (first_bit & 63); -+ -+ while (nr_key_bits) { -+ unsigned bits = min(64 - offset, nr_key_bits); -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if ((*p & mask) != mask) { -+ *p += 1ULL << offset; -+ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); -+ return true; -+ } -+ -+ *p &= ~mask; -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ offset = 0; -+ } -+ -+ return false; -+} -+#endif -+ -+/* -+ * Returns a packed key that compares <= in -+ * -+ * This is used in bset_search_tree(), where we need a packed pos in order to be -+ * able to compare against the keys in the auxiliary search tree - and it's -+ * legal to use a packed pos that isn't equivalent to the original pos, -+ * _provided_ it compares <= to the original pos. -+ */ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, -+ struct bpos in, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct pack_state state = pack_state_init(f, out); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos orig = in; -+#endif -+ bool exact = true; -+ -+ out->_data[0] = 0; -+ -+ if (unlikely(in.snapshot < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { -+ if (!in.offset-- && -+ !in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.offset < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { -+ if (!in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.inode < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) -+ return BKEY_PACK_POS_FAIL; -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) -+ exact = false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = f->key_u64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->type = KEY_TYPE_deleted; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (exact) { -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -+ } else { -+ struct bkey_packed successor; -+ -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -+ BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0); -+ } -+#endif -+ -+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -+} -+ -+void bch2_bkey_format_init(struct bkey_format_state *s) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) -+ s->field_min[i] = U64_MAX; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) -+ s->field_max[i] = 0; -+ -+ /* Make sure we can store a size of 0: */ -+ s->field_min[BKEY_FIELD_SIZE] = 0; -+} -+ -+static void __bkey_format_add(struct bkey_format_state *s, -+ unsigned field, u64 v) -+{ -+ s->field_min[field] = min(s->field_min[field], v); -+ s->field_max[field] = max(s->field_max[field], v); -+} -+ -+/* -+ * Changes @format so that @k can be successfully packed with @format -+ */ -+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -+{ -+#define x(id, field) __bkey_format_add(s, id, k->field); -+ bkey_fields() -+#undef x -+ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); -+} -+ -+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -+{ -+ unsigned field = 0; -+ -+ __bkey_format_add(s, field++, p.inode); -+ __bkey_format_add(s, field++, p.offset); -+ __bkey_format_add(s, field++, p.snapshot); -+} -+ -+/* -+ * We don't want it to be possible for the packed format to represent fields -+ * bigger than a u64... that will cause confusion and issues (like with -+ * bkey_packed_successor()) -+ */ -+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, -+ unsigned bits, u64 offset) -+{ -+ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); -+ -+ f->bits_per_field[i] = bits; -+ f->field_offset[i] = cpu_to_le64(offset); -+} -+ -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ struct bkey_format ret = { -+ .nr_fields = BKEY_NR_FIELDS, -+ }; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { -+ s->field_min[i] = min(s->field_min[i], s->field_max[i]); -+ -+ set_format_field(&ret, i, -+ fls64(s->field_max[i] - s->field_min[i]), -+ s->field_min[i]); -+ -+ bits += ret.bits_per_field[i]; -+ } -+ -+ /* allow for extent merging: */ -+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { -+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; -+ bits += 4; -+ } -+ -+ ret.key_u64s = DIV_ROUND_UP(bits, 64); -+ -+ /* if we have enough spare bits, round fields up to nearest byte */ -+ bits = ret.key_u64s * 64 - bits; -+ -+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { -+ unsigned r = round_up(ret.bits_per_field[i], 8) - -+ ret.bits_per_field[i]; -+ -+ if (r <= bits) { -+ set_format_field(&ret, i, -+ ret.bits_per_field[i] + r, -+ le64_to_cpu(ret.field_offset[i])); -+ bits -= r; -+ } -+ } -+ -+ EBUG_ON(bch2_bkey_format_validate(&ret)); -+ return ret; -+} -+ -+const char *bch2_bkey_format_validate(struct bkey_format *f) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ -+ if (f->nr_fields != BKEY_NR_FIELDS) -+ return "incorrect number of fields"; -+ -+ for (i = 0; i < f->nr_fields; i++) { -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (f->bits_per_field[i] > 64) -+ return "field too large"; -+ -+ if (field_offset && -+ (f->bits_per_field[i] == 64 || -+ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < -+ field_offset))) -+ return "offset + bits overflow"; -+ -+ bits += f->bits_per_field[i]; -+ } -+ -+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) -+ return "incorrect key_u64s"; -+ -+ return NULL; -+} -+ -+/* -+ * Most significant differing bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, -+ const struct bkey_packed *l_k, -+ const struct bkey_packed *r_k) -+{ -+ const u64 *l = high_word(&b->format, l_k); -+ const u64 *r = high_word(&b->format, r_k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned word_bits = 64 - high_bit_offset; -+ u64 l_v, r_v; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ /* for big endian, skip past header */ -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (nr_key_bits) { -+ if (nr_key_bits < word_bits) { -+ l_v >>= word_bits - nr_key_bits; -+ r_v >>= word_bits - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= word_bits; -+ } -+ -+ if (l_v != r_v) -+ return fls64(l_v ^ r_v) - 1 + nr_key_bits; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ word_bits = 64; -+ } -+ -+ return 0; -+} -+ -+/* -+ * First set bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(&b->format, k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned ret = 0, offset; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ offset = nr_key_bits; -+ while (offset > 64) { -+ p = next_word(p); -+ offset -= 64; -+ } -+ -+ offset = 64 - offset; -+ -+ while (nr_key_bits) { -+ unsigned bits = nr_key_bits + offset < 64 -+ ? nr_key_bits -+ : 64 - offset; -+ -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if (*p & mask) -+ return ret + __ffs64(*p & mask) - offset; -+ -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ ret += bits; -+ offset = 0; -+ } -+ -+ return 0; -+} -+ -+#ifdef CONFIG_X86_64 -+ -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ long d0, d1, d2, d3; -+ int cmp; -+ -+ /* we shouldn't need asm for this, but gcc is being retarded: */ -+ -+ asm(".intel_syntax noprefix;" -+ "xor eax, eax;" -+ "xor edx, edx;" -+ "1:;" -+ "mov r8, [rdi];" -+ "mov r9, [rsi];" -+ "sub ecx, 64;" -+ "jl 2f;" -+ -+ "cmp r8, r9;" -+ "jnz 3f;" -+ -+ "lea rdi, [rdi - 8];" -+ "lea rsi, [rsi - 8];" -+ "jmp 1b;" -+ -+ "2:;" -+ "not ecx;" -+ "shr r8, 1;" -+ "shr r9, 1;" -+ "shr r8, cl;" -+ "shr r9, cl;" -+ "cmp r8, r9;" -+ -+ "3:\n" -+ "seta al;" -+ "setb dl;" -+ "sub eax, edx;" -+ ".att_syntax prefix;" -+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) -+ : "0" (l), "1" (r), "3" (nr_key_bits) -+ : "r8", "r9", "cc", "memory"); -+ -+ return cmp; -+} -+ -+#define I(_x) (*(out)++ = (_x)) -+#define I1(i0) I(i0) -+#define I2(i0, i1) (I1(i0), I(i1)) -+#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) -+ -+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, -+ enum bch_bkey_fields field, -+ unsigned dst_offset, unsigned dst_size, -+ bool *eax_zeroed) -+{ -+ unsigned bits = format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(format->field_offset[field]); -+ unsigned i, byte, bit_offset, align, shl, shr; -+ -+ if (!bits && !offset) { -+ if (!*eax_zeroed) { -+ /* xor eax, eax */ -+ I2(0x31, 0xc0); -+ } -+ -+ *eax_zeroed = true; -+ goto set_field; -+ } -+ -+ if (!bits) { -+ /* just return offset: */ -+ -+ switch (dst_size) { -+ case 8: -+ if (offset > S32_MAX) { -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ -+ I3(0xc7, 0x47, dst_offset + 4); -+ memcpy(out, (void *) &offset + 4, 4); -+ out += 4; -+ } else { -+ /* mov [rdi + dst_offset], offset */ -+ /* sign extended */ -+ I4(0x48, 0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+ } -+ -+ bit_offset = format->key_u64s * 64; -+ for (i = 0; i <= field; i++) -+ bit_offset -= format->bits_per_field[i]; -+ -+ byte = bit_offset / 8; -+ bit_offset -= byte * 8; -+ -+ *eax_zeroed = false; -+ -+ if (bit_offset == 0 && bits == 8) { -+ /* movzx eax, BYTE PTR [rsi + imm8] */ -+ I4(0x0f, 0xb6, 0x46, byte); -+ } else if (bit_offset == 0 && bits == 16) { -+ /* movzx eax, WORD PTR [rsi + imm8] */ -+ I4(0x0f, 0xb7, 0x46, byte); -+ } else if (bit_offset + bits <= 32) { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 32); -+ -+ /* mov eax, [rsi + imm8] */ -+ I3(0x8b, 0x46, byte); -+ -+ if (bit_offset) { -+ /* shr eax, imm8 */ -+ I3(0xc1, 0xe8, bit_offset); -+ } -+ -+ if (bit_offset + bits < 32) { -+ unsigned mask = ~0U >> (32 - bits); -+ -+ /* and eax, imm32 */ -+ I1(0x25); -+ memcpy(out, &mask, 4); -+ out += 4; -+ } -+ } else if (bit_offset + bits <= 64) { -+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 64); -+ -+ /* mov rax, [rsi + imm8] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ shl = 64 - bit_offset - bits; -+ shr = bit_offset + shl; -+ -+ if (shl) { -+ /* shl rax, imm8 */ -+ I4(0x48, 0xc1, 0xe0, shl); -+ } -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } else { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 96); -+ -+ /* mov rax, [rsi + byte] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ /* mov edx, [rsi + byte + 8] */ -+ I3(0x8b, 0x56, byte + 8); -+ -+ /* bits from next word: */ -+ shr = bit_offset + bits - 64; -+ BUG_ON(shr > bit_offset); -+ -+ /* shr rax, bit_offset */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ -+ /* shl rdx, imm8 */ -+ I4(0x48, 0xc1, 0xe2, 64 - shr); -+ -+ /* or rax, rdx */ -+ I3(0x48, 0x09, 0xd0); -+ -+ shr = bit_offset - shr; -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } -+ -+ /* rax += offset: */ -+ if (offset > S32_MAX) { -+ /* mov rdx, imm64 */ -+ I2(0x48, 0xba); -+ memcpy(out, &offset, 8); -+ out += 8; -+ /* add %rdx, %rax */ -+ I3(0x48, 0x01, 0xd0); -+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { -+ /* add rax, imm32 */ -+ I2(0x48, 0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } else if (offset) { -+ /* add eax, imm32 */ -+ I1(0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+set_field: -+ switch (dst_size) { -+ case 8: -+ /* mov [rdi + dst_offset], rax */ -+ I4(0x48, 0x89, 0x47, dst_offset); -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], eax */ -+ I3(0x89, 0x47, dst_offset); -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+} -+ -+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -+{ -+ bool eax_zeroed = false; -+ u8 *out = _out; -+ -+ /* -+ * rdi: dst - unpacked key -+ * rsi: src - packed key -+ */ -+ -+ /* k->u64s, k->format, k->type */ -+ -+ /* mov eax, [rsi] */ -+ I2(0x8b, 0x06); -+ -+ /* add eax, BKEY_U64s - format->key_u64s */ -+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); -+ -+ /* and eax, imm32: mask out k->pad: */ -+ I5(0x25, 0xff, 0xff, 0xff, 0); -+ -+ /* mov [rdi], eax */ -+ I2(0x89, 0x07); -+ -+#define x(id, field) \ -+ out = compile_bkey_field(format, out, id, \ -+ offsetof(struct bkey, field), \ -+ sizeof(((struct bkey *) NULL)->field), \ -+ &eax_zeroed); -+ bkey_fields() -+#undef x -+ -+ /* retq */ -+ I1(0xc3); -+ -+ return (void *) out - _out; -+} -+ -+#else -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ u64 l_v, r_v; -+ -+ if (!nr_key_bits) -+ return 0; -+ -+ /* for big endian, skip past header */ -+ nr_key_bits += high_bit_offset; -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (1) { -+ if (nr_key_bits < 64) { -+ l_v >>= 64 - nr_key_bits; -+ r_v >>= 64 - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= 64; -+ } -+ -+ if (!nr_key_bits || l_v != r_v) -+ break; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ } -+ -+ return cmp_int(l_v, r_v); -+} -+#endif -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ int ret; -+ -+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ ret = __bkey_cmp_bits(high_word(f, l), -+ high_word(f, r), -+ b->nr_key_bits); -+ -+ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), -+ bkey_unpack_pos(b, r))); -+ return ret; -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_packed(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ struct bkey unpacked; -+ -+ if (likely(bkey_packed(l) && bkey_packed(r))) -+ return __bch2_bkey_cmp_packed_format_checked(l, r, b); -+ -+ if (bkey_packed(l)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, l); -+ l = (void*) &unpacked; -+ } else if (bkey_packed(r)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, r); -+ r = (void*) &unpacked; -+ } -+ -+ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ const struct bkey *l_unpacked; -+ -+ return unlikely(l_unpacked = packed_to_bkey_c(l)) -+ ? bkey_cmp(l_unpacked->p, *r) -+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+void bch2_bpos_swab(struct bpos *p) -+{ -+ u8 *l = (u8 *) p; -+ u8 *h = ((u8 *) &p[1]) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -+{ -+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; -+ u8 *l = k->key_start; -+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void) -+{ -+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); -+ struct bkey_packed p; -+ -+ struct bkey_format test_format = { -+ .key_u64s = 2, -+ .nr_fields = BKEY_NR_FIELDS, -+ .bits_per_field = { -+ 13, -+ 64, -+ }, -+ }; -+ -+ struct unpack_state in_s = -+ unpack_state_init(&bch2_bkey_format_current, (void *) &t); -+ struct pack_state out_s = pack_state_init(&test_format, &p); -+ unsigned i; -+ -+ for (i = 0; i < out_s.format->nr_fields; i++) { -+ u64 a, v = get_inc_field(&in_s, i); -+ -+ switch (i) { -+#define x(id, field) case id: a = t.field; break; -+ bkey_fields() -+#undef x -+ default: -+ BUG(); -+ } -+ -+ if (a != v) -+ panic("got %llu actual %llu i %u\n", v, a, i); -+ -+ if (!set_inc_field(&out_s, i, v)) -+ panic("failed at %u\n", i); -+ } -+ -+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -+} -+#endif -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -new file mode 100644 -index 000000000000..cbcfbd26bc58 ---- /dev/null -+++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,605 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_H -+#define _BCACHEFS_BKEY_H -+ -+#include -+#include "bcachefs_format.h" -+ -+#include "util.h" -+#include "vstructs.h" -+ -+#ifdef CONFIG_X86_64 -+#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -+#endif -+ -+void bch2_to_binary(char *, const u64 *, unsigned); -+ -+/* bkey with split value, const */ -+struct bkey_s_c { -+ const struct bkey *k; -+ const struct bch_val *v; -+}; -+ -+/* bkey with split value */ -+struct bkey_s { -+ union { -+ struct { -+ struct bkey *k; -+ struct bch_val *v; -+ }; -+ struct bkey_s_c s_c; -+ }; -+}; -+ -+#define bkey_next(_k) vstruct_next(_k) -+ -+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ k = bkey_next(k); -+ -+ while (k != end && !k->u64s) -+ k = (void *) ((u64 *) k + 1); -+ return k; -+} -+ -+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) -+ -+static inline size_t bkey_val_bytes(const struct bkey *k) -+{ -+ return bkey_val_u64s(k) * sizeof(u64); -+} -+ -+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -+{ -+ k->u64s = BKEY_U64s + val_u64s; -+} -+ -+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -+{ -+ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) -+ -+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) -+ -+#define bkey_whiteout(_k) \ -+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -+ -+#define bkey_packed_typecheck(_k) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ -+ !type_is(_k, struct bkey_packed *)); \ -+ type_is(_k, struct bkey_packed *); \ -+}) -+ -+enum bkey_lr_packed { -+ BKEY_PACKED_BOTH, -+ BKEY_PACKED_RIGHT, -+ BKEY_PACKED_LEFT, -+ BKEY_PACKED_NONE, -+}; -+ -+#define bkey_lr_packed_typecheck(_l, _r) \ -+ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) -+ -+#define bkey_lr_packed(_l, _r) \ -+ ((_l)->format + ((_r)->format << 1)) -+ -+#define bkey_copy(_dst, _src) \ -+do { \ -+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ -+ !type_is(_dst, struct bkey_packed *)); \ -+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ -+ !type_is(_src, struct bkey_packed *)); \ -+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ -+ (u64 *) (_dst) < (u64 *) (_src) + \ -+ ((struct bkey *) (_src))->u64s); \ -+ \ -+ memcpy_u64s_small((_dst), (_src), \ -+ ((struct bkey *) (_src))->u64s); \ -+} while (0) -+ -+struct btree; -+ -+struct bkey_format_state { -+ u64 field_min[BKEY_NR_FIELDS]; -+ u64 field_max[BKEY_NR_FIELDS]; -+}; -+ -+void bch2_bkey_format_init(struct bkey_format_state *); -+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); -+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -+const char *bch2_bkey_format_validate(struct bkey_format *); -+ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+__pure -+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+__pure -+int __bch2_bkey_cmp_packed(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+static inline __pure -+int bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, const struct bpos *r) -+{ -+ return __bch2_bkey_cmp_left_packed(b, l, r); -+} -+ -+/* -+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to -+ * pass it by by val... as much as I hate c++, const ref would be nice here: -+ */ -+__pure __flatten -+static inline int bkey_cmp_left_packed_byval(const struct btree *b, -+ const struct bkey_packed *l, -+ struct bpos r) -+{ -+ return bkey_cmp_left_packed(b, l, &r); -+} -+ -+/* -+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to -+ * skip dispatching on k->format: -+ */ -+#define bkey_cmp_packed(_b, _l, _r) \ -+({ \ -+ int _cmp; \ -+ \ -+ switch (bkey_lr_packed_typecheck(_l, _r)) { \ -+ case BKEY_PACKED_NONE: \ -+ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ -+ ((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_LEFT: \ -+ _cmp = bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_l), \ -+ &((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_RIGHT: \ -+ _cmp = -bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_r), \ -+ &((struct bkey *) (_l))->p); \ -+ break; \ -+ case BKEY_PACKED_BOTH: \ -+ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ -+ (void *) (_r), (_b)); \ -+ break; \ -+ } \ -+ _cmp; \ -+}) -+ -+#if 1 -+static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -+{ -+ if (l.inode != r.inode) -+ return l.inode < r.inode ? -1 : 1; -+ if (l.offset != r.offset) -+ return l.offset < r.offset ? -1 : 1; -+ if (l.snapshot != r.snapshot) -+ return l.snapshot < r.snapshot ? -1 : 1; -+ return 0; -+} -+#else -+int bkey_cmp(struct bpos l, struct bpos r); -+#endif -+ -+static inline struct bpos bpos_min(struct bpos l, struct bpos r) -+{ -+ return bkey_cmp(l, r) < 0 ? l : r; -+} -+ -+void bch2_bpos_swab(struct bpos *); -+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); -+ -+static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -+{ -+ return cmp_int(l.hi, r.hi) ?: -+ cmp_int(l.lo, r.lo); -+} -+ -+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -+ -+static __always_inline int bversion_zero(struct bversion v) -+{ -+ return !bversion_cmp(v, ZERO_VERSION); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+/* statement expressions confusing unlikely()? */ -+#define bkey_packed(_k) \ -+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ -+ (_k)->format != KEY_FORMAT_CURRENT; }) -+#else -+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -+#endif -+ -+/* -+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse -+ */ -+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -+{ -+ return (struct bkey_packed *) k; -+} -+ -+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -+{ -+ return (const struct bkey_packed *) k; -+} -+ -+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (struct bkey_i *) k; -+} -+ -+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (const struct bkey *) k; -+} -+ -+static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -+{ -+ return format->bits_per_field[BKEY_FIELD_INODE] + -+ format->bits_per_field[BKEY_FIELD_OFFSET] + -+ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -+} -+ -+static inline struct bpos bkey_successor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!++ret.offset) -+ BUG_ON(!++ret.inode); -+ -+ return ret; -+} -+ -+static inline struct bpos bkey_predecessor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!ret.offset--) -+ BUG_ON(!ret.inode--); -+ -+ return ret; -+} -+ -+static inline u64 bkey_start_offset(const struct bkey *k) -+{ -+ return k->p.offset - k->size; -+} -+ -+static inline struct bpos bkey_start_pos(const struct bkey *k) -+{ -+ return (struct bpos) { -+ .inode = k->p.inode, -+ .offset = bkey_start_offset(k), -+ .snapshot = k->p.snapshot, -+ }; -+} -+ -+/* Packed helpers */ -+ -+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; -+ -+ EBUG_ON(k->u64s < ret); -+ return ret; -+} -+ -+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_key_u64s(format, k) * sizeof(u64); -+} -+ -+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return k->u64s - bkeyp_key_u64s(format, k); -+} -+ -+static inline size_t bkeyp_val_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_val_u64s(format, k) * sizeof(u64); -+} -+ -+static inline void set_bkeyp_val_u64s(const struct bkey_format *format, -+ struct bkey_packed *k, unsigned val_u64s) -+{ -+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -+} -+ -+#define bkeyp_val(_format, _k) \ -+ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) -+ -+extern const struct bkey_format bch2_bkey_format_current; -+ -+bool bch2_bkey_transform(const struct bkey_format *, -+ struct bkey_packed *, -+ const struct bkey_format *, -+ const struct bkey_packed *); -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *, -+ const struct bkey_packed *); -+#endif -+ -+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, -+ const struct bkey_format *); -+ -+enum bkey_pack_pos_ret { -+ BKEY_PACK_POS_EXACT, -+ BKEY_PACK_POS_SMALLER, -+ BKEY_PACK_POS_FAIL, -+}; -+ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, -+ const struct btree *); -+ -+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, -+ const struct btree *b) -+{ -+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -+} -+ -+void bch2_bkey_unpack(const struct btree *, struct bkey_i *, -+ const struct bkey_packed *); -+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, -+ const struct bkey_format *); -+ -+static inline u64 bkey_field_max(const struct bkey_format *f, -+ enum bch_bkey_fields nr) -+{ -+ return f->bits_per_field[nr] < 64 -+ ? (le64_to_cpu(f->field_offset[nr]) + -+ ~(~0ULL << f->bits_per_field[nr])) -+ : U64_MAX; -+} -+ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ -+int bch2_compile_bkey_format(const struct bkey_format *, void *); -+ -+#else -+ -+static inline int bch2_compile_bkey_format(const struct bkey_format *format, -+ void *out) { return 0; } -+ -+#endif -+ -+static inline void bkey_reassemble(struct bkey_i *dst, -+ struct bkey_s_c src) -+{ -+ dst->k = *src.k; -+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -+} -+ -+#define bkey_s_null ((struct bkey_s) { .k = NULL }) -+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) -+ -+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) -+ -+static inline struct bkey_s bkey_to_s(struct bkey *k) -+{ -+ return (struct bkey_s) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -+{ -+ return (struct bkey_s_c) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -+{ -+ return (struct bkey_s) { .k = &k->k, .v = &k->v }; -+} -+ -+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -+{ -+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -+} -+ -+/* -+ * For a given type of value (e.g. struct bch_extent), generates the types for -+ * bkey + bch_extent - inline, split, split const - and also all the conversion -+ * functions, which also check that the value is of the correct type. -+ * -+ * We use anonymous unions for upcasting - e.g. converting from e.g. a -+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion -+ * functions. -+ */ -+#define BKEY_VAL_ACCESSORS(name) \ -+struct bkey_i_##name { \ -+ union { \ -+ struct bkey k; \ -+ struct bkey_i k_i; \ -+ }; \ -+ struct bch_##name v; \ -+}; \ -+ \ -+struct bkey_s_c_##name { \ -+ union { \ -+ struct { \ -+ const struct bkey *k; \ -+ const struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+struct bkey_s_##name { \ -+ union { \ -+ struct { \ -+ struct bkey *k; \ -+ struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c_##name c; \ -+ struct bkey_s s; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline const struct bkey_i_##name * \ -+bkey_i_to_##name##_c(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -+{ \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+name##_i_to_s_c(const struct bkey_i_##name *k) \ -+{ \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+bkey_i_to_s_c_##name(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -+{ \ -+ struct bkey_i_##name *k = \ -+ container_of(&_k->k, struct bkey_i_##name, k); \ -+ \ -+ bkey_init(&k->k); \ -+ memset(&k->v, 0, sizeof(k->v)); \ -+ k->k.type = KEY_TYPE_##name; \ -+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ -+ \ -+ return k; \ -+} -+ -+BKEY_VAL_ACCESSORS(cookie); -+BKEY_VAL_ACCESSORS(btree_ptr); -+BKEY_VAL_ACCESSORS(extent); -+BKEY_VAL_ACCESSORS(reservation); -+BKEY_VAL_ACCESSORS(inode); -+BKEY_VAL_ACCESSORS(inode_generation); -+BKEY_VAL_ACCESSORS(dirent); -+BKEY_VAL_ACCESSORS(xattr); -+BKEY_VAL_ACCESSORS(alloc); -+BKEY_VAL_ACCESSORS(quota); -+BKEY_VAL_ACCESSORS(stripe); -+BKEY_VAL_ACCESSORS(reflink_p); -+BKEY_VAL_ACCESSORS(reflink_v); -+BKEY_VAL_ACCESSORS(inline_data); -+BKEY_VAL_ACCESSORS(btree_ptr_v2); -+ -+/* byte order helpers */ -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return f->key_u64s - 1; -+} -+ -+#define high_bit_offset 0 -+#define nth_word(p, n) ((p) - (n)) -+ -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return 0; -+} -+ -+#define high_bit_offset KEY_PACKED_BITS_START -+#define nth_word(p, n) ((p) + (n)) -+ -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define high_word(f, k) ((k)->_data + high_word_offset(f)) -+#define next_word(p) nth_word(p, 1) -+#define prev_word(p) nth_word(p, -1) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void); -+#else -+static inline void bch2_bkey_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_BKEY_H */ -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -new file mode 100644 -index 000000000000..36e0c5152b47 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,353 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "alloc_background.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "quota.h" -+#include "reflink.h" -+#include "xattr.h" -+ -+const char * const bch2_bkey_types[] = { -+#define x(name, nr) #name, -+ BCH_BKEY_TYPES() -+#undef x -+ NULL -+}; -+ -+static const char *deleted_key_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_deleted (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+#define bch2_bkey_ops_discard (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k)) -+ return "value size should be zero"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_error (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_cookie_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_cookie (struct bkey_ops) { \ -+ .key_invalid = key_type_cookie_invalid, \ -+} -+ -+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_inline_data_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); -+} -+ -+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ -+ .key_invalid = key_type_inline_data_invalid, \ -+ .val_to_text = key_type_inline_data_to_text, \ -+} -+ -+static const struct bkey_ops bch2_bkey_ops[] = { -+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, -+ BCH_BKEY_TYPES() -+#undef x -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->type >= KEY_TYPE_MAX) -+ return "invalid type"; -+ -+ return bch2_bkey_ops[k.k->type].key_invalid(c, k); -+} -+ -+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ if (k.k->u64s < BKEY_U64s) -+ return "u64s too small"; -+ -+ if (type == BKEY_TYPE_BTREE && -+ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ if (btree_node_type_is_extents(type)) { -+ if ((k.k->size == 0) != bkey_deleted(k.k)) -+ return "bad size field"; -+ -+ if (k.k->size > k.k->p.offset) -+ return "size greater than offset"; -+ } else { -+ if (k.k->size) -+ return "nonzero size field"; -+ } -+ -+ if (k.k->p.snapshot) -+ return "nonzero snapshot"; -+ -+ if (type != BKEY_TYPE_BTREE && -+ !bkey_cmp(k.k->p, POS_MAX)) -+ return "POS_MAX key"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ return __bch2_bkey_invalid(c, k, type) ?: -+ bch2_bkey_val_invalid(c, k); -+} -+ -+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) -+{ -+ if (bkey_cmp(k.k->p, b->data->min_key) < 0) -+ return "key before start of btree node"; -+ -+ if (bkey_cmp(k.k->p, b->data->max_key) > 0) -+ return "key past end of btree node"; -+ -+ return NULL; -+} -+ -+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ const char *invalid; -+ -+ BUG_ON(!k.k->u64s); -+ -+ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, k); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); -+ return; -+ } -+ -+ if (ops->key_debugcheck) -+ ops->key_debugcheck(c, k); -+} -+ -+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -+{ -+ if (!bkey_cmp(pos, POS_MIN)) -+ pr_buf(out, "POS_MIN"); -+ else if (!bkey_cmp(pos, POS_MAX)) -+ pr_buf(out, "POS_MAX"); -+ else -+ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); -+} -+ -+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -+{ -+ if (k) { -+ pr_buf(out, "u64s %u type %s ", k->u64s, -+ bch2_bkey_types[k->type]); -+ -+ bch2_bpos_to_text(out, k->p); -+ -+ pr_buf(out, " snap %u len %u ver %llu", -+ k->p.snapshot, k->size, k->version.lo); -+ } else { -+ pr_buf(out, "(null)"); -+ } -+} -+ -+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); -+} -+ -+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_to_text(out, k.k); -+ -+ if (k.k) { -+ pr_buf(out, ": "); -+ bch2_val_to_text(out, c, k); -+ } -+} -+ -+void bch2_bkey_swab_val(struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (ops->swab) -+ ops->swab(k); -+} -+ -+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ return ops->key_normalize -+ ? ops->key_normalize(c, k) -+ : false; -+} -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *c, -+ struct bkey_s l, struct bkey_s r) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; -+ enum merge_result ret; -+ -+ if (key_merging_disabled(c) || -+ !ops->key_merge || -+ l.k->type != r.k->type || -+ bversion_cmp(l.k->version, r.k->version) || -+ bkey_cmp(l.k->p, bkey_start_pos(r.k))) -+ return BCH_MERGE_NOMERGE; -+ -+ ret = ops->key_merge(c, l, r); -+ -+ if (ret != BCH_MERGE_NOMERGE) -+ l.k->needs_whiteout |= r.k->needs_whiteout; -+ return ret; -+} -+ -+static const struct old_bkey_type { -+ u8 btree_node_type; -+ u8 old; -+ u8 new; -+} bkey_renumber_table[] = { -+ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, -+ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, -+ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, -+ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, -+ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, -+ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, -+ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, -+ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, -+}; -+ -+void bch2_bkey_renumber(enum btree_node_type btree_node_type, -+ struct bkey_packed *k, -+ int write) -+{ -+ const struct old_bkey_type *i; -+ -+ for (i = bkey_renumber_table; -+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); -+ i++) -+ if (btree_node_type == i->btree_node_type && -+ k->type == (write ? i->new : i->old)) { -+ k->type = write ? i->old : i->new; -+ break; -+ } -+} -+ -+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ const struct bkey_ops *ops; -+ struct bkey uk; -+ struct bkey_s u; -+ int i; -+ -+ /* -+ * Do these operations in reverse order in the write path: -+ */ -+ -+ for (i = 0; i < 4; i++) -+ switch (!write ? i : 3 - i) { -+ case 0: -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_key(f, k); -+ break; -+ case 1: -+ if (version < bcachefs_metadata_version_bkey_renumber) -+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); -+ break; -+ case 2: -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ if (!bkey_packed(k)) { -+ struct bkey_i *u = packed_to_bkey(k); -+ swap(u->k.p.inode, u->k.p.offset); -+ } else if (f->bits_per_field[BKEY_FIELD_INODE] && -+ f->bits_per_field[BKEY_FIELD_OFFSET]) { -+ struct bkey_format tmp = *f, *in = f, *out = &tmp; -+ -+ swap(tmp.bits_per_field[BKEY_FIELD_INODE], -+ tmp.bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(tmp.field_offset[BKEY_FIELD_INODE], -+ tmp.field_offset[BKEY_FIELD_OFFSET]); -+ -+ if (!write) -+ swap(in, out); -+ -+ uk = __bch2_bkey_unpack_key(in, k); -+ swap(uk.p.inode, uk.p.offset); -+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); -+ } -+ } -+ break; -+ case 3: -+ if (!bkey_packed(k)) { -+ u = bkey_i_to_s(packed_to_bkey(k)); -+ } else { -+ uk = __bch2_bkey_unpack_key(f, k); -+ u.k = &uk; -+ u.v = bkeyp_val(f, k); -+ } -+ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_val(u); -+ -+ ops = &bch2_bkey_ops[k->type]; -+ -+ if (ops->compat) -+ ops->compat(btree_id, version, big_endian, write, u); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h -new file mode 100644 -index 000000000000..0bca725ae3b8 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,82 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_METHODS_H -+#define _BCACHEFS_BKEY_METHODS_H -+ -+#include "bkey.h" -+ -+struct bch_fs; -+struct btree; -+struct bkey; -+enum btree_node_type; -+ -+extern const char * const bch2_bkey_types[]; -+ -+enum merge_result { -+ BCH_MERGE_NOMERGE, -+ -+ /* -+ * The keys were mergeable, but would have overflowed size - so instead -+ * l was changed to the maximum size, and both keys were modified: -+ */ -+ BCH_MERGE_PARTIAL, -+ BCH_MERGE_MERGE, -+}; -+ -+struct bkey_ops { -+ /* Returns reason for being invalid if invalid, else NULL: */ -+ const char * (*key_invalid)(const struct bch_fs *, -+ struct bkey_s_c); -+ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); -+ void (*val_to_text)(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ void (*swab)(struct bkey_s); -+ bool (*key_normalize)(struct bch_fs *, struct bkey_s); -+ enum merge_result (*key_merge)(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ void (*compat)(enum btree_id id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s); -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); -+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -+ -+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -+ -+void bch2_bpos_to_text(struct printbuf *, struct bpos); -+void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -+void bch2_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_bkey_swab_val(struct bkey_s); -+ -+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); -+ -+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, -+ int, struct bkey_format *, struct bkey_packed *); -+ -+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ if (version < bcachefs_metadata_version_current || -+ big_endian != CPU_BIG_ENDIAN) -+ __bch2_bkey_compat(level, btree_id, version, -+ big_endian, write, f, k); -+ -+} -+ -+#endif /* _BCACHEFS_BKEY_METHODS_H */ -diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h -new file mode 100644 -index 000000000000..f607a0cb37ed ---- /dev/null -+++ b/fs/bcachefs/bkey_on_stack.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_ON_STACK_H -+#define _BCACHEFS_BKEY_ON_STACK_H -+ -+#include "bcachefs.h" -+ -+struct bkey_on_stack { -+ struct bkey_i *k; -+ u64 onstack[12]; -+}; -+ -+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, -+ struct bch_fs *c, unsigned u64s) -+{ -+ if (s->k == (void *) s->onstack && -+ u64s > ARRAY_SIZE(s->onstack)) { -+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); -+ memcpy(s->k, s->onstack, sizeof(s->onstack)); -+ } -+} -+ -+static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, -+ struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bkey_on_stack_realloc(s, c, k.k->u64s); -+ bkey_reassemble(s->k, k); -+} -+ -+static inline void bkey_on_stack_init(struct bkey_on_stack *s) -+{ -+ s->k = (void *) s->onstack; -+} -+ -+static inline void bkey_on_stack_exit(struct bkey_on_stack *s, -+ struct bch_fs *c) -+{ -+ if (s->k != (void *) s->onstack) -+ mempool_free(s->k, &c->large_bkey_pool); -+ s->k = NULL; -+} -+ -+#endif /* _BCACHEFS_BKEY_ON_STACK_H */ -diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c -new file mode 100644 -index 000000000000..839e78d1dc35 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,515 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "bkey_sort.h" -+#include "bset.h" -+#include "extents.h" -+ -+typedef int (*sort_cmp_fn)(struct btree *, -+ struct bkey_packed *, -+ struct bkey_packed *); -+ -+static inline bool sort_iter_end(struct sort_iter *iter) -+{ -+ return !iter->used; -+} -+ -+static inline void __sort_iter_sift(struct sort_iter *iter, -+ unsigned from, -+ sort_cmp_fn cmp) -+{ -+ unsigned i; -+ -+ for (i = from; -+ i + 1 < iter->used && -+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; -+ i++) -+ swap(iter->data[i], iter->data[i + 1]); -+} -+ -+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ -+ __sort_iter_sift(iter, 0, cmp); -+} -+ -+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ unsigned i = iter->used; -+ -+ while (i--) -+ __sort_iter_sift(iter, i, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -+{ -+ return !sort_iter_end(iter) ? iter->data->k : NULL; -+} -+ -+static inline void __sort_iter_advance(struct sort_iter *iter, -+ unsigned idx, sort_cmp_fn cmp) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ BUG_ON(idx >= iter->used); -+ -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ -+ BUG_ON(i->k > i->end); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, cmp); -+} -+ -+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ __sort_iter_advance(iter, 0, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, -+ sort_cmp_fn cmp) -+{ -+ struct bkey_packed *ret = sort_iter_peek(iter); -+ -+ if (ret) -+ sort_iter_advance(iter, cmp); -+ -+ return ret; -+} -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ */ -+static inline int key_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ cmp_int((unsigned long) l, (unsigned long) r); -+} -+ -+static inline bool should_drop_next_key(struct sort_iter *iter) -+{ -+ /* -+ * key_sort_cmp() ensures that when keys compare equal the older key -+ * comes first; so if l->k compares equal to r->k then l->k is older -+ * and should be dropped. -+ */ -+ return iter->used >= 2 && -+ !bkey_cmp_packed(iter->b, -+ iter->data[0].k, -+ iter->data[1].k); -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct bkey_packed *out = dst->start; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); -+ -+ while ((k = sort_iter_peek(iter))) { -+ if (!bkey_whiteout(k) && -+ !should_drop_next_key(iter)) { -+ bkey_copy(out, k); -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+static void extent_sort_append(struct bch_fs *c, -+ struct bkey_format *f, -+ struct btree_nr_keys *nr, -+ struct bkey_packed **out, -+ struct bkey_s k) -+{ -+ if (!bkey_whiteout(k.k)) { -+ if (!bch2_bkey_pack_key(*out, k.k, f)) -+ memcpy_u64s_small(*out, k.k, BKEY_U64s); -+ -+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); -+ -+ btree_keys_account_key_add(nr, 0, *out); -+ *out = bkey_next(*out); -+ } -+} -+ -+/* Sort + repack in a new format: */ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *dst, struct btree *src, -+ struct btree_node_iter *src_iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_format *in_f = &src->format; -+ struct bkey_packed *in, *out = vstruct_last(dst); -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(in)) -+ continue; -+ -+ if (bch2_bkey_transform(out_f, out, bkey_packed(in) -+ ? in_f : &bch2_bkey_format_current, in)) -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ else -+ bch2_bkey_unpack(src, (void *) out, in); -+ -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *c, -+ struct bset *dst, struct btree *src, -+ struct btree_node_iter *iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *out = vstruct_last(dst), *k_packed; -+ struct bkey_on_stack k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&k); -+ -+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(k_packed)) -+ continue; -+ -+ /* -+ * NOTE: -+ * bch2_bkey_normalize may modify the key we pass it (dropping -+ * stale pointers) and we don't have a write lock on the src -+ * node; we have to make a copy of the entire key before calling -+ * normalize -+ */ -+ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); -+ bch2_bkey_unpack(src, k.k, k_packed); -+ -+ if (filter_whiteouts && -+ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) -+ continue; -+ -+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ bkey_on_stack_exit(&k, c); -+ return nr; -+} -+ -+static inline int sort_keys_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: -+ (int) l->needs_whiteout - (int) r->needs_whiteout; -+} -+ -+unsigned bch2_sort_keys(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *next, *out = dst; -+ -+ sort_iter_sort(iter, sort_keys_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_keys_cmp))) { -+ bool needs_whiteout = false; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ while ((next = sort_iter_peek(iter)) && -+ !bkey_cmp_packed(iter->b, in, next)) { -+ BUG_ON(in->needs_whiteout && -+ next->needs_whiteout); -+ needs_whiteout |= in->needs_whiteout; -+ in = sort_iter_next(iter, sort_keys_cmp); -+ } -+ -+ if (bkey_whiteout(in)) { -+ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); -+ set_bkeyp_val_u64s(f, out, 0); -+ } else { -+ bkey_copy(out, in); -+ } -+ out->needs_whiteout |= needs_whiteout; -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+/* Compat code for btree_node_old_extent_overwrite: */ -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ * -+ * Necessary for sort_fix_overlapping() - if there are multiple keys that -+ * compare equal in different sets, we have to process them newest to oldest. -+ */ -+static inline int extent_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), -+ bkey_start_pos(&ur)) ?: -+ cmp_int((unsigned long) r, (unsigned long) l); -+} -+ -+/* -+ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same -+ * bset being ordered by start offset - but 0 size whiteouts (which are always -+ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: -+ */ -+static void extent_iter_advance(struct sort_iter *iter, unsigned idx) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ do { -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ } while (i->k != i->end && bkey_deleted(i->k)); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); -+} -+ -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct btree *b = iter->b; -+ struct bkey_format *f = &b->format; -+ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; -+ struct bkey_packed *out = dst->start; -+ struct bkey l_unpacked, r_unpacked; -+ struct bkey_s l, r; -+ struct btree_nr_keys nr; -+ struct bkey_on_stack split; -+ unsigned i; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&split); -+ -+ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); -+ for (i = 0; i < iter->used;) { -+ if (bkey_deleted(iter->data[i].k)) -+ __sort_iter_advance(iter, i, -+ extent_sort_fix_overlapping_cmp); -+ else -+ i++; -+ } -+ -+ while (!sort_iter_end(iter)) { -+ l = __bkey_disassemble(b, _l->k, &l_unpacked); -+ -+ if (iter->used == 1) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ r = __bkey_disassemble(b, _r->k, &r_unpacked); -+ -+ /* If current key and next key don't overlap, just append */ -+ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ /* Skip 0 size keys */ -+ if (!r.k->size) { -+ extent_iter_advance(iter, 1); -+ continue; -+ } -+ -+ /* -+ * overlap: keep the newer key and trim the older key so they -+ * don't overlap. comparing pointers tells us which one is -+ * newer, since the bsets are appended one after the other. -+ */ -+ -+ /* can't happen because of comparison func */ -+ BUG_ON(_l->k < _r->k && -+ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); -+ -+ if (_l->k > _r->k) { -+ /* l wins, trim r */ -+ if (bkey_cmp(l.k->p, r.k->p) >= 0) { -+ extent_iter_advance(iter, 1); -+ } else { -+ bch2_cut_front_s(l.k->p, r); -+ extent_save(b, _r->k, r.k); -+ __sort_iter_sift(iter, 1, -+ extent_sort_fix_overlapping_cmp); -+ } -+ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { -+ -+ /* -+ * r wins, but it overlaps in the middle of l - split l: -+ */ -+ bkey_on_stack_reassemble(&split, c, l.s_c); -+ bch2_cut_back(bkey_start_pos(r.k), split.k); -+ -+ bch2_cut_front_s(r.k->p, l); -+ extent_save(b, _l->k, l.k); -+ -+ __sort_iter_sift(iter, 0, -+ extent_sort_fix_overlapping_cmp); -+ -+ extent_sort_append(c, f, &nr, &out, -+ bkey_i_to_s(split.k)); -+ } else { -+ bch2_cut_back_s(bkey_start_pos(r.k), l); -+ extent_save(b, _l->k, l.k); -+ } -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ -+ bkey_on_stack_exit(&split, c); -+ return nr; -+} -+ -+static inline int sort_extents_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(l) - (int) bkey_deleted(r); -+} -+ -+unsigned bch2_sort_extents(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *in, *out = dst; -+ -+ sort_iter_sort(iter, sort_extents_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extents_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ bkey_copy(out, in); -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+static inline int sort_extent_whiteouts_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -+} -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, -+ struct sort_iter *iter) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *out = dst; -+ struct bkey_i l, r; -+ bool prev = false, l_packed = false; -+ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); -+ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); -+ u64 new_size; -+ -+ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); -+ -+ sort_iter_sort(iter, sort_extent_whiteouts_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ EBUG_ON(bkeyp_val_u64s(f, in)); -+ EBUG_ON(in->type != KEY_TYPE_discard); -+ -+ r.k = bkey_unpack_key(iter->b, in); -+ -+ if (prev && -+ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ new_size = l_packed -+ ? min(max_packed_size, max_packed_offset - -+ bkey_start_offset(&l.k)) -+ : KEY_SIZE_MAX; -+ -+ new_size = min(new_size, r.k.p.offset - -+ bkey_start_offset(&l.k)); -+ -+ BUG_ON(new_size < l.k.size); -+ -+ bch2_key_resize(&l.k, new_size); -+ -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ bch2_cut_front(l.k.p, &r); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ l = r; -+ prev = true; -+ l_packed = bkey_packed(in); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h -new file mode 100644 -index 000000000000..458a051fdac5 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_SORT_H -+#define _BCACHEFS_BKEY_SORT_H -+ -+struct sort_iter { -+ struct btree *b; -+ unsigned used; -+ unsigned size; -+ -+ struct sort_iter_set { -+ struct bkey_packed *k, *end; -+ } data[MAX_BSETS + 1]; -+}; -+ -+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) -+{ -+ iter->b = b; -+ iter->used = 0; -+ iter->size = ARRAY_SIZE(iter->data); -+} -+ -+static inline void sort_iter_add(struct sort_iter *iter, -+ struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ BUG_ON(iter->used >= iter->size); -+ -+ if (k != end) -+ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *, -+ struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+ -+unsigned bch2_sort_keys(struct bkey_packed *, -+ struct sort_iter *, bool); -+unsigned bch2_sort_extents(struct bkey_packed *, -+ struct sort_iter *, bool); -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, -+ struct sort_iter *); -+ -+#endif /* _BCACHEFS_BKEY_SORT_H */ -diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c -new file mode 100644 -index 000000000000..f7c2841ed8a7 ---- /dev/null -+++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1742 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for working with individual keys, and sorted sets of keys with in a -+ * btree node -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "bset.h" -+#include "eytzinger.h" -+#include "util.h" -+ -+#include -+#include -+#include -+#include -+ -+/* hack.. */ -+#include "alloc_types.h" -+#include -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, -+ struct btree *); -+ -+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -+{ -+ unsigned n = ARRAY_SIZE(iter->data); -+ -+ while (n && __btree_node_iter_set_end(iter, n - 1)) -+ --n; -+ -+ return n; -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (offset <= t->end_offset) { -+ EBUG_ON(offset < btree_bkey_first_offset(t)); -+ return t; -+ } -+ -+ BUG(); -+} -+ -+/* -+ * There are never duplicate live keys in the btree - but including keys that -+ * have been flagged as deleted (and will be cleaned up later) we _will_ see -+ * duplicates. -+ * -+ * Thus the sort order is: usual key comparison first, but for keys that compare -+ * equal the deleted key(s) come first, and the (at most one) live version comes -+ * last. -+ * -+ * The main reason for this is insertion: to handle overwrites, we first iterate -+ * over keys that compare equal to our insert key, and then insert immediately -+ * prior to the first key greater than the key we're inserting - our insert -+ * position will be after all keys that compare equal to our insert key, which -+ * by the time we actually do the insert will all be deleted. -+ */ -+ -+void bch2_dump_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned set) -+{ -+ struct bkey_packed *_k, *_n; -+ struct bkey uk, n; -+ struct bkey_s_c k; -+ char buf[200]; -+ -+ if (!i->u64s) -+ return; -+ -+ for (_k = i->start; -+ _k < vstruct_last(i); -+ _k = _n) { -+ _n = bkey_next_skip_noops(_k, vstruct_last(i)); -+ -+ k = bkey_disassemble(b, _k, &uk); -+ if (c) -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ else -+ bch2_bkey_to_text(&PBUF(buf), k.k); -+ printk(KERN_ERR "block %u key %5zu: %s\n", set, -+ _k->_data - i->_data, buf); -+ -+ if (_n == vstruct_last(i)) -+ continue; -+ -+ n = bkey_unpack_key(b, _n); -+ -+ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { -+ printk(KERN_ERR "Key skipped backwards\n"); -+ continue; -+ } -+ -+ if (!bkey_deleted(k.k) && -+ !bkey_cmp(n.p, k.k->p)) -+ printk(KERN_ERR "Duplicate keys\n"); -+ } -+} -+ -+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ console_lock(); -+ for_each_bset(b, t) -+ bch2_dump_bset(c, b, bset(b, t), t - b->set); -+ console_unlock(); -+} -+ -+void bch2_dump_btree_node_iter(struct btree *b, -+ struct btree_node_iter *iter) -+{ -+ struct btree_node_iter_set *set; -+ -+ printk(KERN_ERR "btree node iter with %u/%u sets:\n", -+ __btree_node_iter_used(iter), b->nsets); -+ -+ btree_node_iter_for_each(iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk = bkey_unpack_key(b, k); -+ char buf[100]; -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ printk(KERN_ERR "set %zu key %u: %s\n", -+ t - b->set, set->k, buf); -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ struct bset_tree *t; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr = { 0 }; -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) -+ btree_keys_account_key_add(&nr, t - b->set, k); -+ -+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -+} -+ -+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, -+ struct btree *b) -+{ -+ struct btree_node_iter iter = *_iter; -+ const struct bkey_packed *k, *n; -+ -+ k = bch2_btree_node_iter_peek_all(&iter, b); -+ __bch2_btree_node_iter_advance(&iter, b); -+ n = bch2_btree_node_iter_peek_all(&iter, b); -+ -+ bkey_unpack_key(b, k); -+ -+ if (n && -+ bkey_iter_cmp(b, k, n) > 0) { -+ struct btree_node_iter_set *set; -+ struct bkey ku = bkey_unpack_key(b, k); -+ struct bkey nu = bkey_unpack_key(b, n); -+ char buf1[80], buf2[80]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &ku); -+ bch2_bkey_to_text(&PBUF(buf2), &nu); -+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", -+ buf1, buf2); -+ printk(KERN_ERR "iter was:"); -+ -+ btree_node_iter_for_each(_iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ printk(" [%zi %zi]", t - b->set, -+ k->_data - bset(b, t)->_data); -+ } -+ panic("\n"); -+ } -+} -+ -+void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct btree_node_iter_set *set, *s2; -+ struct bkey_packed *k, *p; -+ struct bset_tree *t; -+ -+ if (bch2_btree_node_iter_end(iter)) -+ return; -+ -+ /* Verify no duplicates: */ -+ btree_node_iter_for_each(iter, set) -+ btree_node_iter_for_each(iter, s2) -+ BUG_ON(set != s2 && set->end == s2->end); -+ -+ /* Verify that set->end is correct: */ -+ btree_node_iter_for_each(iter, set) { -+ for_each_bset(b, t) -+ if (set->end == t->end_offset) -+ goto found; -+ BUG(); -+found: -+ BUG_ON(set->k < btree_bkey_first_offset(t) || -+ set->k >= t->end_offset); -+ } -+ -+ /* Verify iterator is sorted: */ -+ btree_node_iter_for_each(iter, set) -+ BUG_ON(set != iter->data && -+ btree_node_iter_cmp(b, set[-1], set[0]) > 0); -+ -+ k = bch2_btree_node_iter_peek_all(iter, b); -+ -+ for_each_bset(b, t) { -+ if (iter->data[0].end == t->end_offset) -+ continue; -+ -+ p = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ -+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); -+ } -+} -+ -+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -+ struct bkey_packed *insert, unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); -+ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); -+#if 0 -+ BUG_ON(prev && -+ bkey_iter_cmp(b, prev, insert) > 0); -+#else -+ if (prev && -+ bkey_iter_cmp(b, prev, insert) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, prev); -+ struct bkey k2 = bkey_unpack_key(b, insert); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("prev > insert:\n" -+ "prev key %s\n" -+ "insert key %s\n", -+ buf1, buf2); -+ } -+#endif -+#if 0 -+ BUG_ON(next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0); -+#else -+ if (next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, insert); -+ struct bkey k2 = bkey_unpack_key(b, next); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("insert > next:\n" -+ "insert key %s\n" -+ "next key %s\n", -+ buf1, buf2); -+ } -+#endif -+} -+ -+#else -+ -+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, -+ struct btree *b) {} -+ -+#endif -+ -+/* Auxiliary search trees */ -+ -+#define BFLOAT_FAILED_UNPACKED U8_MAX -+#define BFLOAT_FAILED U8_MAX -+ -+struct bkey_float { -+ u8 exponent; -+ u8 key_offset; -+ u16 mantissa; -+}; -+#define BKEY_MANTISSA_BITS 16 -+ -+static unsigned bkey_float_byte_offset(unsigned idx) -+{ -+ return idx * sizeof(struct bkey_float); -+} -+ -+struct ro_aux_tree { -+ struct bkey_float f[0]; -+}; -+ -+struct rw_aux_tree { -+ u16 offset; -+ struct bpos k; -+}; -+ -+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -+{ -+ BUG_ON(t->aux_data_offset == U16_MAX); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return t->aux_data_offset; -+ case BSET_RO_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + -+ t->size * sizeof(u8), 8); -+ case BSET_RW_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned bset_aux_tree_buf_start(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return t == b->set -+ ? DIV_ROUND_UP(b->unpack_fn_len, 8) -+ : bset_aux_tree_buf_end(t - 1); -+} -+ -+static void *__aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return b->aux_data + t->aux_data_offset * 8; -+} -+ -+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+static u8 *ro_aux_tree_prev(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -+} -+ -+static struct bkey_float *bkey_float(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned idx) -+{ -+ return ro_aux_tree_base(b, t)->f + idx; -+} -+ -+static void bset_aux_tree_verify(struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ if (t->aux_data_offset == U16_MAX) -+ continue; -+ -+ BUG_ON(t != b->set && -+ t[-1].aux_data_offset == U16_MAX); -+ -+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); -+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); -+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); -+ } -+#endif -+} -+ -+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) -+{ -+ unsigned i; -+ -+ b->nsets = 0; -+ memset(&b->nr, 0, sizeof(b->nr)); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ b->expensive_debug_checks = expensive_debug_checks; -+#endif -+ for (i = 0; i < MAX_BSETS; i++) -+ b->set[i].data_offset = U16_MAX; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+/* Binary tree stuff for auxiliary search trees */ -+ -+/* -+ * Cacheline/offset <-> bkey pointer arithmetic: -+ * -+ * t->tree is a binary search tree in an array; each node corresponds to a key -+ * in one cacheline in t->set (BSET_CACHELINE bytes). -+ * -+ * This means we don't have to store the full index of the key that a node in -+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and -+ * then bkey_float->m gives us the offset within that cacheline, in units of 8 -+ * bytes. -+ * -+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to -+ * make this work. -+ * -+ * To construct the bfloat for an arbitrary key we need to know what the key -+ * immediately preceding it is: we have to check if the two keys differ in the -+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size -+ * of the previous key so we can walk backwards to it from t->tree[j]'s key. -+ */ -+ -+static inline void *bset_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline) -+{ -+ return (void *) round_down((unsigned long) btree_bkey_first(b, t), -+ L1_CACHE_BYTES) + -+ cacheline * BSET_CACHELINE; -+} -+ -+static struct bkey_packed *cacheline_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ unsigned offset) -+{ -+ return bset_cacheline(b, t, cacheline) + offset * 8; -+} -+ -+static unsigned bkey_to_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ const struct bkey_packed *k) -+{ -+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -+} -+ -+static ssize_t __bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -+} -+ -+static unsigned bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); -+ -+ EBUG_ON(m > U8_MAX); -+ return m; -+} -+ -+static inline struct bkey_packed *tree_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ return cacheline_to_bkey(b, t, -+ __eytzinger1_to_inorder(j, t->size, t->extra), -+ bkey_float(b, t, j)->key_offset); -+} -+ -+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; -+ -+ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); -+} -+ -+static struct rw_aux_tree *rw_aux_tree(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+/* -+ * For the write set - the one we're currently inserting keys into - we don't -+ * maintain a full search tree, we just keep a simple lookup table in t->prev. -+ */ -+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, -+ struct bset_tree *t, -+ unsigned j) -+{ -+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -+} -+ -+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, -+ unsigned j, struct bkey_packed *k) -+{ -+ EBUG_ON(k >= btree_bkey_last(b, t)); -+ -+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { -+ .offset = __btree_node_key_to_offset(b, k), -+ .k = bkey_unpack_pos(b, k), -+ }; -+} -+ -+static void bch2_bset_verify_rw_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ struct bkey_packed *k = btree_bkey_first(b, t); -+ unsigned j = 0; -+ -+ if (!btree_keys_expensive_checks(b)) -+ return; -+ -+ BUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ BUG_ON(t->size < 1); -+ BUG_ON(rw_aux_to_bkey(b, t, j) != k); -+ -+ goto start; -+ while (1) { -+ if (rw_aux_to_bkey(b, t, j) == k) { -+ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, -+ bkey_unpack_pos(b, k))); -+start: -+ if (++j == t->size) -+ break; -+ -+ BUG_ON(rw_aux_tree(b, t)[j].offset <= -+ rw_aux_tree(b, t)[j - 1].offset); -+ } -+ -+ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ BUG_ON(k >= btree_bkey_last(b, t)); -+ } -+} -+ -+/* returns idx of first entry >= offset: */ -+static unsigned rw_aux_tree_bsearch(struct btree *b, -+ struct bset_tree *t, -+ unsigned offset) -+{ -+ unsigned bset_offs = offset - btree_bkey_first_offset(t); -+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); -+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ EBUG_ON(!t->size); -+ EBUG_ON(idx > t->size); -+ -+ while (idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset) -+ idx++; -+ -+ while (idx && -+ rw_aux_tree(b, t)[idx - 1].offset >= offset) -+ idx--; -+ -+ EBUG_ON(idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset); -+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); -+ EBUG_ON(idx + 1 < t->size && -+ rw_aux_tree(b, t)[idx].offset == -+ rw_aux_tree(b, t)[idx + 1].offset); -+ -+ return idx; -+} -+ -+static inline unsigned bkey_mantissa(const struct bkey_packed *k, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+ u64 v; -+ -+ EBUG_ON(!bkey_packed(k)); -+ -+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); -+ -+ /* -+ * In little endian, we're shifting off low bits (and then the bits we -+ * want are at the low end), in big endian we're shifting off high bits -+ * (and then the bits we want are at the high end, so we shift them -+ * back down): -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ v >>= f->exponent & 7; -+#else -+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -+#endif -+ return (u16) v; -+} -+ -+static void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) -+{ -+ struct bkey_float *f = bkey_float(b, t, j); -+ struct bkey_packed *m = tree_to_bkey(b, t, j); -+ struct bkey_packed *l, *r; -+ unsigned mantissa; -+ int shift, exponent, high_bit; -+ -+ if (is_power_of_2(j)) { -+ l = min_key; -+ -+ if (!l->u64s) { -+ if (!bkey_pack_pos(l, b->data->min_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = b->data->min_key; -+ bkey_copy(l, &tmp); -+ } -+ } -+ } else { -+ l = tree_to_prev_bkey(b, t, j >> ffs(j)); -+ -+ EBUG_ON(m < l); -+ } -+ -+ if (is_power_of_2(j + 1)) { -+ r = max_key; -+ -+ if (!r->u64s) { -+ if (!bkey_pack_pos(r, t->max_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = t->max_key; -+ bkey_copy(r, &tmp); -+ } -+ } -+ } else { -+ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); -+ -+ EBUG_ON(m > r); -+ } -+ -+ /* -+ * for failed bfloats, the lookup code falls back to comparing against -+ * the original key. -+ */ -+ -+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || -+ !b->nr_key_bits) { -+ f->exponent = BFLOAT_FAILED_UNPACKED; -+ return; -+ } -+ -+ /* -+ * The greatest differing bit of l and r is the first bit we must -+ * include in the bfloat mantissa we're creating in order to do -+ * comparisons - that bit always becomes the high bit of -+ * bfloat->mantissa, and thus the exponent we're calculating here is -+ * the position of what will become the low bit in bfloat->mantissa: -+ * -+ * Note that this may be negative - we may be running off the low end -+ * of the key: we handle this later: -+ */ -+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), -+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); -+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); -+ -+ /* -+ * Then we calculate the actual shift value, from the start of the key -+ * (k->_data), to get the key bits starting at exponent: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; -+ -+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -+#else -+ shift = high_bit_offset + -+ b->nr_key_bits - -+ exponent - -+ BKEY_MANTISSA_BITS; -+ -+ EBUG_ON(shift < KEY_PACKED_BITS_START); -+#endif -+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); -+ -+ f->exponent = shift; -+ mantissa = bkey_mantissa(m, f, j); -+ -+ /* -+ * If we've got garbage bits, set them to all 1s - it's legal for the -+ * bfloat to compare larger than the original key, but not smaller: -+ */ -+ if (exponent < 0) -+ mantissa |= ~(~0U << -exponent); -+ -+ f->mantissa = mantissa; -+} -+ -+/* bytes remaining - only valid for last bset: */ -+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ bset_aux_tree_verify(b); -+ -+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -+} -+ -+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / -+ (sizeof(struct bkey_float) + sizeof(u8)); -+} -+ -+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -+} -+ -+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *k; -+ -+ t->size = 1; -+ t->extra = BSET_RW_AUX_TREE_VAL; -+ rw_aux_tree(b, t)[0].offset = -+ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); -+ -+ bset_tree_for_each_key(b, t, k) { -+ if (t->size == bset_rw_tree_capacity(b, t)) -+ break; -+ -+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > -+ L1_CACHE_BYTES) -+ rw_aux_tree_set(b, t, t->size++, k); -+ } -+} -+ -+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); -+ struct bkey_packed min_key, max_key; -+ unsigned j, cacheline = 1; -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), -+ bset_ro_tree_capacity(b, t)); -+retry: -+ if (t->size < 2) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ return; -+ } -+ -+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; -+ -+ /* First we figure out where the first key in each cacheline is */ -+ eytzinger1_for_each(j, t->size) { -+ while (bkey_to_cacheline(b, t, k) < cacheline) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ if (k >= btree_bkey_last(b, t)) { -+ /* XXX: this path sucks */ -+ t->size--; -+ goto retry; -+ } -+ -+ ro_aux_tree_prev(b, t)[j] = prev->u64s; -+ bkey_float(b, t, j)->key_offset = -+ bkey_to_cacheline_offset(b, t, cacheline++, k); -+ -+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); -+ EBUG_ON(tree_to_bkey(b, t, j) != k); -+ } -+ -+ while (k != btree_bkey_last(b, t)) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ t->max_key = bkey_unpack_pos(b, prev); -+ -+ /* Then we build the tree */ -+ eytzinger1_for_each(j, t->size) -+ make_bfloat(b, t, j, &min_key, &max_key); -+} -+ -+static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bset_tree *i; -+ -+ for (i = b->set; i != t; i++) -+ BUG_ON(bset_has_rw_aux_tree(i)); -+ -+ bch2_bset_set_no_aux_tree(b, t); -+ -+ /* round up to next cacheline: */ -+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), -+ SMP_CACHE_BYTES / sizeof(u64)); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, -+ bool writeable) -+{ -+ if (writeable -+ ? bset_has_rw_aux_tree(t) -+ : bset_has_ro_aux_tree(t)) -+ return; -+ -+ bset_alloc_tree(b, t); -+ -+ if (!__bset_tree_capacity(b, t)) -+ return; -+ -+ if (writeable) -+ __build_rw_aux_tree(b, t); -+ else -+ __build_ro_aux_tree(b, t); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_init_first(struct btree *b, struct bset *i) -+{ -+ struct bset_tree *t; -+ -+ BUG_ON(b->nsets); -+ -+ memset(i, 0, sizeof(*i)); -+ get_random_bytes(&i->seq, sizeof(i->seq)); -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+void bch2_bset_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_node_entry *bne) -+{ -+ struct bset *i = &bne->keys; -+ struct bset_tree *t; -+ -+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); -+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); -+ BUG_ON(b->nsets >= MAX_BSETS); -+ -+ memset(i, 0, sizeof(*i)); -+ i->seq = btree_bset_first(b)->seq; -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+/* -+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the -+ * immediate predecessor: -+ */ -+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed *p; -+ unsigned offset; -+ int j; -+ -+ EBUG_ON(k < btree_bkey_first(b, t) || -+ k > btree_bkey_last(b, t)); -+ -+ if (k == btree_bkey_first(b, t)) -+ return NULL; -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ p = btree_bkey_first(b, t); -+ break; -+ case BSET_RO_AUX_TREE: -+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); -+ -+ do { -+ p = j ? tree_to_bkey(b, t, -+ __inorder_to_eytzinger1(j--, -+ t->size, t->extra)) -+ : btree_bkey_first(b, t); -+ } while (p >= k); -+ break; -+ case BSET_RW_AUX_TREE: -+ offset = __btree_node_key_to_offset(b, k); -+ j = rw_aux_tree_bsearch(b, t, offset); -+ p = j ? rw_aux_to_bkey(b, t, j - 1) -+ : btree_bkey_first(b, t); -+ break; -+ } -+ -+ return p; -+} -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; -+ -+ while ((p = __bkey_prev(b, t, k)) && !ret) { -+ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) -+ if (i->type >= min_key_type) -+ ret = i; -+ -+ k = p; -+ } -+ -+ if (btree_keys_expensive_checks(b)) { -+ BUG_ON(ret >= orig_k); -+ -+ for (i = ret -+ ? bkey_next_skip_noops(ret, orig_k) -+ : btree_bkey_first(b, t); -+ i != orig_k; -+ i = bkey_next_skip_noops(i, orig_k)) -+ BUG_ON(i->type >= min_key_type); -+ } -+ -+ return ret; -+} -+ -+/* Insert */ -+ -+static void rw_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ unsigned j = rw_aux_tree_bsearch(b, t, offset); -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset == offset) -+ rw_aux_tree_set(b, t, j, k); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+} -+ -+static void ro_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed min_key, max_key; -+ unsigned inorder, j; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { -+ t->max_key = bkey_unpack_pos(b, k); -+ -+ for (j = 1; j < t->size; j = j * 2 + 1) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ -+ if (inorder && -+ inorder < t->size) { -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ -+ if (k == tree_to_bkey(b, t, j)) { -+ /* Fix the node this key corresponds to */ -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the right boundary */ -+ for (j = eytzinger1_left_child(j); -+ j < t->size; -+ j = eytzinger1_right_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+ -+ if (inorder + 1 < t->size) { -+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); -+ -+ if (k == tree_to_prev_bkey(b, t, j)) { -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the left boundary */ -+ for (j = eytzinger1_right_child(j); -+ j < t->size; -+ j = eytzinger1_left_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+} -+ -+/** -+ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been -+ * modified, fix any auxiliary search tree by remaking all the nodes in the -+ * auxiliary search tree that @k corresponds to -+ */ -+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ break; -+ case BSET_RO_AUX_TREE: -+ ro_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ case BSET_RW_AUX_TREE: -+ rw_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ } -+} -+ -+static void bch2_bset_fix_lookup_table(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *_where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ int shift = new_u64s - clobber_u64s; -+ unsigned l, j, where = __btree_node_key_to_offset(b, _where); -+ -+ EBUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ /* returns first entry >= where */ -+ l = rw_aux_tree_bsearch(b, t, where); -+ -+ if (!l) /* never delete first entry */ -+ l++; -+ else if (l < t->size && -+ where < t->end_offset && -+ rw_aux_tree(b, t)[l].offset == where) -+ rw_aux_tree_set(b, t, l++, _where); -+ -+ /* l now > where */ -+ -+ for (j = l; -+ j < t->size && -+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; -+ j++) -+ ; -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset + shift == -+ rw_aux_tree(b, t)[l - 1].offset) -+ j++; -+ -+ memmove(&rw_aux_tree(b, t)[l], -+ &rw_aux_tree(b, t)[j], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[j]); -+ t->size -= j - l; -+ -+ for (j = l; j < t->size; j++) -+ rw_aux_tree(b, t)[j].offset += shift; -+ -+ EBUG_ON(l < t->size && -+ rw_aux_tree(b, t)[l].offset == -+ rw_aux_tree(b, t)[l - 1].offset); -+ -+ if (t->size < bset_rw_tree_capacity(b, t) && -+ (l < t->size -+ ? rw_aux_tree(b, t)[l].offset -+ : t->end_offset) - -+ rw_aux_tree(b, t)[l - 1].offset > -+ L1_CACHE_BYTES / sizeof(u64)) { -+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); -+ struct bkey_packed *end = l < t->size -+ ? rw_aux_to_bkey(b, t, l) -+ : btree_bkey_last(b, t); -+ struct bkey_packed *k = start; -+ -+ while (1) { -+ k = bkey_next_skip_noops(k, end); -+ if (k == end) -+ break; -+ -+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { -+ memmove(&rw_aux_tree(b, t)[l + 1], -+ &rw_aux_tree(b, t)[l], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[l]); -+ t->size++; -+ rw_aux_tree_set(b, t, l, k); -+ break; -+ } -+ } -+ } -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_insert(struct btree *b, -+ struct btree_node_iter *iter, -+ struct bkey_packed *where, -+ struct bkey_i *insert, -+ unsigned clobber_u64s) -+{ -+ struct bkey_format *f = &b->format; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bkey_packed packed, *src = bkey_to_packed(insert); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); -+ -+ if (bch2_bkey_pack_key(&packed, &insert->k, f)) -+ src = &packed; -+ -+ if (!bkey_whiteout(&insert->k)) -+ btree_keys_account_key_add(&b->nr, t - b->set, src); -+ -+ if (src->u64s != clobber_u64s) { -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data + src->u64s; -+ -+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < -+ (int) clobber_u64s - src->u64s); -+ -+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); -+ set_btree_bset_end(b, t); -+ } -+ -+ memcpy_u64s(where, src, -+ bkeyp_key_u64s(f, src)); -+ memcpy_u64s(bkeyp_val(f, where), &insert->v, -+ bkeyp_val_u64s(f, src)); -+ -+ if (src->u64s != clobber_u64s) -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_bset_delete(struct btree *b, -+ struct bkey_packed *where, -+ unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data; -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ -+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); -+ -+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); -+ set_btree_bset_end(b, t); -+ -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -+} -+ -+/* Lookup */ -+ -+__flatten -+static struct bkey_packed *bset_search_write_set(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ unsigned l = 0, r = t->size; -+ -+ while (l + 1 != r) { -+ unsigned m = (l + r) >> 1; -+ -+ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) -+ l = m; -+ else -+ r = m; -+ } -+ -+ return rw_aux_to_bkey(b, t, l); -+} -+ -+static inline void prefetch_four_cachelines(void *p) -+{ -+#ifdef CONFIG_X86_64 -+ asm(".intel_syntax noprefix;" -+ "prefetcht0 [%0 - 127 + 64 * 0];" -+ "prefetcht0 [%0 - 127 + 64 * 1];" -+ "prefetcht0 [%0 - 127 + 64 * 2];" -+ "prefetcht0 [%0 - 127 + 64 * 3];" -+ ".att_syntax prefix;" -+ : -+ : "r" (p + 127)); -+#else -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ prefetch(p + L1_CACHE_BYTES * 3); -+#endif -+} -+ -+static inline bool bkey_mantissa_bits_dropped(const struct btree *b, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; -+ -+ return f->exponent > key_bits_start; -+#else -+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; -+ -+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -+#endif -+} -+ -+__flatten -+static struct bkey_packed *bset_search_tree(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ struct ro_aux_tree *base = ro_aux_tree_base(b, t); -+ struct bkey_float *f; -+ struct bkey_packed *k; -+ unsigned inorder, n = 1, l, r; -+ int cmp; -+ -+ do { -+ if (likely(n << 4 < t->size)) -+ prefetch(&base->f[n << 4]); -+ -+ f = &base->f[n]; -+ -+ if (!unlikely(packed_search)) -+ goto slowpath; -+ if (unlikely(f->exponent >= BFLOAT_FAILED)) -+ goto slowpath; -+ -+ l = f->mantissa; -+ r = bkey_mantissa(packed_search, f, n); -+ -+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) -+ goto slowpath; -+ -+ n = n * 2 + (l < r); -+ continue; -+slowpath: -+ k = tree_to_bkey(b, t, n); -+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); -+ if (!cmp) -+ return k; -+ -+ n = n * 2 + (cmp < 0); -+ } while (n < t->size); -+ -+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); -+ -+ /* -+ * n would have been the node we recursed to - the low bit tells us if -+ * we recursed left or recursed right. -+ */ -+ if (likely(!(n & 1))) { -+ --inorder; -+ if (unlikely(!inorder)) -+ return btree_bkey_first(b, t); -+ -+ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; -+ } -+ -+ return cacheline_to_bkey(b, t, inorder, f->key_offset); -+} -+ -+static __always_inline __flatten -+struct bkey_packed *__bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ -+ /* -+ * First, we search for a cacheline, then lastly we do a linear search -+ * within that cacheline. -+ * -+ * To search for the cacheline, there's three different possibilities: -+ * * The set is too small to have a search tree, so we just do a linear -+ * search over the whole set. -+ * * The set is the one we're currently inserting into; keeping a full -+ * auxiliary search tree up to date would be too expensive, so we -+ * use a much simpler lookup table to do a binary search - -+ * bset_search_write_set(). -+ * * Or we use the auxiliary search tree we constructed earlier - -+ * bset_search_tree() -+ */ -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return btree_bkey_first(b, t); -+ case BSET_RW_AUX_TREE: -+ return bset_search_write_set(b, t, search, lossy_packed_search); -+ case BSET_RO_AUX_TREE: -+ /* -+ * Each node in the auxiliary search tree covers a certain range -+ * of bits, and keys above and below the set it covers might -+ * differ outside those bits - so we have to special case the -+ * start and end - handle that here: -+ */ -+ -+ if (bkey_cmp(*search, t->max_key) > 0) -+ return btree_bkey_last(b, t); -+ -+ return bset_search_tree(b, t, search, lossy_packed_search); -+ default: -+ unreachable(); -+ } -+} -+ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search_linear(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search, -+ struct bkey_packed *m) -+{ -+ if (lossy_packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_cmp_p_or_unp(b, m, -+ lossy_packed_search, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (!packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_pos_cmp(b, m, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); -+ -+ BUG_ON(prev && -+ bkey_iter_cmp_p_or_unp(b, prev, -+ packed_search, search) >= 0); -+ } -+ -+ return m; -+} -+ -+/* -+ * Returns the first key greater than or equal to @search -+ */ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ struct bkey_packed *m = __bch2_bset_search(b, t, search, -+ lossy_packed_search); -+ -+ return bch2_bset_search_linear(b, t, search, -+ packed_search, lossy_packed_search, m); -+} -+ -+/* Btree node iterator */ -+ -+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ if (k != end) { -+ struct btree_node_iter_set *pos; -+ -+ btree_node_iter_for_each(iter, pos) -+ ; -+ -+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); -+ *pos = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+} -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ __bch2_btree_node_iter_push(iter, b, k, end); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+noinline __flatten __attribute__((cold)) -+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bset_tree *t; -+ -+ trace_bkey_pack_pos_fail(search); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ bch2_bset_search(b, t, search, NULL, NULL), -+ btree_bkey_last(b, t)); -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+/** -+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a -+ * given position -+ * -+ * Main entry point to the lookup code for individual btree nodes: -+ * -+ * NOTE: -+ * -+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate -+ * keys. This doesn't matter for most code, but it does matter for lookups. -+ * -+ * Some adjacent keys with a string of equal keys: -+ * i j k k k k l m -+ * -+ * If you search for k, the lookup code isn't guaranteed to return you any -+ * specific k. The lookup code is conceptually doing a binary search and -+ * iterating backwards is very expensive so if the pivot happens to land at the -+ * last k that's what you'll get. -+ * -+ * This works out ok, but it's something to be aware of: -+ * -+ * - For non extents, we guarantee that the live key comes last - see -+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't -+ * see will only be deleted keys you don't care about. -+ * -+ * - For extents, deleted keys sort last (see the comment at the top of this -+ * file). But when you're searching for extents, you actually want the first -+ * key strictly greater than your search key - an extent that compares equal -+ * to the search key is going to have 0 sectors after the search key. -+ * -+ * But this does mean that we can't just search for -+ * bkey_successor(start_of_range) to get the first extent that overlaps with -+ * the range we want - if we're unlucky and there's an extent that ends -+ * exactly where we searched, then there could be a deleted key at the same -+ * position and we'd get that when we search instead of the preceding extent -+ * we needed. -+ * -+ * So we've got to search for start_of_range, then after the lookup iterate -+ * past any extents that compare equal to the position we searched for. -+ */ -+__flatten -+void bch2_btree_node_iter_init(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bkey_packed p, *packed_search = NULL; -+ struct btree_node_iter_set *pos = iter->data; -+ struct bkey_packed *k[MAX_BSETS]; -+ unsigned i; -+ -+ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); -+ bset_aux_tree_verify(b); -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { -+ case BKEY_PACK_POS_EXACT: -+ packed_search = &p; -+ break; -+ case BKEY_PACK_POS_SMALLER: -+ packed_search = NULL; -+ break; -+ case BKEY_PACK_POS_FAIL: -+ btree_node_iter_init_pack_failed(iter, b, search); -+ return; -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ k[i] = __bch2_bset_search(b, b->set + i, search, &p); -+ prefetch_four_cachelines(k[i]); -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ struct bset_tree *t = b->set + i; -+ struct bkey_packed *end = btree_bkey_last(b, t); -+ -+ k[i] = bch2_bset_search_linear(b, t, search, -+ packed_search, &p, k[i]); -+ if (k[i] != end) -+ *pos++ = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k[i]), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) -+ return __btree_node_offset_to_key(b, set->k); -+ -+ return btree_bkey_last(b, t); -+} -+ -+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned first) -+{ -+ bool ret; -+ -+ if ((ret = (btree_node_iter_cmp(b, -+ iter->data[first], -+ iter->data[first + 1]) > 0))) -+ swap(iter->data[first], iter->data[first + 1]); -+ return ret; -+} -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ /* unrolled bubble sort: */ -+ -+ if (!__btree_node_iter_set_end(iter, 2)) { -+ btree_node_iter_sort_two(iter, b, 0); -+ btree_node_iter_sort_two(iter, b, 1); -+ } -+ -+ if (!__btree_node_iter_set_end(iter, 1)) -+ btree_node_iter_sort_two(iter, b, 0); -+} -+ -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, -+ struct btree_node_iter_set *set) -+{ -+ struct btree_node_iter_set *last = -+ iter->data + ARRAY_SIZE(iter->data) - 1; -+ -+ memmove(&set[0], &set[1], (void *) last - (void *) set); -+ *last = (struct btree_node_iter_set) { 0, 0 }; -+} -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; -+ -+ EBUG_ON(iter->data->k > iter->data->end); -+ -+ while (!__btree_node_iter_set_end(iter, 0) && -+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) -+ iter->data->k++; -+ -+ if (unlikely(__btree_node_iter_set_end(iter, 0))) { -+ bch2_btree_node_iter_set_drop(iter, iter->data); -+ return; -+ } -+ -+ if (__btree_node_iter_set_end(iter, 1)) -+ return; -+ -+ if (!btree_node_iter_sort_two(iter, b, 0)) -+ return; -+ -+ if (__btree_node_iter_set_end(iter, 2)) -+ return; -+ -+ btree_node_iter_sort_two(iter, b, 1); -+} -+ -+void bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) { -+ bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_next_check(iter, b); -+ } -+ -+ __bch2_btree_node_iter_advance(iter, b); -+} -+ -+/* -+ * Expensive: -+ */ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bkey_packed *k, *prev = NULL; -+ struct btree_node_iter_set *set; -+ struct bset_tree *t; -+ unsigned end = 0; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ -+ for_each_bset(b, t) { -+ k = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ if (k && -+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { -+ prev = k; -+ end = t->end_offset; -+ } -+ } -+ -+ if (!prev) -+ return NULL; -+ -+ /* -+ * We're manually memmoving instead of just calling sort() to ensure the -+ * prev we picked ends up in slot 0 - sort won't necessarily put it -+ * there because of duplicate deleted keys: -+ */ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == end) -+ goto found; -+ -+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -+found: -+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); -+ -+ memmove(&iter->data[1], -+ &iter->data[0], -+ (void *) set - (void *) &iter->data[0]); -+ -+ iter->data[0].k = __btree_node_key_to_offset(b, prev); -+ iter->data[0].end = end; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ return prev; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *prev; -+ -+ do { -+ prev = bch2_btree_node_iter_prev_all(iter, b); -+ } while (prev && prev->type < min_key_type); -+ -+ return prev; -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bkey *u) -+{ -+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); -+ -+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -+} -+ -+/* Mergesort */ -+ -+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ enum bset_aux_tree_type type = bset_aux_tree_type(t); -+ size_t j; -+ -+ stats->sets[type].nr++; -+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * -+ sizeof(u64); -+ -+ if (bset_has_ro_aux_tree(t)) { -+ stats->floats += t->size - 1; -+ -+ for (j = 1; j < t->size; j++) -+ stats->failed += -+ bkey_float(b, t, j)->exponent == -+ BFLOAT_FAILED; -+ } -+ } -+} -+ -+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, -+ struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk; -+ unsigned j, inorder; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ if (!bset_has_ro_aux_tree(t)) -+ return; -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ if (!inorder || inorder >= t->size) -+ return; -+ -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ if (k != tree_to_bkey(b, t, j)) -+ return; -+ -+ switch (bkey_float(b, t, j)->exponent) { -+ case BFLOAT_FAILED: -+ uk = bkey_unpack_key(b, k); -+ pr_buf(out, -+ " failed unpacked at depth %u\n" -+ "\t%llu:%llu\n", -+ ilog2(j), -+ uk.p.inode, uk.p.offset); -+ break; -+ } -+} -diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h -new file mode 100644 -index 000000000000..5921cf689105 ---- /dev/null -+++ b/fs/bcachefs/bset.h -@@ -0,0 +1,661 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BSET_H -+#define _BCACHEFS_BSET_H -+ -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "util.h" /* for time_stats */ -+#include "vstructs.h" -+ -+/* -+ * BKEYS: -+ * -+ * A bkey contains a key, a size field, a variable number of pointers, and some -+ * ancillary flag bits. -+ * -+ * We use two different functions for validating bkeys, bkey_invalid and -+ * bkey_deleted(). -+ * -+ * The one exception to the rule that ptr_invalid() filters out invalid keys is -+ * that it also filters out keys of size 0 - these are keys that have been -+ * completely overwritten. It'd be safe to delete these in memory while leaving -+ * them on disk, just unnecessary work - so we filter them out when resorting -+ * instead. -+ * -+ * We can't filter out stale keys when we're resorting, because garbage -+ * collection needs to find them to ensure bucket gens don't wrap around - -+ * unless we're rewriting the btree node those stale keys still exist on disk. -+ * -+ * We also implement functions here for removing some number of sectors from the -+ * front or the back of a bkey - this is mainly used for fixing overlapping -+ * extents, by removing the overlapping sectors from the older key. -+ * -+ * BSETS: -+ * -+ * A bset is an array of bkeys laid out contiguously in memory in sorted order, -+ * along with a header. A btree node is made up of a number of these, written at -+ * different times. -+ * -+ * There could be many of them on disk, but we never allow there to be more than -+ * 4 in memory - we lazily resort as needed. -+ * -+ * We implement code here for creating and maintaining auxiliary search trees -+ * (described below) for searching an individial bset, and on top of that we -+ * implement a btree iterator. -+ * -+ * BTREE ITERATOR: -+ * -+ * Most of the code in bcache doesn't care about an individual bset - it needs -+ * to search entire btree nodes and iterate over them in sorted order. -+ * -+ * The btree iterator code serves both functions; it iterates through the keys -+ * in a btree node in sorted order, starting from either keys after a specific -+ * point (if you pass it a search key) or the start of the btree node. -+ * -+ * AUXILIARY SEARCH TREES: -+ * -+ * Since keys are variable length, we can't use a binary search on a bset - we -+ * wouldn't be able to find the start of the next key. But binary searches are -+ * slow anyways, due to terrible cache behaviour; bcache originally used binary -+ * searches and that code topped out at under 50k lookups/second. -+ * -+ * So we need to construct some sort of lookup table. Since we only insert keys -+ * into the last (unwritten) set, most of the keys within a given btree node are -+ * usually in sets that are mostly constant. We use two different types of -+ * lookup tables to take advantage of this. -+ * -+ * Both lookup tables share in common that they don't index every key in the -+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search -+ * is used for the rest. -+ * -+ * For sets that have been written to disk and are no longer being inserted -+ * into, we construct a binary search tree in an array - traversing a binary -+ * search tree in an array gives excellent locality of reference and is very -+ * fast, since both children of any node are adjacent to each other in memory -+ * (and their grandchildren, and great grandchildren...) - this means -+ * prefetching can be used to great effect. -+ * -+ * It's quite useful performance wise to keep these nodes small - not just -+ * because they're more likely to be in L2, but also because we can prefetch -+ * more nodes on a single cacheline and thus prefetch more iterations in advance -+ * when traversing this tree. -+ * -+ * Nodes in the auxiliary search tree must contain both a key to compare against -+ * (we don't want to fetch the key from the set, that would defeat the purpose), -+ * and a pointer to the key. We use a few tricks to compress both of these. -+ * -+ * To compress the pointer, we take advantage of the fact that one node in the -+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have -+ * a function (to_inorder()) that takes the index of a node in a binary tree and -+ * returns what its index would be in an inorder traversal, so we only have to -+ * store the low bits of the offset. -+ * -+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To -+ * compress that, we take advantage of the fact that when we're traversing the -+ * search tree at every iteration we know that both our search key and the key -+ * we're looking for lie within some range - bounded by our previous -+ * comparisons. (We special case the start of a search so that this is true even -+ * at the root of the tree). -+ * -+ * So we know the key we're looking for is between a and b, and a and b don't -+ * differ higher than bit 50, we don't need to check anything higher than bit -+ * 50. -+ * -+ * We don't usually need the rest of the bits, either; we only need enough bits -+ * to partition the key range we're currently checking. Consider key n - the -+ * key our auxiliary search tree node corresponds to, and key p, the key -+ * immediately preceding n. The lowest bit we need to store in the auxiliary -+ * search tree is the highest bit that differs between n and p. -+ * -+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the -+ * comparison. But we'd really like our nodes in the auxiliary search tree to be -+ * of fixed size. -+ * -+ * The solution is to make them fixed size, and when we're constructing a node -+ * check if p and n differed in the bits we needed them to. If they don't we -+ * flag that node, and when doing lookups we fallback to comparing against the -+ * real key. As long as this doesn't happen to often (and it seems to reliably -+ * happen a bit less than 1% of the time), we win - even on failures, that key -+ * is then more likely to be in cache than if we were doing binary searches all -+ * the way, since we're touching so much less memory. -+ * -+ * The keys in the auxiliary search tree are stored in (software) floating -+ * point, with an exponent and a mantissa. The exponent needs to be big enough -+ * to address all the bits in the original key, but the number of bits in the -+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. -+ * -+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys -+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. -+ * We need one node per 128 bytes in the btree node, which means the auxiliary -+ * search trees take up 3% as much memory as the btree itself. -+ * -+ * Constructing these auxiliary search trees is moderately expensive, and we -+ * don't want to be constantly rebuilding the search tree for the last set -+ * whenever we insert another key into it. For the unwritten set, we use a much -+ * simpler lookup table - it's just a flat array, so index i in the lookup table -+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing -+ * within each byte range works the same as with the auxiliary search trees. -+ * -+ * These are much easier to keep up to date when we insert a key - we do it -+ * somewhat lazily; when we shift a key up we usually just increment the pointer -+ * to it, only when it would overflow do we go to the trouble of finding the -+ * first key in that range of bytes again. -+ */ -+ -+extern bool bch2_expensive_debug_checks; -+ -+static inline bool btree_keys_expensive_checks(const struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ return bch2_expensive_debug_checks || *b->expensive_debug_checks; -+#else -+ return false; -+#endif -+} -+ -+enum bset_aux_tree_type { -+ BSET_NO_AUX_TREE, -+ BSET_RO_AUX_TREE, -+ BSET_RW_AUX_TREE, -+}; -+ -+#define BSET_TREE_NR_TYPES 3 -+ -+#define BSET_NO_AUX_TREE_VAL (U16_MAX) -+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) -+ -+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -+{ -+ switch (t->extra) { -+ case BSET_NO_AUX_TREE_VAL: -+ EBUG_ON(t->size); -+ return BSET_NO_AUX_TREE; -+ case BSET_RW_AUX_TREE_VAL: -+ EBUG_ON(!t->size); -+ return BSET_RW_AUX_TREE; -+ default: -+ EBUG_ON(!t->size); -+ return BSET_RO_AUX_TREE; -+ } -+} -+ -+/* -+ * BSET_CACHELINE was originally intended to match the hardware cacheline size - -+ * it used to be 64, but I realized the lookup code would touch slightly less -+ * memory if it was 128. -+ * -+ * It definites the number of bytes (in struct bset) per struct bkey_float in -+ * the auxiliar search tree - when we're done searching the bset_float tree we -+ * have this many bytes left that we do a linear search over. -+ * -+ * Since (after level 5) every level of the bset_tree is on a new cacheline, -+ * we're touching one fewer cacheline in the bset tree in exchange for one more -+ * cacheline in the linear search - but the linear search might stop before it -+ * gets to the second cacheline. -+ */ -+ -+#define BSET_CACHELINE 128 -+ -+static inline size_t btree_keys_cachelines(struct btree *b) -+{ -+ return (1U << b->byte_order) / BSET_CACHELINE; -+} -+ -+static inline size_t btree_aux_data_bytes(struct btree *b) -+{ -+ return btree_keys_cachelines(b) * 8; -+} -+ -+static inline size_t btree_aux_data_u64s(struct btree *b) -+{ -+ return btree_aux_data_bytes(b) / sizeof(u64); -+} -+ -+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -+ -+static inline void -+__bkey_unpack_key_format_checked(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ { -+ compiled_unpack_fn unpack_fn = b->aux_data; -+ unpack_fn(dst, src); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); -+ -+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -+ } -+ } -+#else -+ *dst = __bch2_bkey_unpack_key(&b->format, src); -+#endif -+} -+ -+static inline struct bkey -+bkey_unpack_key_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ struct bkey dst; -+ -+ __bkey_unpack_key_format_checked(b, &dst, src); -+ return dst; -+} -+ -+static inline void __bkey_unpack_key(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (likely(bkey_packed(src))) -+ __bkey_unpack_key_format_checked(b, dst, src); -+ else -+ *dst = *packed_to_bkey_c(src); -+} -+ -+/** -+ * bkey_unpack_key -- unpack just the key, not the value -+ */ -+static inline struct bkey bkey_unpack_key(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_key_format_checked(b, src) -+ : *packed_to_bkey_c(src); -+} -+ -+static inline struct bpos -+bkey_unpack_pos_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ return bkey_unpack_key_format_checked(b, src).p; -+#else -+ return __bkey_unpack_pos(&b->format, src); -+#endif -+} -+ -+static inline struct bpos bkey_unpack_pos(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_pos_format_checked(b, src) -+ : packed_to_bkey_c(src)->p; -+} -+ -+/* Disassembled bkeys */ -+ -+static inline struct bkey_s_c bkey_disassemble(struct btree *b, -+ const struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -+} -+ -+/* non const version: */ -+static inline struct bkey_s __bkey_disassemble(struct btree *b, -+ struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -+} -+ -+#define for_each_bset(_b, _t) \ -+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) -+ -+#define bset_tree_for_each_key(_b, _t, _k) \ -+ for (_k = btree_bkey_first(_b, _t); \ -+ _k != btree_bkey_last(_b, _t); \ -+ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) -+ -+static inline bool bset_has_ro_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -+} -+ -+static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -+} -+ -+static inline void bch2_bset_set_no_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ BUG_ON(t < b->set); -+ -+ for (; t < b->set + ARRAY_SIZE(b->set); t++) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ t->aux_data_offset = U16_MAX; -+ } -+} -+ -+static inline void btree_node_set_format(struct btree *b, -+ struct bkey_format f) -+{ -+ int len; -+ -+ b->format = f; -+ b->nr_key_bits = bkey_format_key_bits(&f); -+ -+ len = bch2_compile_bkey_format(&b->format, b->aux_data); -+ BUG_ON(len < 0 || len > U8_MAX); -+ -+ b->unpack_fn_len = len; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+static inline struct bset *bset_next_set(struct btree *b, -+ unsigned block_bytes) -+{ -+ struct bset *i = btree_bset_last(b); -+ -+ EBUG_ON(!is_power_of_2(block_bytes)); -+ -+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -+} -+ -+void bch2_btree_keys_init(struct btree *, bool *); -+ -+void bch2_bset_init_first(struct btree *, struct bset *); -+void bch2_bset_init_next(struct bch_fs *, struct btree *, -+ struct btree_node_entry *); -+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); -+ -+void bch2_bset_insert(struct btree *, struct btree_node_iter *, -+ struct bkey_packed *, struct bkey_i *, unsigned); -+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); -+ -+/* Bkey utility code */ -+ -+/* packed or unpacked */ -+static inline int bkey_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ EBUG_ON(r_packed && !bkey_packed(r_packed)); -+ -+ if (unlikely(!bkey_packed(l))) -+ return bkey_cmp(packed_to_bkey_c(l)->p, *r); -+ -+ if (likely(r_packed)) -+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); -+ -+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, -+ struct bkey_packed *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); -+} -+ -+enum bch_extent_overlap { -+ BCH_EXTENT_OVERLAP_ALL = 0, -+ BCH_EXTENT_OVERLAP_BACK = 1, -+ BCH_EXTENT_OVERLAP_FRONT = 2, -+ BCH_EXTENT_OVERLAP_MIDDLE = 3, -+}; -+ -+/* Returns how k overlaps with m */ -+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, -+ const struct bkey *m) -+{ -+ int cmp1 = bkey_cmp(k->p, m->p) < 0; -+ int cmp2 = bkey_cmp(bkey_start_pos(k), -+ bkey_start_pos(m)) > 0; -+ -+ return (cmp1 << 1) + cmp2; -+} -+ -+/* Btree key iteration */ -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, -+ struct bpos *); -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, -+ struct btree *, -+ struct bset_tree *); -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *, -+ struct btree_node_iter_set *); -+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); -+ -+#define btree_node_iter_for_each(_iter, _set) \ -+ for (_set = (_iter)->data; \ -+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ -+ (_set)->k != (_set)->end; \ -+ _set++) -+ -+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, -+ unsigned i) -+{ -+ return iter->data[i].k == iter->data[i].end; -+} -+ -+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -+{ -+ return __btree_node_iter_set_end(iter, 0); -+} -+ -+/* -+ * When keys compare equal, deleted keys compare first: -+ * -+ * XXX: only need to compare pointers for keys that are both within a -+ * btree_node_iterator - we need to break ties for prev() to work correctly -+ */ -+static inline int bkey_iter_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) -+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) -+ ?: cmp_int(l, r); -+} -+ -+static inline int btree_node_iter_cmp(const struct btree *b, -+ struct btree_node_iter_set l, -+ struct btree_node_iter_set r) -+{ -+ return bkey_iter_cmp(b, -+ __btree_node_offset_to_key(b, l.k), -+ __btree_node_offset_to_key(b, r.k)); -+} -+ -+/* These assume r (the search key) is not a deleted key: */ -+static inline int bkey_iter_pos_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp_left_packed(b, l, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ return bkey_cmp_p_or_unp(b, l, r_packed, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline struct bkey_packed * -+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return __btree_node_offset_to_key(b, iter->data->k); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ while (!bch2_btree_node_iter_end(iter)) { -+ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (k->type >= min_key_type) -+ return k; -+ -+ bch2_btree_node_iter_advance(iter, b); -+ } -+ -+ return NULL; -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -+{ -+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (ret) -+ bch2_btree_node_iter_advance(iter, b); -+ -+ return ret; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, -+ struct btree *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, -+ struct btree *, -+ struct bkey *); -+ -+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ -+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ -+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ -+ bch2_btree_node_iter_advance(iter, b)) -+ -+/* Accounting: */ -+ -+static inline void btree_keys_account_key(struct btree_nr_keys *n, -+ unsigned bset, -+ struct bkey_packed *k, -+ int sign) -+{ -+ n->live_u64s += k->u64s * sign; -+ n->bset_u64s[bset] += k->u64s * sign; -+ -+ if (bkey_packed(k)) -+ n->packed_keys += sign; -+ else -+ n->unpacked_keys += sign; -+} -+ -+static inline void btree_keys_account_val_delta(struct btree *b, -+ struct bkey_packed *k, -+ int delta) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ b->nr.live_u64s += delta; -+ b->nr.bset_u64s[t - b->set] += delta; -+} -+ -+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, 1) -+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, -1) -+ -+#define btree_account_key_add(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -+#define btree_account_key_drop(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) -+ -+struct bset_stats { -+ struct { -+ size_t nr, bytes; -+ } sets[BSET_TREE_NR_TYPES]; -+ -+ size_t floats; -+ size_t failed; -+}; -+ -+void bch2_btree_keys_stats(struct btree *, struct bset_stats *); -+void bch2_bfloat_to_text(struct printbuf *, struct btree *, -+ struct bkey_packed *); -+ -+/* Debug stuff */ -+ -+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -+void bch2_dump_btree_node(struct bch_fs *, struct btree *); -+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *); -+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, -+ struct bkey_packed *, unsigned); -+ -+#else -+ -+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} -+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) {} -+static inline void bch2_verify_insert_pos(struct btree *b, -+ struct bkey_packed *where, -+ struct bkey_packed *insert, -+ unsigned clobber_u64s) {} -+#endif -+ -+static inline void bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) -+ __bch2_verify_btree_nr_keys(b); -+} -+ -+#endif /* _BCACHEFS_BSET_H */ -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -new file mode 100644 -index 000000000000..736671112861 ---- /dev/null -+++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1057 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "debug.h" -+ -+#include -+#include -+#include -+ -+const char * const bch2_btree_ids[] = { -+#define x(kwd, val, name) name, -+ BCH_BTREE_IDS() -+#undef x -+ NULL -+}; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *c) -+{ -+ unsigned i, reserve = 16; -+ -+ if (!c->btree_roots[0].b) -+ reserve += 8; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ reserve += min_t(unsigned, 1, -+ c->btree_roots[i].b->c.level) * 8; -+ -+ c->btree_cache.reserve = reserve; -+} -+ -+static inline unsigned btree_cache_can_free(struct btree_cache *bc) -+{ -+ return max_t(int, 0, bc->used - bc->reserve); -+} -+ -+static void __btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ EBUG_ON(btree_node_write_in_flight(b)); -+ -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ vfree(b->aux_data); -+ b->aux_data = NULL; -+} -+ -+static void btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ __btree_node_data_free(c, b); -+ bc->used--; -+ list_move(&b->list, &bc->freed); -+} -+ -+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct btree *b = obj; -+ const u64 *v = arg->key; -+ -+ return b->hash_val == *v ? 0 : 1; -+} -+ -+static const struct rhashtable_params bch_btree_cache_params = { -+ .head_offset = offsetof(struct btree, hash), -+ .key_offset = offsetof(struct btree, hash_val), -+ .key_len = sizeof(u64), -+ .obj_cmpfn = bch2_btree_cache_cmp_fn, -+}; -+ -+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -+{ -+ BUG_ON(b->data || b->aux_data); -+ -+ b->data = kvpmalloc(btree_bytes(c), gfp); -+ if (!b->data) -+ return -ENOMEM; -+ -+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); -+ if (!b->aux_data) { -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct btree *__btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); -+ if (!b) -+ return NULL; -+ -+ bkey_btree_ptr_init(&b->key); -+ six_lock_init(&b->c.lock); -+ INIT_LIST_HEAD(&b->list); -+ INIT_LIST_HEAD(&b->write_blocked); -+ b->byte_order = ilog2(btree_bytes(c)); -+ return b; -+} -+ -+static struct btree *btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b = __btree_node_mem_alloc(c); -+ if (!b) -+ return NULL; -+ -+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { -+ kfree(b); -+ return NULL; -+ } -+ -+ bc->used++; -+ list_add(&b->list, &bc->freeable); -+ return b; -+} -+ -+/* Btree in memory cache - hash table */ -+ -+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -+{ -+ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); -+ -+ /* Cause future lookups for this node to fail: */ -+ b->hash_val = 0; -+ -+ six_lock_wakeup_all(&b->c.lock); -+} -+ -+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -+{ -+ BUG_ON(b->hash_val); -+ b->hash_val = btree_ptr_hash_val(&b->key); -+ -+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, -+ bch_btree_cache_params); -+} -+ -+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, -+ unsigned level, enum btree_id id) -+{ -+ int ret; -+ -+ b->c.level = level; -+ b->c.btree_id = id; -+ -+ mutex_lock(&bc->lock); -+ ret = __bch2_btree_node_hash_insert(bc, b); -+ if (!ret) -+ list_add(&b->list, &bc->live); -+ mutex_unlock(&bc->lock); -+ -+ return ret; -+} -+ -+__flatten -+static inline struct btree *btree_cache_find(struct btree_cache *bc, -+ const struct bkey_i *k) -+{ -+ u64 v = btree_ptr_hash_val(k); -+ -+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -+} -+ -+/* -+ * this version is for btree nodes that have already been freed (we're not -+ * reaping a real btree node) -+ */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ int ret = 0; -+ -+ lockdep_assert_held(&bc->lock); -+ -+ if (!six_trylock_intent(&b->c.lock)) -+ return -ENOMEM; -+ -+ if (!six_trylock_write(&b->c.lock)) -+ goto out_unlock_intent; -+ -+ if (btree_node_noevict(b)) -+ goto out_unlock; -+ -+ if (!btree_node_may_write(b)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) && -+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) || -+ btree_node_write_in_flight(b) || -+ btree_node_read_in_flight(b)) { -+ if (!flush) -+ goto out_unlock; -+ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ /* -+ * Using the underscore version because we don't want to compact -+ * bsets after the write, since this node is about to be evicted -+ * - unless btree verify mode is enabled, since it runs out of -+ * the post write cleanup: -+ */ -+ if (verify_btree_ondisk(c)) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ else -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ -+ /* wait for any in flight btree write */ -+ btree_node_wait_on_io(b); -+ } -+out: -+ if (b->hash_val && !ret) -+ trace_btree_node_reap(c, b); -+ return ret; -+out_unlock: -+ six_unlock_write(&b->c.lock); -+out_unlock_intent: -+ six_unlock_intent(&b->c.lock); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, false); -+} -+ -+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, true); -+} -+ -+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b, *t; -+ unsigned long nr = sc->nr_to_scan; -+ unsigned long can_free; -+ unsigned long touched = 0; -+ unsigned long freed = 0; -+ unsigned i; -+ -+ if (btree_shrinker_disabled(c)) -+ return SHRINK_STOP; -+ -+ /* Return -1 if we can't do anything right now */ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ return -1; -+ -+ /* -+ * It's _really_ critical that we don't free too many btree nodes - we -+ * have to always leave ourselves a reserve. The reserve is how we -+ * guarantee that allocating memory for a new btree node can always -+ * succeed, so that inserting keys into the btree can always succeed and -+ * IO can always make forward progress: -+ */ -+ nr /= btree_pages(c); -+ can_free = btree_cache_can_free(bc); -+ nr = min_t(unsigned long, nr, can_free); -+ -+ i = 0; -+ list_for_each_entry_safe(b, t, &bc->freeable, list) { -+ touched++; -+ -+ if (freed >= nr) -+ break; -+ -+ if (++i > 3 && -+ !btree_node_reclaim(c, b)) { -+ btree_node_data_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ freed++; -+ } -+ } -+restart: -+ list_for_each_entry_safe(b, t, &bc->live, list) { -+ touched++; -+ -+ if (freed >= nr) { -+ /* Save position */ -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ break; -+ } -+ -+ if (!btree_node_accessed(b) && -+ !btree_node_reclaim(c, b)) { -+ /* can't call bch2_btree_node_hash_remove under lock */ -+ freed++; -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ -+ btree_node_data_free(c, b); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ if (freed >= nr) -+ goto out; -+ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ goto out; -+ goto restart; -+ } else -+ clear_btree_node_accessed(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+out: -+ return (unsigned long) freed * btree_pages(c); -+} -+ -+static unsigned long bch2_btree_cache_count(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (btree_shrinker_disabled(c)) -+ return 0; -+ -+ return btree_cache_can_free(bc) * btree_pages(c); -+} -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ unsigned i; -+ -+ if (bc->shrink.list.next) -+ unregister_shrinker(&bc->shrink); -+ -+ mutex_lock(&bc->lock); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (c->verify_data) -+ list_move(&c->verify_data->list, &bc->live); -+ -+ kvpfree(c->verify_ondisk, btree_bytes(c)); -+#endif -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ list_add(&c->btree_roots[i].b->list, &bc->live); -+ -+ list_splice(&bc->freeable, &bc->live); -+ -+ while (!list_empty(&bc->live)) { -+ b = list_first_entry(&bc->live, struct btree, list); -+ -+ BUG_ON(btree_node_read_in_flight(b) || -+ btree_node_write_in_flight(b)); -+ -+ if (btree_node_dirty(b)) -+ bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty(b); -+ -+ btree_node_data_free(c, b); -+ } -+ -+ while (!list_empty(&bc->freed)) { -+ b = list_first_entry(&bc->freed, struct btree, list); -+ list_del(&b->list); -+ kfree(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+ -+ if (bc->table_init_done) -+ rhashtable_destroy(&bc->table); -+} -+ -+int bch2_fs_btree_cache_init(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ unsigned i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); -+ if (ret) -+ goto out; -+ -+ bc->table_init_done = true; -+ -+ bch2_recalc_btree_reserve(c); -+ -+ for (i = 0; i < bc->reserve; i++) -+ if (!btree_node_mem_alloc(c)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_splice_init(&bc->live, &bc->freeable); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_init(&c->verify_lock); -+ -+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); -+ if (!c->verify_ondisk) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ c->verify_data = btree_node_mem_alloc(c); -+ if (!c->verify_data) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_del_init(&c->verify_data->list); -+#endif -+ -+ bc->shrink.count_objects = bch2_btree_cache_count; -+ bc->shrink.scan_objects = bch2_btree_cache_scan; -+ bc->shrink.seeks = 4; -+ bc->shrink.batch = btree_pages(c) * 2; -+ register_shrinker(&bc->shrink); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -+{ -+ mutex_init(&bc->lock); -+ INIT_LIST_HEAD(&bc->live); -+ INIT_LIST_HEAD(&bc->freeable); -+ INIT_LIST_HEAD(&bc->freed); -+} -+ -+/* -+ * We can only have one thread cannibalizing other cached btree nodes at a time, -+ * or we'll deadlock. We use an open coded mutex to ensure that, which a -+ * cannibalize_bucket() will take. This means every time we unlock the root of -+ * the btree, we need to release this lock if we have it held. -+ */ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (bc->alloc_lock == current) { -+ trace_btree_node_cannibalize_unlock(c); -+ bc->alloc_lock = NULL; -+ closure_wake_up(&bc->alloc_wait); -+ } -+} -+ -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct task_struct *old; -+ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) -+ goto success; -+ -+ if (!cl) { -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -ENOMEM; -+ } -+ -+ closure_wait(&bc->alloc_wait, cl); -+ -+ /* Try again, after adding ourselves to waitlist */ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) { -+ /* We raced */ -+ closure_wake_up(&bc->alloc_wait); -+ goto success; -+ } -+ -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -EAGAIN; -+ -+success: -+ trace_btree_node_cannibalize_lock(c); -+ return 0; -+} -+ -+static struct btree *btree_node_cannibalize(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_reclaim(c, b)) -+ return b; -+ -+ while (1) { -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_write_and_reclaim(c, b)) -+ return b; -+ -+ /* -+ * Rare case: all nodes were intent-locked. -+ * Just busy-wait. -+ */ -+ WARN_ONCE(1, "btree cache cannibalize failed\n"); -+ cond_resched(); -+ } -+} -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ u64 start_time = local_clock(); -+ unsigned flags; -+ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+ /* -+ * btree_free() doesn't free memory; it sticks the node on the end of -+ * the list. Check if there's any freed nodes there: -+ */ -+ list_for_each_entry(b, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ /* -+ * We never free struct btree itself, just the memory that holds the on -+ * disk node. Check the freed list before allocating a new one: -+ */ -+ list_for_each_entry(b, &bc->freed, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ b = NULL; -+got_node: -+ if (b) -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ if (!b) { -+ b = __btree_node_mem_alloc(c); -+ if (!b) -+ goto err; -+ -+ BUG_ON(!six_trylock_intent(&b->c.lock)); -+ BUG_ON(!six_trylock_write(&b->c.lock)); -+ } -+ -+ if (!b->data) { -+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) -+ goto err; -+ -+ mutex_lock(&bc->lock); -+ bc->used++; -+ mutex_unlock(&bc->lock); -+ } -+ -+ BUG_ON(btree_node_hashed(b)); -+ BUG_ON(btree_node_write_in_flight(b)); -+out: -+ b->flags = 0; -+ b->written = 0; -+ b->nsets = 0; -+ b->sib_u64s[0] = 0; -+ b->sib_u64s[1] = 0; -+ b->whiteout_u64s = 0; -+ bch2_btree_keys_init(b, &c->expensive_debug_checks); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], -+ start_time); -+ -+ memalloc_nofs_restore(flags); -+ return b; -+err: -+ mutex_lock(&bc->lock); -+ -+ if (b) { -+ list_add(&b->list, &bc->freed); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ /* Try to cannibalize another cached btree node: */ -+ if (bc->alloc_lock == current) { -+ b = btree_node_cannibalize(c); -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ -+ trace_btree_node_cannibalize(c); -+ goto out; -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ return ERR_PTR(-ENOMEM); -+} -+ -+/* Slowpath, don't want it inlined into btree_iter_traverse() */ -+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, -+ struct btree_iter *iter, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level, -+ enum six_lock_type lock_type, -+ bool sync) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); -+ /* -+ * Parent node must be locked, else we could read in a btree node that's -+ * been freed: -+ */ -+ if (iter && !bch2_btree_node_relock(iter, level + 1)) -+ return ERR_PTR(-EINTR); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ if (IS_ERR(b)) -+ return b; -+ -+ bkey_copy(&b->key, k); -+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { -+ /* raced with another fill: */ -+ -+ /* mark as unhashed... */ -+ b->hash_val = 0; -+ -+ mutex_lock(&bc->lock); -+ list_add(&b->list, &bc->freeable); -+ mutex_unlock(&bc->lock); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ /* -+ * Unlock before doing IO: -+ * -+ * XXX: ideally should be dropping all btree node locks here -+ */ -+ if (iter && btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ bch2_btree_node_read(c, b, sync); -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (!sync) { -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ if (lock_type == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ -+ return b; -+} -+ -+static int lock_node_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ const struct bkey_i *k = p; -+ -+ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; -+} -+ -+/** -+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it -+ * in from disk if necessary. -+ * -+ * If IO is necessary and running under generic_make_request, returns -EAGAIN. -+ * -+ * The btree node will have either a read or a write lock held, depending on -+ * the @write parameter. -+ */ -+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ /* -+ * We must have the parent locked to call bch2_btree_node_fill(), -+ * else we could read in a btree node from disk that's been -+ * freed: -+ */ -+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, lock_type, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ /* -+ * There's a potential deadlock with splits and insertions into -+ * interior nodes we have to avoid: -+ * -+ * The other thread might be holding an intent lock on the node -+ * we want, and they want to update its parent node so they're -+ * going to upgrade their intent lock on the parent node to a -+ * write lock. -+ * -+ * But if we're holding a read lock on the parent, and we're -+ * trying to get the intent lock they're holding, we deadlock. -+ * -+ * So to avoid this we drop the read locks on parent nodes when -+ * we're starting to take intent locks - and handle the race. -+ * -+ * The race is that they might be about to free the node we -+ * want, and dropping our read lock on the parent node lets them -+ * update the parent marking the node we want as freed, and then -+ * free it: -+ * -+ * To guard against this, btree nodes are evicted from the cache -+ * when they're freed - and b->hash_val is zeroed out, which we -+ * check for after we lock the node. -+ * -+ * Then, bch2_btree_node_relock() on the parent will fail - because -+ * the parent was modified, when the pointer to the node we want -+ * was removed - and we'll bail out: -+ */ -+ if (btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, -+ lock_node_check_fn, (void *) k)) { -+ if (b->hash_val != btree_ptr_hash_val(k)) -+ goto retry; -+ return ERR_PTR(-EINTR); -+ } -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.level != level || -+ race_fault())) { -+ six_unlock_type(&b->c.lock, lock_type); -+ if (bch2_btree_node_relock(iter, level + 1)) -+ goto retry; -+ -+ trace_trans_restart_btree_node_reused(iter->trans->ip); -+ return ERR_PTR(-EINTR); -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != iter->btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ b = bch2_btree_node_fill(c, NULL, k, btree_id, -+ level, SIX_LOCK_read, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); -+ if (ret) -+ goto retry; -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.btree_id != btree_id || -+ b->c.level != level)) { -+ six_unlock_read(&b->c.lock); -+ goto retry; -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_read(&b->c.lock); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ struct btree *b, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *parent; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ struct btree *ret = NULL; -+ unsigned level = b->c.level; -+ -+ parent = btree_iter_node(iter, level + 1); -+ if (!parent) -+ return NULL; -+ -+ /* -+ * There's a corner case where a btree_iter might have a node locked -+ * that is just outside its current pos - when -+ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. -+ * -+ * But the lock ordering checks in __bch2_btree_node_lock() go off of -+ * iter->pos, not the node's key: so if the iterator is marked as -+ * needing to be traversed, we risk deadlock if we don't bail out here: -+ */ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return ERR_PTR(-EINTR); -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) { -+ ret = ERR_PTR(-EINTR); -+ goto out; -+ } -+ -+ node_iter = iter->l[parent->c.level].iter; -+ -+ k = bch2_btree_node_iter_peek_all(&node_iter, parent); -+ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); -+ -+ k = sib == btree_prev_sib -+ ? bch2_btree_node_iter_prev(&node_iter, parent) -+ : (bch2_btree_node_iter_advance(&node_iter, parent), -+ bch2_btree_node_iter_peek(&node_iter, parent)); -+ if (!k) -+ goto out; -+ -+ bch2_bkey_unpack(parent, &tmp.k, k); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { -+ struct btree_iter *linked; -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) -+ goto out; -+ -+ /* -+ * We might have got -EINTR because trylock failed, and we're -+ * holding other locks that would cause us to deadlock: -+ */ -+ trans_for_each_iter(trans, linked) -+ if (btree_iter_cmp(iter, linked) < 0) -+ __bch2_btree_iter_unlock(linked); -+ -+ if (sib == btree_prev_sib) -+ btree_node_unlock(iter, level); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ /* -+ * before btree_iter_relock() calls btree_iter_verify_locks(): -+ */ -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!bch2_btree_node_relock(iter, level)) { -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ if (!IS_ERR(ret)) { -+ six_unlock_intent(&ret->c.lock); -+ ret = ERR_PTR(-EINTR); -+ } -+ } -+ -+ bch2_trans_relock(trans); -+ } -+out: -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR) -+ bch2_btree_iter_upgrade(iter, level + 2); -+ -+ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); -+ -+ if (!IS_ERR_OR_NULL(ret)) { -+ struct btree *n1 = ret, *n2 = b; -+ -+ if (sib != btree_prev_sib) -+ swap(n1, n2); -+ -+ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), -+ n2->data->min_key)); -+ } -+ -+ bch2_btree_trans_verify_locks(trans); -+ -+ return ret; -+} -+ -+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(!btree_node_locked(iter, level + 1)); -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_cache_find(bc, k); -+ if (b) -+ return; -+ -+ bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, SIX_LOCK_read, false); -+} -+ -+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_stats stats; -+ -+ memset(&stats, 0, sizeof(stats)); -+ -+ bch2_btree_keys_stats(b, &stats); -+ -+ pr_buf(out, -+ "l %u %llu:%llu - %llu:%llu:\n" -+ " ptrs: ", -+ b->c.level, -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ b->data->max_key.inode, -+ b->data->max_key.offset); -+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ pr_buf(out, "\n" -+ " format: u64s %u fields %u %u %u %u %u\n" -+ " unpack fn len: %u\n" -+ " bytes used %zu/%zu (%zu%% full)\n" -+ " sib u64s: %u, %u (merge threshold %zu)\n" -+ " nr packed keys %u\n" -+ " nr unpacked keys %u\n" -+ " floats %zu\n" -+ " failed unpacked %zu\n", -+ f->key_u64s, -+ f->bits_per_field[0], -+ f->bits_per_field[1], -+ f->bits_per_field[2], -+ f->bits_per_field[3], -+ f->bits_per_field[4], -+ b->unpack_fn_len, -+ b->nr.live_u64s * sizeof(u64), -+ btree_bytes(c) - sizeof(struct btree_node), -+ b->nr.live_u64s * 100 / btree_max_u64s(c), -+ b->sib_u64s[0], -+ b->sib_u64s[1], -+ BTREE_FOREGROUND_MERGE_THRESHOLD(c), -+ b->nr.packed_keys, -+ b->nr.unpacked_keys, -+ stats.floats, -+ stats.failed); -+} -diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h -new file mode 100644 -index 000000000000..d0d3a85bb8be ---- /dev/null -+++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,104 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_CACHE_H -+#define _BCACHEFS_BTREE_CACHE_H -+ -+#include "bcachefs.h" -+#include "btree_types.h" -+ -+struct btree_iter; -+ -+extern const char * const bch2_btree_ids[]; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *); -+ -+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, -+ unsigned, enum btree_id); -+ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -+ -+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned, -+ enum six_lock_type); -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, -+ enum btree_id, unsigned); -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, -+ struct btree *, enum btree_node_sibling); -+ -+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned); -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *); -+int bch2_fs_btree_cache_init(struct bch_fs *); -+void bch2_fs_btree_cache_init_early(struct btree_cache *); -+ -+static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -+{ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); -+ case KEY_TYPE_btree_ptr_v2: -+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; -+ default: -+ return 0; -+ } -+} -+ -+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -+{ -+ return k->k.type == KEY_TYPE_btree_ptr_v2 -+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr -+ : NULL; -+} -+ -+/* is btree node in hash table? */ -+static inline bool btree_node_hashed(struct btree *b) -+{ -+ return b->hash_val != 0; -+} -+ -+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ -+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ -+ &(_c)->btree_cache.table), \ -+ _iter = 0; _iter < (_tbl)->size; _iter++) \ -+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -+ -+static inline size_t btree_bytes(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size << 9; -+} -+ -+static inline size_t btree_max_u64s(struct bch_fs *c) -+{ -+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); -+} -+ -+static inline size_t btree_pages(struct bch_fs *c) -+{ -+ return btree_bytes(c) / PAGE_SIZE; -+} -+ -+static inline unsigned btree_blocks(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size >> c->block_bits; -+} -+ -+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) -+ -+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) -+ -+#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) -+ -+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, -+ struct btree *); -+ -+#endif /* _BCACHEFS_BTREE_CACHE_H */ -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -new file mode 100644 -index 000000000000..4f581130270c ---- /dev/null -+++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,1395 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * Copyright (C) 2014 Datera Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_locking.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ write_seqcount_begin(&c->gc_pos_lock); -+ c->gc_pos = new_pos; -+ write_seqcount_end(&c->gc_pos_lock); -+} -+ -+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); -+ __gc_pos_set(c, new_pos); -+} -+ -+static int bch2_gc_check_topology(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bpos *expected_start, -+ struct bpos expected_end, -+ bool is_last) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, -+ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", -+ bp.v->min_key.inode, -+ bp.v->min_key.offset, -+ expected_start->inode, -+ expected_start->offset)) { -+ BUG(); -+ } -+ } -+ -+ *expected_start = bkey_cmp(k.k->p, POS_MAX) -+ ? bkey_successor(k.k->p) -+ : k.k->p; -+ -+ if (fsck_err_on(is_last && -+ bkey_cmp(k.k->p, expected_end), c, -+ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", -+ k.k->p.inode, -+ k.k->p.offset, -+ expected_end.inode, -+ expected_end.offset)) { -+ BUG(); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* marking of btree keys/nodes: */ -+ -+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, -+ u8 *max_stale, bool initial) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ unsigned flags = -+ BTREE_TRIGGER_GC| -+ (initial ? BTREE_TRIGGER_NOATOMIC : 0); -+ int ret = 0; -+ -+ if (initial) { -+ BUG_ON(journal_seq_verify(c) && -+ k.k->version.lo > journal_cur_seq(&c->journal)); -+ -+ /* XXX change to fsck check */ -+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, -+ "key version number higher than recorded: %llu > %llu", -+ k.k->version.lo, -+ atomic64_read(&c->key_version))) -+ atomic64_set(&c->key_version, k.k->version.lo); -+ -+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, -+ "superblock not marked as containing replicas (type %u)", -+ k.k->type)) { -+ ret = bch2_mark_bkey_replicas(c, k); -+ if (ret) -+ return ret; -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); -+ -+ if (mustfix_fsck_err_on(!g->gen_valid, c, -+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ } -+ -+ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, -+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen, g->mark.gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ g2->_mark.data_type = 0; -+ g2->_mark.dirty_sectors = 0; -+ g2->_mark.cached_sectors = 0; -+ set_bit(BCH_FS_FIXED_GENS, &c->flags); -+ } -+ } -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ -+ if (gen_after(g->oldest_gen, ptr->gen)) -+ g->oldest_gen = ptr->gen; -+ -+ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); -+ } -+ -+ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); -+fsck_err: -+ return ret; -+} -+ -+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, -+ bool initial) -+{ -+ struct bpos next_node_start = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ *max_stale = 0; -+ -+ if (!btree_node_type_needs_gc(btree_node_type(b))) -+ return 0; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ ret = bch2_gc_mark_key(c, k, max_stale, initial); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (b->c.level) { -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ bch2_btree_node_iter_end(&iter)); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, -+ bool initial, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ 0, depth, BTREE_ITER_PREFETCH, b) { -+ bch2_verify_btree_nr_keys(b); -+ -+ gc_pos_set(c, gc_pos_btree_node(b)); -+ -+ ret = btree_gc_mark_node(c, b, &max_stale, initial); -+ if (ret) -+ break; -+ -+ if (!initial) { -+ if (max_stale > 64) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ else if (!btree_gc_rewrite_disabled(c) && -+ (btree_gc_always_rewrite(c) || max_stale > 16)) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->btree_root_lock); -+ b = c->btree_roots[btree_id].b; -+ if (!btree_node_fake(b)) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, initial); -+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); -+ mutex_unlock(&c->btree_root_lock); -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ unsigned target_depth) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bpos next_node_start = b->data->min_key; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); -+ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); -+ -+ ret = bch2_gc_mark_key(c, k, &max_stale, true); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ !bch2_btree_and_journal_iter_peek(&iter).k); -+ if (ret) -+ break; -+ -+ if (b->c.level > target_depth) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_btree_init_recurse(c, child, -+ journal_keys, target_depth); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init(struct bch_fs *c, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ bool metadata_only) -+{ -+ struct btree *b; -+ unsigned target_depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ b = c->btree_roots[btree_id].b; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset)) { -+ BUG(); -+ } -+ -+ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->max_key.inode, -+ b->data->max_key.offset)) { -+ BUG(); -+ } -+ -+ if (b->c.level >= target_depth) -+ ret = bch2_gc_btree_init_recurse(c, b, -+ journal_keys, target_depth); -+ -+ if (!ret) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, true); -+fsck_err: -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -+{ -+ return (int) btree_id_to_gc_phase(l) - -+ (int) btree_id_to_gc_phase(r); -+} -+ -+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ enum btree_id ids[BTREE_ID_NR]; -+ unsigned i; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ ids[i] = i; -+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ enum btree_id id = ids[i]; -+ int ret = initial -+ ? bch2_gc_btree_init(c, journal_keys, -+ id, metadata_only) -+ : bch2_gc_btree(c, id, initial, metadata_only); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, -+ u64 start, u64 end, -+ enum bch_data_type type, -+ unsigned flags) -+{ -+ u64 b = sector_to_bucket(ca, start); -+ -+ do { -+ unsigned sectors = -+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; -+ -+ bch2_mark_metadata_bucket(c, ca, b, type, sectors, -+ gc_phase(GC_PHASE_SB), flags); -+ b++; -+ start += sectors; -+ } while (start < end); -+} -+ -+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, -+ unsigned flags) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ unsigned i; -+ u64 b; -+ -+ /* -+ * This conditional is kind of gross, but we may be called from the -+ * device add path, before the new device has actually been added to the -+ * running filesystem: -+ */ -+ if (c) { -+ lockdep_assert_held(&c->sb_lock); -+ percpu_down_read(&c->mark_lock); -+ } -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset == BCH_SB_SECTOR) -+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, -+ BCH_DATA_sb, flags); -+ -+ mark_metadata_sectors(c, ca, offset, -+ offset + (1 << layout->sb_max_size_bits), -+ BCH_DATA_sb, flags); -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) { -+ b = ca->journal.buckets[i]; -+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), flags); -+ } -+ -+ if (c) -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_mark_superblocks(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&c->sb_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_SB)); -+ -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); -+ mutex_unlock(&c->sb_lock); -+} -+ -+#if 0 -+/* Also see bch2_pending_btree_node_free_insert_done() */ -+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -+{ -+ struct btree_update *as; -+ struct pending_btree_node_free *d; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); -+ -+ for_each_pending_btree_node_free(c, as, d) -+ if (d->index_update_done) -+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), -+ 0, 0, NULL, 0, -+ BTREE_TRIGGER_GC); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+#endif -+ -+static void bch2_mark_allocator_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct open_bucket *ob; -+ size_t i, j, iter; -+ unsigned ci; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ gc_pos_set(c, gc_pos_alloc(c, NULL)); -+ -+ for_each_member_device(ca, c, ci) { -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ -+ -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid) { -+ gc_pos_set(c, gc_pos_alloc(c, ob)); -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, -+ gc_pos_alloc(c, ob), -+ BTREE_TRIGGER_GC); -+ } -+ spin_unlock(&ob->lock); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_gc_free(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ genradix_free(&c->stripes[1]); -+ -+ for_each_member_device(ca, c, i) { -+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ ca->buckets[1] = NULL; -+ -+ free_percpu(ca->usage[1]); -+ ca->usage[1] = NULL; -+ } -+ -+ free_percpu(c->usage_gc); -+ c->usage_gc = NULL; -+} -+ -+static int bch2_gc_done(struct bch_fs *c, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ bool verify = !metadata_only && -+ (!initial || -+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); -+ unsigned i; -+ int ret = 0; -+ -+#define copy_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ -+ , ##__VA_ARGS__, dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ } -+#define copy_stripe_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, "stripe %zu has wrong "_msg \ -+ ": got %u, should be %u", \ -+ dst_iter.pos, ##__VA_ARGS__, \ -+ dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ dst->dirty = true; \ -+ } -+#define copy_bucket_field(_f) \ -+ if (dst->b[b].mark._f != src->b[b].mark._f) { \ -+ if (verify) \ -+ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ -+ ": got %u, should be %u", i, b, \ -+ dst->b[b].mark.gen, \ -+ bch2_data_types[dst->b[b].mark.data_type],\ -+ dst->b[b].mark._f, src->b[b].mark._f); \ -+ dst->b[b]._mark._f = src->b[b].mark._f; \ -+ } -+#define copy_dev_field(_f, _msg, ...) \ -+ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) -+#define copy_fs_field(_f, _msg, ...) \ -+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) -+ -+ if (!metadata_only) { -+ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); -+ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); -+ struct stripe *dst, *src; -+ unsigned i; -+ -+ c->ec_stripes_heap.used = 0; -+ -+ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && -+ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { -+ BUG_ON(src_iter.pos != dst_iter.pos); -+ -+ copy_stripe_field(alive, "alive"); -+ copy_stripe_field(sectors, "sectors"); -+ copy_stripe_field(algorithm, "algorithm"); -+ copy_stripe_field(nr_blocks, "nr_blocks"); -+ copy_stripe_field(nr_redundant, "nr_redundant"); -+ copy_stripe_field(blocks_nonempty, -+ "blocks_nonempty"); -+ -+ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) -+ copy_stripe_field(block_sectors[i], -+ "block_sectors[%u]", i); -+ -+ if (dst->alive) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_insert(c, dst, dst_iter.pos); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ genradix_iter_advance(&dst_iter, &c->stripes[0]); -+ genradix_iter_advance(&src_iter, &c->stripes[1]); -+ } -+ } -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 0); -+ struct bucket_array *src = __bucket_array(ca, 1); -+ size_t b; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ copy_bucket_field(gen); -+ copy_bucket_field(data_type); -+ copy_bucket_field(owned_by_allocator); -+ copy_bucket_field(stripe); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); -+ -+ dst->b[b].oldest_gen = src->b[b].oldest_gen; -+ } -+ }; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ bch2_dev_usage_from_buckets(c); -+ -+ { -+ unsigned nr = fs_usage_u64s(c); -+ struct bch_fs_usage *dst = c->usage_base; -+ struct bch_fs_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); -+ -+ copy_fs_field(hidden, "hidden"); -+ copy_fs_field(btree, "btree"); -+ -+ if (!metadata_only) { -+ copy_fs_field(data, "data"); -+ copy_fs_field(cached, "cached"); -+ copy_fs_field(reserved, "reserved"); -+ copy_fs_field(nr_inodes,"nr_inodes"); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ copy_fs_field(persistent_reserved[i], -+ "persistent_reserved[%i]", i); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ char buf[80]; -+ -+ if (metadata_only && -+ (e->data_type == BCH_DATA_user || -+ e->data_type == BCH_DATA_cached)) -+ continue; -+ -+ bch2_replicas_entry_to_text(&PBUF(buf), e); -+ -+ copy_fs_field(replicas[i], "%s", buf); -+ } -+ } -+ -+#undef copy_fs_field -+#undef copy_dev_field -+#undef copy_bucket_field -+#undef copy_stripe_field -+#undef copy_field -+fsck_err: -+ return ret; -+} -+ -+static int bch2_gc_start(struct bch_fs *c, -+ bool metadata_only) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(c->usage_gc); -+ -+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), -+ sizeof(u64), GFP_KERNEL); -+ if (!c->usage_gc) { -+ bch_err(c, "error allocating c->usage_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_member_device(ca, c, i) { -+ BUG_ON(ca->buckets[1]); -+ BUG_ON(ca->usage[1]); -+ -+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!ca->buckets[1]) { -+ percpu_ref_put(&ca->ref); -+ bch_err(c, "error allocating ca->buckets[gc]"); -+ return -ENOMEM; -+ } -+ -+ ca->usage[1] = alloc_percpu(struct bch_dev_usage); -+ if (!ca->usage[1]) { -+ bch_err(c, "error allocating ca->usage[gc]"); -+ percpu_ref_put(&ca->ref); -+ return -ENOMEM; -+ } -+ } -+ -+ ret = bch2_ec_mem_alloc(c, true); -+ if (ret) { -+ bch_err(c, "error allocating ec gc mem"); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * indicate to stripe code that we need to allocate for the gc stripes -+ * radix tree, too -+ */ -+ gc_pos_set(c, gc_phase(GC_PHASE_START)); -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 1); -+ struct bucket_array *src = __bucket_array(ca, 0); -+ size_t b; -+ -+ dst->first_bucket = src->first_bucket; -+ dst->nbuckets = src->nbuckets; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ struct bucket *d = &dst->b[b]; -+ struct bucket *s = &src->b[b]; -+ -+ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; -+ d->gen_valid = s->gen_valid; -+ -+ if (metadata_only && -+ (s->mark.data_type == BCH_DATA_user || -+ s->mark.data_type == BCH_DATA_cached)) { -+ d->_mark = s->mark; -+ d->_mark.owned_by_allocator = 0; -+ } -+ } -+ }; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return 0; -+} -+ -+/** -+ * bch2_gc - walk _all_ references to buckets, and recompute them: -+ * -+ * Order matters here: -+ * - Concurrent GC relies on the fact that we have a total ordering for -+ * everything that GC walks - see gc_will_visit_node(), -+ * gc_will_visit_root() -+ * -+ * - also, references move around in the course of index updates and -+ * various other crap: everything needs to agree on the ordering -+ * references are allowed to move around in - e.g., we're allowed to -+ * start with a reference owned by an open_bucket (the allocator) and -+ * move it to the btree, but not the reverse. -+ * -+ * This is necessary to ensure that gc doesn't miss references that -+ * move around - if references move backwards in the ordering GC -+ * uses, GC could skip past them -+ */ -+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ u64 start_time = local_clock(); -+ unsigned i, iter = 0; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ trace_gc_start(c); -+ -+ down_write(&c->gc_lock); -+ -+ /* flush interior btree updates: */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+again: -+ ret = bch2_gc_start(c, metadata_only); -+ if (ret) -+ goto out; -+ -+ bch2_mark_superblocks(c); -+ -+ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); -+ if (ret) -+ goto out; -+ -+#if 0 -+ bch2_mark_pending_btree_node_frees(c); -+#endif -+ bch2_mark_allocator_buckets(c); -+ -+ c->gc_count++; -+out: -+ if (!ret && -+ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || -+ (!iter && test_restart_gc(c)))) { -+ /* -+ * XXX: make sure gens we fixed got saved -+ */ -+ if (iter++ <= 2) { -+ bch_info(c, "Fixed gens, restarting mark and sweep:"); -+ clear_bit(BCH_FS_FIXED_GENS, &c->flags); -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ /* flush fsck errors, reset counters */ -+ bch2_flush_fsck_errs(c); -+ -+ goto again; -+ } -+ -+ bch_info(c, "Unable to fix bucket gens, looping"); -+ ret = -EINVAL; -+ } -+ -+ if (!ret) { -+ bch2_journal_block(&c->journal); -+ -+ percpu_down_write(&c->mark_lock); -+ ret = bch2_gc_done(c, initial, metadata_only); -+ -+ bch2_journal_unblock(&c->journal); -+ } else { -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ /* Indicates that gc is no longer in progress: */ -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ -+ up_write(&c->gc_lock); -+ -+ trace_gc_end(c); -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); -+ -+ /* -+ * Wake up allocator in case it was waiting for buckets -+ * because of not being able to inc gens -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_wake_allocator(ca); -+ -+ /* -+ * At startup, allocations can happen directly instead of via the -+ * allocator thread - issue wakeup in case they blocked on gc_lock: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ return ret; -+} -+ -+/* -+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree -+ * node pointers currently never have cached pointers that can become stale: -+ */ -+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ percpu_down_read(&c->mark_lock); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); -+ -+ if (gen_after(g->gc_gen, ptr->gen)) -+ g->gc_gen = ptr->gen; -+ -+ if (gen_after(g->mark.gen, ptr->gen) > 32) { -+ /* rewrite btree node */ -+ -+ } -+ } -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_gc_gens(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int ret; -+ -+ /* -+ * Ideally we would be using state_lock and not gc_lock here, but that -+ * introduces a deadlock in the RO path - we currently take the state -+ * lock at the start of going RO, thus the gc thread may get stuck: -+ */ -+ down_read(&c->gc_lock); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->gc_gen = g->mark.gen; -+ up_read(&ca->bucket_lock); -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (btree_node_type_needs_gc(i)) { -+ ret = bch2_gc_btree_gens(c, i); -+ if (ret) { -+ bch_err(c, "error recalculating oldest_gen: %i", ret); -+ goto err; -+ } -+ } -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->oldest_gen = g->gc_gen; -+ up_read(&ca->bucket_lock); -+ } -+ -+ c->gc_count++; -+err: -+ up_read(&c->gc_lock); -+ return ret; -+} -+ -+/* Btree coalescing */ -+ -+static void recalc_packed_keys(struct btree *b) -+{ -+ struct bset *i = btree_bset_first(b); -+ struct bkey_packed *k; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ -+ BUG_ON(b->nsets != 1); -+ -+ vstruct_for_each(i, k) -+ btree_keys_account_key_add(&b->nr, 0, k); -+} -+ -+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *old_nodes[GC_MERGE_NODES]) -+{ -+ struct btree *parent = btree_node_parent(iter, old_nodes[0]); -+ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; -+ unsigned blocks = btree_blocks(c) * 2 / 3; -+ struct btree *new_nodes[GC_MERGE_NODES]; -+ struct btree_update *as; -+ struct keylist keylist; -+ struct bkey_format_state format_state; -+ struct bkey_format new_format; -+ -+ memset(new_nodes, 0, sizeof(new_nodes)); -+ bch2_keylist_init(&keylist, NULL); -+ -+ /* Count keys that are not deleted */ -+ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) -+ u64s += old_nodes[i]->nr.live_u64s; -+ -+ nr_old_nodes = nr_new_nodes = i; -+ -+ /* Check if all keys in @old_nodes could fit in one fewer node */ -+ if (nr_old_nodes <= 1 || -+ __vstruct_blocks(struct btree_node, c->block_bits, -+ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) -+ return; -+ -+ /* Find a format that all keys in @old_nodes can pack into */ -+ bch2_bkey_format_init(&format_state); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ __bch2_btree_calc_format(&format_state, old_nodes[i]); -+ -+ new_format = bch2_bkey_format_done(&format_state); -+ -+ /* Check if repacking would make any nodes too big to fit */ -+ for (i = 0; i < nr_old_nodes; i++) -+ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); -+ return; -+ } -+ -+ if (bch2_keylist_realloc(&keylist, NULL, 0, -+ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); -+ return; -+ } -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + nr_old_nodes, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ NULL); -+ if (IS_ERR(as)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET); -+ bch2_keylist_free(&keylist, NULL); -+ return; -+ } -+ -+ trace_btree_gc_coalesce(c, old_nodes[0]); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); -+ -+ /* Repack everything with @new_format and sort down to one bset */ -+ for (i = 0; i < nr_old_nodes; i++) -+ new_nodes[i] = -+ __bch2_btree_node_alloc_replacement(as, old_nodes[i], -+ new_format); -+ -+ /* -+ * Conceptually we concatenate the nodes together and slice them -+ * up at different boundaries. -+ */ -+ for (i = nr_new_nodes - 1; i > 0; --i) { -+ struct btree *n1 = new_nodes[i]; -+ struct btree *n2 = new_nodes[i - 1]; -+ -+ struct bset *s1 = btree_bset_first(n1); -+ struct bset *s2 = btree_bset_first(n2); -+ struct bkey_packed *k, *last = NULL; -+ -+ /* Calculate how many keys from @n2 we could fit inside @n1 */ -+ u64s = 0; -+ -+ for (k = s2->start; -+ k < vstruct_last(s2) && -+ vstruct_blocks_plus(n1->data, c->block_bits, -+ u64s + k->u64s) <= blocks; -+ k = bkey_next_skip_noops(k, vstruct_last(s2))) { -+ last = k; -+ u64s += k->u64s; -+ } -+ -+ if (u64s == le16_to_cpu(s2->u64s)) { -+ /* n2 fits entirely in n1 */ -+ n1->key.k.p = n1->data->max_key = n2->data->max_key; -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, -+ le16_to_cpu(s2->u64s)); -+ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ -+ six_unlock_write(&n2->c.lock); -+ bch2_btree_node_free_never_inserted(c, n2); -+ six_unlock_intent(&n2->c.lock); -+ -+ memmove(new_nodes + i - 1, -+ new_nodes + i, -+ sizeof(new_nodes[0]) * (nr_new_nodes - i)); -+ new_nodes[--nr_new_nodes] = NULL; -+ } else if (u64s) { -+ /* move part of n2 into n1 */ -+ n1->key.k.p = n1->data->max_key = -+ bkey_unpack_pos(n1, last); -+ -+ n2->data->min_key = bkey_successor(n1->data->max_key); -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, u64s); -+ le16_add_cpu(&s1->u64s, u64s); -+ -+ memmove(s2->start, -+ vstruct_idx(s2, u64s), -+ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); -+ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) { -+ struct btree *n = new_nodes[i]; -+ -+ recalc_packed_keys(n); -+ btree_node_reset_sib_u64s(n); -+ -+ bch2_btree_build_aux_trees(n); -+ -+ bch2_btree_update_add_new_node(as, n); -+ six_unlock_write(&n->c.lock); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ } -+ -+ /* -+ * The keys for the old nodes get deleted. We don't want to insert keys -+ * that compare equal to the keys for the new nodes we'll also be -+ * inserting - we can't because keys on a keylist must be strictly -+ * greater than the previous keys, and we also don't need to since the -+ * key for the new node will serve the same purpose (overwriting the key -+ * for the old node). -+ */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ struct bkey_i delete; -+ unsigned j; -+ -+ for (j = 0; j < nr_new_nodes; j++) -+ if (!bkey_cmp(old_nodes[i]->key.k.p, -+ new_nodes[j]->key.k.p)) -+ goto next; -+ -+ bkey_init(&delete.k); -+ delete.k.p = old_nodes[i]->key.k.p; -+ bch2_keylist_add_in_order(&keylist, &delete); -+next: -+ i = i; -+ } -+ -+ /* -+ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only -+ * does the lookup once and thus expects the keys to be in sorted order -+ * so we have to make sure the new keys are correctly ordered with -+ * respect to the deleted keys added in the previous loop -+ */ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); -+ -+ /* Insert the newly coalesced nodes */ -+ bch2_btree_insert_node(as, parent, iter, &keylist, 0); -+ -+ BUG_ON(!bch2_keylist_empty(&keylist)); -+ -+ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); -+ -+ bch2_btree_iter_node_replace(iter, new_nodes[0]); -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_btree_update_get_open_buckets(as, new_nodes[i]); -+ -+ /* Free the old nodes and update our sliding window */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ bch2_btree_node_free_inmem(c, old_nodes[i], iter); -+ -+ /* -+ * the index update might have triggered a split, in which case -+ * the nodes we coalesced - the new nodes we just created - -+ * might not be sibling nodes anymore - don't add them to the -+ * sliding window (except the first): -+ */ -+ if (!i) { -+ old_nodes[i] = new_nodes[i]; -+ } else { -+ old_nodes[i] = NULL; -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ six_unlock_intent(&new_nodes[i]->c.lock); -+ -+ bch2_btree_update_done(as); -+ bch2_keylist_free(&keylist, NULL); -+} -+ -+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ unsigned i; -+ -+ /* Sliding window of adjacent btree nodes */ -+ struct btree *merge[GC_MERGE_NODES]; -+ u32 lock_seq[GC_MERGE_NODES]; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * XXX: We don't have a good way of positively matching on sibling nodes -+ * that have the same parent - this code works by handling the cases -+ * where they might not have the same parent, and is thus fragile. Ugh. -+ * -+ * Perhaps redo this to use multiple linked iterators? -+ */ -+ memset(merge, 0, sizeof(merge)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ BTREE_MAX_DEPTH, 0, -+ BTREE_ITER_PREFETCH, b) { -+ memmove(merge + 1, merge, -+ sizeof(merge) - sizeof(merge[0])); -+ memmove(lock_seq + 1, lock_seq, -+ sizeof(lock_seq) - sizeof(lock_seq[0])); -+ -+ merge[0] = b; -+ -+ for (i = 1; i < GC_MERGE_NODES; i++) { -+ if (!merge[i] || -+ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) -+ break; -+ -+ if (merge[i]->c.level != merge[0]->c.level) { -+ six_unlock_intent(&merge[i]->c.lock); -+ break; -+ } -+ } -+ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); -+ -+ bch2_coalesce_nodes(c, iter, merge); -+ -+ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { -+ lock_seq[i] = merge[i]->c.lock.state.seq; -+ six_unlock_intent(&merge[i]->c.lock); -+ } -+ -+ lock_seq[0] = merge[0]->c.lock.state.seq; -+ -+ if (kthread && kthread_should_stop()) { -+ bch2_trans_exit(&trans); -+ return -ESHUTDOWN; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ -+ /* -+ * If the parent node wasn't relocked, it might have been split -+ * and the nodes in our sliding window might not have the same -+ * parent anymore - blow away the sliding window: -+ */ -+ if (btree_iter_node(iter, iter->level + 1) && -+ !btree_node_intent_locked(iter, iter->level + 1)) -+ memset(merge + 1, 0, -+ (GC_MERGE_NODES - 1) * sizeof(merge[0])); -+ } -+ return bch2_trans_exit(&trans); -+} -+ -+/** -+ * bch_coalesce - coalesce adjacent nodes with low occupancy -+ */ -+void bch2_coalesce(struct bch_fs *c) -+{ -+ enum btree_id id; -+ -+ down_read(&c->gc_lock); -+ trace_gc_coalesce_start(c); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ int ret = c->btree_roots[id].b -+ ? bch2_coalesce_btree(c, id) -+ : 0; -+ -+ if (ret) { -+ if (ret != -ESHUTDOWN) -+ bch_err(c, "btree coalescing failed: %d", ret); -+ return; -+ } -+ } -+ -+ trace_gc_coalesce_end(c); -+ up_read(&c->gc_lock); -+} -+ -+static int bch2_gc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last = atomic_long_read(&clock->now); -+ unsigned last_kick = atomic_read(&c->kick_gc); -+ int ret; -+ -+ set_freezable(); -+ -+ while (1) { -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ if (kthread_should_stop()) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (atomic_read(&c->kick_gc) != last_kick) -+ break; -+ -+ if (c->btree_gc_periodic) { -+ unsigned long next = last + c->capacity / 16; -+ -+ if (atomic_long_read(&clock->now) >= next) -+ break; -+ -+ bch2_io_clock_schedule_timeout(clock, next); -+ } else { -+ schedule(); -+ } -+ -+ try_to_freeze(); -+ } -+ __set_current_state(TASK_RUNNING); -+ -+ last = atomic_long_read(&clock->now); -+ last_kick = atomic_read(&c->kick_gc); -+ -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ ret = bch2_gc(c, NULL, false, false); -+#else -+ ret = bch2_gc_gens(c); -+#endif -+ if (ret) -+ bch_err(c, "btree gc failed: %i", ret); -+ -+ debug_check_no_locks_held(); -+ } -+ -+ return 0; -+} -+ -+void bch2_gc_thread_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ p = c->gc_thread; -+ c->gc_thread = NULL; -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_gc_thread_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ BUG_ON(c->gc_thread); -+ -+ p = kthread_create(bch2_gc_thread, c, "bch_gc"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ c->gc_thread = p; -+ wake_up_process(p); -+ return 0; -+} -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -new file mode 100644 -index 000000000000..3694a3df62a8 ---- /dev/null -+++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,121 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_GC_H -+#define _BCACHEFS_BTREE_GC_H -+ -+#include "btree_types.h" -+ -+void bch2_coalesce(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); -+int bch2_gc_gens(struct bch_fs *); -+void bch2_gc_thread_stop(struct bch_fs *); -+int bch2_gc_thread_start(struct bch_fs *); -+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); -+ -+/* -+ * For concurrent mark and sweep (with other index updates), we define a total -+ * ordering of _all_ references GC walks: -+ * -+ * Note that some references will have the same GC position as others - e.g. -+ * everything within the same btree node; in those cases we're relying on -+ * whatever locking exists for where those references live, i.e. the write lock -+ * on a btree node. -+ * -+ * That locking is also required to ensure GC doesn't pass the updater in -+ * between the updater adding/removing the reference and updating the GC marks; -+ * without that, we would at best double count sometimes. -+ * -+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ -+ * be held that prevents GC from passing the position the updater is at. -+ * -+ * (What about the start of gc, when we're clearing all the marks? GC clears the -+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc -+ * position inside its cmpxchg loop, so crap magically works). -+ */ -+ -+/* Position of (the start of) a gc phase: */ -+static inline struct gc_pos gc_phase(enum gc_phase phase) -+{ -+ return (struct gc_pos) { -+ .phase = phase, -+ .pos = POS_MIN, -+ .level = 0, -+ }; -+} -+ -+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -+{ -+ if (l.phase != r.phase) -+ return l.phase < r.phase ? -1 : 1; -+ if (bkey_cmp(l.pos, r.pos)) -+ return bkey_cmp(l.pos, r.pos); -+ if (l.level != r.level) -+ return l.level < r.level ? -1 : 1; -+ return 0; -+} -+ -+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -+{ -+ switch (id) { -+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; -+ BCH_BTREE_IDS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct gc_pos gc_pos_btree(enum btree_id id, -+ struct bpos pos, unsigned level) -+{ -+ return (struct gc_pos) { -+ .phase = btree_id_to_gc_phase(id), -+ .pos = pos, -+ .level = level, -+ }; -+} -+ -+/* -+ * GC position of the pointers within a btree node: note, _not_ for &b->key -+ * itself, that lives in the parent node: -+ */ -+static inline struct gc_pos gc_pos_btree_node(struct btree *b) -+{ -+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -+} -+ -+/* -+ * GC position of the pointer to a btree root: we don't use -+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with -+ * btree_split() increasing the tree depth - the new root will have level > the -+ * old root and thus have a greater gc position than the old root, but that -+ * would be incorrect since once gc has marked the root it's not coming back. -+ */ -+static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -+{ -+ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); -+} -+ -+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) -+{ -+ return (struct gc_pos) { -+ .phase = GC_PHASE_ALLOC, -+ .pos = POS(ob ? ob - c->open_buckets : 0, 0), -+ }; -+} -+ -+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -+{ -+ unsigned seq; -+ bool ret; -+ -+ do { -+ seq = read_seqcount_begin(&c->gc_pos_lock); -+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; -+ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); -+ -+ return ret; -+} -+ -+#endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -new file mode 100644 -index 000000000000..2f5097218f9c ---- /dev/null -+++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,1834 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static void verify_no_dups(struct btree *b, -+ struct bkey_packed *start, -+ struct bkey_packed *end, -+ bool extents) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bkey_packed *k, *p; -+ -+ if (start == end) -+ return; -+ -+ for (p = start, k = bkey_next_skip_noops(start, end); -+ k != end; -+ p = k, k = bkey_next_skip_noops(k, end)) { -+ struct bkey l = bkey_unpack_key(b, p); -+ struct bkey r = bkey_unpack_key(b, k); -+ -+ BUG_ON(extents -+ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 -+ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); -+ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); -+ } -+#endif -+} -+ -+static void set_needs_whiteout(struct bset *i, int v) -+{ -+ struct bkey_packed *k; -+ -+ for (k = i->start; -+ k != vstruct_last(i); -+ k = bkey_next_skip_noops(k, vstruct_last(i))) -+ k->needs_whiteout = v; -+} -+ -+static void btree_bounce_free(struct bch_fs *c, size_t size, -+ bool used_mempool, void *p) -+{ -+ if (used_mempool) -+ mempool_free(p, &c->btree_bounce_pool); -+ else -+ vpfree(p, size); -+} -+ -+static void *btree_bounce_alloc(struct bch_fs *c, size_t size, -+ bool *used_mempool) -+{ -+ unsigned flags = memalloc_nofs_save(); -+ void *p; -+ -+ BUG_ON(size > btree_bytes(c)); -+ -+ *used_mempool = false; -+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); -+ if (!p) { -+ *used_mempool = true; -+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); -+ } -+ memalloc_nofs_restore(flags); -+ return p; -+} -+ -+static void sort_bkey_ptrs(const struct btree *bt, -+ struct bkey_packed **ptrs, unsigned nr) -+{ -+ unsigned n = nr, a = nr / 2, b, c, d; -+ -+ if (!a) -+ return; -+ -+ /* Heap sort: see lib/sort.c: */ -+ while (1) { -+ if (a) -+ a--; -+ else if (--n) -+ swap(ptrs[0], ptrs[n]); -+ else -+ break; -+ -+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) -+ b = bkey_cmp_packed(bt, -+ ptrs[c], -+ ptrs[d]) >= 0 ? c : d; -+ if (d == n) -+ b = c; -+ -+ while (b != a && -+ bkey_cmp_packed(bt, -+ ptrs[a], -+ ptrs[b]) >= 0) -+ b = (b - 1) / 2; -+ c = b; -+ while (b != a) { -+ b = (b - 1) / 2; -+ swap(ptrs[b], ptrs[c]); -+ } -+ } -+} -+ -+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; -+ bool used_mempool = false; -+ size_t bytes = b->whiteout_u64s * sizeof(u64); -+ -+ if (!b->whiteout_u64s) -+ return; -+ -+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); -+ -+ for (k = unwritten_whiteouts_start(c, b); -+ k != unwritten_whiteouts_end(c, b); -+ k = bkey_next(k)) -+ *--ptrs = k; -+ -+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); -+ -+ k = new_whiteouts; -+ -+ while (ptrs != ptrs_end) { -+ bkey_copy(k, *ptrs); -+ k = bkey_next(k); -+ ptrs++; -+ } -+ -+ verify_no_dups(b, new_whiteouts, -+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), -+ btree_node_old_extent_overwrite(b)); -+ -+ memcpy_u64s(unwritten_whiteouts_start(c, b), -+ new_whiteouts, b->whiteout_u64s); -+ -+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -+} -+ -+static bool should_compact_bset(struct btree *b, struct bset_tree *t, -+ bool compacting, enum compact_mode mode) -+{ -+ if (!bset_dead_u64s(b, t)) -+ return false; -+ -+ switch (mode) { -+ case COMPACT_LAZY: -+ return should_compact_bset_lazy(b, t) || -+ (compacting && !bset_written(b, bset(b, t))); -+ case COMPACT_ALL: -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_compact_extent_whiteouts(struct bch_fs *c, -+ struct btree *b, -+ enum compact_mode mode) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_tree *t; -+ struct bkey_packed *whiteouts = NULL; -+ struct bkey_packed *u_start, *u_pos; -+ struct sort_iter sort_iter; -+ unsigned bytes, whiteout_u64s = 0, u64s; -+ bool used_mempool, compacting = false; -+ -+ BUG_ON(!btree_node_is_extents(b)); -+ -+ for_each_bset(b, t) -+ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) -+ whiteout_u64s += bset_dead_u64s(b, t); -+ -+ if (!whiteout_u64s) -+ return false; -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ whiteout_u64s += b->whiteout_u64s; -+ bytes = whiteout_u64s * sizeof(u64); -+ -+ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); -+ u_start = u_pos = whiteouts; -+ -+ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ compacting = true; -+ -+ if (!should_compact_bset(b, t, compacting, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ compacting = true; -+ u_start = u_pos; -+ start = i->start; -+ end = vstruct_last(i); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (bkey_deleted(k)) -+ continue; -+ -+ BUG_ON(bkey_whiteout(k) && -+ k->needs_whiteout && -+ bkey_written(b, k)); -+ -+ if (bkey_whiteout(k) && !k->needs_whiteout) -+ continue; -+ -+ if (bkey_whiteout(k)) { -+ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); -+ set_bkeyp_val_u64s(f, u_pos, 0); -+ u_pos = bkey_next(u_pos); -+ } else { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } -+ } -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ } -+ -+ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; -+ -+ BUG_ON((void *) unwritten_whiteouts_start(c, b) < -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ -+ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), -+ &sort_iter); -+ -+ BUG_ON(u64s > b->whiteout_u64s); -+ BUG_ON(u_pos != whiteouts && !u64s); -+ -+ if (u64s != b->whiteout_u64s) { -+ void *src = unwritten_whiteouts_start(c, b); -+ -+ b->whiteout_u64s = u64s; -+ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); -+ } -+ -+ verify_no_dups(b, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b), -+ true); -+ -+ btree_bounce_free(c, bytes, used_mempool, whiteouts); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ bch_btree_keys_u64s_remaining(c, b); -+ bch2_verify_btree_nr_keys(b); -+ -+ return true; -+} -+ -+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -+{ -+ struct bset_tree *t; -+ bool ret = false; -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ ret = true; -+ -+ if (!should_compact_bset(b, t, ret, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ start = btree_bkey_first(b, t); -+ end = btree_bkey_last(b, t); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (!bkey_whiteout(k)) { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } else { -+ BUG_ON(k->needs_whiteout); -+ } -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ ret = true; -+ } -+ -+ bch2_verify_btree_nr_keys(b); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return ret; -+} -+ -+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, -+ enum compact_mode mode) -+{ -+ return !btree_node_old_extent_overwrite(b) -+ ? bch2_drop_whiteouts(b, mode) -+ : bch2_compact_extent_whiteouts(c, b, mode); -+} -+ -+static void btree_node_sort(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter, -+ unsigned start_idx, -+ unsigned end_idx, -+ bool filter_whiteouts) -+{ -+ struct btree_node *out; -+ struct sort_iter sort_iter; -+ struct bset_tree *t; -+ struct bset *start_bset = bset(b, &b->set[start_idx]); -+ bool used_mempool = false; -+ u64 start_time, seq = 0; -+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; -+ bool sorting_entire_node = start_idx == 0 && -+ end_idx == b->nsets; -+ -+ sort_iter_init(&sort_iter, b); -+ -+ for (t = b->set + start_idx; -+ t < b->set + end_idx; -+ t++) { -+ u64s += le16_to_cpu(bset(b, t)->u64s); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ } -+ -+ bytes = sorting_entire_node -+ ? btree_bytes(c) -+ : __vstruct_bytes(struct btree_node, u64s); -+ -+ out = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ start_time = local_clock(); -+ -+ if (btree_node_old_extent_overwrite(b)) -+ filter_whiteouts = bset_written(b, start_bset); -+ -+ u64s = (btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents -+ : bch2_sort_keys)(out->keys.start, -+ &sort_iter, -+ filter_whiteouts); -+ -+ out->keys.u64s = cpu_to_le16(u64s); -+ -+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); -+ -+ if (sorting_entire_node) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ /* Make sure we preserve bset journal_seq: */ -+ for (t = b->set + start_idx; t < b->set + end_idx; t++) -+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); -+ start_bset->journal_seq = cpu_to_le64(seq); -+ -+ if (sorting_entire_node) { -+ unsigned u64s = le16_to_cpu(out->keys.u64s); -+ -+ BUG_ON(bytes != btree_bytes(c)); -+ -+ /* -+ * Our temporary buffer is the same size as the btree node's -+ * buffer, we can just swap buffers instead of doing a big -+ * memcpy() -+ */ -+ *out = *b->data; -+ out->keys.u64s = cpu_to_le16(u64s); -+ swap(out, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ } else { -+ start_bset->u64s = out->keys.u64s; -+ memcpy_u64s(start_bset->start, -+ out->keys.start, -+ le16_to_cpu(out->keys.u64s)); -+ } -+ -+ for (i = start_idx + 1; i < end_idx; i++) -+ b->nr.bset_u64s[start_idx] += -+ b->nr.bset_u64s[i]; -+ -+ b->nsets -= shift; -+ -+ for (i = start_idx + 1; i < b->nsets; i++) { -+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; -+ b->set[i] = b->set[i + shift]; -+ } -+ -+ for (i = b->nsets; i < MAX_BSETS; i++) -+ b->nr.bset_u64s[i] = 0; -+ -+ set_btree_bset_end(b, &b->set[start_idx]); -+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); -+ -+ btree_bounce_free(c, bytes, used_mempool, out); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *c, -+ struct btree *dst, -+ struct btree *src) -+{ -+ struct btree_nr_keys nr; -+ struct btree_node_iter src_iter; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(dst->nsets != 1); -+ -+ bch2_bset_set_no_aux_tree(dst, dst->set); -+ -+ bch2_btree_node_iter_init_from_start(&src_iter, src); -+ -+ if (btree_node_is_extents(src)) -+ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ else -+ nr = bch2_sort_repack(btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ set_btree_bset_end(dst, dst->set); -+ -+ dst->nr.live_u64s += nr.live_u64s; -+ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; -+ dst->nr.packed_keys += nr.packed_keys; -+ dst->nr.unpacked_keys += nr.unpacked_keys; -+ -+ bch2_verify_btree_nr_keys(dst); -+} -+ -+#define SORT_CRIT (4096 / sizeof(u64)) -+ -+/* -+ * We're about to add another bset to the btree node, so if there's currently -+ * too many bsets - sort some of them together: -+ */ -+static bool btree_node_compact(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ unsigned unwritten_idx; -+ bool ret = false; -+ -+ for (unwritten_idx = 0; -+ unwritten_idx < b->nsets; -+ unwritten_idx++) -+ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) -+ break; -+ -+ if (b->nsets - unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, unwritten_idx, -+ b->nsets, false); -+ ret = true; -+ } -+ -+ if (unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, 0, unwritten_idx, false); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_build_aux_trees(struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ bch2_bset_build_aux_tree(b, t, -+ !bset_written(b, bset(b, t)) && -+ t == bset_tree_last(b)); -+} -+ -+/* -+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be -+ * inserted into -+ * -+ * Safe to call if there already is an unwritten bset - will only add a new bset -+ * if @b doesn't already have one. -+ * -+ * Returns true if we sorted (i.e. invalidated iterators -+ */ -+void bch2_btree_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_node_entry *bne; -+ bool did_sort; -+ -+ EBUG_ON(!(b->c.lock.state.seq & 1)); -+ EBUG_ON(iter && iter->l[b->c.level].b != b); -+ -+ did_sort = btree_node_compact(c, b, iter); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ if (iter && did_sort) -+ bch2_btree_iter_reinit_node(iter, b); -+} -+ -+static void btree_err_msg(struct printbuf *out, struct bch_fs *c, -+ struct btree *b, struct bset *i, -+ unsigned offset, int write) -+{ -+ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" -+ "pos ", -+ write ? "before write " : "", -+ b->c.btree_id, b->c.level, -+ c->btree_roots[b->c.btree_id].level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ -+ pr_buf(out, " node offset %u", b->written); -+ if (i) -+ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); -+} -+ -+enum btree_err_type { -+ BTREE_ERR_FIXABLE, -+ BTREE_ERR_WANT_RETRY, -+ BTREE_ERR_MUST_RETRY, -+ BTREE_ERR_FATAL, -+}; -+ -+enum btree_validate_ret { -+ BTREE_RETRY_READ = 64, -+}; -+ -+#define btree_err(type, c, b, i, msg, ...) \ -+({ \ -+ __label__ out; \ -+ char _buf[300]; \ -+ struct printbuf out = PBUF(_buf); \ -+ \ -+ btree_err_msg(&out, c, b, i, b->written, write); \ -+ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ -+ \ -+ if (type == BTREE_ERR_FIXABLE && \ -+ write == READ && \ -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ -+ mustfix_fsck_err(c, "%s", _buf); \ -+ goto out; \ -+ } \ -+ \ -+ switch (write) { \ -+ case READ: \ -+ bch_err(c, "%s", _buf); \ -+ \ -+ switch (type) { \ -+ case BTREE_ERR_FIXABLE: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ case BTREE_ERR_WANT_RETRY: \ -+ if (have_retry) { \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case BTREE_ERR_MUST_RETRY: \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ case BTREE_ERR_FATAL: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write: %s", _buf); \ -+ \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+out: \ -+ true; \ -+}) -+ -+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -+ -+static int validate_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ const char *err; -+ int ret = 0; -+ -+ btree_err_on((version != BCH_BSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max, -+ BTREE_ERR_FATAL, c, b, i, -+ "unsupported bset version"); -+ -+ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "bset past end of btree node")) { -+ i->u64s = 0; -+ return 0; -+ } -+ -+ btree_err_on(b->written && !i->u64s, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "empty bset"); -+ -+ if (!b->written) { -+ struct btree_node *bn = -+ container_of(i, struct btree_node, keys); -+ /* These indicate that we read the wrong btree node: */ -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ /* XXX endianness */ -+ btree_err_on(bp->seq != bn->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect sequence number (wrong btree node)"); -+ } -+ -+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect btree id"); -+ -+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect level"); -+ -+ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { -+ u64 *p = (u64 *) &bn->ptr; -+ -+ *p = swab64(*p); -+ } -+ -+ if (!write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect min_key: got %llu:%llu should be %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ bp->min_key.inode, -+ bp->min_key.offset); -+ } -+ -+ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect max key"); -+ -+ if (write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ /* XXX: ideally we would be validating min_key too */ -+#if 0 -+ /* -+ * not correct anymore, due to btree node write error -+ * handling -+ * -+ * need to add bn->seq to btree keys and verify -+ * against that -+ */ -+ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), -+ bn->ptr), -+ BTREE_ERR_FATAL, c, b, i, -+ "incorrect backpointer"); -+#endif -+ err = bch2_bkey_format_validate(&bn->format); -+ btree_err_on(err, -+ BTREE_ERR_FATAL, c, b, i, -+ "invalid bkey format: %s", err); -+ -+ compat_bformat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &bn->format); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int validate_bset_keys(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned *whiteout_u64s, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ struct bkey_packed *k, *prev = NULL; -+ bool seen_non_whiteout = false; -+ int ret = 0; -+ -+ if (!BSET_SEPARATE_WHITEOUTS(i)) { -+ seen_non_whiteout = true; -+ *whiteout_u64s = 0; -+ } -+ -+ for (k = i->start; -+ k != vstruct_last(i);) { -+ struct bkey_s u; -+ struct bkey tmp; -+ const char *invalid; -+ -+ if (btree_err_on(bkey_next(k) > vstruct_last(i), -+ BTREE_ERR_FIXABLE, c, b, i, -+ "key extends past end of bset")) { -+ i->u64s = cpu_to_le16((u64 *) k - i->_data); -+ break; -+ } -+ -+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey format %u", k->format)) { -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ /* XXX: validate k->u64s */ -+ if (!write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ u = __bkey_disassemble(b, k, &tmp); -+ -+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, u.s_c) ?: -+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey:\n%s\n%s", invalid, buf); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ if (write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ /* -+ * with the separate whiteouts thing (used for extents), the -+ * second set of keys actually can have whiteouts too, so we -+ * can't solely go off bkey_whiteout()... -+ */ -+ -+ if (!seen_non_whiteout && -+ (!bkey_whiteout(k) || -+ (prev && bkey_iter_cmp(b, prev, k) > 0))) { -+ *whiteout_u64s = k->_data - i->_data; -+ seen_non_whiteout = true; -+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ char buf1[80]; -+ char buf2[80]; -+ struct bkey up = bkey_unpack_key(b, prev); -+ -+ bch2_bkey_to_text(&PBUF(buf1), &up); -+ bch2_bkey_to_text(&PBUF(buf2), u.k); -+ -+ bch2_dump_bset(c, b, i, 0); -+ btree_err(BTREE_ERR_FATAL, c, b, i, -+ "keys out of order: %s > %s", -+ buf1, buf2); -+ /* XXX: repair this */ -+ } -+ -+ prev = k; -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+fsck_err: -+ return ret; -+} -+ -+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) -+{ -+ struct btree_node_entry *bne; -+ struct sort_iter *iter; -+ struct btree_node *sorted; -+ struct bkey_packed *k; -+ struct bch_extent_ptr *ptr; -+ struct bset *i; -+ bool used_mempool, blacklisted; -+ unsigned u64s; -+ int ret, retry_read = 0, write = READ; -+ -+ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); -+ sort_iter_init(iter, b); -+ iter->size = (btree_blocks(c) + 1) * 2; -+ -+ if (bch2_meta_read_fault("btree")) -+ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "dynamic fault"); -+ -+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad magic"); -+ -+ btree_err_on(!b->data->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad btree header"); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(b->data->keys.seq != bp->seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "got wrong btree node (seq %llx want %llx)", -+ b->data->keys.seq, bp->seq); -+ } -+ -+ while (b->written < c->opts.btree_node_size) { -+ unsigned sectors, whiteout_u64s = 0; -+ struct nonce nonce; -+ struct bch_csum csum; -+ bool first = !b->written; -+ -+ if (!b->written) { -+ i = &b->data->keys; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -+ -+ btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ sectors = vstruct_sectors(b->data, c->block_bits); -+ } else { -+ bne = write_block(b); -+ i = &bne->keys; -+ -+ if (i->seq != b->data->keys.seq) -+ break; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ ret = validate_bset(c, b, i, sectors, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ if (!b->written) -+ btree_node_set_format(b, b->data->format); -+ -+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ b->written += sectors; -+ -+ blacklisted = bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(i->journal_seq), -+ true); -+ -+ btree_err_on(blacklisted && first, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "first btree node bset has blacklisted journal seq"); -+ if (blacklisted && !first) -+ continue; -+ -+ sort_iter_add(iter, i->start, -+ vstruct_idx(i, whiteout_u64s)); -+ -+ sort_iter_add(iter, -+ vstruct_idx(i, whiteout_u64s), -+ vstruct_last(i)); -+ } -+ -+ for (bne = write_block(b); -+ bset_byte_offset(b, bne) < btree_bytes(c); -+ bne = (void *) bne + block_bytes(c)) -+ btree_err_on(bne->keys.seq == b->data->keys.seq, -+ BTREE_ERR_WANT_RETRY, c, b, NULL, -+ "found bset signature after last bset"); -+ -+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); -+ sorted->keys.u64s = 0; -+ -+ set_btree_bset(b, b->set, &b->data->keys); -+ -+ b->nr = (btree_node_old_extent_overwrite(b) -+ ? bch2_extent_sort_fix_overlapping -+ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); -+ -+ u64s = le16_to_cpu(sorted->keys.u64s); -+ *sorted = *b->data; -+ sorted->keys.u64s = cpu_to_le16(u64s); -+ swap(sorted, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ b->nsets = 1; -+ -+ BUG_ON(b->nr.live_u64s != u64s); -+ -+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); -+ -+ i = &b->data->keys; -+ for (k = i->start; k != vstruct_last(i);) { -+ struct bkey tmp; -+ struct bkey_s u = __bkey_disassemble(b, k, &tmp); -+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); -+ -+ if (invalid || -+ (inject_invalid_keys(c) && -+ !bversion_cmp(u.k->version, MAX_VERSION))) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey %s: %s", buf, invalid); -+ -+ btree_keys_account_key_drop(&b->nr, 0, k); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ set_btree_bset_end(b, b->set); -+ continue; -+ } -+ -+ if (u.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); -+ -+ bp.v->mem_ptr = 0; -+ } -+ -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+ -+ bch2_bset_build_aux_tree(b, b->set, false); -+ -+ set_needs_whiteout(btree_bset_first(b), true); -+ -+ btree_node_reset_sib_u64s(b); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ set_btree_node_need_rewrite(b); -+ } -+out: -+ mempool_free(iter, &c->fill_iter); -+ return retry_read; -+fsck_err: -+ if (ret == BTREE_RETRY_READ) { -+ retry_read = 1; -+ } else { -+ bch2_inconsistent_error(c); -+ set_btree_node_read_error(b); -+ } -+ goto out; -+} -+ -+static void btree_node_read_work(struct work_struct *work) -+{ -+ struct btree_read_bio *rb = -+ container_of(work, struct btree_read_bio, work); -+ struct bch_fs *c = rb->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ struct btree *b = rb->bio.bi_private; -+ struct bio *bio = &rb->bio; -+ struct bch_io_failures failed = { .nr = 0 }; -+ bool can_retry; -+ -+ goto start; -+ while (1) { -+ bch_info(c, "retrying read"); -+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ bio_reset(bio); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = rb->pick.ptr.offset; -+ bio->bi_iter.bi_size = btree_bytes(c); -+ -+ if (rb->have_ioref) { -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ submit_bio_wait(bio); -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ } -+start: -+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", -+ bch2_blk_status_to_str(bio->bi_status)); -+ if (rb->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ rb->have_ioref = false; -+ -+ bch2_mark_io_failure(&failed, &rb->pick); -+ -+ can_retry = bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ &failed, &rb->pick) > 0; -+ -+ if (!bio->bi_status && -+ !bch2_btree_node_read_done(c, b, can_retry)) -+ break; -+ -+ if (!can_retry) { -+ set_btree_node_read_error(b); -+ break; -+ } -+ } -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -+ rb->start_time); -+ bio_put(&rb->bio); -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+} -+ -+static void btree_node_read_endio(struct bio *bio) -+{ -+ struct btree_read_bio *rb = -+ container_of(bio, struct btree_read_bio, bio); -+ struct bch_fs *c = rb->c; -+ -+ if (rb->have_ioref) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ bch2_latency_acct(ca, rb->start_time, READ); -+ } -+ -+ queue_work(system_unbound_wq, &rb->work); -+} -+ -+void bch2_btree_node_read(struct bch_fs *c, struct btree *b, -+ bool sync) -+{ -+ struct extent_ptr_decoded pick; -+ struct btree_read_bio *rb; -+ struct bch_dev *ca; -+ struct bio *bio; -+ int ret; -+ -+ trace_btree_read(c, b); -+ -+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick); -+ if (bch2_fs_fatal_err_on(ret <= 0, c, -+ "btree node read error: no device to read from")) { -+ set_btree_node_read_error(b); -+ return; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, -+ btree_bytes(c)), -+ &c->btree_bio); -+ rb = container_of(bio, struct btree_read_bio, bio); -+ rb->c = c; -+ rb->start_time = local_clock(); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ rb->pick = pick; -+ INIT_WORK(&rb->work, btree_node_read_work); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bio->bi_end_io = btree_node_read_endio; -+ bio->bi_private = b; -+ bch2_bio_map(bio, b->data, btree_bytes(c)); -+ -+ set_btree_node_read_in_flight(b); -+ -+ if (rb->have_ioref) { -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], -+ bio_sectors(bio)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ -+ if (sync) { -+ submit_bio_wait(bio); -+ -+ bio->bi_private = b; -+ btree_node_read_work(&rb->work); -+ } else { -+ submit_bio(bio); -+ } -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ -+ if (sync) -+ btree_node_read_work(&rb->work); -+ else -+ queue_work(system_unbound_wq, &rb->work); -+ -+ } -+} -+ -+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ BUG_ON(IS_ERR(b)); -+ -+ bkey_copy(&b->key, k); -+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); -+ -+ bch2_btree_node_read(c, b, true); -+ -+ if (btree_node_read_error(b)) { -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_btree_set_root_for_read(c, b); -+err: -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ return ret; -+} -+ -+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, -+ struct btree_write *w) -+{ -+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); -+ -+ do { -+ old = new = v; -+ if (!(old & 1)) -+ break; -+ -+ new &= ~1UL; -+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); -+ -+ if (old & 1) -+ closure_put(&((struct btree_update *) new)->cl); -+ -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+} -+ -+static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_write *w = btree_prev_write(b); -+ -+ bch2_btree_complete_write(c, b, w); -+ btree_node_io_unlock(b); -+} -+ -+static void bch2_btree_node_write_error(struct bch_fs *c, -+ struct btree_write_bio *wbio) -+{ -+ struct btree *b = wbio->wbio.bio.bi_private; -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+ struct bch_extent_ptr *ptr; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, -+ BTREE_MAX_DEPTH, b->c.level, 0); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ /* has node been freed? */ -+ if (iter->l[b->c.level].b != b) { -+ /* node has been freed: */ -+ BUG_ON(!btree_node_dying(b)); -+ goto out; -+ } -+ -+ BUG_ON(!btree_node_hashed(b)); -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, -+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) -+ goto err; -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_trans_exit(&trans); -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+ return; -+err: -+ set_btree_node_noevict(b); -+ bch2_fs_fatal_error(c, "fatal error writing btree node"); -+ goto out; -+} -+ -+void bch2_btree_write_error_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ btree_write_error_work); -+ struct bio *bio; -+ -+ while (1) { -+ spin_lock_irq(&c->btree_write_error_lock); -+ bio = bio_list_pop(&c->btree_write_error_list); -+ spin_unlock_irq(&c->btree_write_error_lock); -+ -+ if (!bio) -+ break; -+ -+ bch2_btree_node_write_error(c, -+ container_of(bio, struct btree_write_bio, wbio.bio)); -+ } -+} -+ -+static void btree_node_write_work(struct work_struct *work) -+{ -+ struct btree_write_bio *wbio = -+ container_of(work, struct btree_write_bio, work); -+ struct bch_fs *c = wbio->wbio.c; -+ struct btree *b = wbio->wbio.bio.bi_private; -+ -+ btree_bounce_free(c, -+ wbio->bytes, -+ wbio->wbio.used_mempool, -+ wbio->data); -+ -+ if (wbio->wbio.failed.nr) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ -+ queue_work(c->wq, &c->btree_write_error_work); -+ return; -+ } -+ -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+} -+ -+static void btree_node_write_endio(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_write_bio *orig = parent ?: wbio; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ unsigned long flags; -+ -+ if (wbio->have_ioref) -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("btree")) { -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bch2_dev_list_add_dev(&orig->failed, wbio->dev); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ } -+ -+ if (wbio->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ -+ if (parent) { -+ bio_put(bio); -+ bio_endio(&parent->bio); -+ } else { -+ struct btree_write_bio *wb = -+ container_of(orig, struct btree_write_bio, wbio); -+ -+ INIT_WORK(&wb->work, btree_node_write_work); -+ queue_work(system_unbound_wq, &wb->work); -+ } -+} -+ -+static int validate_bset_for_write(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors) -+{ -+ unsigned whiteout_u64s = 0; -+ int ret; -+ -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) -+ return -1; -+ -+ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: -+ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); -+ if (ret) -+ bch2_inconsistent_error(c); -+ -+ return ret; -+} -+ -+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ struct btree_write_bio *wbio; -+ struct bset_tree *t; -+ struct bset *i; -+ struct btree_node *bn = NULL; -+ struct btree_node_entry *bne = NULL; -+ BKEY_PADDED(key) k; -+ struct bch_extent_ptr *ptr; -+ struct sort_iter sort_iter; -+ struct nonce nonce; -+ unsigned bytes_to_write, sectors_to_write, bytes, u64s; -+ u64 seq = 0; -+ bool used_mempool; -+ unsigned long old, new; -+ bool validate_before_checksum = false; -+ void *data; -+ -+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ return; -+ -+ /* -+ * We may only have a read lock on the btree node - the dirty bit is our -+ * "lock" against racing with other threads that may be trying to start -+ * a write, we do a write iff we clear the dirty bit. Since setting the -+ * dirty bit requires a write lock, we can't race with other threads -+ * redirtying it: -+ */ -+ do { -+ old = new = READ_ONCE(b->flags); -+ -+ if (!(old & (1 << BTREE_NODE_dirty))) -+ return; -+ -+ if (!btree_node_may_write(b)) -+ return; -+ -+ if (old & (1 << BTREE_NODE_write_in_flight)) { -+ btree_node_wait_on_io(b); -+ continue; -+ } -+ -+ new &= ~(1 << BTREE_NODE_dirty); -+ new &= ~(1 << BTREE_NODE_need_write); -+ new |= (1 << BTREE_NODE_write_in_flight); -+ new |= (1 << BTREE_NODE_just_written); -+ new ^= (1 << BTREE_NODE_write_idx); -+ } while (cmpxchg_acquire(&b->flags, old, new) != old); -+ -+ BUG_ON(btree_node_fake(b)); -+ BUG_ON((b->will_make_reachable != 0) != !b->written); -+ -+ BUG_ON(b->written >= c->opts.btree_node_size); -+ BUG_ON(b->written & (c->opts.block_size - 1)); -+ BUG_ON(bset_written(b, btree_bset_last(b))); -+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); -+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ bytes = !b->written -+ ? sizeof(struct btree_node) -+ : sizeof(struct btree_node_entry); -+ -+ bytes += b->whiteout_u64s * sizeof(u64); -+ -+ for_each_bset(b, t) { -+ i = bset(b, t); -+ -+ if (bset_written(b, i)) -+ continue; -+ -+ bytes += le16_to_cpu(i->u64s) * sizeof(u64); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ seq = max(seq, le64_to_cpu(i->journal_seq)); -+ } -+ -+ data = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ if (!b->written) { -+ bn = data; -+ *bn = *b->data; -+ i = &bn->keys; -+ } else { -+ bne = data; -+ bne->keys = b->data->keys; -+ i = &bne->keys; -+ } -+ -+ i->journal_seq = cpu_to_le64(seq); -+ i->u64s = 0; -+ -+ if (!btree_node_old_extent_overwrite(b)) { -+ sort_iter_add(&sort_iter, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b)); -+ SET_BSET_SEPARATE_WHITEOUTS(i, false); -+ } else { -+ memcpy_u64s(i->start, -+ unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ i->u64s = cpu_to_le16(b->whiteout_u64s); -+ SET_BSET_SEPARATE_WHITEOUTS(i, true); -+ } -+ -+ b->whiteout_u64s = 0; -+ -+ u64s = btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) -+ : bch2_sort_keys(i->start, &sort_iter, false); -+ le16_add_cpu(&i->u64s, u64s); -+ -+ set_needs_whiteout(i, false); -+ -+ /* do we have data to write? */ -+ if (b->written && !i->u64s) -+ goto nowrite; -+ -+ bytes_to_write = vstruct_end(i) - data; -+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; -+ -+ memset(data + bytes_to_write, 0, -+ (sectors_to_write << 9) - bytes_to_write); -+ -+ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); -+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); -+ BUG_ON(i->seq != b->data->keys.seq); -+ -+ i->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le16(BCH_BSET_VERSION_OLD) -+ : cpu_to_le16(c->sb.version); -+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) -+ validate_before_checksum = true; -+ -+ /* validate_bset will be modifying: */ -+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ /* if we're going to be encrypting, check metadata validity first: */ -+ if (validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ -+ if (bn) -+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); -+ else -+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ /* if we're not encrypting, check metadata after checksumming: */ -+ if (!validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ /* -+ * We handle btree write errors by immediately halting the journal - -+ * after we've done that, we can't issue any subsequent btree writes -+ * because they might have pointers to new nodes that failed to write. -+ * -+ * Furthermore, there's no point in doing any more btree writes because -+ * with the journal stopped, we're never going to update the journal to -+ * reflect that those writes were done and the data flushed from the -+ * journal: -+ * -+ * Also on journal error, the pending write may have updates that were -+ * never journalled (interior nodes, see btree_update_nodes_written()) - -+ * it's critical that we don't do the write in that case otherwise we -+ * will have updates visible that weren't in the journal: -+ * -+ * Make sure to update b->written so bch2_btree_init_next() doesn't -+ * break: -+ */ -+ if (bch2_journal_error(&c->journal) || -+ c->opts.nochanges) -+ goto err; -+ -+ trace_btree_write(b, bytes_to_write, sectors_to_write); -+ -+ wbio = container_of(bio_alloc_bioset(GFP_NOIO, -+ buf_pages(data, sectors_to_write << 9), -+ &c->btree_bio), -+ struct btree_write_bio, wbio.bio); -+ wbio_init(&wbio->wbio.bio); -+ wbio->data = data; -+ wbio->bytes = bytes; -+ wbio->wbio.used_mempool = used_mempool; -+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; -+ wbio->wbio.bio.bi_end_io = btree_node_write_endio; -+ wbio->wbio.bio.bi_private = b; -+ -+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); -+ -+ /* -+ * If we're appending to a leaf node, we don't technically need FUA - -+ * this write just needs to be persisted before the next journal write, -+ * which will be marked FLUSH|FUA. -+ * -+ * Similarly if we're writing a new btree root - the pointer is going to -+ * be in the next journal entry. -+ * -+ * But if we're writing a new btree node (that isn't a root) or -+ * appending to a non leaf btree node, we need either FUA or a flush -+ * when we write the parent with the new pointer. FUA is cheaper than a -+ * flush, and writes appending to leaf nodes aren't blocking anything so -+ * just make all btree node writes FUA to keep things sane. -+ */ -+ -+ bkey_copy(&k.key, &b->key); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) -+ ptr->offset += b->written; -+ -+ b->written += sectors_to_write; -+ -+ /* XXX: submitting IO with btree locks held: */ -+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); -+ return; -+err: -+ set_btree_node_noevict(b); -+ b->written += sectors_to_write; -+nowrite: -+ btree_bounce_free(c, bytes, used_mempool, data); -+ btree_node_write_done(c, b); -+} -+ -+/* -+ * Work that must be done with write lock held: -+ */ -+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -+{ -+ bool invalidated_iter = false; -+ struct btree_node_entry *bne; -+ struct bset_tree *t; -+ -+ if (!btree_node_just_written(b)) -+ return false; -+ -+ BUG_ON(b->whiteout_u64s); -+ -+ clear_btree_node_just_written(b); -+ -+ /* -+ * Note: immediately after write, bset_written() doesn't work - the -+ * amount of data we had to write after compaction might have been -+ * smaller than the offset of the last bset. -+ * -+ * However, we know that all bsets have been written here, as long as -+ * we're still holding the write lock: -+ */ -+ -+ /* -+ * XXX: decide if we really want to unconditionally sort down to a -+ * single bset: -+ */ -+ if (b->nsets > 1) { -+ btree_node_sort(c, b, NULL, 0, b->nsets, true); -+ invalidated_iter = true; -+ } else { -+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); -+ } -+ -+ for_each_bset(b, t) -+ set_needs_whiteout(bset(b, t), true); -+ -+ bch2_btree_verify(c, b); -+ -+ /* -+ * If later we don't unconditionally sort down to a single bset, we have -+ * to ensure this is still true: -+ */ -+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return invalidated_iter; -+} -+ -+/* -+ * Use this one if the node is intent locked: -+ */ -+void bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ BUG_ON(lock_type_held == SIX_LOCK_write); -+ -+ if (lock_type_held == SIX_LOCK_intent || -+ six_lock_tryupgrade(&b->c.lock)) { -+ __bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ -+ /* don't cycle lock unnecessarily: */ -+ if (btree_node_just_written(b) && -+ six_trylock_write(&b->c.lock)) { -+ bch2_btree_post_write_cleanup(c, b); -+ six_unlock_write(&b->c.lock); -+ } -+ -+ if (lock_type_held == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ } else { -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ } -+} -+ -+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+restart: -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) -+ if (test_bit(flag, &b->flags)) { -+ rcu_read_unlock(); -+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); -+ goto restart; -+ -+ } -+ rcu_read_unlock(); -+} -+ -+void bch2_btree_flush_all_reads(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -+} -+ -+void bch2_btree_flush_all_writes(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -+} -+ -+void bch2_btree_verify_flushed(struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || -+ (flags & (1 << BTREE_NODE_write_in_flight))); -+ } -+ rcu_read_unlock(); -+} -+ -+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ if (!(flags & (1 << BTREE_NODE_dirty))) -+ continue; -+ -+ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", -+ b, -+ (flags & (1 << BTREE_NODE_dirty)) != 0, -+ (flags & (1 << BTREE_NODE_need_write)) != 0, -+ b->c.level, -+ b->written, -+ !list_empty_careful(&b->write_blocked), -+ b->will_make_reachable != 0, -+ b->will_make_reachable & 1); -+ } -+ rcu_read_unlock(); -+} -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -new file mode 100644 -index 000000000000..626d0f071b70 ---- /dev/null -+++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,220 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_IO_H -+#define _BCACHEFS_BTREE_IO_H -+ -+#include "bkey_methods.h" -+#include "bset.h" -+#include "btree_locking.h" -+#include "checksum.h" -+#include "extents.h" -+#include "io_types.h" -+ -+struct bch_fs; -+struct btree_write; -+struct btree; -+struct btree_iter; -+ -+struct btree_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ unsigned have_ioref:1; -+ struct extent_ptr_decoded pick; -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+struct btree_write_bio { -+ struct work_struct work; -+ void *data; -+ unsigned bytes; -+ struct bch_write_bio wbio; -+}; -+ -+static inline void btree_node_io_unlock(struct btree *b) -+{ -+ EBUG_ON(!btree_node_write_in_flight(b)); -+ clear_btree_node_write_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+} -+ -+static inline void btree_node_io_lock(struct btree *b) -+{ -+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline void btree_node_wait_on_io(struct btree *b) -+{ -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline bool btree_node_may_write(struct btree *b) -+{ -+ return list_empty_careful(&b->write_blocked) && -+ (!b->written || !b->will_make_reachable); -+} -+ -+enum compact_mode { -+ COMPACT_LAZY, -+ COMPACT_ALL, -+}; -+ -+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, -+ enum compact_mode); -+ -+static inline bool should_compact_bset_lazy(struct btree *b, -+ struct bset_tree *t) -+{ -+ unsigned total_u64s = bset_u64s(t); -+ unsigned dead_u64s = bset_dead_u64s(b, t); -+ -+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -+} -+ -+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (should_compact_bset_lazy(b, t)) -+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); -+ -+ return false; -+} -+ -+static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -+{ -+ return (struct nonce) {{ -+ [0] = cpu_to_le32(offset), -+ [1] = ((__le32 *) &i->seq)[0], -+ [2] = ((__le32 *) &i->seq)[1], -+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, -+ }}; -+} -+ -+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -+{ -+ struct nonce nonce = btree_nonce(i, offset); -+ -+ if (!offset) { -+ struct btree_node *bn = container_of(i, struct btree_node, keys); -+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, -+ bytes); -+ -+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); -+ } -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, -+ vstruct_end(i) - (void *) i->_data); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); -+ -+void bch2_btree_build_aux_trees(struct btree *); -+void bch2_btree_init_next(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+ -+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); -+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); -+int bch2_btree_root_read(struct bch_fs *, enum btree_id, -+ const struct bkey_i *, unsigned); -+ -+void bch2_btree_complete_write(struct bch_fs *, struct btree *, -+ struct btree_write *); -+void bch2_btree_write_error_work(struct work_struct *); -+ -+void __bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -+ -+void bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+ -+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_held) -+{ -+ while (b->written && -+ btree_node_need_write(b) && -+ btree_node_may_write(b)) { -+ if (!btree_node_write_in_flight(b)) { -+ bch2_btree_node_write(c, b, lock_held); -+ break; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_held); -+ btree_node_wait_on_io(b); -+ btree_node_lock_type(c, b, lock_held); -+ } -+} -+ -+#define bch2_btree_node_write_cond(_c, _b, cond) \ -+do { \ -+ unsigned long old, new, v = READ_ONCE((_b)->flags); \ -+ \ -+ do { \ -+ old = new = v; \ -+ \ -+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ -+ break; \ -+ \ -+ new |= (1 << BTREE_NODE_need_write); \ -+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ -+ \ -+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -+} while (0) -+ -+void bch2_btree_flush_all_reads(struct bch_fs *); -+void bch2_btree_flush_all_writes(struct bch_fs *); -+void bch2_btree_verify_flushed(struct bch_fs *); -+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); -+ -+static inline void compat_bformat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bkey_format *f) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ swap(f->bits_per_field[BKEY_FIELD_INODE], -+ f->bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(f->field_offset[BKEY_FIELD_INODE], -+ f->field_offset[BKEY_FIELD_OFFSET]); -+ } -+} -+ -+static inline void compat_bpos(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bpos *p) -+{ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bpos_swab(p); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) -+ swap(p->inode, p->offset); -+} -+ -+static inline void compat_btree_node(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct btree_node *bn) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ write) -+ bn->min_key = bkey_predecessor(bn->min_key); -+ -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ !write) -+ bn->min_key = bkey_successor(bn->min_key); -+} -+ -+#endif /* _BCACHEFS_BTREE_IO_H */ -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -new file mode 100644 -index 000000000000..6fab76c3220c ---- /dev/null -+++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,2445 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "debug.h" -+#include "extents.h" -+#include "journal.h" -+ -+#include -+#include -+ -+static inline bool is_btree_node(struct btree_iter *iter, unsigned l) -+{ -+ return l < BTREE_MAX_DEPTH && -+ (unsigned long) iter->l[l].b >= 128; -+} -+ -+static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ -+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ bkey_cmp(pos, POS_MAX)) -+ pos = bkey_successor(pos); -+ return pos; -+} -+ -+static inline bool btree_iter_pos_before_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; -+} -+ -+static inline bool btree_iter_pos_after_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; -+} -+ -+static inline bool btree_iter_pos_in_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return iter->btree_id == b->c.btree_id && -+ !btree_iter_pos_before_node(iter, b) && -+ !btree_iter_pos_after_node(iter, b); -+} -+ -+/* Btree node locking: */ -+ -+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) -+{ -+ bch2_btree_node_unlock_write_inlined(b, iter); -+} -+ -+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ unsigned readers = 0; -+ -+ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[b->c.level].b == b && -+ btree_node_read_locked(linked, b->c.level)) -+ readers++; -+ -+ /* -+ * Must drop our read locks before calling six_lock_write() - -+ * six_unlock() won't do wakeups until the reader count -+ * goes to 0, and it's safe because we have the node intent -+ * locked: -+ */ -+ atomic64_sub(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); -+ atomic64_add(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = btree_iter_node(iter, level); -+ int want = __btree_lock_want(iter, level); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (race_fault()) -+ return false; -+ -+ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || -+ (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, want))) { -+ mark_btree_node_locked(iter, level, want); -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = iter->l[level].b; -+ -+ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (btree_node_intent_locked(iter, level)) -+ return true; -+ -+ if (race_fault()) -+ return false; -+ -+ if (btree_node_locked(iter, level) -+ ? six_lock_tryupgrade(&b->c.lock) -+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) -+ goto success; -+ -+ if (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { -+ btree_node_unlock(iter, level); -+ goto success; -+ } -+ -+ return false; -+success: -+ mark_btree_node_intent_locked(iter, level); -+ return true; -+} -+ -+static inline bool btree_iter_get_locks(struct btree_iter *iter, -+ bool upgrade, bool trace) -+{ -+ unsigned l = iter->level; -+ int fail_idx = -1; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!(upgrade -+ ? bch2_btree_node_upgrade(iter, l) -+ : bch2_btree_node_relock(iter, l))) { -+ if (trace) -+ (upgrade -+ ? trace_node_upgrade_fail -+ : trace_node_relock_fail)(l, iter->l[l].lock_seq, -+ is_btree_node(iter, l) -+ ? 0 -+ : (unsigned long) iter->l[l].b, -+ is_btree_node(iter, l) -+ ? iter->l[l].b->c.lock.state.seq -+ : 0); -+ -+ fail_idx = l; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ /* -+ * When we fail to get a lock, we have to ensure that any child nodes -+ * can't be relocked so bch2_btree_iter_traverse has to walk back up to -+ * the node that we failed to relock: -+ */ -+ while (fail_idx >= 0) { -+ btree_node_unlock(iter, fail_idx); -+ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; -+ --fail_idx; -+ } -+ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ return iter->uptodate < BTREE_ITER_NEED_RELOCK; -+} -+ -+static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ return type != BTREE_ITER_CACHED -+ ? container_of(_b, struct btree, c)->key.k.p -+ : container_of(_b, struct bkey_cached, c)->key.pos; -+} -+ -+/* Slowpath: */ -+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, -+ unsigned level, struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, -+ void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_iter *linked; -+ u64 start_time = local_clock(); -+ bool ret = true; -+ -+ /* Check if it's safe to block: */ -+ trans_for_each_iter(trans, linked) { -+ if (!linked->nodes_locked) -+ continue; -+ -+ /* -+ * Can't block taking an intent lock if we have _any_ nodes read -+ * locked: -+ * -+ * - Our read lock blocks another thread with an intent lock on -+ * the same node from getting a write lock, and thus from -+ * dropping its intent lock -+ * -+ * - And the other thread may have multiple nodes intent locked: -+ * both the node we want to intent lock, and the node we -+ * already have read locked - deadlock: -+ */ -+ if (type == SIX_LOCK_intent && -+ linked->nodes_locked != linked->nodes_intent_locked) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = max_t(unsigned, -+ linked->locks_want, -+ __fls(linked->nodes_locked) + 1); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* -+ * Interior nodes must be locked before their descendants: if -+ * another iterator has possible descendants locked of the node -+ * we're about to lock, it must have the ancestors locked too: -+ */ -+ if (linked->btree_id == iter->btree_id && -+ level > __fls(linked->nodes_locked)) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = -+ max(level + 1, max_t(unsigned, -+ linked->locks_want, -+ iter->locks_want)); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* Must lock btree nodes in key order: */ -+ if ((cmp_int(iter->btree_id, linked->btree_id) ?: -+ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) -+ ret = false; -+ -+ if (iter->btree_id == linked->btree_id && -+ btree_node_locked(linked, level) && -+ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, -+ btree_iter_type(linked))) <= 0) -+ ret = false; -+ -+ /* -+ * Recheck if this is a node we already have locked - since one -+ * of the get_locks() calls might've successfully -+ * upgraded/relocked it: -+ */ -+ if (linked->l[level].b == b && -+ btree_node_locked_type(linked, level) >= type) { -+ six_lock_increment(&b->c.lock, type); -+ return true; -+ } -+ } -+ -+ if (unlikely(!ret)) { -+ trace_trans_restart_would_deadlock(iter->trans->ip); -+ return false; -+ } -+ -+ if (six_trylock_type(&b->c.lock, type)) -+ return true; -+ -+ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) -+ return false; -+ -+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], -+ start_time); -+ return true; -+} -+ -+/* Btree iterator locking: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static void bch2_btree_iter_verify_locks(struct btree_iter *iter) -+{ -+ unsigned l; -+ -+ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { -+ BUG_ON(iter->nodes_locked); -+ return; -+ } -+ -+ for (l = 0; is_btree_node(iter, l); l++) { -+ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && -+ !btree_node_locked(iter, l)) -+ continue; -+ -+ BUG_ON(btree_lock_want(iter, l) != -+ btree_node_locked_type(iter, l)); -+ } -+} -+ -+void bch2_btree_trans_verify_locks(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter_all(trans, iter) -+ bch2_btree_iter_verify_locks(iter); -+} -+#else -+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} -+#endif -+ -+__flatten -+bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) -+{ -+ return btree_iter_get_locks(iter, false, trace); -+} -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ if (btree_iter_get_locks(iter, true, true)) -+ return true; -+ -+ /* -+ * Ancestor nodes must be locked before child nodes, so set locks_want -+ * on iterators that might lock ancestors before us to avoid getting -+ * -EINTR later: -+ */ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked != iter && -+ linked->btree_id == iter->btree_id && -+ linked->locks_want < new_locks_want) { -+ linked->locks_want = new_locks_want; -+ btree_iter_get_locks(linked, true, false); -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ unsigned l = iter->level; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!bch2_btree_node_upgrade(iter, l)) { -+ iter->locks_want = l; -+ return false; -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ return true; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *iter, -+ unsigned downgrade_to) -+{ -+ unsigned l, new_locks_want = downgrade_to ?: -+ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); -+ -+ if (iter->locks_want < downgrade_to) { -+ iter->locks_want = new_locks_want; -+ -+ while (iter->nodes_locked && -+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { -+ if (l > iter->level) { -+ btree_node_unlock(iter, l); -+ } else { -+ if (btree_node_intent_locked(iter, l)) { -+ six_lock_downgrade(&iter->l[l].b->c.lock); -+ iter->nodes_intent_locked ^= 1 << l; -+ } -+ break; -+ } -+ } -+ } -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ bch2_btree_iter_downgrade(iter); -+} -+ -+/* Btree transaction locking: */ -+ -+bool bch2_trans_relock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ bool ret = true; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ ret &= bch2_btree_iter_relock(iter, true); -+ -+ return ret; -+} -+ -+void bch2_trans_unlock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ __bch2_btree_iter_unlock(iter); -+} -+ -+/* Btree iterator: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_btree_iter_verify_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ bool locked = btree_node_locked(iter, 0); -+ -+ if (!bch2_btree_node_relock(iter, 0)) -+ return; -+ -+ ck = (void *) iter->l[0].b; -+ BUG_ON(ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)); -+ -+ if (!locked) -+ btree_node_unlock(iter, 0); -+} -+ -+static void bch2_btree_iter_verify_level(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ struct btree_node_iter tmp = l->iter; -+ bool locked = btree_node_locked(iter, level); -+ struct bkey_packed *p, *k; -+ char buf1[100], buf2[100]; -+ const char *msg; -+ -+ if (!debug_check_iterators(iter->trans->c)) -+ return; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ if (!level) -+ bch2_btree_iter_verify_cached(iter); -+ return; -+ } -+ -+ BUG_ON(iter->level < iter->min_depth); -+ -+ if (!btree_iter_node(iter, level)) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ /* -+ * Ideally this invariant would always be true, and hopefully in the -+ * future it will be, but for now set_pos_same_leaf() breaks it: -+ */ -+ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && -+ !btree_iter_pos_in_node(iter, l->b)); -+ -+ /* -+ * node iterators don't use leaf node iterator: -+ */ -+ if (btree_iter_type(iter) == BTREE_ITER_NODES && -+ level <= iter->min_depth) -+ goto unlock; -+ -+ bch2_btree_node_iter_verify(&l->iter, l->b); -+ -+ /* -+ * For interior nodes, the iterator will have skipped past -+ * deleted keys: -+ * -+ * For extents, the iterator may have skipped past deleted keys (but not -+ * whiteouts) -+ */ -+ p = level || btree_node_type_is_extents(iter->btree_id) -+ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) -+ : bch2_btree_node_iter_prev_all(&tmp, l->b); -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { -+ msg = "before"; -+ goto err; -+ } -+ -+ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ msg = "after"; -+ goto err; -+ } -+unlock: -+ if (!locked) -+ btree_node_unlock(iter, level); -+ return; -+err: -+ strcpy(buf1, "(none)"); -+ strcpy(buf2, "(none)"); -+ -+ if (p) { -+ struct bkey uk = bkey_unpack_key(l->b, p); -+ bch2_bkey_to_text(&PBUF(buf1), &uk); -+ } -+ -+ if (k) { -+ struct bkey uk = bkey_unpack_key(l->b, k); -+ bch2_bkey_to_text(&PBUF(buf2), &uk); -+ } -+ -+ panic("iterator should be %s key at level %u:\n" -+ "iter pos %s %llu:%llu\n" -+ "prev key %s\n" -+ "cur key %s\n", -+ msg, level, -+ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", -+ iter->pos.inode, iter->pos.offset, -+ buf1, buf2); -+} -+ -+static void bch2_btree_iter_verify(struct btree_iter *iter) -+{ -+ unsigned i; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ bch2_btree_iter_verify_level(iter, i); -+} -+ -+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) -+{ -+ struct btree_iter *iter; -+ -+ if (!debug_check_iterators(trans->c)) -+ return; -+ -+ trans_for_each_iter_with_node(trans, b, iter) -+ bch2_btree_iter_verify_level(iter, b->c.level); -+} -+ -+#else -+ -+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} -+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -+ -+#endif -+ -+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) { -+ set->k = __btree_node_key_to_offset(b, k); -+ bch2_btree_node_iter_sort(iter, b); -+ return; -+ } -+ -+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -+} -+ -+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter_level *l = &iter->l[b->c.level]; -+ struct bpos pos = btree_iter_search_key(iter); -+ -+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) -+ return; -+ -+ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_iter_fix_key_modified(linked, b, where); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static void __bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bset_tree *t, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ const struct bkey_packed *end = btree_bkey_last(b, t); -+ struct btree_node_iter_set *set; -+ unsigned offset = __btree_node_key_to_offset(b, where); -+ int shift = new_u64s - clobber_u64s; -+ unsigned old_end = t->end_offset - shift; -+ unsigned orig_iter_pos = node_iter->data[0].k; -+ bool iter_current_key_modified = -+ orig_iter_pos >= offset && -+ orig_iter_pos <= offset + clobber_u64s; -+ struct bpos iter_pos = btree_iter_search_key(iter); -+ -+ btree_node_iter_for_each(node_iter, set) -+ if (set->end == old_end) -+ goto found; -+ -+ /* didn't find the bset in the iterator - might have to readd it: */ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ bch2_btree_node_iter_push(node_iter, b, where, end); -+ goto fixup_done; -+ } else { -+ /* Iterator is after key that changed */ -+ return; -+ } -+found: -+ set->end = t->end_offset; -+ -+ /* Iterator hasn't gotten to the key that changed yet: */ -+ if (set->k < offset) -+ return; -+ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ set->k = offset; -+ } else if (set->k < offset + clobber_u64s) { -+ set->k = offset + new_u64s; -+ if (set->k == set->end) -+ bch2_btree_node_iter_set_drop(node_iter, set); -+ } else { -+ /* Iterator is after key that changed */ -+ set->k = (int) set->k + shift; -+ return; -+ } -+ -+ bch2_btree_node_iter_sort(node_iter, b); -+fixup_done: -+ if (node_iter->data[0].k != orig_iter_pos) -+ iter_current_key_modified = true; -+ -+ /* -+ * When a new key is added, and the node iterator now points to that -+ * key, the iterator might have skipped past deleted keys that should -+ * come after the key the iterator now points to. We have to rewind to -+ * before those deleted keys - otherwise -+ * bch2_btree_node_iter_prev_all() breaks: -+ */ -+ if (!bch2_btree_node_iter_end(node_iter) && -+ iter_current_key_modified && -+ (b->c.level || -+ btree_node_type_is_extents(iter->btree_id))) { -+ struct bset_tree *t; -+ struct bkey_packed *k, *k2, *p; -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ -+ for_each_bset(b, t) { -+ bool set_pos = false; -+ -+ if (node_iter->data[0].end == t->end_offset) -+ continue; -+ -+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); -+ -+ while ((p = bch2_bkey_prev_all(b, t, k2)) && -+ bkey_iter_cmp(b, k, p) < 0) { -+ k2 = p; -+ set_pos = true; -+ } -+ -+ if (set_pos) -+ btree_node_iter_set_set_pos(node_iter, -+ b, t, k2); -+ } -+ } -+ -+ if (!b->c.level && -+ node_iter == &iter->l[0].iter && -+ iter_current_key_modified) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct btree_iter *linked; -+ -+ if (node_iter != &iter->l[b->c.level].iter) { -+ __bch2_btree_node_iter_fix(iter, b, node_iter, t, -+ where, clobber_u64s, new_u64s); -+ -+ if (debug_check_iterators(iter->trans->c)) -+ bch2_btree_node_iter_verify(node_iter, b); -+ } -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_node_iter_fix(linked, b, -+ &linked->l[b->c.level].iter, t, -+ where, clobber_u64s, new_u64s); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u, -+ struct bkey_packed *k) -+{ -+ struct bkey_s_c ret; -+ -+ if (unlikely(!k)) { -+ /* -+ * signal to bch2_btree_iter_peek_slot() that we're currently at -+ * a hole -+ */ -+ u->type = KEY_TYPE_deleted; -+ return bkey_s_c_null; -+ } -+ -+ ret = bkey_disassemble(l->b, k, u); -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ -+ return ret; -+} -+ -+/* peek_all() doesn't skip deleted keys */ -+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u) -+{ -+ return __btree_iter_unpack(iter, l, u, -+ bch2_btree_node_iter_peek_all(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_prev(&l->iter, l->b)); -+} -+ -+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ int max_advance) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct bkey_packed *k; -+ int nr_advanced = 0; -+ -+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && -+ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ if (max_advance > 0 && nr_advanced >= max_advance) -+ return false; -+ -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ nr_advanced++; -+ } -+ -+ return true; -+} -+ -+/* -+ * Verify that iterator for parent node points to child node: -+ */ -+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter_level *l; -+ unsigned plevel; -+ bool parent_locked; -+ struct bkey_packed *k; -+ -+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ return; -+ -+ plevel = b->c.level + 1; -+ if (!btree_iter_node(iter, plevel)) -+ return; -+ -+ parent_locked = btree_node_locked(iter, plevel); -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ l = &iter->l[plevel]; -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ if (!k || -+ bkey_deleted(k) || -+ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { -+ char buf[100]; -+ struct bkey uk = bkey_unpack_key(b, k); -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", -+ buf, b->key.k.p.inode, b->key.k.p.offset); -+ } -+ -+ if (!parent_locked) -+ btree_node_unlock(iter, b->c.level + 1); -+} -+ -+static inline void __btree_iter_init(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ -+ bch2_btree_node_iter_init(&l->iter, l->b, &pos); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+static inline void btree_iter_node_set(struct btree_iter *iter, -+ struct btree *b) -+{ -+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); -+ -+ btree_iter_verify_new_node(iter, b); -+ -+ EBUG_ON(!btree_iter_pos_in_node(iter, b)); -+ EBUG_ON(b->c.lock.state.seq & 1); -+ -+ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; -+ iter->l[b->c.level].b = b; -+ __btree_iter_init(iter, b->c.level); -+} -+ -+/* -+ * A btree node is being replaced - update the iterator to point to the new -+ * node: -+ */ -+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) -+{ -+ enum btree_node_locked_type t; -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (btree_iter_type(linked) != BTREE_ITER_CACHED && -+ btree_iter_pos_in_node(linked, b)) { -+ /* -+ * bch2_btree_iter_node_drop() has already been called - -+ * the old node we're replacing has already been -+ * unlocked and the pointer invalidated -+ */ -+ BUG_ON(btree_node_locked(linked, b->c.level)); -+ -+ t = btree_lock_want(linked, b->c.level); -+ if (t != BTREE_NODE_UNLOCKED) { -+ six_lock_increment(&b->c.lock, t); -+ mark_btree_node_locked(linked, b->c.level, t); -+ } -+ -+ btree_iter_node_set(linked, b); -+ } -+} -+ -+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ unsigned level = b->c.level; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[level].b == b) { -+ __btree_node_unlock(linked, level); -+ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; -+ } -+} -+ -+/* -+ * A btree node has been modified in such a way as to invalidate iterators - fix -+ * them: -+ */ -+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ __btree_iter_init(linked, b->c.level); -+} -+ -+static int lock_root_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ struct btree **rootp = p; -+ -+ return b == *rootp ? 0 : -1; -+} -+ -+static inline int btree_iter_lock_root(struct btree_iter *iter, -+ unsigned depth_want) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; -+ enum six_lock_type lock_type; -+ unsigned i; -+ -+ EBUG_ON(iter->nodes_locked); -+ -+ while (1) { -+ b = READ_ONCE(*rootp); -+ iter->level = READ_ONCE(b->c.level); -+ -+ if (unlikely(iter->level < depth_want)) { -+ /* -+ * the root is at a lower depth than the depth we want: -+ * got to the end of the btree, or we're walking nodes -+ * greater than some depth and there are no nodes >= -+ * that depth -+ */ -+ iter->level = depth_want; -+ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ return 1; -+ } -+ -+ lock_type = __btree_lock_want(iter, iter->level); -+ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, -+ iter, lock_type, -+ lock_root_check_fn, rootp))) -+ return -EINTR; -+ -+ if (likely(b == READ_ONCE(*rootp) && -+ b->c.level == iter->level && -+ !race_fault())) { -+ for (i = 0; i < iter->level; i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; -+ iter->l[iter->level].b = b; -+ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ -+ mark_btree_node_locked(iter, iter->level, lock_type); -+ btree_iter_node_set(iter, b); -+ return 0; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ } -+} -+ -+noinline -+static void btree_iter_prefetch(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) -+ ? (iter->level > 1 ? 0 : 2) -+ : (iter->level > 1 ? 1 : 16); -+ bool was_locked = btree_node_locked(iter, iter->level); -+ -+ while (nr) { -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ return; -+ -+ bch2_btree_node_iter_advance(&node_iter, l->b); -+ k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!k) -+ break; -+ -+ bch2_bkey_unpack(l->b, &tmp.k, k); -+ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); -+ } -+ -+ if (!was_locked) -+ btree_node_unlock(iter, iter->level); -+} -+ -+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, -+ unsigned plevel, struct btree *b) -+{ -+ struct btree_iter_level *l = &iter->l[plevel]; -+ bool locked = btree_node_locked(iter, plevel); -+ struct bkey_packed *k; -+ struct bch_btree_ptr_v2 *bp; -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); -+ -+ bp = (void *) bkeyp_val(&l->b->format, k); -+ bp->mem_ptr = (unsigned long)b; -+ -+ if (!locked) -+ btree_node_unlock(iter, plevel); -+} -+ -+static __always_inline int btree_iter_down(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree *b; -+ unsigned level = iter->level - 1; -+ enum six_lock_type lock_type = __btree_lock_want(iter, level); -+ BKEY_PADDED(k) tmp; -+ -+ EBUG_ON(!btree_node_locked(iter, iter->level)); -+ -+ bch2_bkey_unpack(l->b, &tmp.k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+ -+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); -+ if (unlikely(IS_ERR(b))) -+ return PTR_ERR(b); -+ -+ mark_btree_node_locked(iter, level, lock_type); -+ btree_iter_node_set(iter, b); -+ -+ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && -+ unlikely(b != btree_node_mem_ptr(&tmp.k))) -+ btree_node_mem_ptr_set(iter, level + 1, b); -+ -+ if (iter->flags & BTREE_ITER_PREFETCH) -+ btree_iter_prefetch(iter); -+ -+ iter->level = level; -+ -+ return 0; -+} -+ -+static void btree_iter_up(struct btree_iter *iter) -+{ -+ btree_node_unlock(iter, iter->level++); -+} -+ -+static int btree_iter_traverse_one(struct btree_iter *); -+ -+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ u8 sorted[BTREE_ITER_MAX]; -+ unsigned i, nr_sorted = 0; -+ -+ if (trans->in_traverse_all) -+ return -EINTR; -+ -+ trans->in_traverse_all = true; -+retry_all: -+ nr_sorted = 0; -+ -+ trans_for_each_iter(trans, iter) -+ sorted[nr_sorted++] = iter->idx; -+ -+#define btree_iter_cmp_by_idx(_l, _r) \ -+ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) -+ -+ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); -+#undef btree_iter_cmp_by_idx -+ bch2_trans_unlock(trans); -+ -+ if (unlikely(ret == -ENOMEM)) { -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ } -+ -+ if (unlikely(ret == -EIO)) { -+ trans->error = true; -+ goto out; -+ } -+ -+ BUG_ON(ret && ret != -EINTR); -+ -+ /* Now, redo traversals in correct order: */ -+ for (i = 0; i < nr_sorted; i++) { -+ unsigned idx = sorted[i]; -+ -+ /* -+ * sucessfully traversing one iterator can cause another to be -+ * unlinked, in btree_key_cache_fill() -+ */ -+ if (!(trans->iters_linked & (1ULL << idx))) -+ continue; -+ -+ ret = btree_iter_traverse_one(&trans->iters[idx]); -+ if (ret) -+ goto retry_all; -+ } -+ -+ if (hweight64(trans->iters_live) > 1) -+ ret = -EINTR; -+ else -+ trans_for_each_iter(trans, iter) -+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { -+ ret = -EINTR; -+ break; -+ } -+out: -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ trans->in_traverse_all = false; -+ return ret; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *trans) -+{ -+ return __btree_iter_traverse_all(trans, 0); -+} -+ -+static inline bool btree_iter_good_node(struct btree_iter *iter, -+ unsigned l, int check_pos) -+{ -+ if (!is_btree_node(iter, l) || -+ !bch2_btree_node_relock(iter, l)) -+ return false; -+ -+ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) -+ return false; -+ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) -+ return false; -+ return true; -+} -+ -+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, -+ int check_pos) -+{ -+ unsigned l = iter->level; -+ -+ while (btree_iter_node(iter, l) && -+ !btree_iter_good_node(iter, l, check_pos)) { -+ btree_node_unlock(iter, l); -+ iter->l[l].b = BTREE_ITER_NO_NODE_UP; -+ l++; -+ } -+ -+ return l; -+} -+ -+/* -+ * This is the main state machine for walking down the btree - walks down to a -+ * specified depth -+ * -+ * Returns 0 on success, -EIO on error (error reading in a btree node). -+ * -+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is -+ * stashed in the iterator and returned from bch2_trans_exit(). -+ */ -+static int btree_iter_traverse_one(struct btree_iter *iter) -+{ -+ unsigned depth_want = iter->level; -+ -+ /* -+ * if we need interior nodes locked, call btree_iter_relock() to make -+ * sure we walk back up enough that we lock them: -+ */ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || -+ iter->locks_want > 1) -+ bch2_btree_iter_relock(iter, false); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_traverse_cached(iter); -+ -+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) -+ return 0; -+ -+ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) -+ return 0; -+ -+ /* -+ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos -+ * here unnecessary -+ */ -+ iter->level = btree_iter_up_until_good_node(iter, 0); -+ -+ /* -+ * If we've got a btree node locked (i.e. we aren't about to relock the -+ * root) - advance its node iterator if necessary: -+ * -+ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary -+ */ -+ if (is_btree_node(iter, iter->level)) { -+ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); -+ -+ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); -+ } -+ -+ /* -+ * Note: iter->nodes[iter->level] may be temporarily NULL here - that -+ * would indicate to other code that we got to the end of the btree, -+ * here it indicates that relocking the root failed - it's critical that -+ * btree_iter_lock_root() comes next and that it can't fail -+ */ -+ while (iter->level > depth_want) { -+ int ret = btree_iter_node(iter, iter->level) -+ ? btree_iter_down(iter) -+ : btree_iter_lock_root(iter, depth_want); -+ if (unlikely(ret)) { -+ if (ret == 1) -+ return 0; -+ -+ iter->level = depth_want; -+ -+ if (ret == -EIO) { -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_ERROR; -+ } else { -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_DOWN; -+ } -+ return ret; -+ } -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_iter_verify(iter); -+ return 0; -+} -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ int ret; -+ -+ ret = bch2_trans_cond_resched(trans) ?: -+ btree_iter_traverse_one(iter); -+ if (unlikely(ret)) -+ ret = __btree_iter_traverse_all(trans, ret); -+ -+ return ret; -+} -+ -+static inline void bch2_btree_iter_checks(struct btree_iter *iter) -+{ -+ enum btree_iter_type type = btree_iter_type(iter); -+ -+ EBUG_ON(iter->btree_id >= BTREE_ID_NR); -+ -+ BUG_ON((type == BTREE_ITER_KEYS || -+ type == BTREE_ITER_CACHED) && -+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || -+ bkey_cmp(iter->pos, iter->k.p) > 0)); -+ -+ bch2_btree_iter_verify_locks(iter); -+ bch2_btree_iter_verify_level(iter, iter->level); -+} -+ -+/* Iterate across nodes (leaf and interior nodes) */ -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return iter->l[iter->level].b; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ /* already got to end? */ -+ if (!btree_iter_node(iter, iter->level)) -+ return NULL; -+ -+ bch2_trans_cond_resched(iter->trans); -+ -+ btree_iter_up(iter); -+ -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ /* got to end? */ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { -+ /* -+ * Haven't gotten to the end of the parent node: go back down to -+ * the next child node -+ */ -+ -+ /* -+ * We don't really want to be unlocking here except we can't -+ * directly tell btree_iter_traverse() "traverse to this level" -+ * except by setting iter->level, so we have to unlock so we -+ * don't screw up our lock invariants: -+ */ -+ if (btree_node_read_locked(iter, iter->level)) -+ btree_node_unlock(iter, iter->level); -+ -+ iter->pos = bkey_successor(iter->pos); -+ iter->level = iter->min_depth; -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = iter->l[iter->level].b; -+ } -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+/* Iterate across keys (in leaf nodes only) */ -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ -+ EBUG_ON(iter->level != 0); -+ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); -+ EBUG_ON(!btree_node_locked(iter, 0)); -+ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+ -+ btree_iter_advance_to_pos(iter, l, -1); -+ -+ /* -+ * XXX: -+ * keeping a node locked that's outside (even just outside) iter->pos -+ * breaks __bch2_btree_node_lock(). This seems to only affect -+ * bch2_btree_node_get_sibling so for now it's fixed there, but we -+ * should try to get rid of this corner case. -+ * -+ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) -+ */ -+ -+ if (bch2_btree_node_iter_end(&l->iter) && -+ btree_iter_pos_after_node(iter, l->b)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+} -+ -+static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) -+{ -+ unsigned l = iter->level; -+ -+ if (!cmp) -+ goto out; -+ -+ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { -+ btree_node_unlock(iter, 0); -+ iter->l[0].b = BTREE_ITER_NO_NODE_UP; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ return; -+ } -+ -+ l = btree_iter_up_until_good_node(iter, cmp); -+ -+ if (btree_iter_node(iter, l)) { -+ /* -+ * We might have to skip over many keys, or just a few: try -+ * advancing the node iterator, and if we have to skip over too -+ * many keys just reinit it (or if we're rewinding, since that -+ * is expensive). -+ */ -+ if (cmp < 0 || -+ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) -+ __btree_iter_init(iter, l); -+ -+ /* Don't leave it locked if we're not supposed to: */ -+ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, l); -+ } -+out: -+ if (l != iter->level) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ else -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, -+ bool strictly_greater) -+{ -+ struct bpos old = btree_iter_search_key(iter); -+ int cmp; -+ -+ iter->flags &= ~BTREE_ITER_IS_EXTENTS; -+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ cmp = bkey_cmp(btree_iter_search_key(iter), old); -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+{ -+ int cmp = bkey_cmp(new_pos, iter->pos); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->key.k.p; -+ -+ ret = bkey_cmp(iter->pos, POS_MAX) != 0; -+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ iter->k.p = iter->pos = bkey_successor(iter->pos); -+ -+ btree_iter_pos_changed(iter, 1); -+ return ret; -+} -+ -+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->data->min_key; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ -+ ret = bkey_cmp(iter->pos, POS_MIN) != 0; -+ if (ret) { -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ } -+ -+ btree_iter_pos_changed(iter, -1); -+ return ret; -+} -+ -+/** -+ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key -+ * it currently points to -+ */ -+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c ret = { .k = &iter->k }; -+ -+ if (!bkey_deleted(&iter->k)) { -+ struct bkey_packed *_k = -+ __bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ ret.v = bkeyp_val(&l->b->format, _k); -+ -+ if (debug_check_iterators(iter->trans->c)) { -+ struct bkey k = bkey_unpack_key(l->b, _k); -+ -+ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); -+ } -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's -+ * current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_next: returns first key greater than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek(iter); -+} -+ -+static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_trans *trans = iter->trans; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update2(trans, i) -+ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: -+ bkey_cmp(pos, i->k->k.p)) <= 0) -+ break; -+ -+ return i < trans->updates2 + trans->nr_updates2 && -+ iter->btree_id == i->iter->btree_id -+ ? bkey_i_to_s_c(i->k) -+ : bkey_s_c_null; -+} -+ -+static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k = __btree_iter_peek(iter, l); -+ struct bkey_s_c u = __btree_trans_updates_peek(iter); -+ -+ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) -+ return k; -+ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { -+ iter->k = *u.k; -+ return u; -+ } -+ return bkey_s_c_null; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __bch2_btree_iter_peek_with_updates(iter); -+ -+ if (k.k && bkey_deleted(k.k)) { -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ continue; -+ } -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_with_updates(iter); -+} -+ -+/** -+ * bch2_btree_iter_peek_prev: returns first key less than or equal to -+ * iterator's current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) -+ k = __btree_iter_prev(iter, l); -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_prev_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); -+ iter->pos = bkey_start_pos(k.k); -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_prev: returns first key less than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = bkey_start_pos(&iter->k); -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (unlikely(!bkey_cmp(pos, POS_MIN))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); -+ -+ return bch2_btree_iter_peek_prev(iter); -+} -+ -+static inline struct bkey_s_c -+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter; -+ struct bkey_s_c k; -+ struct bkey n; -+ int ret; -+ -+ /* keys & holes can't span inode numbers: */ -+ if (iter->pos.offset == KEY_OFFSET_MAX) { -+ if (iter->pos.inode == KEY_INODE_MAX) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ } -+ -+ /* -+ * iterator is now at the correct position for inserting at iter->pos, -+ * but we need to keep iterating until we find the first non whiteout so -+ * we know how big a hole we have, if any: -+ */ -+ -+ node_iter = l->iter; -+ k = __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&node_iter, l->b)); -+ -+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { -+ /* -+ * We're not setting iter->uptodate because the node iterator -+ * doesn't necessarily point at the key we're returning: -+ */ -+ -+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+ } -+ -+ /* hole */ -+ -+ if (!k.k) -+ k.k = &l->b->key.k; -+ -+ bkey_init(&n); -+ n.p = iter->pos; -+ bch2_key_resize(&n, -+ min_t(u64, KEY_SIZE_MAX, -+ (k.k->p.inode == n.p.inode -+ ? bkey_start_offset(k.k) -+ : KEY_OFFSET_MAX) - -+ n.p.offset)); -+ -+ EBUG_ON(!n.size); -+ -+ iter->k = n; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return (struct bkey_s_c) { &iter->k, NULL }; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return btree_iter_peek_uptodate(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return __bch2_btree_iter_peek_slot_extents(iter); -+ -+ k = __btree_iter_peek_all(iter, l, &iter->k); -+ -+ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); -+ -+ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { -+ /* hole */ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos; -+ k = (struct bkey_s_c) { &iter->k, NULL }; -+ } -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); -+ bch2_btree_iter_checks(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ ck = (void *) iter->l[0].b; -+ -+ EBUG_ON(iter->btree_id != ck->key.btree_id || -+ bkey_cmp(iter->pos, ck->key.pos)); -+ BUG_ON(!ck->valid); -+ -+ return bkey_i_to_s_c(ck->k); -+} -+ -+static inline void bch2_btree_iter_init(struct btree_trans *trans, -+ struct btree_iter *iter, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i; -+ -+ if (btree_node_type_is_extents(btree_id) && -+ !(flags & BTREE_ITER_NODES)) -+ flags |= BTREE_ITER_IS_EXTENTS; -+ -+ iter->trans = trans; -+ iter->pos = pos; -+ bkey_init(&iter->k); -+ iter->k.p = pos; -+ iter->flags = flags; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ iter->btree_id = btree_id; -+ iter->level = 0; -+ iter->min_depth = 0; -+ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; -+ iter->nodes_locked = 0; -+ iter->nodes_intent_locked = 0; -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; -+ -+ prefetch(c->btree_roots[btree_id].b); -+} -+ -+/* new transactional stuff: */ -+ -+static inline void __bch2_trans_iter_free(struct btree_trans *trans, -+ unsigned idx) -+{ -+ __bch2_btree_iter_unlock(&trans->iters[idx]); -+ trans->iters_linked &= ~(1ULL << idx); -+ trans->iters_live &= ~(1ULL << idx); -+ trans->iters_touched &= ~(1ULL << idx); -+} -+ -+int bch2_trans_iter_put(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ int ret; -+ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ BUG_ON(trans->iters + iter->idx != iter); -+ -+ ret = btree_iter_err(iter); -+ -+ if (!(trans->iters_touched & (1ULL << iter->idx)) && -+ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) -+ __bch2_trans_iter_free(trans, iter->idx); -+ -+ trans->iters_live &= ~(1ULL << iter->idx); -+ return ret; -+} -+ -+int bch2_trans_iter_free(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return bch2_trans_iter_put(trans, iter); -+} -+ -+static int bch2_trans_realloc_iters(struct btree_trans *trans, -+ unsigned new_size) -+{ -+ void *p, *new_iters, *new_updates, *new_updates2; -+ size_t iters_bytes; -+ size_t updates_bytes; -+ -+ new_size = roundup_pow_of_two(new_size); -+ -+ BUG_ON(new_size > BTREE_ITER_MAX); -+ -+ if (new_size <= trans->size) -+ return 0; -+ -+ BUG_ON(trans->used_mempool); -+ -+ bch2_trans_unlock(trans); -+ -+ iters_bytes = sizeof(struct btree_iter) * new_size; -+ updates_bytes = sizeof(struct btree_insert_entry) * new_size; -+ -+ p = kmalloc(iters_bytes + -+ updates_bytes + -+ updates_bytes, GFP_NOFS); -+ if (p) -+ goto success; -+ -+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); -+ new_size = BTREE_ITER_MAX; -+ -+ trans->used_mempool = true; -+success: -+ new_iters = p; p += iters_bytes; -+ new_updates = p; p += updates_bytes; -+ new_updates2 = p; p += updates_bytes; -+ -+ memcpy(new_iters, trans->iters, -+ sizeof(struct btree_iter) * trans->nr_iters); -+ memcpy(new_updates, trans->updates, -+ sizeof(struct btree_insert_entry) * trans->nr_updates); -+ memcpy(new_updates2, trans->updates2, -+ sizeof(struct btree_insert_entry) * trans->nr_updates2); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ memset(trans->iters, POISON_FREE, -+ sizeof(struct btree_iter) * trans->nr_iters + -+ sizeof(struct btree_insert_entry) * trans->nr_iters); -+ -+ if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ -+ trans->iters = new_iters; -+ trans->updates = new_updates; -+ trans->updates2 = new_updates2; -+ trans->size = new_size; -+ -+ if (trans->iters_live) { -+ trace_trans_restart_iters_realloced(trans->ip, trans->size); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) -+{ -+ unsigned idx = __ffs64(~trans->iters_linked); -+ -+ if (idx < trans->nr_iters) -+ goto got_slot; -+ -+ if (trans->nr_iters == trans->size) { -+ int ret; -+ -+ if (trans->nr_iters >= BTREE_ITER_MAX) { -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) { -+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", -+ bch2_btree_ids[iter->btree_id], -+ iter->pos.inode, -+ iter->pos.offset, -+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", -+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", -+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", -+ (void *) iter->ip_allocated); -+ } -+ -+ panic("trans iter oveflow\n"); -+ } -+ -+ ret = bch2_trans_realloc_iters(trans, trans->size * 2); -+ if (ret) -+ return ERR_PTR(ret); -+ } -+ -+ idx = trans->nr_iters++; -+ BUG_ON(trans->nr_iters > trans->size); -+ -+ trans->iters[idx].idx = idx; -+got_slot: -+ BUG_ON(trans->iters_linked & (1ULL << idx)); -+ trans->iters_linked |= 1ULL << idx; -+ trans->iters[idx].flags = 0; -+ return &trans->iters[idx]; -+} -+ -+static inline void btree_iter_copy(struct btree_iter *dst, -+ struct btree_iter *src) -+{ -+ unsigned i, idx = dst->idx; -+ -+ *dst = *src; -+ dst->idx = idx; -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ if (btree_node_locked(dst, i)) -+ six_lock_increment(&dst->l[i].b->c.lock, -+ __btree_lock_want(dst, i)); -+ -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; -+} -+ -+static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -+{ -+ if (bkey_cmp(l, r) > 0) -+ swap(l, r); -+ -+ return POS(r.inode - l.inode, r.offset - l.offset); -+} -+ -+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ struct btree_iter *iter, *best = NULL; -+ -+ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); -+ -+ trans_for_each_iter(trans, iter) { -+ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) -+ continue; -+ -+ if (iter->btree_id != btree_id) -+ continue; -+ -+ if (best && -+ bkey_cmp(bpos_diff(best->pos, pos), -+ bpos_diff(iter->pos, pos)) < 0) -+ continue; -+ -+ best = iter; -+ } -+ -+ if (!best) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); -+ } else if ((trans->iters_live & (1ULL << best->idx)) || -+ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, best); -+ } else { -+ iter = best; -+ } -+ -+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ iter->flags &= ~BTREE_ITER_USER_FLAGS; -+ iter->flags |= flags & BTREE_ITER_USER_FLAGS; -+ -+ if (iter->flags & BTREE_ITER_INTENT) -+ bch2_btree_iter_upgrade(iter, 1); -+ else -+ bch2_btree_iter_downgrade(iter); -+ -+ BUG_ON(iter->btree_id != btree_id); -+ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); -+ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); -+ BUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ trans->iters_touched |= 1ULL << iter->idx; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ __bch2_btree_iter_set_pos(iter, pos, -+ btree_node_type_is_extents(btree_id)); -+ return iter; -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, -+ unsigned locks_want, -+ unsigned depth, -+ unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_NODES); -+ unsigned i; -+ -+ BUG_ON(IS_ERR(iter)); -+ BUG_ON(bkey_cmp(iter->pos, pos)); -+ -+ iter->locks_want = locks_want; -+ iter->level = depth; -+ iter->min_depth = depth; -+ -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = NULL; -+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, -+ struct btree_iter *src) -+{ -+ struct btree_iter *iter; -+ -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, src); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ /* -+ * We don't need to preserve this iter since it's cheap to copy it -+ * again - this will cause trans_iter_put() to free it right away: -+ */ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return iter; -+} -+ -+static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) -+{ -+ if (size > trans->mem_bytes) { -+ size_t old_bytes = trans->mem_bytes; -+ size_t new_bytes = roundup_pow_of_two(size); -+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); -+ -+ if (!new_mem) -+ return -ENOMEM; -+ -+ trans->mem = new_mem; -+ trans->mem_bytes = new_bytes; -+ -+ if (old_bytes) { -+ trace_trans_restart_mem_realloced(trans->ip, new_bytes); -+ return -EINTR; -+ } -+ } -+ -+ return 0; -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ void *p; -+ int ret; -+ -+ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ p = trans->mem + trans->mem_top; -+ trans->mem_top += size; -+ return p; -+} -+ -+inline void bch2_trans_unlink_iters(struct btree_trans *trans) -+{ -+ u64 iters = trans->iters_linked & -+ ~trans->iters_touched & -+ ~trans->iters_live; -+ -+ while (iters) { -+ unsigned idx = __ffs64(iters); -+ -+ iters &= ~(1ULL << idx); -+ __bch2_trans_iter_free(trans, idx); -+ } -+} -+ -+void bch2_trans_reset(struct btree_trans *trans, unsigned flags) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| -+ BTREE_ITER_SET_POS_AFTER_COMMIT); -+ -+ bch2_trans_unlink_iters(trans); -+ -+ trans->iters_touched &= trans->iters_live; -+ -+ trans->need_reset = 0; -+ trans->nr_updates = 0; -+ trans->nr_updates2 = 0; -+ trans->mem_top = 0; -+ -+ trans->extra_journal_entries = NULL; -+ trans->extra_journal_entry_u64s = 0; -+ -+ if (trans->fs_usage_deltas) { -+ trans->fs_usage_deltas->used = 0; -+ memset(&trans->fs_usage_deltas->memset_start, 0, -+ (void *) &trans->fs_usage_deltas->memset_end - -+ (void *) &trans->fs_usage_deltas->memset_start); -+ } -+ -+ if (!(flags & TRANS_RESET_NOTRAVERSE)) -+ bch2_btree_iter_traverse_all(trans); -+} -+ -+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, -+ unsigned expected_nr_iters, -+ size_t expected_mem_bytes) -+{ -+ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); -+ -+ /* -+ * reallocating iterators currently completely breaks -+ * bch2_trans_iter_put(): -+ */ -+ expected_nr_iters = BTREE_ITER_MAX; -+ -+ trans->c = c; -+ trans->ip = _RET_IP_; -+ trans->size = ARRAY_SIZE(trans->iters_onstack); -+ trans->iters = trans->iters_onstack; -+ trans->updates = trans->updates_onstack; -+ trans->updates2 = trans->updates2_onstack; -+ trans->fs_usage_deltas = NULL; -+ -+ if (expected_nr_iters > trans->size) -+ bch2_trans_realloc_iters(trans, expected_nr_iters); -+ -+ if (expected_mem_bytes) -+ bch2_trans_preload_mem(trans, expected_mem_bytes); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->pid = current->pid; -+ mutex_lock(&c->btree_trans_lock); -+ list_add(&trans->list, &c->btree_trans_list); -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+int bch2_trans_exit(struct btree_trans *trans) -+{ -+ bch2_trans_unlock(trans); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_lock(&trans->c->btree_trans_lock); -+ list_del(&trans->list); -+ mutex_unlock(&trans->c->btree_trans_lock); -+#endif -+ -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ kfree(trans->fs_usage_deltas); -+ kfree(trans->mem); -+ if (trans->used_mempool) -+ mempool_free(trans->iters, &trans->c->btree_iters_pool); -+ else if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ trans->mem = (void *) 0x1; -+ trans->iters = (void *) 0x1; -+ -+ return trans->error ? -EIO : 0; -+} -+ -+static void bch2_btree_iter_node_to_text(struct printbuf *out, -+ struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ pr_buf(out, " %px l=%u %s:", -+ _b, _b->level, bch2_btree_ids[_b->btree_id]); -+ bch2_bpos_to_text(out, btree_node_pos(_b, type)); -+} -+ -+void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree_trans *trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned l; -+ -+ mutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); -+ -+ trans_for_each_iter(trans, iter) { -+ if (!iter->nodes_locked) -+ continue; -+ -+ pr_buf(out, " iter %u %s:", -+ iter->idx, -+ bch2_btree_ids[iter->btree_id]); -+ bch2_bpos_to_text(out, iter->pos); -+ pr_buf(out, "\n"); -+ -+ for (l = 0; l < BTREE_MAX_DEPTH; l++) { -+ if (btree_node_locked(iter, l)) { -+ pr_buf(out, " %s l=%u ", -+ btree_node_intent_locked(iter, l) ? "i" : "r", l); -+ bch2_btree_iter_node_to_text(out, -+ (void *) iter->l[l].b, -+ btree_iter_type(iter)); -+ pr_buf(out, "\n"); -+ } -+ } -+ } -+ -+ b = READ_ONCE(trans->locking); -+ if (b) { -+ pr_buf(out, " locking iter %u l=%u %s:", -+ trans->locking_iter_idx, -+ trans->locking_level, -+ bch2_btree_ids[trans->locking_btree_id]); -+ bch2_bpos_to_text(out, trans->locking_pos); -+ -+ -+ pr_buf(out, " node "); -+ bch2_btree_iter_node_to_text(out, -+ (void *) b, -+ btree_iter_type(&trans->iters[trans->locking_iter_idx])); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *c) -+{ -+ mempool_exit(&c->btree_iters_pool); -+} -+ -+int bch2_fs_btree_iter_init(struct bch_fs *c) -+{ -+ unsigned nr = BTREE_ITER_MAX; -+ -+ INIT_LIST_HEAD(&c->btree_trans_list); -+ mutex_init(&c->btree_trans_lock); -+ -+ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, -+ sizeof(struct btree_iter) * nr + -+ sizeof(struct btree_insert_entry) * nr + -+ sizeof(struct btree_insert_entry) * nr); -+} -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -new file mode 100644 -index 000000000000..bd9ec3ec9a92 ---- /dev/null -+++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,314 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_ITER_H -+#define _BCACHEFS_BTREE_ITER_H -+ -+#include "bset.h" -+#include "btree_types.h" -+ -+static inline void btree_iter_set_dirty(struct btree_iter *iter, -+ enum btree_iter_uptodate u) -+{ -+ iter->uptodate = max_t(unsigned, iter->uptodate, u); -+} -+ -+static inline struct btree *btree_iter_node(struct btree_iter *iter, -+ unsigned level) -+{ -+ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; -+} -+ -+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, -+ const struct btree *b, unsigned level) -+{ -+ /* -+ * We don't compare the low bits of the lock sequence numbers because -+ * @iter might have taken a write lock on @b, and we don't want to skip -+ * the linked iterator if the sequence numbers were equal before taking -+ * that write lock. The lock sequence number is incremented by taking -+ * and releasing write locks and is even when unlocked: -+ */ -+ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; -+} -+ -+static inline struct btree *btree_node_parent(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return btree_iter_node(iter, b->c.level + 1); -+} -+ -+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) -+{ -+ return hweight64(trans->iters_linked) > 1; -+} -+ -+static inline int btree_iter_err(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -+} -+ -+/* Iterate over iters within a transaction: */ -+ -+#define trans_for_each_iter_all(_trans, _iter) \ -+ for (_iter = (_trans)->iters; \ -+ _iter < (_trans)->iters + (_trans)->nr_iters; \ -+ _iter++) -+ -+static inline struct btree_iter * -+__trans_next_iter(struct btree_trans *trans, unsigned idx) -+{ -+ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); -+ -+ for (; idx < trans->nr_iters; idx++) -+ if (trans->iters_linked & (1ULL << idx)) -+ return &trans->iters[idx]; -+ -+ return NULL; -+} -+ -+#define trans_for_each_iter(_trans, _iter) \ -+ for (_iter = __trans_next_iter((_trans), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) -+ -+static inline bool __iter_has_node(const struct btree_iter *iter, -+ const struct btree *b) -+{ -+ return iter->l[b->c.level].b == b && -+ btree_node_lock_seq_matches(iter, b, b->c.level); -+} -+ -+static inline struct btree_iter * -+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, -+ unsigned idx) -+{ -+ struct btree_iter *iter = __trans_next_iter(trans, idx); -+ -+ while (iter && !__iter_has_node(iter, b)) -+ iter = __trans_next_iter(trans, iter->idx + 1); -+ -+ return iter; -+} -+ -+#define trans_for_each_iter_with_node(_trans, _b, _iter) \ -+ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter_with_node((_trans), (_b), \ -+ (_iter)->idx + 1)) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); -+void bch2_btree_trans_verify_locks(struct btree_trans *); -+#else -+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, -+ struct btree *b) {} -+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} -+#endif -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, -+ struct bkey_packed *); -+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_packed *, -+ unsigned, unsigned); -+ -+bool bch2_btree_iter_relock(struct btree_iter *, bool); -+bool bch2_trans_relock(struct btree_trans *); -+void bch2_trans_unlock(struct btree_trans *); -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); -+ -+ return iter->locks_want < new_locks_want -+ ? (!iter->trans->nounlock -+ ? __bch2_btree_iter_upgrade(iter, new_locks_want) -+ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) -+ : iter->uptodate <= BTREE_ITER_NEED_PEEK; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); -+ -+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) -+{ -+ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) -+ __bch2_btree_iter_downgrade(iter, 0); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *); -+ -+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); -+ -+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *); -+ -+static inline int __must_check -+bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK -+ ? __bch2_btree_iter_traverse(iter) -+ : 0; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *); -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -+struct btree *bch2_btree_iter_next_node(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); -+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); -+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -+ -+static inline int btree_iter_cmp(const struct btree_iter *l, -+ const struct btree_iter *r) -+{ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: -+ bkey_cmp(l->pos, r->pos); -+} -+ -+/* -+ * Unlocks before scheduling -+ * Note: does not revalidate iterator -+ */ -+static inline int bch2_trans_cond_resched(struct btree_trans *trans) -+{ -+ if (need_resched() || race_fault()) { -+ bch2_trans_unlock(trans); -+ schedule(); -+ return bch2_trans_relock(trans) ? 0 : -EINTR; -+ } else { -+ return 0; -+ } -+} -+ -+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _locks_want, _depth, _flags, _b) \ -+ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ -+ _start, _locks_want, _depth, _flags), \ -+ _b = bch2_btree_iter_peek_node(_iter); \ -+ (_b); \ -+ (_b) = bch2_btree_iter_next_node(_iter)) -+ -+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _flags, _b) \ -+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ 0, 0, _flags, _b) -+ -+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, -+ unsigned flags) -+{ -+ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_peek_cached(iter); -+ else -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_peek_slot(iter) -+ : bch2_btree_iter_peek(iter); -+} -+ -+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, -+ unsigned flags) -+{ -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_next_slot(iter) -+ : bch2_btree_iter_next(iter); -+} -+ -+static inline int bkey_err(struct bkey_s_c k) -+{ -+ return PTR_ERR_OR_ZERO(k.k); -+} -+ -+#define for_each_btree_key(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ -+ bch2_trans_get_iter((_trans), (_btree_id), \ -+ (_start), (_flags))) ?: \ -+ PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_peek(_iter, _flags)).k); \ -+ !_ret && (_k).k; \ -+ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_next(_iter, _flags)).k)) -+ -+#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ -+ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ (_k) = __bch2_btree_iter_next(_iter, _flags)) -+ -+/* new multiple iterator interface: */ -+ -+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); -+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); -+ -+void bch2_trans_unlink_iters(struct btree_trans *); -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, -+ struct bpos, unsigned); -+ -+static inline struct btree_iter * -+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, -+ struct btree_iter *); -+static inline struct btree_iter * -+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_copy_iter(trans, src); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+ -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, -+ enum btree_id, struct bpos, -+ unsigned, unsigned, unsigned); -+ -+#define TRANS_RESET_NOTRAVERSE (1 << 0) -+ -+void bch2_trans_reset(struct btree_trans *, unsigned); -+ -+static inline void bch2_trans_begin(struct btree_trans *trans) -+{ -+ return bch2_trans_reset(trans, 0); -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *, size_t); -+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); -+int bch2_trans_exit(struct btree_trans *); -+ -+void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *); -+int bch2_fs_btree_iter_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_ITER_H */ -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -new file mode 100644 -index 000000000000..61662750dfc0 ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,519 @@ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+ -+#include -+ -+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct bkey_cached *ck = obj; -+ const struct bkey_cached_key *key = arg->key; -+ -+ return cmp_int(ck->key.btree_id, key->btree_id) ?: -+ bkey_cmp(ck->key.pos, key->pos); -+} -+ -+static const struct rhashtable_params bch2_btree_key_cache_params = { -+ .head_offset = offsetof(struct bkey_cached, hash), -+ .key_offset = offsetof(struct bkey_cached, key), -+ .key_len = sizeof(struct bkey_cached_key), -+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, -+}; -+ -+__flatten -+static inline struct bkey_cached * -+btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -+{ -+ struct bkey_cached_key key = { -+ .btree_id = btree_id, -+ .pos = pos, -+ }; -+ -+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, -+ bch2_btree_key_cache_params); -+} -+ -+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -+{ -+ if (!six_trylock_intent(&ck->c.lock)) -+ return false; -+ -+ if (!six_trylock_write(&ck->c.lock)) { -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void bkey_cached_evict(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, -+ bch2_btree_key_cache_params)); -+ memset(&ck->key, ~0, sizeof(ck->key)); -+} -+ -+static void bkey_cached_free(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ list_move(&ck->list, &c->freed); -+ -+ kfree(ck->k); -+ ck->k = NULL; -+ ck->u64s = 0; -+ -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+} -+ -+static struct bkey_cached * -+bkey_cached_alloc(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck; -+ -+ list_for_each_entry(ck, &c->freed, list) -+ if (bkey_cached_lock_for_evict(ck)) -+ return ck; -+ -+ list_for_each_entry(ck, &c->clean, list) -+ if (bkey_cached_lock_for_evict(ck)) { -+ bkey_cached_evict(c, ck); -+ return ck; -+ } -+ -+ ck = kzalloc(sizeof(*ck), GFP_NOFS); -+ if (!ck) -+ return NULL; -+ -+ INIT_LIST_HEAD(&ck->list); -+ six_lock_init(&ck->c.lock); -+ BUG_ON(!six_trylock_intent(&ck->c.lock)); -+ BUG_ON(!six_trylock_write(&ck->c.lock)); -+ -+ return ck; -+} -+ -+static struct bkey_cached * -+btree_key_cache_create(struct btree_key_cache *c, -+ enum btree_id btree_id, -+ struct bpos pos) -+{ -+ struct bkey_cached *ck; -+ -+ ck = bkey_cached_alloc(c); -+ if (!ck) -+ return ERR_PTR(-ENOMEM); -+ -+ ck->c.level = 0; -+ ck->c.btree_id = btree_id; -+ ck->key.btree_id = btree_id; -+ ck->key.pos = pos; -+ ck->valid = false; -+ -+ BUG_ON(ck->flags); -+ -+ if (rhashtable_lookup_insert_fast(&c->table, -+ &ck->hash, -+ bch2_btree_key_cache_params)) { -+ /* We raced with another fill: */ -+ bkey_cached_free(c, ck); -+ return NULL; -+ } -+ -+ list_move(&ck->list, &c->clean); -+ six_unlock_write(&ck->c.lock); -+ -+ return ck; -+} -+ -+static int btree_key_cache_fill(struct btree_trans *trans, -+ struct btree_iter *ck_iter, -+ struct bkey_cached *ck) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned new_u64s = 0; -+ struct bkey_i *new_k = NULL; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, ck->key.btree_id, -+ ck->key.pos, BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+ } -+ -+ if (!bch2_btree_node_relock(ck_iter, 0)) { -+ bch2_trans_iter_put(trans, iter); -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ if (k.k->u64s > ck->u64s) { -+ new_u64s = roundup_pow_of_two(k.k->u64s); -+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) { -+ bch2_trans_iter_put(trans, iter); -+ return -ENOMEM; -+ } -+ } -+ -+ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); -+ if (new_k) { -+ kfree(ck->k); -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ } -+ -+ bkey_reassemble(ck->k, k); -+ ck->valid = true; -+ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); -+ -+ /* We're not likely to need this iterator again: */ -+ bch2_trans_iter_free(trans, iter); -+ -+ return 0; -+} -+ -+static int bkey_cached_check_fn(struct six_lock *lock, void *p) -+{ -+ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); -+ const struct btree_iter *iter = p; -+ -+ return ck->key.btree_id == iter->btree_id && -+ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; -+} -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck; -+ int ret = 0; -+ -+ BUG_ON(iter->level); -+ -+ if (btree_node_locked(iter, 0)) { -+ ck = (void *) iter->l[0].b; -+ goto fill; -+ } -+retry: -+ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); -+ if (!ck) { -+ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { -+ iter->l[0].b = NULL; -+ return 0; -+ } -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ ck = btree_key_cache_create(&c->btree_key_cache, -+ iter->btree_id, iter->pos); -+ mutex_unlock(&c->btree_key_cache.lock); -+ -+ ret = PTR_ERR_OR_ZERO(ck); -+ if (ret) -+ goto err; -+ if (!ck) -+ goto retry; -+ -+ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); -+ iter->locks_want = 1; -+ } else { -+ enum six_lock_type lock_want = __btree_lock_want(iter, 0); -+ -+ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, -+ bkey_cached_check_fn, iter)) { -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ goto retry; -+ } -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ six_unlock_type(&ck->c.lock, lock_want); -+ goto retry; -+ } -+ -+ mark_btree_node_locked(iter, 0, lock_want); -+ } -+ -+ iter->l[0].lock_seq = ck->c.lock.state.seq; -+ iter->l[0].b = (void *) ck; -+fill: -+ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { -+ if (!btree_node_intent_locked(iter, 0)) -+ bch2_btree_iter_upgrade(iter, 1); -+ if (!btree_node_intent_locked(iter, 0)) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ ret = btree_key_cache_fill(trans, iter, ck); -+ if (ret) -+ goto err; -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ bch2_btree_iter_downgrade(iter); -+ return ret; -+err: -+ if (ret != -EINTR) { -+ btree_node_unlock(iter, 0); -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; -+ } -+ return ret; -+} -+ -+static int btree_key_cache_flush_pos(struct btree_trans *trans, -+ struct bkey_cached_key key, -+ u64 journal_seq, -+ bool evict) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct btree_iter *c_iter = NULL, *b_iter = NULL; -+ struct bkey_cached *ck; -+ int ret; -+ -+ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(b_iter); -+ if (ret) -+ goto out; -+ -+ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_CACHED_NOCREATE| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(c_iter); -+ if (ret) -+ goto out; -+retry: -+ ret = bch2_btree_iter_traverse(c_iter); -+ if (ret) -+ goto err; -+ -+ ck = (void *) c_iter->l[0].b; -+ if (!ck || -+ (journal_seq && ck->journal.seq != journal_seq)) -+ goto out; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ if (!evict) -+ goto out; -+ goto evict; -+ } -+ -+ ret = bch2_btree_iter_traverse(b_iter) ?: -+ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_JOURNAL_RESERVED| -+ BTREE_INSERT_JOURNAL_RECLAIM); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ BUG_ON(ret && !bch2_journal_error(j)); -+ -+ if (ret) -+ goto out; -+ -+ bch2_journal_pin_drop(j, &ck->journal); -+ bch2_journal_preres_put(j, &ck->res); -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ -+ if (!evict) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_move_tail(&ck->list, &c->btree_key_cache.clean); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } else { -+evict: -+ BUG_ON(!btree_node_intent_locked(c_iter, 0)); -+ -+ mark_btree_node_unlocked(c_iter, 0); -+ c_iter->l[0].b = NULL; -+ -+ six_lock_write(&ck->c.lock, NULL, NULL); -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ bkey_cached_evict(&c->btree_key_cache, ck); -+ bkey_cached_free(&c->btree_key_cache, ck); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+out: -+ bch2_trans_iter_put(trans, b_iter); -+ bch2_trans_iter_put(trans, c_iter); -+ return ret; -+} -+ -+static void btree_key_cache_journal_flush(struct journal *j, -+ struct journal_entry_pin *pin, -+ u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bkey_cached *ck = -+ container_of(pin, struct bkey_cached, journal); -+ struct bkey_cached_key key; -+ struct btree_trans trans; -+ -+ six_lock_read(&ck->c.lock, NULL, NULL); -+ key = ck->key; -+ -+ if (ck->journal.seq != seq || -+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_read(&ck->c.lock); -+ return; -+ } -+ six_unlock_read(&ck->c.lock); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ btree_key_cache_flush_pos(&trans, key, seq, false); -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * Flush and evict a key from the key cache: -+ */ -+int bch2_btree_key_cache_flush(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached_key key = { id, pos }; -+ -+ /* Fastpath - assume it won't be found: */ -+ if (!btree_key_cache_find(c, id, pos)) -+ return 0; -+ -+ return btree_key_cache_flush_pos(trans, key, 0, true); -+} -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ BUG_ON(insert->u64s > ck->u64s); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ int difference; -+ -+ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); -+ -+ difference = jset_u64s(insert->u64s) - ck->res.u64s; -+ if (difference > 0) { -+ trans->journal_preres.u64s -= difference; -+ ck->res.u64s += difference; -+ } -+ } -+ -+ bkey_copy(ck->k, insert); -+ ck->valid = true; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_del_init(&ck->list); -+ -+ set_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+ -+ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, -+ &ck->journal, btree_key_cache_journal_flush); -+ return true; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ BUG_ON(btree_key_cache_find(trans->c, id, pos)); -+} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck, *n; -+ -+ mutex_lock(&c->lock); -+ list_for_each_entry_safe(ck, n, &c->clean, list) { -+ kfree(ck->k); -+ kfree(ck); -+ } -+ list_for_each_entry_safe(ck, n, &c->freed, list) -+ kfree(ck); -+ mutex_unlock(&c->lock); -+ -+ rhashtable_destroy(&c->table); -+} -+ -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -+{ -+ mutex_init(&c->lock); -+ INIT_LIST_HEAD(&c->freed); -+ INIT_LIST_HEAD(&c->clean); -+} -+ -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) -+{ -+ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); -+} -+ -+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) -+{ -+ struct bucket_table *tbl; -+ struct bkey_cached *ck; -+ struct rhash_head *pos; -+ size_t i; -+ -+ mutex_lock(&c->lock); -+ tbl = rht_dereference_rcu(c->table.tbl, &c->table); -+ -+ for (i = 0; i < tbl->size; i++) { -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ pr_buf(out, "%s:", -+ bch2_btree_ids[ck->key.btree_id]); -+ bch2_bpos_to_text(out, ck->key.pos); -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) -+ pr_buf(out, " journal seq %llu", ck->journal.seq); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->lock); -+} -diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h -new file mode 100644 -index 000000000000..b1756c6c622c ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,25 @@ -+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -+#define _BCACHEFS_BTREE_KEY_CACHE_H -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *); -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *, -+ struct btree_iter *, struct bkey_i *); -+int bch2_btree_key_cache_flush(struct btree_trans *, -+ enum btree_id, struct bpos); -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *, -+ enum btree_id, struct bpos); -+#else -+static inline void -+bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) {} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *); -+ -+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); -+ -+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -new file mode 100644 -index 000000000000..81fbf3e18647 ---- /dev/null -+++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_LOCKING_H -+#define _BCACHEFS_BTREE_LOCKING_H -+ -+/* -+ * Only for internal btree use: -+ * -+ * The btree iterator tracks what locks it wants to take, and what locks it -+ * currently has - here we have wrappers for locking/unlocking btree nodes and -+ * updating the iterator state -+ */ -+ -+#include -+ -+#include "btree_iter.h" -+ -+/* matches six lock types */ -+enum btree_node_locked_type { -+ BTREE_NODE_UNLOCKED = -1, -+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, -+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, -+}; -+ -+static inline int btree_node_locked_type(struct btree_iter *iter, -+ unsigned level) -+{ -+ /* -+ * We're relying on the fact that if nodes_intent_locked is set -+ * nodes_locked must be set as well, so that we can compute without -+ * branches: -+ */ -+ return BTREE_NODE_UNLOCKED + -+ ((iter->nodes_locked >> level) & 1) + -+ ((iter->nodes_intent_locked >> level) & 1); -+} -+ -+static inline bool btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; -+} -+ -+static inline bool btree_node_read_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; -+} -+ -+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) -+{ -+ return iter->nodes_locked & (1 << level); -+} -+ -+static inline void mark_btree_node_unlocked(struct btree_iter *iter, -+ unsigned level) -+{ -+ iter->nodes_locked &= ~(1 << level); -+ iter->nodes_intent_locked &= ~(1 << level); -+} -+ -+static inline void mark_btree_node_locked(struct btree_iter *iter, -+ unsigned level, -+ enum six_lock_type type) -+{ -+ /* relying on this to avoid a branch */ -+ BUILD_BUG_ON(SIX_LOCK_read != 0); -+ BUILD_BUG_ON(SIX_LOCK_intent != 1); -+ -+ iter->nodes_locked |= 1 << level; -+ iter->nodes_intent_locked |= type << level; -+} -+ -+static inline void mark_btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ mark_btree_node_locked(iter, level, SIX_LOCK_intent); -+} -+ -+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) -+{ -+ return level < iter->locks_want -+ ? SIX_LOCK_intent -+ : SIX_LOCK_read; -+} -+ -+static inline enum btree_node_locked_type -+btree_lock_want(struct btree_iter *iter, int level) -+{ -+ if (level < iter->level) -+ return BTREE_NODE_UNLOCKED; -+ if (level < iter->locks_want) -+ return BTREE_NODE_INTENT_LOCKED; -+ if (level == iter->level) -+ return BTREE_NODE_READ_LOCKED; -+ return BTREE_NODE_UNLOCKED; -+} -+ -+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ int lock_type = btree_node_locked_type(iter, level); -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ if (lock_type != BTREE_NODE_UNLOCKED) -+ six_unlock_type(&iter->l[level].b->c.lock, lock_type); -+ mark_btree_node_unlocked(iter, level); -+} -+ -+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ EBUG_ON(!level && iter->trans->nounlock); -+ -+ __btree_node_unlock(iter, level); -+} -+ -+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) -+{ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ while (iter->nodes_locked) -+ btree_node_unlock(iter, __ffs(iter->nodes_locked)); -+} -+ -+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) -+{ -+ switch (type) { -+ case SIX_LOCK_read: -+ return BCH_TIME_btree_lock_contended_read; -+ case SIX_LOCK_intent: -+ return BCH_TIME_btree_lock_contended_intent; -+ case SIX_LOCK_write: -+ return BCH_TIME_btree_lock_contended_write; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * wrapper around six locks that just traces lock contended time -+ */ -+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ u64 start_time = local_clock(); -+ -+ six_lock_type(&b->c.lock, type, NULL, NULL); -+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -+} -+ -+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ if (!six_trylock_type(&b->c.lock, type)) -+ __btree_node_lock_type(c, b, type); -+} -+ -+/* -+ * Lock a btree node if we already have it locked on one of our linked -+ * iterators: -+ */ -+static inline bool btree_node_lock_increment(struct btree_trans *trans, -+ struct btree *b, unsigned level, -+ enum btree_node_locked_type want) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->l[level].b == b && -+ btree_node_locked_type(iter, level) >= want) { -+ six_lock_increment(&b->c.lock, want); -+ return true; -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, -+ struct btree_iter *, enum six_lock_type, -+ six_lock_should_sleep_fn, void *); -+ -+static inline bool btree_node_lock(struct btree *b, -+ struct bpos pos, unsigned level, -+ struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ bool ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = b; -+ trans->locking_iter_idx = iter->idx; -+ trans->locking_pos = pos; -+ trans->locking_btree_id = iter->btree_id; -+ trans->locking_level = level; -+#endif -+ ret = likely(six_trylock_type(&b->c.lock, type)) || -+ btree_node_lock_increment(trans, b, level, type) || -+ __bch2_btree_node_lock(b, pos, level, iter, type, -+ should_sleep_fn, p); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = NULL; -+#endif -+ return ret; -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_node_relock(struct btree_iter *iter, -+ unsigned level) -+{ -+ EBUG_ON(btree_node_locked(iter, level) && -+ btree_node_locked_type(iter, level) != -+ __btree_lock_want(iter, level)); -+ -+ return likely(btree_node_locked(iter, level)) || -+ __bch2_btree_node_relock(iter, level); -+} -+ -+/* -+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will -+ * succeed: -+ */ -+static inline void -+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ linked->l[b->c.level].lock_seq += 2; -+ -+ six_unlock_write(&b->c.lock); -+} -+ -+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); -+ -+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); -+ -+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); -+ -+ if (unlikely(!six_trylock_write(&b->c.lock))) -+ __bch2_btree_node_lock_write(b, iter); -+} -+ -+#endif /* _BCACHEFS_BTREE_LOCKING_H */ -+ -+ -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -new file mode 100644 -index 000000000000..683b416ef427 ---- /dev/null -+++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,664 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_TYPES_H -+#define _BCACHEFS_BTREE_TYPES_H -+ -+#include -+#include -+#include -+ -+#include "bkey_methods.h" -+#include "buckets_types.h" -+#include "journal_types.h" -+ -+struct open_bucket; -+struct btree_update; -+struct btree_trans; -+ -+#define MAX_BSETS 3U -+ -+struct btree_nr_keys { -+ -+ /* -+ * Amount of live metadata (i.e. size of node after a compaction) in -+ * units of u64s -+ */ -+ u16 live_u64s; -+ u16 bset_u64s[MAX_BSETS]; -+ -+ /* live keys only: */ -+ u16 packed_keys; -+ u16 unpacked_keys; -+}; -+ -+struct bset_tree { -+ /* -+ * We construct a binary tree in an array as if the array -+ * started at 1, so that things line up on the same cachelines -+ * better: see comments in bset.c at cacheline_to_bkey() for -+ * details -+ */ -+ -+ /* size of the binary tree and prev array */ -+ u16 size; -+ -+ /* function of size - precalculated for to_inorder() */ -+ u16 extra; -+ -+ u16 data_offset; -+ u16 aux_data_offset; -+ u16 end_offset; -+ -+ struct bpos max_key; -+}; -+ -+struct btree_write { -+ struct journal_entry_pin journal; -+}; -+ -+struct btree_alloc { -+ struct open_buckets ob; -+ BKEY_PADDED(k); -+}; -+ -+struct btree_bkey_cached_common { -+ struct six_lock lock; -+ u8 level; -+ u8 btree_id; -+}; -+ -+struct btree { -+ struct btree_bkey_cached_common c; -+ -+ struct rhash_head hash; -+ u64 hash_val; -+ -+ unsigned long flags; -+ u16 written; -+ u8 nsets; -+ u8 nr_key_bits; -+ -+ struct bkey_format format; -+ -+ struct btree_node *data; -+ void *aux_data; -+ -+ /* -+ * Sets of sorted keys - the real btree node - plus a binary search tree -+ * -+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point -+ * to the memory we have allocated for this btree node. Additionally, -+ * set[0]->data points to the entire btree node as it exists on disk. -+ */ -+ struct bset_tree set[MAX_BSETS]; -+ -+ struct btree_nr_keys nr; -+ u16 sib_u64s[2]; -+ u16 whiteout_u64s; -+ u8 byte_order; -+ u8 unpack_fn_len; -+ -+ /* -+ * XXX: add a delete sequence number, so when bch2_btree_node_relock() -+ * fails because the lock sequence number has changed - i.e. the -+ * contents were modified - we can still relock the node if it's still -+ * the one we want, without redoing the traversal -+ */ -+ -+ /* -+ * For asynchronous splits/interior node updates: -+ * When we do a split, we allocate new child nodes and update the parent -+ * node to point to them: we update the parent in memory immediately, -+ * but then we must wait until the children have been written out before -+ * the update to the parent can be written - this is a list of the -+ * btree_updates that are blocking this node from being -+ * written: -+ */ -+ struct list_head write_blocked; -+ -+ /* -+ * Also for asynchronous splits/interior node updates: -+ * If a btree node isn't reachable yet, we don't want to kick off -+ * another write - because that write also won't yet be reachable and -+ * marking it as completed before it's reachable would be incorrect: -+ */ -+ unsigned long will_make_reachable; -+ -+ struct open_buckets ob; -+ -+ /* lru list */ -+ struct list_head list; -+ -+ struct btree_write writes[2]; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ bool *expensive_debug_checks; -+#endif -+ -+ /* Key/pointer for this btree node */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+}; -+ -+struct btree_cache { -+ struct rhashtable table; -+ bool table_init_done; -+ /* -+ * We never free a struct btree, except on shutdown - we just put it on -+ * the btree_cache_freed list and reuse it later. This simplifies the -+ * code, and it doesn't cost us much memory as the memory usage is -+ * dominated by buffers that hold the actual btree node data and those -+ * can be freed - and the number of struct btrees allocated is -+ * effectively bounded. -+ * -+ * btree_cache_freeable effectively is a small cache - we use it because -+ * high order page allocations can be rather expensive, and it's quite -+ * common to delete and allocate btree nodes in quick succession. It -+ * should never grow past ~2-3 nodes in practice. -+ */ -+ struct mutex lock; -+ struct list_head live; -+ struct list_head freeable; -+ struct list_head freed; -+ -+ /* Number of elements in live + freeable lists */ -+ unsigned used; -+ unsigned reserve; -+ struct shrinker shrink; -+ -+ /* -+ * If we need to allocate memory for a new btree node and that -+ * allocation fails, we can cannibalize another node in the btree cache -+ * to satisfy the allocation - lock to guarantee only one thread does -+ * this at a time: -+ */ -+ struct task_struct *alloc_lock; -+ struct closure_waitlist alloc_wait; -+}; -+ -+struct btree_node_iter { -+ struct btree_node_iter_set { -+ u16 k, end; -+ } data[MAX_BSETS]; -+}; -+ -+enum btree_iter_type { -+ BTREE_ITER_KEYS, -+ BTREE_ITER_NODES, -+ BTREE_ITER_CACHED, -+}; -+ -+#define BTREE_ITER_TYPE ((1 << 2) - 1) -+ -+/* -+ * Iterate over all possible positions, synthesizing deleted keys for holes: -+ */ -+#define BTREE_ITER_SLOTS (1 << 2) -+/* -+ * Indicates that intent locks should be taken on leaf nodes, because we expect -+ * to be doing updates: -+ */ -+#define BTREE_ITER_INTENT (1 << 3) -+/* -+ * Causes the btree iterator code to prefetch additional btree nodes from disk: -+ */ -+#define BTREE_ITER_PREFETCH (1 << 4) -+/* -+ * Indicates that this iterator should not be reused until transaction commit, -+ * either because a pending update references it or because the update depends -+ * on that particular key being locked (e.g. by the str_hash code, for hash -+ * table consistency) -+ */ -+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) -+/* -+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for -+ * @pos or the first key strictly greater than @pos -+ */ -+#define BTREE_ITER_IS_EXTENTS (1 << 6) -+#define BTREE_ITER_ERROR (1 << 7) -+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -+#define BTREE_ITER_CACHED_NOFILL (1 << 9) -+#define BTREE_ITER_CACHED_NOCREATE (1 << 10) -+ -+#define BTREE_ITER_USER_FLAGS \ -+ (BTREE_ITER_SLOTS \ -+ |BTREE_ITER_INTENT \ -+ |BTREE_ITER_PREFETCH \ -+ |BTREE_ITER_CACHED_NOFILL \ -+ |BTREE_ITER_CACHED_NOCREATE) -+ -+enum btree_iter_uptodate { -+ BTREE_ITER_UPTODATE = 0, -+ BTREE_ITER_NEED_PEEK = 1, -+ BTREE_ITER_NEED_RELOCK = 2, -+ BTREE_ITER_NEED_TRAVERSE = 3, -+}; -+ -+#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) -+#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) -+#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) -+#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) -+#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) -+#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) -+#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) -+ -+/* -+ * @pos - iterator's current position -+ * @level - current btree depth -+ * @locks_want - btree level below which we start taking intent locks -+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked -+ * @nodes_intent_locked - bitmask indicating which locks are intent locks -+ */ -+struct btree_iter { -+ struct btree_trans *trans; -+ struct bpos pos; -+ struct bpos pos_after_commit; -+ -+ u16 flags; -+ u8 idx; -+ -+ enum btree_id btree_id:4; -+ enum btree_iter_uptodate uptodate:4; -+ unsigned level:4, -+ min_depth:4, -+ locks_want:4, -+ nodes_locked:4, -+ nodes_intent_locked:4; -+ -+ struct btree_iter_level { -+ struct btree *b; -+ struct btree_node_iter iter; -+ u32 lock_seq; -+ } l[BTREE_MAX_DEPTH]; -+ -+ /* -+ * Current unpacked key - so that bch2_btree_iter_next()/ -+ * bch2_btree_iter_next_slot() can correctly advance pos. -+ */ -+ struct bkey k; -+ unsigned long ip_allocated; -+}; -+ -+static inline enum btree_iter_type -+btree_iter_type(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_TYPE; -+} -+ -+static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -+{ -+ return iter->l + iter->level; -+} -+ -+struct btree_key_cache { -+ struct mutex lock; -+ struct rhashtable table; -+ struct list_head freed; -+ struct list_head clean; -+}; -+ -+struct bkey_cached_key { -+ u32 btree_id; -+ struct bpos pos; -+} __attribute__((packed, aligned(4))); -+ -+#define BKEY_CACHED_DIRTY 0 -+ -+struct bkey_cached { -+ struct btree_bkey_cached_common c; -+ -+ unsigned long flags; -+ u8 u64s; -+ bool valid; -+ struct bkey_cached_key key; -+ -+ struct rhash_head hash; -+ struct list_head list; -+ -+ struct journal_preres res; -+ struct journal_entry_pin journal; -+ -+ struct bkey_i *k; -+}; -+ -+struct btree_insert_entry { -+ unsigned trigger_flags; -+ unsigned trans_triggers_run:1; -+ struct bkey_i *k; -+ struct btree_iter *iter; -+}; -+ -+#ifndef CONFIG_LOCKDEP -+#define BTREE_ITER_MAX 64 -+#else -+#define BTREE_ITER_MAX 32 -+#endif -+ -+struct btree_trans { -+ struct bch_fs *c; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct list_head list; -+ struct btree *locking; -+ unsigned locking_iter_idx; -+ struct bpos locking_pos; -+ u8 locking_btree_id; -+ u8 locking_level; -+ pid_t pid; -+#endif -+ unsigned long ip; -+ -+ u64 iters_linked; -+ u64 iters_live; -+ u64 iters_touched; -+ -+ u8 nr_iters; -+ u8 nr_updates; -+ u8 nr_updates2; -+ u8 size; -+ unsigned used_mempool:1; -+ unsigned error:1; -+ unsigned nounlock:1; -+ unsigned need_reset:1; -+ unsigned in_traverse_all:1; -+ -+ unsigned mem_top; -+ unsigned mem_bytes; -+ void *mem; -+ -+ struct btree_iter *iters; -+ struct btree_insert_entry *updates; -+ struct btree_insert_entry *updates2; -+ -+ /* update path: */ -+ struct jset_entry *extra_journal_entries; -+ unsigned extra_journal_entry_u64s; -+ struct journal_entry_pin *journal_pin; -+ -+ struct journal_res journal_res; -+ struct journal_preres journal_preres; -+ u64 *journal_seq; -+ struct disk_reservation *disk_res; -+ unsigned flags; -+ unsigned journal_u64s; -+ unsigned journal_preres_u64s; -+ struct replicas_delta_list *fs_usage_deltas; -+ -+ struct btree_iter iters_onstack[2]; -+ struct btree_insert_entry updates_onstack[2]; -+ struct btree_insert_entry updates2_onstack[2]; -+}; -+ -+#define BTREE_FLAG(flag) \ -+static inline bool btree_node_ ## flag(struct btree *b) \ -+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void set_btree_node_ ## flag(struct btree *b) \ -+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void clear_btree_node_ ## flag(struct btree *b) \ -+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } -+ -+enum btree_flags { -+ BTREE_NODE_read_in_flight, -+ BTREE_NODE_read_error, -+ BTREE_NODE_dirty, -+ BTREE_NODE_need_write, -+ BTREE_NODE_noevict, -+ BTREE_NODE_write_idx, -+ BTREE_NODE_accessed, -+ BTREE_NODE_write_in_flight, -+ BTREE_NODE_just_written, -+ BTREE_NODE_dying, -+ BTREE_NODE_fake, -+ BTREE_NODE_old_extent_overwrite, -+ BTREE_NODE_need_rewrite, -+}; -+ -+BTREE_FLAG(read_in_flight); -+BTREE_FLAG(read_error); -+BTREE_FLAG(dirty); -+BTREE_FLAG(need_write); -+BTREE_FLAG(noevict); -+BTREE_FLAG(write_idx); -+BTREE_FLAG(accessed); -+BTREE_FLAG(write_in_flight); -+BTREE_FLAG(just_written); -+BTREE_FLAG(dying); -+BTREE_FLAG(fake); -+BTREE_FLAG(old_extent_overwrite); -+BTREE_FLAG(need_rewrite); -+ -+static inline struct btree_write *btree_current_write(struct btree *b) -+{ -+ return b->writes + btree_node_write_idx(b); -+} -+ -+static inline struct btree_write *btree_prev_write(struct btree *b) -+{ -+ return b->writes + (btree_node_write_idx(b) ^ 1); -+} -+ -+static inline struct bset_tree *bset_tree_last(struct btree *b) -+{ -+ EBUG_ON(!b->nsets); -+ return b->set + b->nsets - 1; -+} -+ -+static inline void * -+__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -+{ -+ return (void *) ((u64 *) b->data + 1 + offset); -+} -+ -+static inline u16 -+__btree_node_ptr_to_offset(const struct btree *b, const void *p) -+{ -+ u16 ret = (u64 *) p - 1 - (u64 *) b->data; -+ -+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); -+ return ret; -+} -+ -+static inline struct bset *bset(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return __btree_node_offset_to_ptr(b, t->data_offset); -+} -+ -+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -+{ -+ t->end_offset = -+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -+} -+ -+static inline void set_btree_bset(struct btree *b, struct bset_tree *t, -+ const struct bset *i) -+{ -+ t->data_offset = __btree_node_ptr_to_offset(b, i); -+ set_btree_bset_end(b, t); -+} -+ -+static inline struct bset *btree_bset_first(struct btree *b) -+{ -+ return bset(b, b->set); -+} -+ -+static inline struct bset *btree_bset_last(struct btree *b) -+{ -+ return bset(b, bset_tree_last(b)); -+} -+ -+static inline u16 -+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -+{ -+ return __btree_node_ptr_to_offset(b, k); -+} -+ -+static inline struct bkey_packed * -+__btree_node_offset_to_key(const struct btree *b, u16 k) -+{ -+ return __btree_node_offset_to_ptr(b, k); -+} -+ -+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -+{ -+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -+} -+ -+#define btree_bkey_first(_b, _t) \ -+({ \ -+ EBUG_ON(bset(_b, _t)->start != \ -+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ -+ \ -+ bset(_b, _t)->start; \ -+}) -+ -+#define btree_bkey_last(_b, _t) \ -+({ \ -+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ -+ vstruct_last(bset(_b, _t))); \ -+ \ -+ __btree_node_offset_to_key(_b, (_t)->end_offset); \ -+}) -+ -+static inline unsigned bset_u64s(struct bset_tree *t) -+{ -+ return t->end_offset - t->data_offset - -+ sizeof(struct bset) / sizeof(u64); -+} -+ -+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -+{ -+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -+} -+ -+static inline unsigned bset_byte_offset(struct btree *b, void *i) -+{ -+ return i - (void *) b->data; -+} -+ -+enum btree_node_type { -+#define x(kwd, val, name) BKEY_TYPE_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BKEY_TYPE_BTREE, -+}; -+ -+/* Type of a key in btree @id at level @level: */ -+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -+{ -+ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; -+} -+ -+/* Type of keys @b contains: */ -+static inline enum btree_node_type btree_node_type(struct btree *b) -+{ -+ return __btree_node_type(b->c.level, b->c.btree_id); -+} -+ -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ switch (type) { -+ case BKEY_TYPE_EXTENTS: -+ case BKEY_TYPE_REFLINK: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool btree_node_is_extents(struct btree *b) -+{ -+ return btree_node_type_is_extents(btree_node_type(b)); -+} -+ -+static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) -+{ -+ return __btree_node_type(iter->level, iter->btree_id); -+} -+ -+static inline bool btree_iter_is_extents(struct btree_iter *iter) -+{ -+ return btree_node_type_is_extents(btree_iter_key_type(iter)); -+} -+ -+#define BTREE_NODE_TYPE_HAS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_ALLOC)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_REFLINK)| \ -+ (1U << BKEY_TYPE_EC)| \ -+ (1U << BKEY_TYPE_BTREE)) -+ -+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_REFLINK)) -+ -+enum btree_trigger_flags { -+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ -+ -+ __BTREE_TRIGGER_INSERT, -+ __BTREE_TRIGGER_OVERWRITE, -+ __BTREE_TRIGGER_OVERWRITE_SPLIT, -+ -+ __BTREE_TRIGGER_GC, -+ __BTREE_TRIGGER_BUCKET_INVALIDATE, -+ __BTREE_TRIGGER_ALLOC_READ, -+ __BTREE_TRIGGER_NOATOMIC, -+}; -+ -+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -+ -+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) -+ -+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -+#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) -+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -+ -+static inline bool btree_node_type_needs_gc(enum btree_node_type type) -+{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); -+} -+ -+struct btree_root { -+ struct btree *b; -+ -+ /* On disk root - see async splits: */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ u8 level; -+ u8 alive; -+ s8 error; -+}; -+ -+/* -+ * Optional hook that will be called just prior to a btree node update, when -+ * we're holding the write lock and we know what key is about to be overwritten: -+ */ -+ -+enum btree_insert_ret { -+ BTREE_INSERT_OK, -+ /* leaf node needs to be split */ -+ BTREE_INSERT_BTREE_NODE_FULL, -+ BTREE_INSERT_ENOSPC, -+ BTREE_INSERT_NEED_MARK_REPLICAS, -+ BTREE_INSERT_NEED_JOURNAL_RES, -+}; -+ -+enum btree_gc_coalesce_fail_reason { -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -+}; -+ -+enum btree_node_sibling { -+ btree_prev_sib, -+ btree_next_sib, -+}; -+ -+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, -+ struct btree *, -+ struct btree_node_iter *); -+ -+#endif /* _BCACHEFS_BTREE_TYPES_H */ -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -new file mode 100644 -index 000000000000..e0b1bde37484 ---- /dev/null -+++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,144 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_H -+#define _BCACHEFS_BTREE_UPDATE_H -+ -+#include "btree_iter.h" -+#include "journal.h" -+ -+struct bch_fs; -+struct btree; -+ -+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_i *); -+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); -+ -+enum btree_insert_flags { -+ __BTREE_INSERT_NOUNLOCK, -+ __BTREE_INSERT_NOFAIL, -+ __BTREE_INSERT_NOCHECK_RW, -+ __BTREE_INSERT_LAZY_RW, -+ __BTREE_INSERT_USE_RESERVE, -+ __BTREE_INSERT_USE_ALLOC_RESERVE, -+ __BTREE_INSERT_JOURNAL_REPLAY, -+ __BTREE_INSERT_JOURNAL_RESERVED, -+ __BTREE_INSERT_JOURNAL_RECLAIM, -+ __BTREE_INSERT_NOWAIT, -+ __BTREE_INSERT_GC_LOCK_HELD, -+ __BCH_HASH_SET_MUST_CREATE, -+ __BCH_HASH_SET_MUST_REPLACE, -+}; -+ -+/* -+ * Don't drop locks _after_ successfully updating btree: -+ */ -+#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) -+ -+/* Don't check for -ENOSPC: */ -+#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) -+ -+#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) -+#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) -+ -+/* for copygc, or when merging btree nodes */ -+#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) -+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) -+ -+/* Insert is for journal replay - don't get journal reservations: */ -+#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -+ -+/* Indicates that we have pre-reserved space in the journal: */ -+#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -+ -+/* Insert is being called from journal reclaim path: */ -+#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) -+ -+/* Don't block on allocation failure (for new btree nodes: */ -+#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) -+#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) -+ -+#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) -+#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) -+ -+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -+ -+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); -+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, -+ struct disk_reservation *, u64 *, int flags); -+ -+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *); -+int bch2_btree_delete_range(struct bch_fs *, enum btree_id, -+ struct bpos, struct bpos, u64 *); -+ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, -+ __le64, unsigned); -+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, -+ struct btree *, struct bkey_i *); -+ -+int bch2_trans_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_trigger_flags); -+int __bch2_trans_commit(struct btree_trans *); -+ -+/** -+ * bch2_trans_commit - insert keys at given iterator positions -+ * -+ * This is main entry point for btree updates. -+ * -+ * Return values: -+ * -EINTR: locking changed, this function should be called again. -+ * -EROFS: filesystem read only -+ * -EIO: journal or btree node IO error -+ */ -+static inline int bch2_trans_commit(struct btree_trans *trans, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ unsigned flags) -+{ -+ trans->disk_res = disk_res; -+ trans->journal_seq = journal_seq; -+ trans->flags = flags; -+ -+ return __bch2_trans_commit(trans); -+} -+ -+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ int _ret; \ -+ \ -+ while (1) { \ -+ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ -+ (_journal_seq), (_flags)); \ -+ if (_ret != -EINTR) \ -+ break; \ -+ bch2_trans_reset(_trans, 0); \ -+ } \ -+ \ -+ _ret; \ -+}) -+ -+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ struct btree_trans trans; \ -+ int _ret, _ret2; \ -+ \ -+ bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ -+ _do); \ -+ _ret2 = bch2_trans_exit(&trans); \ -+ \ -+ _ret ?: _ret2; \ -+}) -+ -+#define trans_for_each_update(_trans, _i) \ -+ for ((_i) = (_trans)->updates; \ -+ (_i) < (_trans)->updates + (_trans)->nr_updates; \ -+ (_i)++) -+ -+#define trans_for_each_update2(_trans, _i) \ -+ for ((_i) = (_trans)->updates2; \ -+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ -+ (_i)++) -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_H */ -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -new file mode 100644 -index 000000000000..a2604b0ce2d8 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2075 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "extents.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+/* Debug code: */ -+ -+/* -+ * Verify that child nodes correctly span parent node's range: -+ */ -+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos next_node = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_btree_ptr_v2 bp; -+ struct bkey unpacked; -+ -+ BUG_ON(!b->c.level); -+ -+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) -+ return; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while (1) { -+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ break; -+ bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (bch2_btree_node_iter_end(&iter)) { -+ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); -+ break; -+ } -+ -+ next_node = bkey_successor(k.k->p); -+ } -+#endif -+} -+ -+/* Calculate ideal packed bkey format for new btree nodes: */ -+ -+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -+{ -+ struct bkey_packed *k; -+ struct bset_tree *t; -+ struct bkey uk; -+ -+ bch2_bkey_format_add_pos(s, b->data->min_key); -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) { -+ uk = bkey_unpack_key(b, k); -+ bch2_bkey_format_add_key(s, &uk); -+ } -+} -+ -+static struct bkey_format bch2_btree_calc_format(struct btree *b) -+{ -+ struct bkey_format_state s; -+ -+ bch2_bkey_format_init(&s); -+ __bch2_btree_calc_format(&s, b); -+ -+ return bch2_bkey_format_done(&s); -+} -+ -+static size_t btree_node_u64s_with_format(struct btree *b, -+ struct bkey_format *new_f) -+{ -+ struct bkey_format *old_f = &b->format; -+ -+ /* stupid integer promotion rules */ -+ ssize_t delta = -+ (((int) new_f->key_u64s - old_f->key_u64s) * -+ (int) b->nr.packed_keys) + -+ (((int) new_f->key_u64s - BKEY_U64s) * -+ (int) b->nr.unpacked_keys); -+ -+ BUG_ON(delta + b->nr.live_u64s < 0); -+ -+ return b->nr.live_u64s + delta; -+} -+ -+/** -+ * btree_node_format_fits - check if we could rewrite node with a new format -+ * -+ * This assumes all keys can pack with the new format -- it just checks if -+ * the re-packed keys would fit inside the node itself. -+ */ -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, -+ struct bkey_format *new_f) -+{ -+ size_t u64s = btree_node_u64s_with_format(b, new_f); -+ -+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); -+} -+ -+/* Btree node freeing/allocation: */ -+ -+static void __btree_node_free(struct bch_fs *c, struct btree *b) -+{ -+ trace_btree_node_free(c, b); -+ -+ BUG_ON(btree_node_dirty(b)); -+ BUG_ON(btree_node_need_write(b)); -+ BUG_ON(b == btree_node_root(c, b)); -+ BUG_ON(b->ob.nr); -+ BUG_ON(!list_empty(&b->write_blocked)); -+ BUG_ON(b->will_make_reachable); -+ -+ clear_btree_node_noevict(b); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+} -+ -+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) -+{ -+ struct open_buckets ob = b->ob; -+ -+ b->ob.nr = 0; -+ -+ clear_btree_node_dirty(b); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ bch2_open_buckets_put(c, &ob); -+} -+ -+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ BUG_ON(linked->l[b->c.level].b == b); -+ -+ six_lock_write(&b->c.lock, NULL, NULL); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, -+ struct disk_reservation *res, -+ struct closure *cl, -+ unsigned flags) -+{ -+ struct write_point *wp; -+ struct btree *b; -+ BKEY_PADDED(k) tmp; -+ struct open_buckets ob = { .nr = 0 }; -+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; -+ unsigned nr_reserve; -+ enum alloc_reserve alloc_reserve; -+ -+ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { -+ nr_reserve = 0; -+ alloc_reserve = RESERVE_ALLOC; -+ } else if (flags & BTREE_INSERT_USE_RESERVE) { -+ nr_reserve = BTREE_NODE_RESERVE / 2; -+ alloc_reserve = RESERVE_BTREE; -+ } else { -+ nr_reserve = BTREE_NODE_RESERVE; -+ alloc_reserve = RESERVE_NONE; -+ } -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ if (c->btree_reserve_cache_nr > nr_reserve) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ ob = a->ob; -+ bkey_copy(&tmp.k, &a->k); -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ goto mem_alloc; -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+retry: -+ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, -+ writepoint_ptr(&c->btree_write_point), -+ &devs_have, -+ res->nr_replicas, -+ c->opts.metadata_replicas_required, -+ alloc_reserve, 0, cl); -+ if (IS_ERR(wp)) -+ return ERR_CAST(wp); -+ -+ if (wp->sectors_free < c->opts.btree_node_size) { -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ob->sectors_free < c->opts.btree_node_size) -+ ob->sectors_free = 0; -+ -+ bch2_alloc_sectors_done(c, wp); -+ goto retry; -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) -+ bkey_btree_ptr_v2_init(&tmp.k); -+ else -+ bkey_btree_ptr_init(&tmp.k); -+ -+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); -+ -+ bch2_open_bucket_get(c, wp, &ob); -+ bch2_alloc_sectors_done(c, wp); -+mem_alloc: -+ b = bch2_btree_node_mem_alloc(c); -+ -+ /* we hold cannibalize_lock: */ -+ BUG_ON(IS_ERR(b)); -+ BUG_ON(b->ob.nr); -+ -+ bkey_copy(&b->key, &tmp.k); -+ b->ob = ob; -+ -+ return b; -+} -+ -+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ BUG_ON(!as->nr_prealloc_nodes); -+ -+ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ set_btree_node_accessed(b); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ b->c.level = level; -+ b->c.btree_id = as->btree_id; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ b->data->magic = cpu_to_le64(bset_magic(c)); -+ b->data->flags = 0; -+ SET_BTREE_NODE_ID(b->data, as->btree_id); -+ SET_BTREE_NODE_LEVEL(b->data, level); -+ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); -+ -+ bp->v.mem_ptr = 0; -+ bp->v.seq = b->data->keys.seq; -+ bp->v.sectors_written = 0; -+ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) -+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ bch2_btree_build_aux_trees(b); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); -+ BUG_ON(ret); -+ -+ trace_btree_node_alloc(c, b); -+ return b; -+} -+ -+static void btree_set_min(struct btree *b, struct bpos pos) -+{ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; -+ b->data->min_key = pos; -+} -+ -+static void btree_set_max(struct btree *b, struct bpos pos) -+{ -+ b->key.k.p = pos; -+ b->data->max_key = pos; -+} -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b, -+ struct bkey_format format) -+{ -+ struct btree *n; -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ -+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); -+ -+ btree_set_min(n, b->data->min_key); -+ btree_set_max(n, b->data->max_key); -+ -+ n->data->format = format; -+ btree_node_set_format(n, format); -+ -+ bch2_btree_sort_into(as->c, n, b); -+ -+ btree_node_reset_sib_u64s(n); -+ -+ n->key.k.p = b->key.k.p; -+ return n; -+} -+ -+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bkey_format new_f = bch2_btree_calc_format(b); -+ -+ /* -+ * The keys might expand with the new format - if they wouldn't fit in -+ * the btree node anymore, use the old format for now: -+ */ -+ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) -+ new_f = b->format; -+ -+ return __bch2_btree_node_alloc_replacement(as, b, new_f); -+} -+ -+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) -+{ -+ struct btree *b = bch2_btree_node_alloc(as, level); -+ -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ -+ btree_node_set_format(b, b->data->format); -+ bch2_btree_build_aux_trees(b); -+ -+ bch2_btree_update_add_new_node(as, b); -+ six_unlock_write(&b->c.lock); -+ -+ return b; -+} -+ -+static void bch2_btree_reserve_put(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ -+ while (as->nr_prealloc_nodes) { -+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (c->btree_reserve_cache_nr < -+ ARRAY_SIZE(c->btree_reserve_cache)) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; -+ -+ a->ob = b->ob; -+ b->ob.nr = 0; -+ bkey_copy(&a->k, &b->key); -+ } else { -+ bch2_open_buckets_put(c, &b->ob); -+ } -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ mutex_unlock(&c->btree_reserve_cache_lock); -+} -+ -+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, -+ unsigned flags, struct closure *cl) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); -+ -+ /* -+ * Protects reaping from the btree node cache and using the btree node -+ * open bucket reserve: -+ */ -+ ret = bch2_btree_cache_cannibalize_lock(c, cl); -+ if (ret) -+ return ret; -+ -+ while (as->nr_prealloc_nodes < nr_nodes) { -+ b = __bch2_btree_node_alloc(c, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT -+ ? NULL : cl, flags); -+ if (IS_ERR(b)) { -+ ret = PTR_ERR(b); -+ goto err_free; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); -+ if (ret) -+ goto err_free; -+ -+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; -+ } -+ -+ bch2_btree_cache_cannibalize_unlock(c); -+ return 0; -+err_free: -+ bch2_btree_cache_cannibalize_unlock(c); -+ trace_btree_reserve_get_fail(c, nr_nodes, cl); -+ return ret; -+} -+ -+/* Asynchronous interior node update machinery */ -+ -+static void bch2_btree_update_free(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ bch2_journal_pin_flush(&c->journal, &as->journal); -+ bch2_disk_reservation_put(c, &as->disk_res); -+ bch2_btree_reserve_put(as); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_del(&as->unwritten_list); -+ list_del(&as->list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ closure_debug_destroy(&as->cl); -+ mempool_free(as, &c->btree_interior_update_pool); -+ -+ closure_wake_up(&c->btree_interior_update_wait); -+} -+ -+static void btree_update_will_delete_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_old_keys)); -+ bch2_keylist_add(&as->old_keys, k); -+} -+ -+static void btree_update_will_add_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_new_keys)); -+ bch2_keylist_add(&as->new_keys, k); -+} -+ -+/* -+ * The transactional part of an interior btree node update, where we journal the -+ * update we did to the interior node and update alloc info: -+ */ -+static int btree_update_nodes_written_trans(struct btree_trans *trans, -+ struct btree_update *as) -+{ -+ struct bkey_i *k; -+ int ret; -+ -+ trans->extra_journal_entries = (void *) &as->journal_entries[0]; -+ trans->extra_journal_entry_u64s = as->journal_u64s; -+ trans->journal_pin = &as->journal; -+ -+ for_each_keylist_key(&as->new_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_keylist_key(&as->old_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void btree_update_nodes_written(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b = as->b; -+ u64 journal_seq = 0; -+ unsigned i; -+ int ret; -+ -+ /* -+ * We did an update to a parent node where the pointers we added pointed -+ * to child nodes that weren't written yet: now, the child nodes have -+ * been written so we can write out the update to the interior node. -+ */ -+ -+ /* -+ * We can't call into journal reclaim here: we'd block on the journal -+ * reclaim lock, but we may need to release the open buckets we have -+ * pinned in order for other btree updates to make forward progress, and -+ * journal reclaim does btree updates when flushing bkey_cached entries, -+ * which may require allocations as well. -+ */ -+ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED, -+ btree_update_nodes_written_trans(&trans, as)); -+ BUG_ON(ret && !bch2_journal_error(&c->journal)); -+ -+ if (b) { -+ /* -+ * @b is the node we did the final insert into: -+ * -+ * On failure to get a journal reservation, we still have to -+ * unblock the write and allow most of the write path to happen -+ * so that shutdown works, but the i->journal_seq mechanism -+ * won't work to prevent the btree write from being visible (we -+ * didn't get a journal sequence number) - instead -+ * __bch2_btree_node_write() doesn't do the actual write if -+ * we're in journal error state: -+ */ -+ -+ btree_node_lock_type(c, b, SIX_LOCK_intent); -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ list_del(&as->write_blocked_list); -+ -+ if (!ret && as->b == b) { -+ struct bset *i = btree_bset_last(b); -+ -+ BUG_ON(!b->c.level); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ i->journal_seq = cpu_to_le64( -+ max(journal_seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ } -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ six_unlock_write(&b->c.lock); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_intent); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ BUG_ON(b->will_make_reachable != (unsigned long) as); -+ b->will_make_reachable = 0; -+ } -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ } -+ -+ for (i = 0; i < as->nr_open_buckets; i++) -+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); -+ -+ bch2_btree_update_free(as); -+} -+ -+static void btree_interior_update_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, btree_interior_update_work); -+ struct btree_update *as; -+ -+ while (1) { -+ mutex_lock(&c->btree_interior_update_lock); -+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, -+ struct btree_update, unwritten_list); -+ if (as && !as->nodes_written) -+ as = NULL; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (!as) -+ break; -+ -+ btree_update_nodes_written(as); -+ } -+} -+ -+static void btree_update_set_nodes_written(struct closure *cl) -+{ -+ struct btree_update *as = container_of(cl, struct btree_update, cl); -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ as->nodes_written = true; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -+} -+ -+/* -+ * We're updating @b with pointers to nodes that haven't finished writing yet: -+ * block @b from being written until @as completes -+ */ -+static void btree_update_updated_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_NODE; -+ as->b = b; -+ list_add(&as->write_blocked_list, &b->write_blocked); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static void btree_update_reparent(struct btree_update *as, -+ struct btree_update *child) -+{ -+ struct bch_fs *c = as->c; -+ -+ lockdep_assert_held(&c->btree_interior_update_lock); -+ -+ child->b = NULL; -+ child->mode = BTREE_INTERIOR_UPDATING_AS; -+ -+ /* -+ * When we write a new btree root, we have to drop our journal pin -+ * _before_ the new nodes are technically reachable; see -+ * btree_update_nodes_written(). -+ * -+ * This goes for journal pins that are recursively blocked on us - so, -+ * just transfer the journal pin to the new interior update so -+ * btree_update_nodes_written() can drop it. -+ */ -+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &child->journal); -+} -+ -+static void btree_update_updated_root(struct btree_update *as, struct btree *b) -+{ -+ struct bkey_i *insert = &b->key; -+ struct bch_fs *c = as->c; -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_ROOT; -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+/* -+ * bch2_btree_update_add_new_node: -+ * -+ * This causes @as to wait on @b to be written, before it gets to -+ * bch2_btree_update_nodes_written -+ * -+ * Additionally, it sets b->will_make_reachable to prevent any additional writes -+ * to @b from happening besides the first until @b is reachable on disk -+ * -+ * And it adds @b to the list of @as's new nodes, so that we can update sector -+ * counts in bch2_btree_update_nodes_written: -+ */ -+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ closure_get(&as->cl); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); -+ BUG_ON(b->will_make_reachable); -+ -+ as->new_nodes[as->nr_new_nodes++] = b; -+ b->will_make_reachable = 1UL|(unsigned long) as; -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ btree_update_will_add_key(as, &b->key); -+} -+ -+/* -+ * returns true if @b was a new node -+ */ -+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_update *as; -+ unsigned long v; -+ unsigned i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ /* -+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's -+ * dropped when it gets written by bch2_btree_complete_write - the -+ * xchg() is for synchronization with bch2_btree_complete_write: -+ */ -+ v = xchg(&b->will_make_reachable, 0); -+ as = (struct btree_update *) (v & ~1UL); -+ -+ if (!as) { -+ mutex_unlock(&c->btree_interior_update_lock); -+ return; -+ } -+ -+ for (i = 0; i < as->nr_new_nodes; i++) -+ if (as->new_nodes[i] == b) -+ goto found; -+ -+ BUG(); -+found: -+ array_remove_item(as->new_nodes, as->nr_new_nodes, i); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (v & 1) -+ closure_put(&as->cl); -+} -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -+{ -+ while (b->ob.nr) -+ as->open_buckets[as->nr_open_buckets++] = -+ b->ob.v[--b->ob.nr]; -+} -+ -+/* -+ * @b is being split/rewritten: it may have pointers to not-yet-written btree -+ * nodes and thus outstanding btree_updates - redirect @b's -+ * btree_updates to point to this btree_update: -+ */ -+void bch2_btree_interior_update_will_free_node(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct btree_update *p, *n; -+ struct btree_write *w; -+ -+ set_btree_node_dying(b); -+ -+ if (btree_node_fake(b)) -+ return; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ /* -+ * Does this node have any btree_update operations preventing -+ * it from being written? -+ * -+ * If so, redirect them to point to this btree_update: we can -+ * write out our new nodes, but we won't make them visible until those -+ * operations complete -+ */ -+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { -+ list_del_init(&p->write_blocked_list); -+ btree_update_reparent(as, p); -+ -+ /* -+ * for flush_held_btree_writes() waiting on updates to flush or -+ * nodes to be writeable: -+ */ -+ closure_wake_up(&c->btree_interior_update_wait); -+ } -+ -+ clear_btree_node_dirty(b); -+ clear_btree_node_need_write(b); -+ -+ /* -+ * Does this node have unwritten data that has a pin on the journal? -+ * -+ * If so, transfer that pin to the btree_update operation - -+ * note that if we're freeing multiple nodes, we only need to keep the -+ * oldest pin of any of the nodes we're freeing. We'll release the pin -+ * when the new nodes are persistent and reachable on disk: -+ */ -+ w = btree_current_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ w = btree_prev_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ /* -+ * Is this a node that isn't reachable on disk yet? -+ * -+ * Nodes that aren't reachable yet have writes blocked until they're -+ * reachable - now that we've cancelled any pending writes and moved -+ * things waiting on that write to wait on this update, we can drop this -+ * node from the list of nodes that the other update is making -+ * reachable, prior to freeing it: -+ */ -+ btree_update_drop_new_node(c, b); -+ -+ btree_update_will_delete_key(as, &b->key); -+} -+ -+void bch2_btree_update_done(struct btree_update *as) -+{ -+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); -+ -+ bch2_btree_reserve_put(as); -+ -+ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); -+} -+ -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, -+ unsigned nr_nodes, unsigned flags, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_update *as; -+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) -+ ? JOURNAL_RES_GET_RECLAIM : 0; -+ int ret = 0; -+ -+ /* -+ * This check isn't necessary for correctness - it's just to potentially -+ * prevent us from doing a lot of work that'll end up being wasted: -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); -+ memset(as, 0, sizeof(*as)); -+ closure_init(&as->cl, NULL); -+ as->c = c; -+ as->mode = BTREE_INTERIOR_NO_UPDATE; -+ as->btree_id = id; -+ INIT_LIST_HEAD(&as->list); -+ INIT_LIST_HEAD(&as->unwritten_list); -+ INIT_LIST_HEAD(&as->write_blocked_list); -+ bch2_keylist_init(&as->old_keys, as->_old_keys); -+ bch2_keylist_init(&as->new_keys, as->_new_keys); -+ bch2_keylist_init(&as->parent_keys, as->inline_keys); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags|JOURNAL_RES_GET_NONBLOCK); -+ if (ret == -EAGAIN) { -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ return ERR_PTR(-EINTR); -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ ret = bch2_disk_reservation_get(c, &as->disk_res, -+ nr_nodes * c->opts.btree_node_size, -+ c->opts.metadata_replicas, -+ disk_res_flags); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->list, &c->btree_interior_update_list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return as; -+err: -+ bch2_btree_update_free(as); -+ return ERR_PTR(ret); -+} -+ -+/* Btree root updates: */ -+ -+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -+{ -+ /* Root nodes cannot be reaped */ -+ mutex_lock(&c->btree_cache.lock); -+ list_del_init(&b->list); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ mutex_lock(&c->btree_root_lock); -+ BUG_ON(btree_node_root(c, b) && -+ (b->c.level < btree_node_root(c, b)->c.level || -+ !btree_node_dying(btree_node_root(c, b)))); -+ -+ btree_node_root(c, b) = b; -+ mutex_unlock(&c->btree_root_lock); -+ -+ bch2_recalc_btree_reserve(c); -+} -+ -+/** -+ * bch_btree_set_root - update the root in memory and on disk -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. However, you must hold an intent lock on the -+ * old root. -+ * -+ * Note: This allocates a journal entry but doesn't add any keys to -+ * it. All the btree roots are part of every journal write, so there -+ * is nothing new to be done. This just guarantees that there is a -+ * journal write. -+ */ -+static void bch2_btree_set_root(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *old; -+ -+ trace_btree_set_root(c, b); -+ BUG_ON(!b->written && -+ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); -+ -+ old = btree_node_root(c, b); -+ -+ /* -+ * Ensure no one is using the old root while we switch to the -+ * new root: -+ */ -+ bch2_btree_node_lock_write(old, iter); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ btree_update_updated_root(as, b); -+ -+ /* -+ * Unlock old root after new root is visible: -+ * -+ * The new root isn't persistent, but that's ok: we still have -+ * an intent lock on the new root, and any updates that would -+ * depend on the new root would have to update the new root. -+ */ -+ bch2_btree_node_unlock_write(old, iter); -+} -+ -+/* Interior node updates: */ -+ -+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct btree_node_iter *node_iter) -+{ -+ struct bkey_packed *k; -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_keys, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && -+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) -+ bch2_btree_node_iter_advance(node_iter, b); -+ -+ bch2_btree_bset_insert_key(iter, b, node_iter, insert); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+} -+ -+/* -+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher -+ * node) -+ */ -+static struct btree *__btree_split_node(struct btree_update *as, -+ struct btree *n1, -+ struct btree_iter *iter) -+{ -+ size_t nr_packed = 0, nr_unpacked = 0; -+ struct btree *n2; -+ struct bset *set1, *set2; -+ struct bkey_packed *k, *prev = NULL; -+ -+ n2 = bch2_btree_node_alloc(as, n1->c.level); -+ bch2_btree_update_add_new_node(as, n2); -+ -+ n2->data->max_key = n1->data->max_key; -+ n2->data->format = n1->format; -+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); -+ n2->key.k.p = n1->key.k.p; -+ -+ btree_node_set_format(n2, n2->data->format); -+ -+ set1 = btree_bset_first(n1); -+ set2 = btree_bset_first(n2); -+ -+ /* -+ * Has to be a linear search because we don't have an auxiliary -+ * search tree yet -+ */ -+ k = set1->start; -+ while (1) { -+ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); -+ -+ if (n == vstruct_last(set1)) -+ break; -+ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) -+ break; -+ -+ if (bkey_packed(k)) -+ nr_packed++; -+ else -+ nr_unpacked++; -+ -+ prev = k; -+ k = n; -+ } -+ -+ BUG_ON(!prev); -+ -+ btree_set_max(n1, bkey_unpack_pos(n1, prev)); -+ btree_set_min(n2, bkey_successor(n1->key.k.p)); -+ -+ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); -+ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ -+ n2->nr.live_u64s = le16_to_cpu(set2->u64s); -+ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); -+ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; -+ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; -+ -+ n1->nr.live_u64s = le16_to_cpu(set1->u64s); -+ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); -+ n1->nr.packed_keys = nr_packed; -+ n1->nr.unpacked_keys = nr_unpacked; -+ -+ BUG_ON(!set1->u64s); -+ BUG_ON(!set2->u64s); -+ -+ memcpy_u64s(set2->start, -+ vstruct_end(set1), -+ le16_to_cpu(set2->u64s)); -+ -+ btree_node_reset_sib_u64s(n1); -+ btree_node_reset_sib_u64s(n2); -+ -+ bch2_verify_btree_nr_keys(n1); -+ bch2_verify_btree_nr_keys(n2); -+ -+ if (n1->c.level) { -+ btree_node_interior_verify(as->c, n1); -+ btree_node_interior_verify(as->c, n2); -+ } -+ -+ return n2; -+} -+ -+/* -+ * For updates to interior nodes, we've got to do the insert before we split -+ * because the stuff we're inserting has to be inserted atomically. Post split, -+ * the keys might have to go in different nodes and the split would no longer be -+ * atomic. -+ * -+ * Worse, if the insert is from btree node coalescing, if we do the insert after -+ * we do the split (and pick the pivot) - the pivot we pick might be between -+ * nodes that were coalesced, and thus in the middle of a child node post -+ * coalescing: -+ */ -+static void btree_split_insert_keys(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct keylist *keys) -+{ -+ struct btree_node_iter node_iter; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct bkey_packed *src, *dst, *n; -+ struct bset *i; -+ -+ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); -+ -+ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); -+ -+ while (!bch2_keylist_empty(keys)) { -+ k = bch2_keylist_front(keys); -+ -+ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); -+ bch2_keylist_pop_front(keys); -+ } -+ -+ /* -+ * We can't tolerate whiteouts here - with whiteouts there can be -+ * duplicate keys, and it would be rather bad if we picked a duplicate -+ * for the pivot: -+ */ -+ i = btree_bset_first(b); -+ src = dst = i->start; -+ while (src != vstruct_last(i)) { -+ n = bkey_next_skip_noops(src, vstruct_last(i)); -+ if (!bkey_deleted(src)) { -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ src = n; -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) dst - i->_data); -+ set_btree_bset_end(b, b->set); -+ -+ BUG_ON(b->nsets != 1 || -+ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); -+ -+ btree_node_interior_verify(as->c, b); -+} -+ -+static void btree_split(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree *n1, *n2 = NULL, *n3 = NULL; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(!parent && (b != btree_node_root(c, b))); -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n1 = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n1); -+ -+ if (keys) -+ btree_split_insert_keys(as, n1, iter, keys); -+ -+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { -+ trace_btree_split(c, b); -+ -+ n2 = __btree_split_node(as, n1, iter); -+ -+ bch2_btree_build_aux_trees(n2); -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n2->c.lock); -+ six_unlock_write(&n1->c.lock); -+ -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent); -+ -+ /* -+ * Note that on recursive parent_keys == keys, so we -+ * can't start adding new keys to parent_keys before emptying it -+ * out (which we did with btree_split_insert_keys() above) -+ */ -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ bch2_keylist_add(&as->parent_keys, &n2->key); -+ -+ if (!parent) { -+ /* Depth increases, make a new root */ -+ n3 = __btree_root_alloc(as, b->c.level + 1); -+ -+ n3->sib_u64s[0] = U16_MAX; -+ n3->sib_u64s[1] = U16_MAX; -+ -+ btree_split_insert_keys(as, n3, iter, &as->parent_keys); -+ -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent); -+ } -+ } else { -+ trace_btree_compact(c, b); -+ -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n1->c.lock); -+ -+ if (parent) -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ } -+ -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent); -+ -+ /* New nodes all written, now make them visible: */ -+ -+ if (parent) { -+ /* Split a non root node */ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else if (n3) { -+ bch2_btree_set_root(as, n3, iter); -+ } else { -+ /* Root filled up but didn't need to be split */ -+ bch2_btree_set_root(as, n1, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n1); -+ if (n2) -+ bch2_btree_update_get_open_buckets(as, n2); -+ if (n3) -+ bch2_btree_update_get_open_buckets(as, n3); -+ -+ /* Successful split, update the iterator to point to the new nodes: */ -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ if (n3) -+ bch2_btree_iter_node_replace(iter, n3); -+ if (n2) -+ bch2_btree_iter_node_replace(iter, n2); -+ bch2_btree_iter_node_replace(iter, n1); -+ -+ /* -+ * The old node must be freed (in memory) _before_ unlocking the new -+ * nodes - else another thread could re-acquire a read lock on the old -+ * node after another thread has locked and updated the new node, thus -+ * seeing stale data: -+ */ -+ bch2_btree_node_free_inmem(c, b, iter); -+ -+ if (n3) -+ six_unlock_intent(&n3->c.lock); -+ if (n2) -+ six_unlock_intent(&n2->c.lock); -+ six_unlock_intent(&n1->c.lock); -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], -+ start_time); -+} -+ -+static void -+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys) -+{ -+ struct btree_iter *linked; -+ struct btree_node_iter node_iter; -+ struct bkey_i *insert = bch2_keylist_front(keys); -+ struct bkey_packed *k; -+ -+ /* Don't screw up @iter's position: */ -+ node_iter = iter->l[b->c.level].iter; -+ -+ /* -+ * btree_split(), btree_gc_coalesce() will insert keys before -+ * the iterator's current position - they know the keys go in -+ * the node the iterator points to: -+ */ -+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && -+ (bkey_cmp_packed(b, k, &insert->k) >= 0)) -+ ; -+ -+ for_each_keylist_key(keys, insert) -+ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); -+ -+ btree_update_updated_node(as, b); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); -+ -+ bch2_btree_trans_verify_iters(iter->trans, b); -+} -+ -+/** -+ * bch_btree_insert_node - insert bkeys into a given btree node -+ * -+ * @iter: btree iterator -+ * @keys: list of keys to insert -+ * @hook: insert callback -+ * @persistent: if not null, @persistent will wait on journal write -+ * -+ * Inserts as many keys as it can into a given btree node, splitting it if full. -+ * If a split occurred, this function will return early. This can only happen -+ * for leaf nodes -- inserts into interior nodes have to be atomic. -+ */ -+void bch2_btree_insert_node(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ BUG_ON(!b->c.level); -+ BUG_ON(!as || as->b); -+ bch2_verify_keylist_sorted(keys); -+ -+ if (as->must_rewrite) -+ goto split; -+ -+ bch2_btree_node_lock_for_insert(c, b, iter); -+ -+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { -+ bch2_btree_node_unlock_write(b, iter); -+ goto split; -+ } -+ -+ bch2_btree_insert_keys_interior(as, b, iter, keys); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ bch2_btree_node_unlock_write(b, iter); -+ -+ btree_node_interior_verify(c, b); -+ -+ /* -+ * when called from the btree_split path the new nodes aren't added to -+ * the btree iterator yet, so the merge path's unlock/wait/relock dance -+ * won't work: -+ */ -+ bch2_foreground_maybe_merge(c, iter, b->c.level, -+ flags|BTREE_INSERT_NOUNLOCK); -+ return; -+split: -+ btree_split(as, b, iter, keys, flags); -+} -+ -+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_update *as; -+ struct closure cl; -+ int ret = 0; -+ struct btree_insert_entry *i; -+ -+ /* -+ * We already have a disk reservation and open buckets pinned; this -+ * allocation must not block: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ flags |= BTREE_INSERT_USE_RESERVE; -+ -+ closure_init_stack(&cl); -+ -+ /* Hack, because gc and splitting nodes doesn't mix yet: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) { -+ if (flags & BTREE_INSERT_NOUNLOCK) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(trans)) -+ ret = -EINTR; -+ } -+ -+ /* -+ * XXX: figure out how far we might need to split, -+ * instead of locking/reserving all the way to the root: -+ */ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ trace_trans_restart_iter_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, b), flags, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) { -+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); -+ bch2_trans_unlock(trans); -+ ret = -EINTR; -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ } -+ goto out; -+ } -+ -+ btree_split(as, b, iter, NULL, flags); -+ bch2_btree_update_done(as); -+ -+ /* -+ * We haven't successfully inserted yet, so don't downgrade all the way -+ * back to read locks; -+ */ -+ __bch2_btree_iter_downgrade(iter, 1); -+out: -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+} -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_update *as; -+ struct bkey_format_state new_s; -+ struct bkey_format new_f; -+ struct bkey_i delete; -+ struct btree *b, *m, *n, *prev, *next, *parent; -+ struct closure cl; -+ size_t sib_u64s; -+ int ret = 0; -+ -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ closure_init_stack(&cl); -+retry: -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ b = iter->l[level].b; -+ -+ parent = btree_node_parent(iter, b); -+ if (!parent) -+ goto out; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) -+ goto out; -+ -+ /* XXX: can't be holding read locks */ -+ m = bch2_btree_node_get_sibling(c, iter, b, sib); -+ if (IS_ERR(m)) { -+ ret = PTR_ERR(m); -+ goto err; -+ } -+ -+ /* NULL means no sibling: */ -+ if (!m) { -+ b->sib_u64s[sib] = U16_MAX; -+ goto out; -+ } -+ -+ if (sib == btree_prev_sib) { -+ prev = m; -+ next = b; -+ } else { -+ prev = b; -+ next = m; -+ } -+ -+ bch2_bkey_format_init(&new_s); -+ __bch2_btree_calc_format(&new_s, b); -+ __bch2_btree_calc_format(&new_s, m); -+ new_f = bch2_bkey_format_done(&new_s); -+ -+ sib_u64s = btree_node_u64s_with_format(b, &new_f) + -+ btree_node_u64s_with_format(m, &new_f); -+ -+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { -+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ sib_u64s /= 2; -+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ } -+ -+ sib_u64s = min(sib_u64s, btree_max_u64s(c)); -+ b->sib_u64s[sib] = sib_u64s; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { -+ six_unlock_intent(&m->c.lock); -+ goto out; -+ } -+ -+ /* We're changing btree topology, doesn't mix with gc: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) -+ goto err_cycle_gc_lock; -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ ret = -EINTR; -+ goto err_unlock; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + 1, -+ flags| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ goto err_unlock; -+ } -+ -+ trace_btree_merge(c, b); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ bch2_btree_interior_update_will_free_node(as, m); -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ bch2_btree_update_add_new_node(as, n); -+ -+ btree_set_min(n, prev->data->min_key); -+ btree_set_max(n, next->data->max_key); -+ n->data->format = new_f; -+ -+ btree_node_set_format(n, new_f); -+ -+ bch2_btree_sort_into(c, n, prev); -+ bch2_btree_sort_into(c, n, next); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ bkey_init(&delete.k); -+ delete.k.p = prev->key.k.p; -+ bch2_keylist_add(&as->parent_keys, &delete); -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_drop(iter, m); -+ -+ bch2_btree_iter_node_replace(iter, n); -+ -+ bch2_btree_trans_verify_iters(trans, n); -+ -+ bch2_btree_node_free_inmem(c, b, iter); -+ bch2_btree_node_free_inmem(c, m, iter); -+ -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+out: -+ bch2_btree_trans_verify_locks(trans); -+ -+ /* -+ * Don't downgrade locks here: we're called after successful insert, -+ * and the caller will downgrade locks after a successful insert -+ * anyways (in case e.g. a split was required first) -+ * -+ * And we're also called when inserting into interior nodes in the -+ * split path, and downgrading to read locks in there is potentially -+ * confusing: -+ */ -+ closure_sync(&cl); -+ return; -+ -+err_cycle_gc_lock: -+ six_unlock_intent(&m->c.lock); -+ -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ goto out; -+ -+ bch2_trans_unlock(trans); -+ -+ down_read(&c->gc_lock); -+ up_read(&c->gc_lock); -+ ret = -EINTR; -+ goto err; -+ -+err_unlock: -+ six_unlock_intent(&m->c.lock); -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+err: -+ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); -+ -+ if ((ret == -EAGAIN || ret == -EINTR) && -+ !(flags & BTREE_INSERT_NOUNLOCK)) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto out; -+ -+ goto retry; -+ } -+ -+ goto out; -+} -+ -+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, unsigned flags, -+ struct closure *cl) -+{ -+ struct btree *n, *parent = btree_node_parent(iter, b); -+ struct btree_update *as; -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ (parent -+ ? btree_update_reserve_required(c, parent) -+ : 0) + 1, -+ flags, cl); -+ if (IS_ERR(as)) { -+ trace_btree_gc_rewrite_node_fail(c, b); -+ return PTR_ERR(as); -+ } -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ trace_btree_gc_rewrite_node(c, b); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ if (parent) { -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else { -+ bch2_btree_set_root(as, n, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_replace(iter, n); -+ bch2_btree_node_free_inmem(c, b, iter); -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ return 0; -+} -+ -+/** -+ * bch_btree_node_rewrite - Rewrite/move a btree node -+ * -+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. -+ * btree_check_reserve() has to wait) -+ */ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ __le64 seq, unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ flags |= BTREE_INSERT_NOFAIL; -+ -+ closure_init_stack(&cl); -+ -+ bch2_btree_iter_upgrade(iter, U8_MAX); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ } -+ } -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ break; -+ -+ b = bch2_btree_iter_peek_node(iter); -+ if (!b || b->data->keys.seq != seq) -+ break; -+ -+ ret = __btree_node_rewrite(c, iter, b, flags, &cl); -+ if (ret != -EAGAIN && -+ ret != -EINTR) -+ break; -+ -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+ -+ bch2_btree_iter_downgrade(iter); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ -+ closure_sync(&cl); -+ return ret; -+} -+ -+static void __bch2_btree_node_update_key(struct bch_fs *c, -+ struct btree_update *as, -+ struct btree_iter *iter, -+ struct btree *b, struct btree *new_hash, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent; -+ int ret; -+ -+ btree_update_will_delete_key(as, &b->key); -+ btree_update_will_add_key(as, new_key); -+ -+ parent = btree_node_parent(iter, b); -+ if (parent) { -+ if (new_hash) { -+ bkey_copy(&new_hash->key, new_key); -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, -+ new_hash, b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ } -+ -+ bch2_keylist_add(&as->parent_keys, new_key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); -+ -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, new_key); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } else { -+ bkey_copy(&b->key, new_key); -+ } -+ } else { -+ BUG_ON(btree_node_root(c, b) != b); -+ -+ bch2_btree_node_lock_write(b, iter); -+ bkey_copy(&b->key, new_key); -+ -+ if (btree_ptr_hash_val(&b->key) != b->hash_val) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } -+ -+ btree_update_updated_root(as, b); -+ bch2_btree_node_unlock_write(b, iter); -+ } -+ -+ bch2_btree_update_done(as); -+} -+ -+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree_update *as = NULL; -+ struct btree *new_hash = NULL; -+ struct closure cl; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) -+ return -EINTR; -+ -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(iter->trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ /* -+ * check btree_ptr_hash_val() after @b is locked by -+ * btree_iter_traverse(): -+ */ -+ if (btree_ptr_hash_val(new_key) != b->hash_val) { -+ /* bch2_btree_reserve_get will unlock */ -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ if (ret) { -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ new_hash = bch2_btree_node_mem_alloc(c); -+ } -+retry: -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ parent ? btree_update_reserve_required(c, parent) : 0, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE, -+ &cl); -+ -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) -+ ret = -EINTR; -+ -+ if (ret == -EINTR) { -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (bch2_trans_relock(iter->trans)) -+ goto retry; -+ } -+ -+ goto err; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); -+ if (ret) -+ goto err_free_update; -+ -+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); -+ -+ bch2_btree_iter_downgrade(iter); -+err: -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&new_hash->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ six_unlock_write(&new_hash->c.lock); -+ six_unlock_intent(&new_hash->c.lock); -+ } -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+err_free_update: -+ bch2_btree_update_free(as); -+ goto err; -+} -+ -+/* Init code: */ -+ -+/* -+ * Only for filesystem bringup, when first reading the btree roots or allocating -+ * btree roots when initializing a new filesystem: -+ */ -+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -+{ -+ BUG_ON(btree_node_root(c, b)); -+ -+ bch2_btree_set_root_inmem(c, b); -+} -+ -+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ set_btree_node_fake(b); -+ set_btree_node_need_rewrite(b); -+ b->c.level = 0; -+ b->c.btree_id = id; -+ -+ bkey_btree_ptr_init(&b->key); -+ b->key.k.p = POS_MAX; -+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ bch2_btree_build_aux_trees(b); -+ -+ b->data->flags = 0; -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ btree_node_set_format(b, b->data->format); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, -+ b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_update *as; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each_entry(as, &c->btree_interior_update_list, list) -+ pr_buf(out, "%p m %u w %u r %u j %llu\n", -+ as, -+ as->mode, -+ as->nodes_written, -+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, -+ as->journal.seq); -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct list_head *i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each(i, &c->btree_interior_update_list) -+ ret++; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return ret; -+} -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) -+{ -+ struct btree_root *r; -+ struct jset_entry *entry; -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ vstruct_for_each(jset, entry) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) { -+ r = &c->btree_roots[entry->btree_id]; -+ r->level = entry->level; -+ r->alive = true; -+ bkey_copy(&r->key, &entry->start[0]); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+} -+ -+struct jset_entry * -+bch2_btree_roots_to_journal_entries(struct bch_fs *c, -+ struct jset_entry *start, -+ struct jset_entry *end) -+{ -+ struct jset_entry *entry; -+ unsigned long have = 0; -+ unsigned i; -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) -+ __set_bit(entry->btree_id, &have); -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].alive && !test_bit(i, &have)) { -+ journal_entry_set(end, -+ BCH_JSET_ENTRY_btree_root, -+ i, c->btree_roots[i].level, -+ &c->btree_roots[i].key, -+ c->btree_roots[i].key.u64s); -+ end = vstruct_next(end); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+ -+ return end; -+} -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -+{ -+ if (c->btree_interior_update_worker) -+ destroy_workqueue(c->btree_interior_update_worker); -+ mempool_exit(&c->btree_interior_update_pool); -+} -+ -+int bch2_fs_btree_interior_update_init(struct bch_fs *c) -+{ -+ mutex_init(&c->btree_reserve_cache_lock); -+ INIT_LIST_HEAD(&c->btree_interior_update_list); -+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); -+ mutex_init(&c->btree_interior_update_lock); -+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); -+ -+ c->btree_interior_update_worker = -+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); -+ if (!c->btree_interior_update_worker) -+ return -ENOMEM; -+ -+ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, -+ sizeof(struct btree_update)); -+} -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -new file mode 100644 -index 000000000000..7668225e72c6 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,331 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+ -+#include "btree_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+ -+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, -+ struct bkey_format *); -+ -+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) -+ -+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) -+ -+/* -+ * Tracks an in progress split/rewrite of a btree node and the update to the -+ * parent node: -+ * -+ * When we split/rewrite a node, we do all the updates in memory without -+ * waiting for any writes to complete - we allocate the new node(s) and update -+ * the parent node, possibly recursively up to the root. -+ * -+ * The end result is that we have one or more new nodes being written - -+ * possibly several, if there were multiple splits - and then a write (updating -+ * an interior node) which will make all these new nodes visible. -+ * -+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old -+ * nodes can't be freed (their space on disk can't be reclaimed) until the -+ * update to the interior node that makes the new node visible completes - -+ * until then, the old nodes are still reachable on disk. -+ * -+ */ -+struct btree_update { -+ struct closure cl; -+ struct bch_fs *c; -+ -+ struct list_head list; -+ struct list_head unwritten_list; -+ -+ /* What kind of update are we doing? */ -+ enum { -+ BTREE_INTERIOR_NO_UPDATE, -+ BTREE_INTERIOR_UPDATING_NODE, -+ BTREE_INTERIOR_UPDATING_ROOT, -+ BTREE_INTERIOR_UPDATING_AS, -+ } mode; -+ -+ unsigned must_rewrite:1; -+ unsigned nodes_written:1; -+ -+ enum btree_id btree_id; -+ -+ struct disk_reservation disk_res; -+ struct journal_preres journal_preres; -+ -+ /* -+ * BTREE_INTERIOR_UPDATING_NODE: -+ * The update that made the new nodes visible was a regular update to an -+ * existing interior node - @b. We can't write out the update to @b -+ * until the new nodes we created are finished writing, so we block @b -+ * from writing by putting this btree_interior update on the -+ * @b->write_blocked list with @write_blocked_list: -+ */ -+ struct btree *b; -+ struct list_head write_blocked_list; -+ -+ /* -+ * We may be freeing nodes that were dirty, and thus had journal entries -+ * pinned: we need to transfer the oldest of those pins to the -+ * btree_update operation, and release it when the new node(s) -+ * are all persistent and reachable: -+ */ -+ struct journal_entry_pin journal; -+ -+ /* Preallocated nodes we reserve when we start the update: */ -+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_prealloc_nodes; -+ -+ /* Nodes being freed: */ -+ struct keylist old_keys; -+ u64 _old_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* Nodes being added: */ -+ struct keylist new_keys; -+ u64 _new_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* New nodes, that will be made reachable by this update: */ -+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_new_nodes; -+ -+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * -+ BCH_REPLICAS_MAX]; -+ open_bucket_idx_t nr_open_buckets; -+ -+ unsigned journal_u64s; -+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; -+ -+ /* Only here to reduce stack usage on recursive splits: */ -+ struct keylist parent_keys; -+ /* -+ * Enough room for btree_split's keys without realloc - btree node -+ * pointers never have crc/compression info, so we only need to acount -+ * for the pointers for three keys -+ */ -+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -+}; -+ -+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, -+ struct btree *, -+ struct bkey_format); -+ -+void bch2_btree_update_done(struct btree_update *); -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, -+ unsigned, struct closure *); -+ -+void bch2_btree_interior_update_will_free_node(struct btree_update *, -+ struct btree *); -+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -+ -+void bch2_btree_insert_node(struct btree_update *, struct btree *, -+ struct btree_iter *, struct keylist *, -+ unsigned); -+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, -+ unsigned, unsigned, enum btree_node_sibling); -+ -+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree *b; -+ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ b = iter->l[level].b; -+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) -+ return; -+ -+ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); -+} -+ -+static inline void bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags) -+{ -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_prev_sib); -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_next_sib); -+} -+ -+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); -+ -+static inline unsigned btree_update_reserve_required(struct bch_fs *c, -+ struct btree *b) -+{ -+ unsigned depth = btree_node_root(c, b)->c.level + 1; -+ -+ /* -+ * Number of nodes we might have to allocate in a worst case btree -+ * split operation - we split all the way up to the root, then allocate -+ * a new root, unless we're already at max depth: -+ */ -+ if (depth < BTREE_MAX_DEPTH) -+ return (depth - b->c.level) * 2 + 1; -+ else -+ return (depth - b->c.level) * 2 - 1; -+} -+ -+static inline void btree_node_reset_sib_u64s(struct btree *b) -+{ -+ b->sib_u64s[0] = b->nr.live_u64s; -+ b->sib_u64s[1] = b->nr.live_u64s; -+} -+ -+static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -+{ -+ return (void *) b->data + btree_bytes(c); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, -+ struct btree *b) -+{ -+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, -+ struct btree *b) -+{ -+ return btree_data_end(c, b); -+} -+ -+static inline void *write_block(struct btree *b) -+{ -+ return (void *) b->data + (b->written << 9); -+} -+ -+static inline bool __btree_addr_written(struct btree *b, void *p) -+{ -+ return p < write_block(b); -+} -+ -+static inline bool bset_written(struct btree *b, struct bset *i) -+{ -+ return __btree_addr_written(b, i); -+} -+ -+static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -+{ -+ return __btree_addr_written(b, k); -+} -+ -+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, -+ struct btree *b, -+ void *end) -+{ -+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + -+ b->whiteout_u64s; -+ ssize_t total = c->opts.btree_node_size << 6; -+ -+ return total - used; -+} -+ -+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, -+ struct btree *b) -+{ -+ ssize_t remaining = __bch_btree_u64s_remaining(c, b, -+ btree_bkey_last(b, bset_tree_last(b))); -+ -+ BUG_ON(remaining < 0); -+ -+ if (bset_written(b, btree_bset_last(b))) -+ return 0; -+ -+ return remaining; -+} -+ -+static inline unsigned btree_write_set_buffer(struct btree *b) -+{ -+ /* -+ * Could buffer up larger amounts of keys for btrees with larger keys, -+ * pending benchmarking: -+ */ -+ return 4 << 10; -+} -+ -+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, -+ struct btree *b) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ struct btree_node_entry *bne = max(write_block(b), -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ ssize_t remaining_space = -+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); -+ -+ if (unlikely(bset_written(b, bset(b, t)))) { -+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) -+ return bne; -+ } else { -+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) -+ return bne; -+ } -+ -+ return NULL; -+} -+ -+static inline void push_whiteout(struct bch_fs *c, struct btree *b, -+ struct bpos pos) -+{ -+ struct bkey_packed k; -+ -+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); -+ -+ if (!bkey_pack_pos(&k, pos, b)) { -+ struct bkey *u = (void *) &k; -+ -+ bkey_init(u); -+ u->p = pos; -+ } -+ -+ k.needs_whiteout = true; -+ -+ b->whiteout_u64s += k.u64s; -+ bkey_copy(unwritten_whiteouts_start(c, b), &k); -+} -+ -+/* -+ * write lock must be held on @b (else the dirty bset that we were going to -+ * insert into could be written out from under us) -+ */ -+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, -+ struct btree *b, unsigned u64s) -+{ -+ if (unlikely(btree_node_need_rewrite(b))) -+ return false; -+ -+ return u64s <= bch_btree_keys_u64s_remaining(c, b); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); -+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, -+ struct jset_entry *, struct jset_entry *); -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *); -+int bch2_fs_btree_interior_update_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ -diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c -new file mode 100644 -index 000000000000..cd699c257244 ---- /dev/null -+++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1171 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extent_update.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+ -+#include -+#include -+#include -+ -+static inline bool same_leaf_as_prev(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i != trans->updates2 && -+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; -+} -+ -+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ bch2_btree_node_lock_write(b, iter); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return; -+ -+ if (unlikely(btree_node_just_written(b)) && -+ bch2_btree_post_write_cleanup(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ /* -+ * If the last bset has been written, or if it's gotten too big - start -+ * a new bset to insert into: -+ */ -+ if (want_new_bset(c, b)) -+ bch2_btree_init_next(c, b, iter); -+} -+ -+/* Inserting into a given leaf node (last stage of insert): */ -+ -+/* Handle overwrites and do insert, for non extents: */ -+bool bch2_btree_bset_insert_key(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bkey_packed *k; -+ unsigned clobber_u64s = 0, new_u64s = 0; -+ -+ EBUG_ON(btree_node_just_written(b)); -+ EBUG_ON(bset_written(b, btree_bset_last(b))); -+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); -+ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); -+ EBUG_ON(insert->k.u64s > -+ bch_btree_keys_u64s_remaining(iter->trans->c, b)); -+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_packed(b, k, &insert->k)) -+ k = NULL; -+ -+ /* @k is the key being overwritten/deleted, if any: */ -+ EBUG_ON(k && bkey_whiteout(k)); -+ -+ /* Deleting, but not found? nothing to do: */ -+ if (bkey_whiteout(&insert->k) && !k) -+ return false; -+ -+ if (bkey_whiteout(&insert->k)) { -+ /* Deleting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ if (k->needs_whiteout) -+ push_whiteout(iter->trans->c, b, insert->k.p); -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ bch2_bset_delete(b, k, clobber_u64s); -+ goto fix_iter; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ -+ return true; -+ } -+ -+ if (k) { -+ /* Overwriting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ insert->k.needs_whiteout = k->needs_whiteout; -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ goto overwrite; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ } -+ -+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -+overwrite: -+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); -+ new_u64s = k->u64s; -+fix_iter: -+ if (clobber_u64s != new_u64s) -+ bch2_btree_node_iter_fix(iter, b, node_iter, k, -+ clobber_u64s, new_u64s); -+ return true; -+} -+ -+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, -+ unsigned i, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write *w = container_of(pin, struct btree_write, journal); -+ struct btree *b = container_of(w, struct btree, writes[i]); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ bch2_btree_node_write_cond(c, b, -+ (btree_current_write(b) == w && w->journal.seq == seq)); -+ six_unlock_read(&b->c.lock); -+} -+ -+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 0, seq); -+} -+ -+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 1, seq); -+} -+ -+inline void bch2_btree_add_journal_pin(struct bch_fs *c, -+ struct btree *b, u64 seq) -+{ -+ struct btree_write *w = btree_current_write(b); -+ -+ bch2_journal_pin_add(&c->journal, seq, &w->journal, -+ btree_node_write_idx(b) == 0 -+ ? btree_node_flush0 -+ : btree_node_flush1); -+} -+ -+/** -+ * btree_insert_key - insert a key one key into a leaf node -+ */ -+static bool btree_insert_key_leaf(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bset *i = bset(b, t); -+ int old_u64s = bset_u64s(t); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ EBUG_ON(!iter->level && -+ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); -+ -+ if (unlikely(!bch2_btree_bset_insert_key(iter, b, -+ &iter_l(iter)->iter, insert))) -+ return false; -+ -+ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); -+ -+ if (unlikely(!btree_node_dirty(b))) -+ set_btree_node_dirty(b); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) bset_u64s(t) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ trace_btree_insert_key(c, b, insert); -+ return true; -+} -+ -+/* Cached btree updates: */ -+ -+/* Normal update interface: */ -+ -+static inline void btree_insert_entry_checks(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ -+ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); -+ BUG_ON(debug_check_bkeys(c) && -+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), -+ __btree_node_type(iter->level, iter->btree_id))); -+} -+ -+static noinline int -+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, -+ &trans->journal_preres, u64s, 0); -+ if (ret) -+ return ret; -+ -+ if (!bch2_trans_relock(trans)) { -+ trace_trans_restart_journal_preres_get(trans->ip); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static inline int bch2_trans_journal_res_get(struct btree_trans *trans, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) -+ flags |= JOURNAL_RES_GET_RESERVED; -+ -+ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); -+ -+ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ -+ if (!bch2_btree_node_insert_fits(c, b, u64s)) -+ return BTREE_INSERT_BTREE_NODE_FULL; -+ -+ return BTREE_INSERT_OK; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned u64s) -+{ -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ unsigned new_u64s; -+ struct bkey_i *new_k; -+ -+ BUG_ON(iter->level); -+ -+ if (u64s <= ck->u64s) -+ return BTREE_INSERT_OK; -+ -+ new_u64s = roundup_pow_of_two(u64s); -+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) -+ return -ENOMEM; -+ -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ return BTREE_INSERT_OK; -+} -+ -+static inline void do_btree_insert_one(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ bool did_work; -+ -+ EBUG_ON(trans->journal_res.ref != -+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); -+ -+ insert->k.needs_whiteout = false; -+ -+ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) -+ ? btree_insert_key_leaf(trans, iter, insert) -+ : bch2_btree_insert_key_cached(trans, iter, insert); -+ if (!did_work) -+ return; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ bch2_journal_add_keys(j, &trans->journal_res, -+ iter->btree_id, insert); -+ -+ bch2_journal_set_has_inode(j, &trans->journal_res, -+ insert->k.p.inode); -+ -+ if (trans->journal_seq) -+ *trans->journal_seq = trans->journal_res.seq; -+ } -+} -+ -+static inline bool iter_has_trans_triggers(struct btree_iter *iter) -+{ -+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); -+} -+ -+static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) -+{ -+ return (BTREE_NODE_TYPE_HAS_TRIGGERS & -+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & -+ (1U << iter->btree_id); -+} -+ -+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) -+{ -+ __bch2_btree_iter_unlock(iter); -+} -+ -+static noinline void bch2_trans_mark_gc(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) { -+ /* -+ * XXX: synchronization of cached update triggers with gc -+ */ -+ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); -+ -+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) -+ bch2_mark_update(trans, i->iter, i->k, NULL, -+ i->trigger_flags|BTREE_TRIGGER_GC); -+ } -+} -+ -+static inline int -+bch2_trans_commit_write_locked(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_fs_usage *fs_usage = NULL; -+ struct btree_insert_entry *i; -+ unsigned u64s = 0; -+ bool marking = false; -+ int ret; -+ -+ if (race_fault()) { -+ trace_trans_restart_fault_inject(trans->ip); -+ return -EINTR; -+ } -+ -+ /* -+ * Check if the insert will fit in the leaf node with the write lock -+ * held, otherwise another thread could write the node changing the -+ * amount of space available: -+ */ -+ -+ prefetch(&trans->c->journal.flags); -+ -+ trans_for_each_update2(trans, i) { -+ /* Multiple inserts might go to same leaf: */ -+ if (!same_leaf_as_prev(trans, i)) -+ u64s = 0; -+ -+ u64s += i->k->k.u64s; -+ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED -+ ? btree_key_can_insert(trans, i->iter, u64s) -+ : btree_key_can_insert_cached(trans, i->iter, u64s); -+ if (ret) { -+ *stopped_at = i; -+ return ret; -+ } -+ -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ marking = true; -+ } -+ -+ if (marking) { -+ percpu_down_read(&c->mark_lock); -+ fs_usage = bch2_fs_usage_scratch_get(c); -+ } -+ -+ /* -+ * Don't get journal reservation until after we know insert will -+ * succeed: -+ */ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ ret = bch2_trans_journal_res_get(trans, -+ JOURNAL_RES_GET_NONBLOCK); -+ if (ret) -+ goto err; -+ } else { -+ trans->journal_res.seq = c->journal.replay_journal_seq; -+ } -+ -+ if (unlikely(trans->extra_journal_entry_u64s)) { -+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries, -+ trans->extra_journal_entry_u64s); -+ -+ trans->journal_res.offset += trans->extra_journal_entry_u64s; -+ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; -+ } -+ -+ /* -+ * Not allowed to fail after we've gotten our journal reservation - we -+ * have to use it: -+ */ -+ -+ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (journal_seq_verify(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version.lo = trans->journal_res.seq; -+ else if (inject_invalid_keys(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version = MAX_VERSION; -+ } -+ -+ /* Must be called under mark_lock: */ -+ if (marking && trans->fs_usage_deltas && -+ bch2_replicas_delta_list_apply(c, fs_usage, -+ trans->fs_usage_deltas)) { -+ ret = BTREE_INSERT_NEED_MARK_REPLICAS; -+ goto err; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (iter_has_nontrans_triggers(i->iter)) -+ bch2_mark_update(trans, i->iter, i->k, -+ fs_usage, i->trigger_flags); -+ -+ if (marking) -+ bch2_trans_fs_usage_apply(trans, fs_usage); -+ -+ if (unlikely(c->gc_pos.phase)) -+ bch2_trans_mark_gc(trans); -+ -+ trans_for_each_update2(trans, i) -+ do_btree_insert_one(trans, i->iter, i->k); -+err: -+ if (marking) { -+ bch2_fs_usage_scratch_put(c, fs_usage); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Get journal reservation, take write locks, and attempt to do btree update(s): -+ */ -+static inline int do_bch2_trans_commit(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct btree_insert_entry *i; -+ struct btree_iter *iter; -+ int ret; -+ -+ trans_for_each_update2(trans, i) -+ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); -+ -+ ret = bch2_journal_preres_get(&trans->c->journal, -+ &trans->journal_preres, trans->journal_preres_u64s, -+ JOURNAL_RES_GET_NONBLOCK| -+ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) -+ ? JOURNAL_RES_GET_RECLAIM : 0)); -+ if (unlikely(ret == -EAGAIN)) -+ ret = bch2_trans_journal_preres_get_cold(trans, -+ trans->journal_preres_u64s); -+ if (unlikely(ret)) -+ return ret; -+ -+ /* -+ * Can't be holding any read locks when we go to take write locks: -+ * -+ * note - this must be done after bch2_trans_journal_preres_get_cold() -+ * or anything else that might call bch2_trans_relock(), since that -+ * would just retake the read locks: -+ */ -+ trans_for_each_iter(trans, iter) { -+ if (iter->nodes_locked != iter->nodes_intent_locked) { -+ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ bch2_btree_iter_unlock_noinline(iter); -+ } -+ } -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ trans_for_each_update2(trans, i) -+ btree_insert_entry_checks(trans, i->iter, i->k); -+ bch2_btree_trans_verify_locks(trans); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_lock_for_insert(trans->c, -+ iter_l(i->iter)->b, i->iter); -+ -+ ret = bch2_trans_commit_write_locked(trans, stopped_at); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, -+ i->iter); -+ -+ if (!ret && trans->journal_pin) -+ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, -+ trans->journal_pin, NULL); -+ -+ /* -+ * Drop journal reservation after dropping write locks, since dropping -+ * the journal reservation may kick off a journal write: -+ */ -+ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); -+ -+ if (unlikely(ret)) -+ return ret; -+ -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ trans->nounlock = true; -+ -+ trans_for_each_update2(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !same_leaf_as_prev(trans, i)) -+ bch2_foreground_maybe_merge(trans->c, i->iter, -+ 0, trans->flags); -+ -+ trans->nounlock = false; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; -+} -+ -+static noinline -+int bch2_trans_commit_error(struct btree_trans *trans, -+ struct btree_insert_entry *i, -+ int ret) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned flags = trans->flags; -+ -+ /* -+ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree -+ * update; if we haven't done anything yet it doesn't apply -+ */ -+ flags &= ~BTREE_INSERT_NOUNLOCK; -+ -+ switch (ret) { -+ case BTREE_INSERT_BTREE_NODE_FULL: -+ ret = bch2_btree_split_leaf(c, i->iter, flags); -+ -+ /* -+ * if the split succeeded without dropping locks the insert will -+ * still be atomic (what the caller peeked() and is overwriting -+ * won't have changed) -+ */ -+#if 0 -+ /* -+ * XXX: -+ * split -> btree node merging (of parent node) might still drop -+ * locks when we're not passing it BTREE_INSERT_NOUNLOCK -+ * -+ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that -+ * will inhibit merging - but we don't have a reliable way yet -+ * (do we?) of checking if we dropped locks in this path -+ */ -+ if (!ret) -+ goto retry; -+#endif -+ -+ /* -+ * don't care if we got ENOSPC because we told split it -+ * couldn't block: -+ */ -+ if (!ret || -+ ret == -EINTR || -+ (flags & BTREE_INSERT_NOUNLOCK)) { -+ trace_trans_restart_btree_node_split(trans->ip); -+ ret = -EINTR; -+ } -+ break; -+ case BTREE_INSERT_ENOSPC: -+ ret = -ENOSPC; -+ break; -+ case BTREE_INSERT_NEED_MARK_REPLICAS: -+ bch2_trans_unlock(trans); -+ -+ trans_for_each_update(trans, i) { -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); -+ if (ret) -+ return ret; -+ } -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_mark_replicas(trans->ip); -+ ret = -EINTR; -+ break; -+ case BTREE_INSERT_NEED_JOURNAL_RES: -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); -+ if (ret) -+ return ret; -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_journal_res_get(trans->ip); -+ ret = -EINTR; -+ break; -+ default: -+ BUG_ON(ret >= 0); -+ break; -+ } -+ -+ if (ret == -EINTR) { -+ int ret2 = bch2_btree_iter_traverse_all(trans); -+ -+ if (ret2) { -+ trace_trans_restart_traverse(trans->ip); -+ return ret2; -+ } -+ -+ trace_trans_restart_atomic(trans->ip); -+ } -+ -+ return ret; -+} -+ -+static noinline int -+bch2_trans_commit_get_rw_cold(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) -+ return -EROFS; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_fs_read_write_early(c); -+ if (ret) -+ return ret; -+ -+ percpu_ref_get(&c->writes); -+ return 0; -+} -+ -+static void bch2_trans_update2(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .iter = iter, .k = insert -+ }; -+ -+ btree_insert_entry_checks(trans, n.iter, n.k); -+ -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ trans_for_each_update2(trans, i) { -+ if (btree_iter_cmp(n.iter, i->iter) == 0) { -+ *i = n; -+ return; -+ } -+ -+ if (btree_iter_cmp(n.iter, i->iter) <= 0) -+ break; -+ } -+ -+ array_insert_item(trans->updates2, trans->nr_updates2, -+ i - trans->updates2, n); -+} -+ -+static int extent_update_to_keys(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ ret = bch2_extent_can_insert(trans, orig_iter, insert); -+ if (ret) -+ return ret; -+ -+ if (bkey_deleted(&insert->k)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, orig_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ iter->flags |= BTREE_ITER_INTENT; -+ __bch2_btree_iter_set_pos(iter, insert->k.p, false); -+ bch2_trans_update2(trans, iter, insert); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+static int extent_handle_overwrites(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos start, struct bpos end) -+{ -+ struct btree_iter *iter = NULL, *update_iter; -+ struct bkey_i *update; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_with_updates(iter); -+ -+ while (k.k && !(ret = bkey_err(k))) { -+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_back(start, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ if (bkey_cmp(k.k->p, end) > 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_front(end, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } else { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ update->k = *k.k; -+ set_bkey_val_u64s(&update->k, 0); -+ update->k.type = KEY_TYPE_deleted; -+ update->k.size = 0; -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ k = bch2_btree_iter_next_with_updates(iter); -+ } -+err: -+ if (!IS_ERR_OR_NULL(iter)) -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_trans_commit(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i = NULL; -+ struct btree_iter *iter; -+ bool trans_trigger_run; -+ unsigned u64s; -+ int ret = 0; -+ -+ BUG_ON(trans->need_reset); -+ -+ if (!trans->nr_updates) -+ goto out_noupdates; -+ -+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&trans->c->gc_lock); -+ -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); -+ -+ trans->journal_u64s = trans->extra_journal_entry_u64s; -+ trans->journal_preres_u64s = 0; -+ -+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!percpu_ref_tryget(&trans->c->writes))) { -+ ret = bch2_trans_commit_get_rw_cold(trans); -+ if (ret) -+ return ret; -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) -+ bch2_btree_key_cache_verify_clean(trans, -+ i->iter->btree_id, i->iter->pos); -+#endif -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ trans_for_each_update(trans, i) { -+ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && -+ (ret = bch2_btree_iter_traverse(i->iter)))) { -+ trace_trans_restart_traverse(trans->ip); -+ goto out; -+ } -+ -+ /* -+ * We're not using bch2_btree_iter_upgrade here because -+ * we know trans->nounlock can't be set: -+ */ -+ if (unlikely(i->iter->locks_want < 1 && -+ !__bch2_btree_iter_upgrade(i->iter, 1))) { -+ trace_trans_restart_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ if (iter_has_trans_triggers(i->iter) && -+ !i->trans_triggers_run) { -+ i->trans_triggers_run = true; -+ trans_trigger_run = true; -+ -+ ret = bch2_trans_mark_update(trans, i->iter, i->k, -+ i->trigger_flags); -+ if (unlikely(ret)) { -+ if (ret == -EINTR) -+ trace_trans_restart_mark(trans->ip); -+ goto out; -+ } -+ } -+ } -+ } while (trans_trigger_run); -+ -+ /* Turn extents updates into keys: */ -+ trans_for_each_update(trans, i) -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ struct bpos start = bkey_start_pos(&i->k->k); -+ -+ while (i + 1 < trans->updates + trans->nr_updates && -+ i[0].iter->btree_id == i[1].iter->btree_id && -+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) -+ i++; -+ -+ ret = extent_handle_overwrites(trans, i->iter->btree_id, -+ start, i->k->k.p); -+ if (ret) -+ goto out; -+ } -+ -+ trans_for_each_update(trans, i) { -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ ret = extent_update_to_keys(trans, i->iter, i->k); -+ if (ret) -+ goto out; -+ } else { -+ bch2_trans_update2(trans, i->iter, i->k); -+ } -+ } -+ -+ trans_for_each_update2(trans, i) { -+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); -+ BUG_ON(i->iter->locks_want < 1); -+ -+ u64s = jset_u64s(i->k->k.u64s); -+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && -+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) -+ trans->journal_preres_u64s += u64s; -+ trans->journal_u64s += u64s; -+ } -+retry: -+ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); -+ -+ ret = do_bch2_trans_commit(trans, &i); -+ -+ /* make sure we didn't drop or screw up locks: */ -+ bch2_btree_trans_verify_locks(trans); -+ -+ if (ret) -+ goto err; -+ -+ trans_for_each_iter(trans, iter) -+ if ((trans->iters_live & (1ULL << iter->idx)) && -+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); -+ else -+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); -+ } -+out: -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) -+ percpu_ref_put(&trans->c->writes); -+out_noupdates: -+ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); -+ -+ return ret; -+err: -+ ret = bch2_trans_commit_error(trans, i, ret); -+ if (ret) -+ goto out; -+ -+ goto retry; -+} -+ -+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_trigger_flags flags) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .trigger_flags = flags, .iter = iter, .k = k -+ }; -+ -+ EBUG_ON(bkey_cmp(iter->pos, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? bkey_start_pos(&k->k) -+ : k->k.p)); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ iter->pos_after_commit = k->k.p; -+ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; -+ } -+ -+ /* -+ * Pending updates are kept sorted: first, find position of new update: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_iter_cmp(iter, i->iter) <= 0) -+ break; -+ -+ /* -+ * Now delete/trim any updates the new update overwrites: -+ */ -+ if (i > trans->updates && -+ i[-1].iter->btree_id == iter->btree_id && -+ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) -+ bch2_cut_back(n.iter->pos, i[-1].k); -+ -+ while (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) -+ array_remove_item(trans->updates, trans->nr_updates, -+ i - trans->updates); -+ -+ if (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { -+ /* -+ * When we have an extent that overwrites the start of another -+ * update, trimming that extent will mean the iterator's -+ * position has to change since the iterator position has to -+ * match the extent's start pos - but we don't want to change -+ * the iterator pos if some other code is using it, so we may -+ * need to clone it: -+ */ -+ if (trans->iters_live & (1ULL << i->iter->idx)) { -+ i->iter = bch2_trans_copy_iter(trans, i->iter); -+ if (IS_ERR(i->iter)) { -+ trans->need_reset = true; -+ return PTR_ERR(i->iter); -+ } -+ -+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, i->iter); -+ } -+ -+ bch2_cut_front(n.k->k.p, i->k); -+ bch2_btree_iter_set_pos(i->iter, n.k->k.p); -+ } -+ -+ EBUG_ON(trans->nr_updates >= trans->nr_iters); -+ -+ array_insert_item(trans->updates, trans->nr_updates, -+ i - trans->updates, n); -+ return 0; -+} -+ -+int __bch2_btree_insert(struct btree_trans *trans, -+ enum btree_id id, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, 0); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+/** -+ * bch2_btree_insert - insert keys into the extent btree -+ * @c: pointer to struct bch_fs -+ * @id: btree to insert into -+ * @insert_keys: list of keys to insert -+ * @hook: insert callback -+ */ -+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, int flags) -+{ -+ return bch2_trans_do(c, disk_res, journal_seq, flags, -+ __bch2_btree_insert(&trans, id, k)); -+} -+ -+int bch2_btree_delete_at_range(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ u64 *journal_seq) -+{ -+ struct bkey_s_c k; -+ int ret = 0; -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ bkey_init(&delete.k); -+ -+ /* -+ * For extents, iter.pos won't necessarily be the same as -+ * bkey_start_pos(k.k) (for non extents they always will be the -+ * same). It's important that we delete starting from iter.pos -+ * because the range we want to delete could start in the middle -+ * of k. -+ * -+ * (bch2_btree_iter_peek() does guarantee that iter.pos >= -+ * bkey_start_pos(k.k)). -+ */ -+ delete.k.p = iter->pos; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ unsigned max_sectors = -+ KEY_SIZE_MAX & (~0 << trans->c->block_bits); -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_trim_atomic(&delete, iter); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+ ret = bch2_trans_commit(trans, NULL, journal_seq, -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ break; -+ -+ bch2_trans_cond_resched(trans); -+ } -+ -+ if (ret == -EINTR) { -+ ret = 0; -+ goto retry; -+ } -+ -+ return ret; -+ -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned flags) -+{ -+ struct bkey_i k; -+ -+ bkey_init(&k.k); -+ k.k.p = iter->pos; -+ -+ bch2_trans_update(trans, iter, &k, 0); -+ return bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE|flags); -+} -+ -+/* -+ * bch_btree_delete_range - delete everything within a given range -+ * -+ * Range is a half open interval - [start, end) -+ */ -+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, -+ struct bpos start, struct bpos end, -+ u64 *journal_seq) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ /* -+ * XXX: whether we need mem/more iters depends on whether this btree id -+ * has triggers -+ */ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); -+ -+ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -new file mode 100644 -index 000000000000..97a8af31ded1 ---- /dev/null -+++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2145 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ * -+ * Bucket states: -+ * - free bucket: mark == 0 -+ * The bucket contains no data and will not be read -+ * -+ * - allocator bucket: owned_by_allocator == 1 -+ * The bucket is on a free list, or it is an open bucket -+ * -+ * - cached bucket: owned_by_allocator == 0 && -+ * dirty_sectors == 0 && -+ * cached_sectors > 0 -+ * The bucket contains data but may be safely discarded as there are -+ * enough replicas of the data on other cache devices, or it has been -+ * written back to the backing device -+ * -+ * - dirty bucket: owned_by_allocator == 0 && -+ * dirty_sectors > 0 -+ * The bucket contains data that we must not discard (either only copy, -+ * or one of the 'main copies' for data requiring multiple replicas) -+ * -+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 -+ * This is a btree node, journal or gen/prio bucket -+ * -+ * Lifecycle: -+ * -+ * bucket invalidated => bucket on freelist => open bucket => -+ * [dirty bucket =>] cached bucket => bucket invalidated => ... -+ * -+ * Note that cache promotion can skip the dirty bucket step, as data -+ * is copied from a deeper tier to a shallower tier, onto a cached -+ * bucket. -+ * Note also that a cached bucket can spontaneously become dirty -- -+ * see below. -+ * -+ * Only a traversal of the key space can determine whether a bucket is -+ * truly dirty or cached. -+ * -+ * Transitions: -+ * -+ * - free => allocator: bucket was invalidated -+ * - cached => allocator: bucket was invalidated -+ * -+ * - allocator => dirty: open bucket was filled up -+ * - allocator => cached: open bucket was filled up -+ * - allocator => metadata: metadata was allocated -+ * -+ * - dirty => cached: dirty sectors were copied to a deeper tier -+ * - dirty => free: dirty sectors were overwritten or moved (copy gc) -+ * - cached => free: cached sectors were overwritten -+ * -+ * - metadata => free: metadata was freed -+ * -+ * Oddities: -+ * - cached => dirty: a device was removed so formerly replicated data -+ * is no longer sufficiently replicated -+ * - free => cached: cannot happen -+ * - free => dirty: cannot happen -+ * - free => metadata: cannot happen -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "ec.h" -+#include "error.h" -+#include "movinggc.h" -+#include "replicas.h" -+ -+#include -+#include -+ -+/* -+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent -+ * wraparound: -+ */ -+void bch2_bucket_seq_cleanup(struct bch_fs *c) -+{ -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u16 last_seq_ondisk = c->journal.last_seq_ondisk; -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ struct bucket_mark m; -+ unsigned i; -+ -+ if (journal_seq - c->last_bucket_seq_cleanup < -+ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) -+ return; -+ -+ c->last_bucket_seq_cleanup = journal_seq; -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) { -+ bucket_cmpxchg(g, m, ({ -+ if (!m.journal_seq_valid || -+ bucket_needs_journal_commit(m, last_seq_ondisk)) -+ break; -+ -+ m.journal_seq_valid = 0; -+ })); -+ } -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+void bch2_fs_usage_initialize(struct bch_fs *c) -+{ -+ struct bch_fs_usage *usage; -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ usage = c->usage_base; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ usage->reserved += usage->persistent_reserved[i]; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ switch (e->data_type) { -+ case BCH_DATA_btree: -+ usage->btree += usage->replicas[i]; -+ break; -+ case BCH_DATA_user: -+ usage->data += usage->replicas[i]; -+ break; -+ case BCH_DATA_cached: -+ usage->cached += usage->replicas[i]; -+ break; -+ } -+ } -+ -+ percpu_up_write(&c->mark_lock); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ if (fs_usage == c->usage_scratch) -+ mutex_unlock(&c->usage_scratch_lock); -+ else -+ kfree(fs_usage); -+} -+ -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); -+ -+ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); -+ if (ret) -+ return ret; -+ -+ if (mutex_trylock(&c->usage_scratch_lock)) -+ goto out_pool; -+ -+ ret = kzalloc(bytes, GFP_NOFS); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->usage_scratch_lock); -+out_pool: -+ ret = c->usage_scratch; -+ memset(ret, 0, bytes); -+ return ret; -+} -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -+{ -+ struct bch_dev_usage ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ acc_u64s_percpu((u64 *) &ret, -+ (u64 __percpu *) ca->usage[0], -+ sizeof(ret) / sizeof(u64)); -+ -+ return ret; -+} -+ -+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, -+ unsigned journal_seq, -+ bool gc) -+{ -+ return this_cpu_ptr(gc -+ ? c->usage_gc -+ : c->usage[journal_seq & 1]); -+} -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -+{ -+ ssize_t offset = v - (u64 *) c->usage_base; -+ unsigned seq; -+ u64 ret; -+ -+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ ret = *v + -+ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + -+ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned seq, v, u64s = fs_usage_u64s(c); -+retry: -+ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); -+ if (unlikely(!ret)) -+ return NULL; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ v = fs_usage_u64s(c); -+ if (unlikely(u64s != v)) { -+ u64s = v; -+ percpu_up_read(&c->mark_lock); -+ kfree(ret); -+ goto retry; -+ } -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ memcpy(ret, c->usage_base, u64s * sizeof(u64)); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -+{ -+ unsigned u64s = fs_usage_u64s(c); -+ -+ BUG_ON(idx >= 2); -+ -+ write_seqcount_begin(&c->usage_lock); -+ -+ acc_u64s_percpu((u64 *) c->usage_base, -+ (u64 __percpu *) c->usage[idx], u64s); -+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); -+ -+ write_seqcount_end(&c->usage_lock); -+} -+ -+void bch2_fs_usage_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_fs_usage *fs_usage) -+{ -+ unsigned i; -+ -+ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); -+ -+ pr_buf(out, "hidden:\t\t\t\t%llu\n", -+ fs_usage->hidden); -+ pr_buf(out, "data:\t\t\t\t%llu\n", -+ fs_usage->data); -+ pr_buf(out, "cached:\t\t\t\t%llu\n", -+ fs_usage->cached); -+ pr_buf(out, "reserved:\t\t\t%llu\n", -+ fs_usage->reserved); -+ pr_buf(out, "nr_inodes:\t\t\t%llu\n", -+ fs_usage->nr_inodes); -+ pr_buf(out, "online reserved:\t\t%llu\n", -+ fs_usage->online_reserved); -+ -+ for (i = 0; -+ i < ARRAY_SIZE(fs_usage->persistent_reserved); -+ i++) { -+ pr_buf(out, "%u replicas:\n", i + 1); -+ pr_buf(out, "\treserved:\t\t%llu\n", -+ fs_usage->persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ pr_buf(out, "\t"); -+ bch2_replicas_entry_to_text(out, e); -+ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); -+ } -+} -+ -+#define RESERVE_FACTOR 6 -+ -+static u64 reserve_factor(u64 r) -+{ -+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -+} -+ -+static u64 avail_factor(u64 r) -+{ -+ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); -+} -+ -+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ return min(fs_usage->hidden + -+ fs_usage->btree + -+ fs_usage->data + -+ reserve_factor(fs_usage->reserved + -+ fs_usage->online_reserved), -+ c->capacity); -+} -+ -+static struct bch_fs_usage_short -+__bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ u64 data, reserved; -+ -+ ret.capacity = c->capacity - -+ bch2_fs_usage_read_one(c, &c->usage_base->hidden); -+ -+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + -+ bch2_fs_usage_read_one(c, &c->usage_base->btree); -+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + -+ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); -+ -+ ret.used = min(ret.capacity, data + reserve_factor(reserved)); -+ ret.free = ret.capacity - ret.used; -+ -+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); -+ -+ return ret; -+} -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = __bch2_fs_usage_read_short(c); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+static inline int is_unavailable_bucket(struct bucket_mark m) -+{ -+ return !is_available_bucket(m); -+} -+ -+static inline int is_fragmented_bucket(struct bucket_mark m, -+ struct bch_dev *ca) -+{ -+ if (!m.owned_by_allocator && -+ m.data_type == BCH_DATA_user && -+ bucket_sectors_used(m)) -+ return max_t(int, 0, (int) ca->mi.bucket_size - -+ bucket_sectors_used(m)); -+ return 0; -+} -+ -+static inline int bucket_stripe_sectors(struct bucket_mark m) -+{ -+ return m.stripe ? m.dirty_sectors : 0; -+} -+ -+static inline enum bch_data_type bucket_type(struct bucket_mark m) -+{ -+ return m.cached_sectors && !m.dirty_sectors -+ ? BCH_DATA_cached -+ : m.data_type; -+} -+ -+static bool bucket_became_unavailable(struct bucket_mark old, -+ struct bucket_mark new) -+{ -+ return is_available_bucket(old) && -+ !is_available_bucket(new); -+} -+ -+int bch2_fs_usage_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct disk_reservation *disk_res, -+ unsigned journal_seq) -+{ -+ s64 added = fs_usage->data + fs_usage->reserved; -+ s64 should_not_have_added; -+ int ret = 0; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ /* -+ * Not allowed to reduce sectors_available except by getting a -+ * reservation: -+ */ -+ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); -+ if (WARN_ONCE(should_not_have_added > 0, -+ "disk usage increased by %lli without a reservation", -+ should_not_have_added)) { -+ atomic64_sub(should_not_have_added, &c->sectors_available); -+ added -= should_not_have_added; -+ ret = -1; -+ } -+ -+ if (added > 0) { -+ disk_res->sectors -= added; -+ fs_usage->online_reserved -= added; -+ } -+ -+ preempt_disable(); -+ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), -+ (u64 *) fs_usage, fs_usage_u64s(c)); -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static inline void account_bucket(struct bch_fs_usage *fs_usage, -+ struct bch_dev_usage *dev_usage, -+ enum bch_data_type type, -+ int nr, s64 size) -+{ -+ if (type == BCH_DATA_sb || type == BCH_DATA_journal) -+ fs_usage->hidden += size; -+ -+ dev_usage->buckets[type] += nr; -+} -+ -+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, -+ struct bch_fs_usage *fs_usage, -+ struct bucket_mark old, struct bucket_mark new, -+ bool gc) -+{ -+ struct bch_dev_usage *u; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ preempt_disable(); -+ u = this_cpu_ptr(ca->usage[gc]); -+ -+ if (bucket_type(old)) -+ account_bucket(fs_usage, u, bucket_type(old), -+ -1, -ca->mi.bucket_size); -+ -+ if (bucket_type(new)) -+ account_bucket(fs_usage, u, bucket_type(new), -+ 1, ca->mi.bucket_size); -+ -+ u->buckets_alloc += -+ (int) new.owned_by_allocator - (int) old.owned_by_allocator; -+ u->buckets_unavailable += -+ is_unavailable_bucket(new) - is_unavailable_bucket(old); -+ -+ u->buckets_ec += (int) new.stripe - (int) old.stripe; -+ u->sectors_ec += bucket_stripe_sectors(new) - -+ bucket_stripe_sectors(old); -+ -+ u->sectors[old.data_type] -= old.dirty_sectors; -+ u->sectors[new.data_type] += new.dirty_sectors; -+ u->sectors[BCH_DATA_cached] += -+ (int) new.cached_sectors - (int) old.cached_sectors; -+ u->sectors_fragmented += -+ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); -+ preempt_enable(); -+ -+ if (!is_available_bucket(old) && is_available_bucket(new)) -+ bch2_wake_allocator(ca); -+} -+ -+void bch2_dev_usage_from_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_mark old = { .v.counter = 0 }; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int cpu; -+ -+ c->usage_base->hidden = 0; -+ -+ for_each_member_device(ca, c, i) { -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(ca->usage[0], cpu), 0, -+ sizeof(*ca->usage[0])); -+ -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ bch2_dev_usage_update(c, ca, c->usage_base, -+ old, g->mark, false); -+ } -+} -+ -+static inline int update_replicas(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ int idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) -+ return -1; -+ -+ if (!fs_usage) -+ return 0; -+ -+ switch (r->data_type) { -+ case BCH_DATA_btree: -+ fs_usage->btree += sectors; -+ break; -+ case BCH_DATA_user: -+ fs_usage->data += sectors; -+ break; -+ case BCH_DATA_cached: -+ fs_usage->cached += sectors; -+ break; -+ } -+ fs_usage->replicas[idx] += sectors; -+ return 0; -+} -+ -+static inline void update_cached_sectors(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas(c, fs_usage, &r.e, sectors); -+} -+ -+static struct replicas_delta_list * -+replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -+{ -+ struct replicas_delta_list *d = trans->fs_usage_deltas; -+ unsigned new_size = d ? (d->size + more) * 2 : 128; -+ -+ if (!d || d->used + more > d->size) { -+ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); -+ BUG_ON(!d); -+ -+ d->size = new_size; -+ trans->fs_usage_deltas = d; -+ } -+ return d; -+} -+ -+static inline void update_replicas_list(struct btree_trans *trans, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ struct replicas_delta_list *d; -+ struct replicas_delta *n; -+ unsigned b; -+ -+ if (!sectors) -+ return; -+ -+ b = replicas_entry_bytes(r) + 8; -+ d = replicas_deltas_realloc(trans, b); -+ -+ n = (void *) d->d + d->used; -+ n->delta = sectors; -+ memcpy(&n->r, r, replicas_entry_bytes(r)); -+ d->used += b; -+} -+ -+static inline void update_cached_sectors_list(struct btree_trans *trans, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas_list(trans, &r.e, sectors); -+} -+ -+static inline struct replicas_delta * -+replicas_delta_next(struct replicas_delta *d) -+{ -+ return (void *) d + replicas_entry_bytes(&d->r) + 8; -+} -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct replicas_delta_list *r) -+{ -+ struct replicas_delta *d = r->d; -+ struct replicas_delta *top = (void *) r->d + r->used; -+ unsigned i; -+ -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ if (update_replicas(c, fs_usage, &d->r, d->delta)) { -+ top = d; -+ goto unwind; -+ } -+ -+ if (!fs_usage) -+ return 0; -+ -+ fs_usage->nr_inodes += r->nr_inodes; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ fs_usage->reserved += r->persistent_reserved[i]; -+ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; -+ } -+ -+ return 0; -+unwind: -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ update_replicas(c, fs_usage, &d->r, -d->delta); -+ return -1; -+} -+ -+#define do_mark_fn(fn, c, pos, flags, ...) \ -+({ \ -+ int gc, ret = 0; \ -+ \ -+ percpu_rwsem_assert_held(&c->mark_lock); \ -+ \ -+ for (gc = 0; gc < 2 && !ret; gc++) \ -+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ -+ (gc && gc_visited(c, pos))) \ -+ ret = fn(c, __VA_ARGS__, gc); \ -+ ret; \ -+}) -+ -+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *ret, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ BUG_ON(!is_available_bucket(new)); -+ -+ new.owned_by_allocator = true; -+ new.data_type = 0; -+ new.cached_sectors = 0; -+ new.dirty_sectors = 0; -+ new.gen++; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ if (old.cached_sectors) -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -((s64) old.cached_sectors)); -+ -+ if (!gc) -+ *ret = old; -+ return 0; -+} -+ -+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *old) -+{ -+ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, -+ ca, b, old); -+ -+ if (!old->owned_by_allocator && old->cached_sectors) -+ trace_invalidate(ca, bucket_to_sector(ca, b), -+ old->cached_sectors); -+} -+ -+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.owned_by_allocator = owned_by_allocator; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && -+ !owned_by_allocator && !old.owned_by_allocator); -+ -+ return 0; -+} -+ -+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ struct gc_pos pos, unsigned flags) -+{ -+ preempt_disable(); -+ -+ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, -+ ca, b, owned_by_allocator); -+ -+ preempt_enable(); -+} -+ -+static int bch2_mark_alloc(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bkey_alloc_unpacked u; -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bucket_mark old_m, m; -+ -+ /* We don't do anything for deletions - do we?: */ -+ if (new.k->type != KEY_TYPE_alloc) -+ return 0; -+ -+ /* -+ * alloc btree is read in by bch2_alloc_read, not gc: -+ */ -+ if ((flags & BTREE_TRIGGER_GC) && -+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, new.k->p.inode); -+ -+ if (new.k->p.offset >= ca->mi.nbuckets) -+ return 0; -+ -+ g = __bucket(ca, new.k->p.offset, gc); -+ u = bch2_alloc_unpack(new); -+ -+ old_m = bucket_cmpxchg(g, m, ({ -+ m.gen = u.gen; -+ m.data_type = u.data_type; -+ m.dirty_sectors = u.dirty_sectors; -+ m.cached_sectors = u.cached_sectors; -+ -+ if (journal_seq) { -+ m.journal_seq_valid = 1; -+ m.journal_seq = journal_seq; -+ } -+ })); -+ -+ if (!(flags & BTREE_TRIGGER_ALLOC_READ)) -+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); -+ -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; -+ -+ /* -+ * need to know if we're getting called from the invalidate path or -+ * not: -+ */ -+ -+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && -+ old_m.cached_sectors) { -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -old_m.cached_sectors); -+ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), -+ old_m.cached_sectors); -+ } -+ -+ return 0; -+} -+ -+#define checked_add(a, b) \ -+({ \ -+ unsigned _res = (unsigned) (a) + (b); \ -+ bool overflow = _res > U16_MAX; \ -+ if (overflow) \ -+ _res = U16_MAX; \ -+ (a) = _res; \ -+ overflow; \ -+}) -+ -+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type data_type, -+ unsigned sectors, bool gc) -+{ -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ bool overflow; -+ -+ BUG_ON(data_type != BCH_DATA_sb && -+ data_type != BCH_DATA_journal); -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.data_type = data_type; -+ overflow = checked_add(new.dirty_sectors, sectors); -+ })); -+ -+ bch2_fs_inconsistent_on(old.data_type && -+ old.data_type != data_type, c, -+ "different types of data in same bucket: %s, %s", -+ bch2_data_types[old.data_type], -+ bch2_data_types[data_type]); -+ -+ bch2_fs_inconsistent_on(overflow, c, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", -+ ca->dev_idx, b, new.gen, -+ bch2_data_types[old.data_type ?: data_type], -+ old.dirty_sectors, sectors); -+ -+ if (c) -+ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), -+ old, new, gc); -+ -+ return 0; -+} -+ -+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type type, -+ unsigned sectors, struct gc_pos pos, -+ unsigned flags) -+{ -+ BUG_ON(type != BCH_DATA_sb && -+ type != BCH_DATA_journal); -+ -+ preempt_disable(); -+ -+ if (likely(c)) { -+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, -+ ca, b, type, sectors); -+ } else { -+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); -+ } -+ -+ preempt_enable(); -+} -+ -+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) -+{ -+ return DIV_ROUND_UP(sectors * n, d); -+} -+ -+static s64 __ptr_disk_sectors_delta(unsigned old_size, -+ unsigned offset, s64 delta, -+ unsigned flags, -+ unsigned n, unsigned d) -+{ -+ BUG_ON(!n || !d); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, offset) + -+ disk_sectors_scaled(n, d, old_size - offset + delta); -+ } else if (flags & BTREE_TRIGGER_OVERWRITE) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, old_size + delta); -+ } else { -+ return disk_sectors_scaled(n, d, delta); -+ } -+} -+ -+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, -+ unsigned offset, s64 delta, -+ unsigned flags) -+{ -+ return __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, delta, flags, -+ p.crc.compressed_size, -+ p.crc.uncompressed_size); -+} -+ -+static void bucket_set_stripe(struct bch_fs *c, -+ const struct bch_extent_ptr *ptr, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, -+ unsigned flags, -+ bool enabled) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, gc); -+ struct bucket_mark new, old; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.stripe = enabled; -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ /* -+ * XXX write repair code for these, flag stripe as possibly bad -+ */ -+ if (old.gen != ptr->gen) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "stripe with stale pointer"); -+#if 0 -+ /* -+ * We'd like to check for these, but these checks don't work -+ * yet: -+ */ -+ if (old.stripe && enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "multiple stripes using same bucket"); -+ -+ if (!old.stripe && !enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "deleting stripe but bucket not marked as stripe bucket"); -+#endif -+} -+ -+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 *bucket_data_type, -+ u16 *dirty_sectors, u16 *cached_sectors) -+{ -+ u16 *dst_sectors = !p.ptr.cached -+ ? dirty_sectors -+ : cached_sectors; -+ u16 orig_sectors = *dst_sectors; -+ char buf[200]; -+ -+ if (gen_after(p.ptr.gen, bucket_gen)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != p.ptr.gen && !p.ptr.cached) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ p.ptr.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != p.ptr.gen) -+ return 1; -+ -+ if (*bucket_data_type && *bucket_data_type != ptr_data_type) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type], -+ bch2_data_types[ptr_data_type], -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (checked_add(*dst_sectors, sectors)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" -+ "while marking %s", -+ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), -+ bucket_gen, -+ bch2_data_types[*bucket_data_type ?: ptr_data_type], -+ orig_sectors, sectors, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ *bucket_data_type = *dirty_sectors || *cached_sectors -+ ? ptr_data_type : 0; -+ return 0; -+} -+ -+static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bucket_mark old, new; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); -+ u8 bucket_data_type; -+ u64 v; -+ int ret; -+ -+ v = atomic64_read(&g->_mark.v); -+ do { -+ new.v.counter = old.v.counter = v; -+ bucket_data_type = new.data_type; -+ -+ ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, -+ &bucket_data_type, -+ &new.dirty_sectors, -+ &new.cached_sectors); -+ if (ret) -+ return ret; -+ -+ new.data_type = bucket_data_type; -+ -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ -+ if (flags & BTREE_TRIGGER_NOATOMIC) { -+ g->_mark = new; -+ break; -+ } -+ } while ((v = atomic64_cmpxchg(&g->_mark.v, -+ old.v.counter, -+ new.v.counter)) != old.v.counter); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && bucket_became_unavailable(old, new)); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe_ptr(struct bch_fs *c, -+ struct bch_extent_stripe_ptr p, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ s64 sectors, unsigned flags, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct stripe *m; -+ unsigned i, blocks_nonempty = 0; -+ -+ m = genradix_ptr(&c->stripes[gc], p.idx); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ if (!m || !m->alive) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ return -EIO; -+ } -+ -+ BUG_ON(m->r.e.data_type != data_type); -+ -+ *nr_data = m->nr_blocks - m->nr_redundant; -+ *nr_parity = m->nr_redundant; -+ *r = m->r; -+ -+ m->block_sectors[p.block] += sectors; -+ -+ for (i = 0; i < m->nr_blocks; i++) -+ blocks_nonempty += m->block_sectors[i] != 0; -+ -+ if (m->blocks_nonempty != blocks_nonempty) { -+ m->blocks_nonempty = blocks_nonempty; -+ if (!gc) -+ bch2_stripes_heap_update(c, m, p.idx); -+ } -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ return 0; -+} -+ -+static int bch2_mark_extent(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ unsigned journal_seq, unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_btree -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, -+ fs_usage, journal_seq, flags); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors(c, fs_usage, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, -+ fs_usage, disk_sectors, flags, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas(c, fs_usage, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ /* -+ * There may be other dirty pointers in this extent, but -+ * if so they're not required for mounting if we have an -+ * erasure coded pointer in this extent: -+ */ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas(c, fs_usage, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ size_t idx = new.k->p.offset; -+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(old).v : NULL; -+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(new).v : NULL; -+ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); -+ unsigned i; -+ -+ if (!m || (old_s && !m->alive)) { -+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", -+ idx); -+ return -1; -+ } -+ -+ if (!new_s) { -+ /* Deleting: */ -+ for (i = 0; i < old_s->nr_blocks; i++) -+ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, -+ journal_seq, flags, false); -+ -+ if (!gc && m->on_heap) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_del(c, m, idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ memset(m, 0, sizeof(*m)); -+ } else { -+ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); -+ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ if (!old_s || -+ memcmp(new_s->ptrs + i, -+ old_s->ptrs + i, -+ sizeof(struct bch_extent_ptr))) { -+ -+ if (old_s) -+ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, -+ journal_seq, flags, false); -+ bucket_set_stripe(c, new_s->ptrs + i, fs_usage, -+ journal_seq, flags, true); -+ } -+ } -+ -+ m->alive = true; -+ m->sectors = le16_to_cpu(new_s->sectors); -+ m->algorithm = new_s->algorithm; -+ m->nr_blocks = new_s->nr_blocks; -+ m->nr_redundant = new_s->nr_redundant; -+ -+ bch2_bkey_to_replicas(&m->r.e, new); -+ -+ /* gc recalculates these fields: */ -+ if (!(flags & BTREE_TRIGGER_GC)) { -+ m->blocks_nonempty = 0; -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ m->block_sectors[i] = -+ stripe_blockcount_get(new_s, i); -+ m->blocks_nonempty += !!m->block_sectors[i]; -+ } -+ } -+ -+ if (!gc) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_update(c, m, idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ } -+ -+ return 0; -+} -+ -+static int bch2_mark_key_locked(struct bch_fs *c, -+ struct bkey_s_c old, -+ struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; -+ int ret = 0; -+ -+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); -+ -+ preempt_disable(); -+ -+ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) -+ fs_usage = fs_usage_ptr(c, journal_seq, -+ flags & BTREE_TRIGGER_GC); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_alloc: -+ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ ret = bch2_mark_extent(c, old, new, offset, sectors, -+ BCH_DATA_btree, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ ret = bch2_mark_extent(c, old, new, offset, sectors, -+ BCH_DATA_user, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_stripe: -+ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_inode: -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ fs_usage->nr_inodes++; -+ else -+ fs_usage->nr_inodes--; -+ break; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(fs_usage->persistent_reserved)); -+ -+ fs_usage->reserved += sectors; -+ fs_usage->persistent_reserved[replicas - 1] += sectors; -+ break; -+ } -+ } -+ -+ preempt_enable(); -+ -+ return ret; -+} -+ -+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ struct bkey deleted; -+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; -+ int ret; -+ -+ bkey_init(&deleted); -+ -+ percpu_down_read(&c->mark_lock); -+ ret = bch2_mark_key_locked(c, old, new, offset, sectors, -+ fs_usage, journal_seq, -+ BTREE_TRIGGER_INSERT|flags); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *new, -+ struct bch_fs_usage *fs_usage, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_old; -+ struct bkey_s_c old; -+ struct bkey unpacked; -+ int ret = 0; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ bkey_init(&unpacked); -+ old = (struct bkey_s_c) { &unpacked, NULL }; -+ -+ if (!btree_node_type_is_extents(iter->btree_id)) { -+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { -+ _old = bch2_btree_node_iter_peek(&node_iter, b); -+ if (_old) -+ old = bkey_disassemble(b, _old, &unpacked); -+ } else { -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ if (ck->valid) -+ old = bkey_i_to_s_c(ck->k); -+ } -+ -+ if (old.k->type == new->k.type) { -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); -+ -+ } else { -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|flags); -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_OVERWRITE|flags); -+ } -+ } else { -+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), -+ 0, new->k.size, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|flags); -+ -+ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { -+ unsigned offset = 0; -+ s64 sectors; -+ -+ old = bkey_disassemble(b, _old, &unpacked); -+ sectors = -((s64) old.k->size); -+ -+ flags |= BTREE_TRIGGER_OVERWRITE; -+ -+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) -+ return 0; -+ -+ switch (bch2_extent_overlap(&new->k, old.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) old.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = bkey_start_offset(&new->k) - -+ old.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(old.k) - -+ new->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = -((s64) new->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ -+ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), -+ offset, sectors, fs_usage, -+ trans->journal_res.seq, flags) ?: 1; -+ if (ret <= 0) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ } -+ -+ return ret; -+} -+ -+void bch2_trans_fs_usage_apply(struct btree_trans *trans, -+ struct bch_fs_usage *fs_usage) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ static int warned_disk_usage = 0; -+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; -+ char buf[200]; -+ -+ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, -+ trans->journal_res.seq) || -+ warned_disk_usage || -+ xchg(&warned_disk_usage, 1)) -+ return; -+ -+ bch_err(c, "disk usage increased more than %llu sectors reserved", -+ disk_res_sectors); -+ -+ trans_for_each_update(trans, i) { -+ pr_err("while inserting"); -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); -+ pr_err("%s", buf); -+ pr_err("overlapping with"); -+ -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { -+ struct btree *b = iter_l(i->iter)->b; -+ struct btree_node_iter node_iter = iter_l(i->iter)->iter; -+ struct bkey_packed *_k; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ -+ pr_info("_k %px format %u", _k, _k->format); -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(i->k->k.p, k.k->p)) -+ break; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ pr_err("%s", buf); -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ } else { -+ struct bkey_cached *ck = (void *) i->iter->l[0].b; -+ -+ if (ck->valid) { -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); -+ pr_err("%s", buf); -+ } -+ } -+ } -+} -+ -+/* trans_mark: */ -+ -+static struct btree_iter *trans_get_update(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct bkey_s_c *k) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ if (i->iter->btree_id == btree_id && -+ (btree_node_type_is_extents(btree_id) -+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && -+ bkey_cmp(pos, i->k->k.p) < 0 -+ : !bkey_cmp(pos, i->iter->pos))) { -+ *k = bkey_i_to_s_c(i->k); -+ return i->iter; -+ } -+ -+ return NULL; -+} -+ -+static int trans_get_key(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct btree_iter **iter, -+ struct bkey_s_c *k) -+{ -+ unsigned flags = btree_id != BTREE_ID_ALLOC -+ ? BTREE_ITER_SLOTS -+ : BTREE_ITER_CACHED; -+ int ret; -+ -+ *iter = trans_get_update(trans, btree_id, pos, k); -+ if (*iter) -+ return 1; -+ -+ *iter = bch2_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_INTENT); -+ if (IS_ERR(*iter)) -+ return PTR_ERR(*iter); -+ -+ *k = __bch2_btree_iter_peek(*iter, flags); -+ ret = bkey_err(*k); -+ if (ret) -+ bch2_trans_iter_put(trans, *iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_pointer(struct btree_trans *trans, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); -+ struct btree_iter *iter; -+ struct bkey_s_c k_a; -+ struct bkey_alloc_unpacked u; -+ struct bkey_i_alloc *a; -+ struct bucket *g; -+ int ret; -+ -+ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); -+ if (iter) { -+ u = bch2_alloc_unpack(k_a); -+ } else { -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto out; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, pos.offset); -+ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, -+ &u.dirty_sectors, &u.cached_sectors); -+ if (ret) -+ goto out; -+ -+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ bkey_alloc_init(&a->k_i); -+ a->k.p = pos; -+ bch2_alloc_pack(a, u); -+ bch2_trans_update(trans, iter, &a->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, -+ struct bch_extent_stripe_ptr p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_stripe *s; -+ int ret = 0; -+ -+ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) { -+ bch2_fs_inconsistent(c, -+ "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ ret = -EIO; -+ goto out; -+ } -+ -+ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ goto out; -+ -+ bkey_reassemble(&s->k_i, k); -+ -+ stripe_blockcount_set(&s->v, p.block, -+ stripe_blockcount_get(&s->v, p.block) + -+ sectors); -+ -+ *nr_data = s->v.nr_blocks - s->v.nr_redundant; -+ *nr_parity = s->v.nr_redundant; -+ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); -+ bch2_trans_update(trans, iter, &s->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_extent(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned offset, -+ s64 sectors, unsigned flags, -+ enum bch_data_type data_type) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_btree -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, -+ data_type); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors_list(trans, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, -+ disk_sectors, data_type, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas_list(trans, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas_list(trans, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 idx, unsigned sectors, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ s64 ret; -+ -+ ret = trans_get_key(trans, BTREE_ID_REFLINK, -+ POS(0, idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ bch2_fs_inconsistent(c, -+ "%llu:%llu len %u points to nonexistent indirect extent %llu", -+ p.k->p.inode, p.k->p.offset, p.k->size, idx); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if ((flags & BTREE_TRIGGER_OVERWRITE) && -+ (bkey_start_offset(k.k) < idx || -+ k.k->p.offset > idx + sectors)) -+ goto out; -+ -+ sectors = k.k->p.offset - idx; -+ -+ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&r_v->k_i, k); -+ -+ le64_add_cpu(&r_v->v.refcount, -+ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); -+ -+ if (!r_v->v.refcount) { -+ r_v->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&r_v->k, 0); -+ } -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ bch2_trans_update(trans, iter, &r_v->k_i, 0); -+out: -+ ret = sectors; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, unsigned offset, -+ s64 sectors, unsigned flags) -+{ -+ u64 idx = le64_to_cpu(p.v->idx) + offset; -+ s64 ret = 0; -+ -+ sectors = abs(sectors); -+ BUG_ON(offset + sectors > p.k->size); -+ -+ while (sectors) { -+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); -+ if (ret < 0) -+ break; -+ -+ idx += ret; -+ sectors = max_t(s64, 0LL, sectors - ret); -+ ret = 0; -+ } -+ -+ return ret; -+} -+ -+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, -+ unsigned offset, s64 sectors, unsigned flags) -+{ -+ struct replicas_delta_list *d; -+ struct bch_fs *c = trans->c; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_btree); -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_user); -+ case KEY_TYPE_inode: -+ d = replicas_deltas_realloc(trans, 0); -+ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ d->nr_inodes++; -+ else -+ d->nr_inodes--; -+ return 0; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ d = replicas_deltas_realloc(trans, 0); -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(d->persistent_reserved)); -+ -+ d->persistent_reserved[replicas - 1] += sectors; -+ return 0; -+ } -+ case KEY_TYPE_reflink_p: -+ return bch2_trans_mark_reflink_p(trans, -+ bkey_s_c_to_reflink_p(k), -+ offset, sectors, flags); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_trans_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ unsigned flags) -+{ -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_k; -+ int ret; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), -+ 0, insert->k.size, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ } -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ unsigned offset = 0; -+ s64 sectors = 0; -+ unsigned flags = BTREE_TRIGGER_OVERWRITE; -+ -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(insert->k.p, k.k->p)) -+ break; -+ -+ if (btree_node_is_extents(b)) { -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) k.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = bkey_start_offset(&insert->k) - -+ k.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(k.k) - -+ insert->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = -((s64) insert->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ } -+ -+ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); -+ if (ret) -+ return ret; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return 0; -+} -+ -+/* Disk reservations: */ -+ -+static u64 bch2_recalc_sectors_available(struct bch_fs *c) -+{ -+ percpu_u64_set(&c->pcpu->sectors_available, 0); -+ -+ return avail_factor(__bch2_fs_usage_read_short(c).free); -+} -+ -+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -+{ -+ percpu_down_read(&c->mark_lock); -+ this_cpu_sub(c->usage[0]->online_reserved, -+ res->sectors); -+ percpu_up_read(&c->mark_lock); -+ -+ res->sectors = 0; -+} -+ -+#define SECTORS_CACHE 1024 -+ -+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, -+ unsigned sectors, int flags) -+{ -+ struct bch_fs_pcpu *pcpu; -+ u64 old, v, get; -+ s64 sectors_available; -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ pcpu = this_cpu_ptr(c->pcpu); -+ -+ if (sectors <= pcpu->sectors_available) -+ goto out; -+ -+ v = atomic64_read(&c->sectors_available); -+ do { -+ old = v; -+ get = min((u64) sectors + SECTORS_CACHE, old); -+ -+ if (get < sectors) { -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ goto recalculate; -+ } -+ } while ((v = atomic64_cmpxchg(&c->sectors_available, -+ old, old - get)) != old); -+ -+ pcpu->sectors_available += get; -+ -+out: -+ pcpu->sectors_available -= sectors; -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ return 0; -+ -+recalculate: -+ percpu_down_write(&c->mark_lock); -+ -+ sectors_available = bch2_recalc_sectors_available(c); -+ -+ if (sectors <= sectors_available || -+ (flags & BCH_DISK_RESERVATION_NOFAIL)) { -+ atomic64_set(&c->sectors_available, -+ max_t(s64, 0, sectors_available - sectors)); -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ ret = 0; -+ } else { -+ atomic64_set(&c->sectors_available, sectors_available); -+ ret = -ENOSPC; -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return ret; -+} -+ -+/* Startup/shutdown: */ -+ -+static void buckets_free_rcu(struct rcu_head *rcu) -+{ -+ struct bucket_array *buckets = -+ container_of(rcu, struct bucket_array, rcu); -+ -+ kvpfree(buckets, -+ sizeof(struct bucket_array) + -+ buckets->nbuckets * sizeof(struct bucket)); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bucket_array *buckets = NULL, *old_buckets = NULL; -+ unsigned long *buckets_nouse = NULL; -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ alloc_heap alloc_heap; -+ -+ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, -+ ca->mi.bucket_size / c->opts.btree_node_size); -+ /* XXX: these should be tunable */ -+ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); -+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); -+ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), -+ btree_reserve * 2); -+ bool resize = ca->buckets[0] != NULL; -+ int ret = -ENOMEM; -+ unsigned i; -+ -+ memset(&free, 0, sizeof(free)); -+ memset(&free_inc, 0, sizeof(free_inc)); -+ memset(&alloc_heap, 0, sizeof(alloc_heap)); -+ -+ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + -+ nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * -+ sizeof(unsigned long), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_MOVINGGC], -+ copygc_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || -+ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || -+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) -+ goto err; -+ -+ buckets->first_bucket = ca->mi.first_bucket; -+ buckets->nbuckets = nbuckets; -+ -+ bch2_copygc_stop(c); -+ -+ if (resize) { -+ down_write(&c->gc_lock); -+ down_write(&ca->bucket_lock); -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ old_buckets = bucket_array(ca); -+ -+ if (resize) { -+ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); -+ -+ memcpy(buckets->b, -+ old_buckets->b, -+ n * sizeof(struct bucket)); -+ memcpy(buckets_nouse, -+ ca->buckets_nouse, -+ BITS_TO_LONGS(n) * sizeof(unsigned long)); -+ } -+ -+ rcu_assign_pointer(ca->buckets[0], buckets); -+ buckets = old_buckets; -+ -+ swap(ca->buckets_nouse, buckets_nouse); -+ -+ if (resize) { -+ percpu_up_write(&c->mark_lock); -+ up_write(&c->gc_lock); -+ } -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ fifo_move(&free[i], &ca->free[i]); -+ swap(ca->free[i], free[i]); -+ } -+ fifo_move(&free_inc, &ca->free_inc); -+ swap(ca->free_inc, free_inc); -+ spin_unlock(&c->freelist_lock); -+ -+ /* with gc lock held, alloc_heap can't be in use: */ -+ swap(ca->alloc_heap, alloc_heap); -+ -+ nbuckets = ca->mi.nbuckets; -+ -+ if (resize) -+ up_write(&ca->bucket_lock); -+ -+ ret = 0; -+err: -+ free_heap(&alloc_heap); -+ free_fifo(&free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&free[i]); -+ kvpfree(buckets_nouse, -+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); -+ if (buckets) -+ call_rcu(&old_buckets->rcu, buckets_free_rcu); -+ -+ return ret; -+} -+ -+void bch2_dev_buckets_free(struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ free_heap(&ca->alloc_heap); -+ free_fifo(&ca->free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&ca->free[i]); -+ kvpfree(ca->buckets_nouse, -+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); -+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ -+ free_percpu(ca->usage[0]); -+} -+ -+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) -+ return -ENOMEM; -+ -+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; -+} -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -new file mode 100644 -index 000000000000..653f6761862e ---- /dev/null -+++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,324 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ */ -+ -+#ifndef _BUCKETS_H -+#define _BUCKETS_H -+ -+#include "buckets_types.h" -+#include "super.h" -+ -+#define for_each_bucket(_b, _buckets) \ -+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ -+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -+ -+#define bucket_cmpxchg(g, new, expr) \ -+({ \ -+ struct bucket *_g = g; \ -+ u64 _v = atomic64_read(&(g)->_mark.v); \ -+ struct bucket_mark _old; \ -+ \ -+ do { \ -+ (new).v.counter = _old.v.counter = _v; \ -+ expr; \ -+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ -+ _old.v.counter, \ -+ (new).v.counter)) != _old.v.counter);\ -+ _old; \ -+}) -+ -+static inline struct bucket_array *__bucket_array(struct bch_dev *ca, -+ bool gc) -+{ -+ return rcu_dereference_check(ca->buckets[gc], -+ !ca->fs || -+ percpu_rwsem_is_held(&ca->fs->mark_lock) || -+ lockdep_is_held(&ca->fs->gc_lock) || -+ lockdep_is_held(&ca->bucket_lock)); -+} -+ -+static inline struct bucket_array *bucket_array(struct bch_dev *ca) -+{ -+ return __bucket_array(ca, false); -+} -+ -+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) -+{ -+ struct bucket_array *buckets = __bucket_array(ca, gc); -+ -+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); -+ return buckets->b + b; -+} -+ -+static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -+{ -+ return __bucket(ca, b, false); -+} -+ -+static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, int rw) -+{ -+ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; -+} -+ -+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -+{ -+ return c->bucket_clock[rw].hand - g->io_time[rw]; -+} -+ -+/* -+ * bucket_gc_gen() returns the difference between the bucket's current gen and -+ * the oldest gen of any pointer into that bucket in the btree. -+ */ -+ -+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) -+{ -+ struct bucket *g = bucket(ca, b); -+ -+ return g->mark.gen - g->oldest_gen; -+} -+ -+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return sector_to_bucket(ca, ptr->offset); -+} -+ -+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr, -+ bool gc) -+{ -+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); -+} -+ -+static inline enum bch_data_type ptr_data_type(const struct bkey *k, -+ const struct bch_extent_ptr *ptr) -+{ -+ if (k->type == KEY_TYPE_btree_ptr || -+ k->type == KEY_TYPE_btree_ptr_v2) -+ return BCH_DATA_btree; -+ -+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -+} -+ -+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ struct bucket_mark m; -+ -+ rcu_read_lock(); -+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); -+ rcu_read_unlock(); -+ -+ return m; -+} -+ -+static inline int gen_cmp(u8 a, u8 b) -+{ -+ return (s8) (a - b); -+} -+ -+static inline int gen_after(u8 a, u8 b) -+{ -+ int r = gen_cmp(a, b); -+ -+ return r > 0 ? r : 0; -+} -+ -+/** -+ * ptr_stale() - check if a pointer points into a bucket that has been -+ * invalidated. -+ */ -+static inline u8 ptr_stale(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); -+} -+ -+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, -+ unsigned live_size) -+{ -+ return live_size && p.crc.compression_type -+ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, -+ p.crc.uncompressed_size)) -+ : live_size; -+} -+ -+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) -+{ -+ return __ptr_disk_sectors(p, p.crc.live_size); -+} -+ -+/* bucket gc marks */ -+ -+static inline unsigned bucket_sectors_used(struct bucket_mark mark) -+{ -+ return mark.dirty_sectors + mark.cached_sectors; -+} -+ -+static inline bool bucket_unused(struct bucket_mark mark) -+{ -+ return !mark.owned_by_allocator && -+ !mark.data_type && -+ !bucket_sectors_used(mark); -+} -+ -+static inline bool is_available_bucket(struct bucket_mark mark) -+{ -+ return (!mark.owned_by_allocator && -+ !mark.dirty_sectors && -+ !mark.stripe); -+} -+ -+static inline bool bucket_needs_journal_commit(struct bucket_mark m, -+ u16 last_seq_ondisk) -+{ -+ return m.journal_seq_valid && -+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -+} -+ -+/* Device usage: */ -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -+ -+void bch2_dev_usage_from_buckets(struct bch_fs *); -+ -+static inline u64 __dev_buckets_available(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; -+ -+ if (WARN_ONCE(stats.buckets_unavailable > total, -+ "buckets_unavailable overflow (%llu > %llu)\n", -+ stats.buckets_unavailable, total)) -+ return 0; -+ -+ return total - stats.buckets_unavailable; -+} -+ -+/* -+ * Number of reclaimable buckets - only for use by the allocator thread: -+ */ -+static inline u64 dev_buckets_available(struct bch_dev *ca) -+{ -+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); -+} -+ -+static inline u64 __dev_buckets_free(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ return __dev_buckets_available(ca, stats) + -+ fifo_used(&ca->free[RESERVE_NONE]) + -+ fifo_used(&ca->free_inc); -+} -+ -+static inline u64 dev_buckets_free(struct bch_dev *ca) -+{ -+ return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); -+} -+ -+/* Filesystem usage: */ -+ -+static inline unsigned fs_usage_u64s(struct bch_fs *c) -+{ -+ -+ return sizeof(struct bch_fs_usage) / sizeof(u64) + -+ READ_ONCE(c->replicas.nr); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); -+ -+void bch2_fs_usage_to_text(struct printbuf *, -+ struct bch_fs *, struct bch_fs_usage *); -+ -+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *); -+ -+/* key/bucket marking: */ -+ -+void bch2_bucket_seq_cleanup(struct bch_fs *); -+void bch2_fs_usage_initialize(struct bch_fs *); -+ -+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, struct bucket_mark *); -+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, bool, struct gc_pos, unsigned); -+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, enum bch_data_type, unsigned, -+ struct gc_pos, unsigned); -+ -+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, -+ s64, struct bch_fs_usage *, u64, unsigned); -+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, -+ struct disk_reservation *, unsigned); -+ -+int bch2_mark_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct bch_fs_usage *, unsigned); -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *, -+ struct bch_fs_usage *, -+ struct replicas_delta_list *); -+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, -+ unsigned, s64, unsigned); -+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, -+ struct bkey_i *insert, unsigned); -+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); -+ -+/* disk reservations: */ -+ -+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); -+ -+static inline void bch2_disk_reservation_put(struct bch_fs *c, -+ struct disk_reservation *res) -+{ -+ if (res->sectors) -+ __bch2_disk_reservation_put(c, res); -+} -+ -+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -+ -+int bch2_disk_reservation_add(struct bch_fs *, -+ struct disk_reservation *, -+ unsigned, int); -+ -+static inline struct disk_reservation -+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -+{ -+ return (struct disk_reservation) { -+ .sectors = 0, -+#if 0 -+ /* not used yet: */ -+ .gen = c->capacity_gen, -+#endif -+ .nr_replicas = nr_replicas, -+ }; -+} -+ -+static inline int bch2_disk_reservation_get(struct bch_fs *c, -+ struct disk_reservation *res, -+ unsigned sectors, -+ unsigned nr_replicas, -+ int flags) -+{ -+ *res = bch2_disk_reservation_init(c, nr_replicas); -+ -+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -+void bch2_dev_buckets_free(struct bch_dev *); -+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); -+ -+#endif /* _BUCKETS_H */ -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -new file mode 100644 -index 000000000000..d5215b14d7d9 ---- /dev/null -+++ b/fs/bcachefs/buckets_types.h -@@ -0,0 +1,135 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_TYPES_H -+#define _BUCKETS_TYPES_H -+ -+#include "bcachefs_format.h" -+#include "util.h" -+ -+#define BUCKET_JOURNAL_SEQ_BITS 16 -+ -+struct bucket_mark { -+ union { -+ atomic64_t v; -+ -+ struct { -+ u8 gen; -+ u8 data_type:3, -+ owned_by_allocator:1, -+ journal_seq_valid:1, -+ stripe:1; -+ u16 dirty_sectors; -+ u16 cached_sectors; -+ -+ /* -+ * low bits of journal sequence number when this bucket was most -+ * recently modified: if journal_seq_valid is set, this bucket can't be -+ * reused until the journal sequence number written to disk is >= the -+ * bucket's journal sequence number: -+ */ -+ u16 journal_seq; -+ }; -+ }; -+}; -+ -+struct bucket { -+ union { -+ struct bucket_mark _mark; -+ const struct bucket_mark mark; -+ }; -+ -+ u16 io_time[2]; -+ u8 oldest_gen; -+ u8 gc_gen; -+ unsigned gen_valid:1; -+}; -+ -+struct bucket_array { -+ struct rcu_head rcu; -+ u16 first_bucket; -+ size_t nbuckets; -+ struct bucket b[]; -+}; -+ -+struct bch_dev_usage { -+ u64 buckets[BCH_DATA_NR]; -+ u64 buckets_alloc; -+ u64 buckets_unavailable; -+ -+ /* _compressed_ sectors: */ -+ u64 sectors[BCH_DATA_NR]; -+ u64 sectors_fragmented; -+ -+ u64 buckets_ec; -+ u64 sectors_ec; -+}; -+ -+struct bch_fs_usage { -+ /* all fields are in units of 512 byte sectors: */ -+ -+ u64 online_reserved; -+ -+ /* fields after online_reserved are cleared/recalculated by gc: */ -+ u64 gc_start[0]; -+ -+ u64 hidden; -+ u64 btree; -+ u64 data; -+ u64 cached; -+ u64 reserved; -+ u64 nr_inodes; -+ -+ /* XXX: add stats for compression ratio */ -+#if 0 -+ u64 uncompressed; -+ u64 compressed; -+#endif -+ -+ /* broken out: */ -+ -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ u64 replicas[]; -+}; -+ -+struct bch_fs_usage_short { -+ u64 capacity; -+ u64 used; -+ u64 free; -+ u64 nr_inodes; -+}; -+ -+struct replicas_delta { -+ s64 delta; -+ struct bch_replicas_entry r; -+} __packed; -+ -+struct replicas_delta_list { -+ unsigned size; -+ unsigned used; -+ -+ struct {} memset_start; -+ u64 nr_inodes; -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ struct {} memset_end; -+ struct replicas_delta d[0]; -+}; -+ -+/* -+ * A reservation for space on disk: -+ */ -+struct disk_reservation { -+ u64 sectors; -+ u32 gen; -+ unsigned nr_replicas; -+}; -+ -+struct copygc_heap_entry { -+ u8 dev; -+ u8 gen; -+ u16 fragmentation; -+ u32 sectors; -+ u64 offset; -+}; -+ -+typedef HEAP(struct copygc_heap_entry) copygc_heap; -+ -+#endif /* _BUCKETS_TYPES_H */ -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -new file mode 100644 -index 000000000000..0377f9018d27 ---- /dev/null -+++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,704 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_CHARDEV -+ -+#include "bcachefs.h" -+#include "bcachefs_ioctl.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "move.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* returns with ref on ca->ref */ -+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, -+ unsigned flags) -+{ -+ struct bch_dev *ca; -+ -+ if (flags & BCH_BY_INDEX) { -+ if (dev >= c->sb.nr_devices) -+ return ERR_PTR(-EINVAL); -+ -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ if (!ca) -+ return ERR_PTR(-EINVAL); -+ } else { -+ char *path; -+ -+ path = strndup_user((const char __user *) -+ (unsigned long) dev, PATH_MAX); -+ if (IS_ERR(path)) -+ return ERR_CAST(path); -+ -+ ca = bch2_dev_lookup(c, path); -+ kfree(path); -+ } -+ -+ return ca; -+} -+ -+#if 0 -+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -+{ -+ struct bch_ioctl_assemble arg; -+ struct bch_fs *c; -+ u64 *user_devs = NULL; -+ char **devs = NULL; -+ unsigned i; -+ int ret = -EFAULT; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); -+ if (!user_devs) -+ return -ENOMEM; -+ -+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); -+ -+ if (copy_from_user(user_devs, user_arg->devs, -+ sizeof(u64) * arg.nr_devs)) -+ goto err; -+ -+ for (i = 0; i < arg.nr_devs; i++) { -+ devs[i] = strndup_user((const char __user *)(unsigned long) -+ user_devs[i], -+ PATH_MAX); -+ if (!devs[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); -+ ret = PTR_ERR_OR_ZERO(c); -+ if (!ret) -+ closure_put(&c->cl); -+err: -+ if (devs) -+ for (i = 0; i < arg.nr_devs; i++) -+ kfree(devs[i]); -+ kfree(devs); -+ return ret; -+} -+ -+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -+{ -+ struct bch_ioctl_incremental arg; -+ const char *err; -+ char *path; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ err = bch2_fs_open_incremental(path); -+ kfree(path); -+ -+ if (err) { -+ pr_err("Could not register bcachefs devices: %s", err); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+#endif -+ -+static long bch2_global_ioctl(unsigned cmd, void __user *arg) -+{ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_ASSEMBLE: -+ return bch2_ioctl_assemble(arg); -+ case BCH_IOCTL_INCREMENTAL: -+ return bch2_ioctl_incremental(arg); -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static long bch2_ioctl_query_uuid(struct bch_fs *c, -+ struct bch_ioctl_query_uuid __user *user_arg) -+{ -+ return copy_to_user(&user_arg->uuid, -+ &c->sb.user_uuid, -+ sizeof(c->sb.user_uuid)); -+} -+ -+#if 0 -+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -+{ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ return bch2_fs_start(c); -+} -+ -+static long bch2_ioctl_stop(struct bch_fs *c) -+{ -+ bch2_fs_stop(c); -+ return 0; -+} -+#endif -+ -+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_add(c, path); -+ kfree(path); -+ -+ return ret; -+} -+ -+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ return bch2_dev_remove(c, ca, arg.flags); -+} -+ -+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_online(c, path); -+ kfree(path); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_offline(c, ca, arg.flags); -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_set_state(struct bch_fs *c, -+ struct bch_ioctl_disk_set_state arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad[0] || arg.pad[1] || arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+struct bch_data_ctx { -+ struct bch_fs *c; -+ struct bch_ioctl_data arg; -+ struct bch_move_stats stats; -+ -+ int ret; -+ -+ struct task_struct *thread; -+}; -+ -+static int bch2_data_thread(void *arg) -+{ -+ struct bch_data_ctx *ctx = arg; -+ -+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -+ -+ ctx->stats.data_type = U8_MAX; -+ return 0; -+} -+ -+static int bch2_data_job_release(struct inode *inode, struct file *file) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ -+ kthread_stop(ctx->thread); -+ put_task_struct(ctx->thread); -+ kfree(ctx); -+ return 0; -+} -+ -+static ssize_t bch2_data_job_read(struct file *file, char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ struct bch_fs *c = ctx->c; -+ struct bch_ioctl_data_event e = { -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.btree_id, -+ .p.pos = ctx->stats.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ }; -+ -+ if (len < sizeof(e)) -+ return -EINVAL; -+ -+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); -+} -+ -+static const struct file_operations bcachefs_data_ops = { -+ .release = bch2_data_job_release, -+ .read = bch2_data_job_read, -+ .llseek = no_llseek, -+}; -+ -+static long bch2_ioctl_data(struct bch_fs *c, -+ struct bch_ioctl_data arg) -+{ -+ struct bch_data_ctx *ctx = NULL; -+ struct file *file = NULL; -+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; -+ int ret, fd = -1; -+ -+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) -+ return -EINVAL; -+ -+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); -+ if (!ctx) -+ return -ENOMEM; -+ -+ ctx->c = c; -+ ctx->arg = arg; -+ -+ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); -+ if (IS_ERR(ctx->thread)) { -+ ret = PTR_ERR(ctx->thread); -+ goto err; -+ } -+ -+ ret = get_unused_fd_flags(flags); -+ if (ret < 0) -+ goto err; -+ fd = ret; -+ -+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); -+ if (IS_ERR(file)) { -+ ret = PTR_ERR(file); -+ goto err; -+ } -+ -+ fd_install(fd, file); -+ -+ get_task_struct(ctx->thread); -+ wake_up_process(ctx->thread); -+ -+ return fd; -+err: -+ if (fd >= 0) -+ put_unused_fd(fd); -+ if (!IS_ERR_OR_NULL(ctx->thread)) -+ kthread_stop(ctx->thread); -+ kfree(ctx); -+ return ret; -+} -+ -+static long bch2_ioctl_fs_usage(struct bch_fs *c, -+ struct bch_ioctl_fs_usage __user *user_arg) -+{ -+ struct bch_ioctl_fs_usage *arg = NULL; -+ struct bch_replicas_usage *dst_e, *dst_end; -+ struct bch_fs_usage *src; -+ u32 replica_entries_bytes; -+ unsigned i; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) -+ return -EFAULT; -+ -+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); -+ if (!arg) -+ return -ENOMEM; -+ -+ src = bch2_fs_usage_read(c); -+ if (!src) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ arg->capacity = c->capacity; -+ arg->used = bch2_fs_sectors_used(c, src); -+ arg->online_reserved = src->online_reserved; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ arg->persistent_reserved[i] = src->persistent_reserved[i]; -+ -+ dst_e = arg->replicas; -+ dst_end = (void *) arg->replicas + replica_entries_bytes; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *src_e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ dst_e->sectors = src->replicas[i]; -+ dst_e->r = *src_e; -+ -+ /* recheck after setting nr_devs: */ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); -+ -+ dst_e = replicas_usage_next(dst_e); -+ } -+ -+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; -+ -+ percpu_up_read(&c->mark_lock); -+ kfree(src); -+ -+ if (!ret) -+ ret = copy_to_user(user_arg, arg, -+ sizeof(*arg) + arg->replica_entries_bytes); -+err: -+ kfree(arg); -+ return ret; -+} -+ -+static long bch2_ioctl_dev_usage(struct bch_fs *c, -+ struct bch_ioctl_dev_usage __user *user_arg) -+{ -+ struct bch_ioctl_dev_usage arg; -+ struct bch_dev_usage src; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad[0] || -+ arg.pad[1] || -+ arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ src = bch2_dev_usage_read(ca); -+ -+ arg.state = ca->mi.state; -+ arg.bucket_size = ca->mi.bucket_size; -+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; -+ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; -+ arg.ec_buckets = src.buckets_ec; -+ arg.ec_sectors = src.sectors_ec; -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ arg.buckets[i] = src.buckets[i]; -+ arg.sectors[i] = src.sectors[i]; -+ } -+ -+ percpu_ref_put(&ca->ref); -+ -+ return copy_to_user(user_arg, &arg, sizeof(arg)); -+} -+ -+static long bch2_ioctl_read_super(struct bch_fs *c, -+ struct bch_ioctl_read_super arg) -+{ -+ struct bch_dev *ca = NULL; -+ struct bch_sb *sb; -+ int ret = 0; -+ -+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || -+ arg.pad) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (arg.flags & BCH_READ_DEV) { -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ -+ if (IS_ERR(ca)) { -+ ret = PTR_ERR(ca); -+ goto err; -+ } -+ -+ sb = ca->disk_sb.sb; -+ } else { -+ sb = c->disk_sb.sb; -+ } -+ -+ if (vstruct_bytes(sb) > arg.size) { -+ ret = -ERANGE; -+ goto err; -+ } -+ -+ ret = copy_to_user((void __user *)(unsigned long)arg.sb, -+ sb, vstruct_bytes(sb)); -+err: -+ if (ca) -+ percpu_ref_put(&ca->ref); -+ mutex_unlock(&c->sb_lock); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_get_idx(struct bch_fs *c, -+ struct bch_ioctl_disk_get_idx arg) -+{ -+ dev_t dev = huge_decode_dev(arg.dev); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ if (ca->disk_sb.bdev->bd_dev == dev) { -+ percpu_ref_put(&ca->io_ref); -+ return i; -+ } -+ -+ return -ENOENT; -+} -+ -+static long bch2_ioctl_disk_resize(struct bch_fs *c, -+ struct bch_ioctl_disk_resize arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_resize(c, ca, arg.nbuckets); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+#define BCH_IOCTL(_name, _argtype) \ -+do { \ -+ _argtype i; \ -+ \ -+ if (copy_from_user(&i, arg, sizeof(i))) \ -+ return -EFAULT; \ -+ return bch2_ioctl_##_name(c, i); \ -+} while (0) -+ -+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -+{ -+ /* ioctls that don't require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_QUERY_UUID: -+ return bch2_ioctl_query_uuid(c, arg); -+ case BCH_IOCTL_FS_USAGE: -+ return bch2_ioctl_fs_usage(c, arg); -+ case BCH_IOCTL_DEV_USAGE: -+ return bch2_ioctl_dev_usage(c, arg); -+ } -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_START: -+ BCH_IOCTL(start, struct bch_ioctl_start); -+ case BCH_IOCTL_STOP: -+ return bch2_ioctl_stop(c); -+#endif -+ case BCH_IOCTL_READ_SUPER: -+ BCH_IOCTL(read_super, struct bch_ioctl_read_super); -+ case BCH_IOCTL_DISK_GET_IDX: -+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); -+ } -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ /* ioctls that do require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_DISK_ADD: -+ BCH_IOCTL(disk_add, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_REMOVE: -+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_ONLINE: -+ BCH_IOCTL(disk_online, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_OFFLINE: -+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_SET_STATE: -+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); -+ case BCH_IOCTL_DATA: -+ BCH_IOCTL(data, struct bch_ioctl_data); -+ case BCH_IOCTL_DISK_RESIZE: -+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); -+ -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static DEFINE_IDR(bch_chardev_minor); -+ -+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -+{ -+ unsigned minor = iminor(file_inode(filp)); -+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; -+ void __user *arg = (void __user *) v; -+ -+ return c -+ ? bch2_fs_ioctl(c, cmd, arg) -+ : bch2_global_ioctl(cmd, arg); -+} -+ -+static const struct file_operations bch_chardev_fops = { -+ .owner = THIS_MODULE, -+ .unlocked_ioctl = bch2_chardev_ioctl, -+ .open = nonseekable_open, -+}; -+ -+static int bch_chardev_major; -+static struct class *bch_chardev_class; -+static struct device *bch_chardev; -+ -+void bch2_fs_chardev_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->chardev)) -+ device_unregister(c->chardev); -+ if (c->minor >= 0) -+ idr_remove(&bch_chardev_minor, c->minor); -+} -+ -+int bch2_fs_chardev_init(struct bch_fs *c) -+{ -+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); -+ if (c->minor < 0) -+ return c->minor; -+ -+ c->chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, c->minor), c, -+ "bcachefs%u-ctl", c->minor); -+ if (IS_ERR(c->chardev)) -+ return PTR_ERR(c->chardev); -+ -+ return 0; -+} -+ -+void bch2_chardev_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ device_destroy(bch_chardev_class, -+ MKDEV(bch_chardev_major, U8_MAX)); -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ class_destroy(bch_chardev_class); -+ if (bch_chardev_major > 0) -+ unregister_chrdev(bch_chardev_major, "bcachefs"); -+} -+ -+int __init bch2_chardev_init(void) -+{ -+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); -+ if (bch_chardev_major < 0) -+ return bch_chardev_major; -+ -+ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); -+ if (IS_ERR(bch_chardev_class)) -+ return PTR_ERR(bch_chardev_class); -+ -+ bch_chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, U8_MAX), -+ NULL, "bcachefs-ctl"); -+ if (IS_ERR(bch_chardev)) -+ return PTR_ERR(bch_chardev); -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_CHARDEV */ -diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h -new file mode 100644 -index 000000000000..3a4890d39ff9 ---- /dev/null -+++ b/fs/bcachefs/chardev.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHARDEV_H -+#define _BCACHEFS_CHARDEV_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); -+ -+void bch2_fs_chardev_exit(struct bch_fs *); -+int bch2_fs_chardev_init(struct bch_fs *); -+ -+void bch2_chardev_exit(void); -+int __init bch2_chardev_init(void); -+ -+#else -+ -+static inline long bch2_fs_ioctl(struct bch_fs *c, -+ unsigned cmd, void __user * arg) -+{ -+ return -ENOSYS; -+} -+ -+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } -+ -+static inline void bch2_chardev_exit(void) {} -+static inline int __init bch2_chardev_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_CHARDEV_H */ -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -new file mode 100644 -index 000000000000..3d88719ba86c ---- /dev/null -+++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,618 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static u64 bch2_checksum_init(unsigned type) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return 0; -+ case BCH_CSUM_CRC64: -+ return 0; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_final(unsigned type, u64 crc) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return crc ^ U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return crc ^ U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return crc; -+ case BCH_CSUM_CRC64: -+ return crc; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC32C: -+ return crc32c(crc, data, len); -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC64: -+ return crc64_be(crc, data, len); -+ default: -+ BUG(); -+ } -+} -+ -+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ struct scatterlist *sg, size_t len) -+{ -+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); -+ int ret; -+ -+ skcipher_request_set_sync_tfm(req, tfm); -+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); -+ -+ ret = crypto_skcipher_encrypt(req); -+ BUG_ON(ret); -+} -+ -+static inline void do_encrypt(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct scatterlist sg; -+ -+ sg_init_one(&sg, buf, len); -+ do_encrypt_sg(tfm, nonce, &sg, len); -+} -+ -+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct crypto_sync_skcipher *chacha20 = -+ crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ int ret; -+ -+ if (!chacha20) { -+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); -+ return PTR_ERR(chacha20); -+ } -+ -+ ret = crypto_skcipher_setkey(&chacha20->base, -+ (void *) key, sizeof(*key)); -+ if (ret) { -+ pr_err("crypto_skcipher_setkey() error: %i", ret); -+ goto err; -+ } -+ -+ do_encrypt(chacha20, nonce, buf, len); -+err: -+ crypto_free_sync_skcipher(chacha20); -+ return ret; -+} -+ -+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -+ struct nonce nonce) -+{ -+ u8 key[POLY1305_KEY_SIZE]; -+ -+ nonce.d[3] ^= BCH_NONCE_POLY; -+ -+ memset(key, 0, sizeof(key)); -+ do_encrypt(c->chacha20, nonce, key, sizeof(key)); -+ -+ desc->tfm = c->poly1305; -+ crypto_shash_init(desc); -+ crypto_shash_update(desc, key, sizeof(key)); -+} -+ -+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, -+ struct nonce nonce, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+ crc = bch2_checksum_update(type, crc, data, len); -+ crc = bch2_checksum_final(type, crc); -+ -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+ crypto_shash_update(desc, data, len); -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_encrypt(struct bch_fs *c, unsigned type, -+ struct nonce nonce, void *data, size_t len) -+{ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ do_encrypt(c->chacha20, nonce, data, len); -+} -+ -+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio, -+ struct bvec_iter *iter) -+{ -+ struct bio_vec bv; -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return (struct bch_csum) { 0 }; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ crc = bch2_checksum_update(type, -+ crc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crc = bch2_checksum_update(type, crc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crc = bch2_checksum_final(type, crc); -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ -+ crypto_shash_update(desc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crypto_shash_update(desc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ -+ return __bch2_checksum_bio(c, type, nonce, bio, &iter); -+} -+ -+void bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ struct scatterlist sgl[16], *sg = sgl; -+ size_t bytes = 0; -+ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ -+ bio_for_each_segment(bv, bio, iter) { -+ if (sg == sgl + ARRAY_SIZE(sgl)) { -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+ -+ nonce = nonce_add(nonce, bytes); -+ bytes = 0; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ sg = sgl; -+ } -+ -+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); -+ bytes += bv.bv_len; -+ } -+ -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, -+ struct bch_csum b, size_t b_len) -+{ -+ BUG_ON(!bch2_checksum_mergeable(type)); -+ -+ while (b_len) { -+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); -+ -+ a.lo = bch2_checksum_update(type, a.lo, -+ page_address(ZERO_PAGE(0)), b); -+ b_len -= b; -+ } -+ -+ a.lo ^= b.lo; -+ a.hi ^= b.hi; -+ return a; -+} -+ -+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc_old, -+ struct bch_extent_crc_unpacked *crc_a, -+ struct bch_extent_crc_unpacked *crc_b, -+ unsigned len_a, unsigned len_b, -+ unsigned new_csum_type) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ struct nonce nonce = extent_nonce(version, crc_old); -+ struct bch_csum merged = { 0 }; -+ struct crc_split { -+ struct bch_extent_crc_unpacked *crc; -+ unsigned len; -+ unsigned csum_type; -+ struct bch_csum csum; -+ } splits[3] = { -+ { crc_a, len_a, new_csum_type }, -+ { crc_b, len_b, new_csum_type }, -+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, -+ }, *i; -+ bool mergeable = crc_old.csum_type == new_csum_type && -+ bch2_checksum_mergeable(new_csum_type); -+ unsigned crc_nonce = crc_old.nonce; -+ -+ BUG_ON(len_a + len_b > bio_sectors(bio)); -+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); -+ BUG_ON(crc_is_compressed(crc_old)); -+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)); -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ iter.bi_size = i->len << 9; -+ if (mergeable || i->crc) -+ i->csum = __bch2_checksum_bio(c, i->csum_type, -+ nonce, bio, &iter); -+ else -+ bio_advance_iter(bio, &iter, i->len << 9); -+ nonce = nonce_add(nonce, i->len << 9); -+ } -+ -+ if (mergeable) -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) -+ merged = bch2_checksum_merge(new_csum_type, merged, -+ i->csum, i->len << 9); -+ else -+ merged = bch2_checksum_bio(c, crc_old.csum_type, -+ extent_nonce(version, crc_old), bio); -+ -+ if (bch2_crc_cmp(merged, crc_old.csum)) -+ return -EIO; -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ if (i->crc) -+ *i->crc = (struct bch_extent_crc_unpacked) { -+ .csum_type = i->csum_type, -+ .compression_type = crc_old.compression_type, -+ .compressed_size = i->len, -+ .uncompressed_size = i->len, -+ .offset = 0, -+ .live_size = i->len, -+ .nonce = crc_nonce, -+ .csum = i->csum, -+ }; -+ -+ if (bch2_csum_type_is_encryption(new_csum_type)) -+ crc_nonce += i->len; -+ } -+ -+ return 0; -+} -+ -+#ifdef __KERNEL__ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ char key_description[60]; -+ struct key *keyring_key; -+ const struct user_key_payload *ukp; -+ int ret; -+ -+ snprintf(key_description, sizeof(key_description), -+ "bcachefs:%pUb", &sb->user_uuid); -+ -+ keyring_key = request_key(&key_type_logon, key_description, NULL); -+ if (IS_ERR(keyring_key)) -+ return PTR_ERR(keyring_key); -+ -+ down_read(&keyring_key->sem); -+ ukp = dereference_key_locked(keyring_key); -+ if (ukp->datalen == sizeof(*key)) { -+ memcpy(key, ukp->data, ukp->datalen); -+ ret = 0; -+ } else { -+ ret = -EINVAL; -+ } -+ up_read(&keyring_key->sem); -+ key_put(keyring_key); -+ -+ return ret; -+} -+#else -+#include -+#include -+ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ key_serial_t key_id; -+ char key_description[60]; -+ char uuid[40]; -+ -+ uuid_unparse_lower(sb->user_uuid.b, uuid); -+ sprintf(key_description, "bcachefs:%s", uuid); -+ -+ key_id = request_key("user", key_description, NULL, -+ KEY_SPEC_USER_KEYRING); -+ if (key_id < 0) -+ return -errno; -+ -+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) -+ return -1; -+ -+ return 0; -+} -+#endif -+ -+int bch2_decrypt_sb_key(struct bch_fs *c, -+ struct bch_sb_field_crypt *crypt, -+ struct bch_key *key) -+{ -+ struct bch_encrypted_key sb_key = crypt->key; -+ struct bch_key user_key; -+ int ret = 0; -+ -+ /* is key encrypted? */ -+ if (!bch2_key_is_encrypted(&sb_key)) -+ goto out; -+ -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ /* decrypt real key: */ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &sb_key, sizeof(sb_key)); -+ if (ret) -+ goto err; -+ -+ if (bch2_key_is_encrypted(&sb_key)) { -+ bch_err(c, "incorrect encryption key"); -+ ret = -EINVAL; -+ goto err; -+ } -+out: -+ *key = sb_key.key; -+err: -+ memzero_explicit(&sb_key, sizeof(sb_key)); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ return ret; -+} -+ -+static int bch2_alloc_ciphers(struct bch_fs *c) -+{ -+ if (!c->chacha20) -+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ if (IS_ERR(c->chacha20)) { -+ bch_err(c, "error requesting chacha20 module: %li", -+ PTR_ERR(c->chacha20)); -+ return PTR_ERR(c->chacha20); -+ } -+ -+ if (!c->poly1305) -+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); -+ if (IS_ERR(c->poly1305)) { -+ bch_err(c, "error requesting poly1305 module: %li", -+ PTR_ERR(c->poly1305)); -+ return PTR_ERR(c->poly1305); -+ } -+ -+ return 0; -+} -+ -+int bch2_disable_encryption(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ /* is key encrypted? */ -+ ret = 0; -+ if (bch2_key_is_encrypted(&crypt->key)) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ crypt->key.magic = BCH_KEY_MAGIC; -+ crypt->key.key = key; -+ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_enable_encryption(struct bch_fs *c, bool keyed) -+{ -+ struct bch_encrypted_key key; -+ struct bch_key user_key; -+ struct bch_sb_field_crypt *crypt; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ /* Do we already have an encryption key? */ -+ if (bch2_sb_get_crypt(c->disk_sb.sb)) -+ goto err; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto err; -+ -+ key.magic = BCH_KEY_MAGIC; -+ get_random_bytes(&key.key, sizeof(key.key)); -+ -+ if (keyed) { -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &key, sizeof(key)); -+ if (ret) -+ goto err; -+ } -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto err; -+ -+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); -+ if (!crypt) { -+ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ -+ goto err; -+ } -+ -+ crypt->key = key; -+ -+ /* write superblock */ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); -+ bch2_write_super(c); -+err: -+ mutex_unlock(&c->sb_lock); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ memzero_explicit(&key, sizeof(key)); -+ return ret; -+} -+ -+void bch2_fs_encryption_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->poly1305)) -+ crypto_free_shash(c->poly1305); -+ if (!IS_ERR_OR_NULL(c->chacha20)) -+ crypto_free_sync_skcipher(c->chacha20); -+ if (!IS_ERR_OR_NULL(c->sha256)) -+ crypto_free_shash(c->sha256); -+} -+ -+int bch2_fs_encryption_init(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->sha256 = crypto_alloc_shash("sha256", 0, 0); -+ if (IS_ERR(c->sha256)) { -+ bch_err(c, "error requesting sha256 module"); -+ ret = PTR_ERR(c->sha256); -+ goto out; -+ } -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto out; -+out: -+ memzero_explicit(&key, sizeof(key)); -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -new file mode 100644 -index 000000000000..24dee8039d57 ---- /dev/null -+++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,202 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHECKSUM_H -+#define _BCACHEFS_CHECKSUM_H -+ -+#include "bcachefs.h" -+#include "extents_types.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static inline bool bch2_checksum_mergeable(unsigned type) -+{ -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, -+ struct bch_csum, size_t); -+ -+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -+#define BCH_NONCE_POLY cpu_to_le32(1 << 31) -+ -+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, -+ const void *, size_t); -+ -+/* -+ * This is used for various on disk data structures - bch_sb, prio_set, bset, -+ * jset: The checksum is _always_ the first field of these structs -+ */ -+#define csum_vstruct(_c, _type, _nonce, _i) \ -+({ \ -+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ -+ const void *end = vstruct_end(_i); \ -+ \ -+ bch2_checksum(_c, _type, _nonce, start, end - start); \ -+}) -+ -+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -+int bch2_request_key(struct bch_sb *, struct bch_key *); -+ -+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, -+ void *data, size_t); -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, -+ struct bch_extent_crc_unpacked, -+ struct bch_extent_crc_unpacked *, -+ struct bch_extent_crc_unpacked *, -+ unsigned, unsigned, unsigned); -+ -+void bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, -+ struct bch_key *); -+ -+int bch2_disable_encryption(struct bch_fs *); -+int bch2_enable_encryption(struct bch_fs *, bool); -+ -+void bch2_fs_encryption_exit(struct bch_fs *); -+int bch2_fs_encryption_init(struct bch_fs *); -+ -+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, -+ bool data) -+{ -+ switch (type) { -+ case BCH_CSUM_OPT_NONE: -+ return BCH_CSUM_NONE; -+ case BCH_CSUM_OPT_CRC32C: -+ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; -+ case BCH_CSUM_OPT_CRC64: -+ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; -+ default: -+ BUG(); -+ } -+} -+ -+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, -+ unsigned opt) -+{ -+ if (c->sb.encryption_type) -+ return c->opts.wide_macs -+ ? BCH_CSUM_CHACHA20_POLY1305_128 -+ : BCH_CSUM_CHACHA20_POLY1305_80; -+ -+ return bch2_csum_opt_to_type(opt, true); -+} -+ -+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -+{ -+ if (c->sb.encryption_type) -+ return BCH_CSUM_CHACHA20_POLY1305_128; -+ -+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -+} -+ -+static const unsigned bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+static inline bool bch2_checksum_type_valid(const struct bch_fs *c, -+ unsigned type) -+{ -+ if (type >= BCH_CSUM_NR) -+ return false; -+ -+ if (bch2_csum_type_is_encryption(type) && !c->chacha20) -+ return false; -+ -+ return true; -+} -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -+{ -+ /* -+ * XXX: need some way of preventing the compiler from optimizing this -+ * into a form that isn't constant time.. -+ */ -+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -+} -+ -+/* for skipping ahead and encrypting/decrypting at an offset: */ -+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -+{ -+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); -+ -+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); -+ return nonce; -+} -+ -+static inline struct nonce null_nonce(void) -+{ -+ struct nonce ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ return ret; -+} -+ -+static inline struct nonce extent_nonce(struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ unsigned compression_type = crc_is_compressed(crc) -+ ? crc.compression_type -+ : 0; -+ unsigned size = compression_type ? crc.uncompressed_size : 0; -+ struct nonce nonce = (struct nonce) {{ -+ [0] = cpu_to_le32(size << 22), -+ [1] = cpu_to_le32(version.lo), -+ [2] = cpu_to_le32(version.lo >> 32), -+ [3] = cpu_to_le32(version.hi| -+ (compression_type << 24))^BCH_NONCE_EXTENT, -+ }}; -+ -+ return nonce_add(nonce, crc.nonce << 9); -+} -+ -+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -+{ -+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -+} -+ -+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -+{ -+ __le64 magic = __bch2_sb_magic(sb); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -+{ -+ __le64 magic = bch2_sb_magic(c); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+#endif /* _BCACHEFS_CHECKSUM_H */ -diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c -new file mode 100644 -index 000000000000..1d1590de55e8 ---- /dev/null -+++ b/fs/bcachefs/clock.c -@@ -0,0 +1,191 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "clock.h" -+ -+#include -+#include -+#include -+ -+static inline long io_timer_cmp(io_timer_heap *h, -+ struct io_timer *l, -+ struct io_timer *r) -+{ -+ return l->expire - r->expire; -+} -+ -+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), -+ timer->expire)) { -+ spin_unlock(&clock->timer_lock); -+ timer->fn(timer); -+ return; -+ } -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) -+ goto out; -+ -+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); -+out: -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) { -+ heap_del(&clock->timers, i, io_timer_cmp, NULL); -+ break; -+ } -+ -+ spin_unlock(&clock->timer_lock); -+} -+ -+struct io_clock_wait { -+ struct io_timer io_timer; -+ struct timer_list cpu_timer; -+ struct task_struct *task; -+ int expired; -+}; -+ -+static void io_clock_wait_fn(struct io_timer *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, io_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+static void io_clock_cpu_timeout(struct timer_list *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, cpu_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) -+{ -+ struct io_clock_wait wait; -+ -+ /* XXX: calculate sleep time rigorously */ -+ wait.io_timer.expire = until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ schedule(); -+ -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+void bch2_kthread_io_clock_wait(struct io_clock *clock, -+ unsigned long io_until, -+ unsigned long cpu_timeout) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct io_clock_wait wait; -+ -+ wait.io_timer.expire = io_until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); -+ -+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) -+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if (wait.expired) -+ break; -+ -+ schedule(); -+ try_to_freeze(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ del_singleshot_timer_sync(&wait.cpu_timer); -+ destroy_timer_on_stack(&wait.cpu_timer); -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+static struct io_timer *get_expired_timer(struct io_clock *clock, -+ unsigned long now) -+{ -+ struct io_timer *ret = NULL; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (clock->timers.used && -+ time_after_eq(now, clock->timers.data[0]->expire)) -+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); -+ -+ spin_unlock(&clock->timer_lock); -+ -+ return ret; -+} -+ -+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) -+{ -+ struct io_timer *timer; -+ unsigned long now = atomic_long_add_return(sectors, &clock->now); -+ -+ while ((timer = get_expired_timer(clock, now))) -+ timer->fn(timer); -+} -+ -+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -+{ -+ unsigned long now; -+ unsigned i; -+ -+ spin_lock(&clock->timer_lock); -+ now = atomic_long_read(&clock->now); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ pr_buf(out, "%ps:\t%li\n", -+ clock->timers.data[i]->fn, -+ clock->timers.data[i]->expire - now); -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_clock_exit(struct io_clock *clock) -+{ -+ free_heap(&clock->timers); -+ free_percpu(clock->pcpu_buf); -+} -+ -+int bch2_io_clock_init(struct io_clock *clock) -+{ -+ atomic_long_set(&clock->now, 0); -+ spin_lock_init(&clock->timer_lock); -+ -+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); -+ -+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); -+ if (!clock->pcpu_buf) -+ return -ENOMEM; -+ -+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h -new file mode 100644 -index 000000000000..70a0f7436c84 ---- /dev/null -+++ b/fs/bcachefs/clock.h -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_H -+#define _BCACHEFS_CLOCK_H -+ -+void bch2_io_timer_add(struct io_clock *, struct io_timer *); -+void bch2_io_timer_del(struct io_clock *, struct io_timer *); -+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, -+ unsigned long); -+ -+void __bch2_increment_clock(struct io_clock *, unsigned); -+ -+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, -+ int rw) -+{ -+ struct io_clock *clock = &c->io_clock[rw]; -+ -+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= -+ IO_CLOCK_PCPU_SECTORS)) -+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); -+ -+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -+({ \ -+ long __ret = timeout; \ -+ might_sleep(); \ -+ if (!___wait_cond_timeout(condition)) \ -+ __ret = __wait_event_timeout(wq, condition, timeout); \ -+ __ret; \ -+}) -+ -+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); -+ -+void bch2_io_clock_exit(struct io_clock *); -+int bch2_io_clock_init(struct io_clock *); -+ -+#endif /* _BCACHEFS_CLOCK_H */ -diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h -new file mode 100644 -index 000000000000..92c740a47565 ---- /dev/null -+++ b/fs/bcachefs/clock_types.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_TYPES_H -+#define _BCACHEFS_CLOCK_TYPES_H -+ -+#include "util.h" -+ -+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) -+ -+/* -+ * Clocks/timers in units of sectors of IO: -+ * -+ * Note - they use percpu batching, so they're only approximate. -+ */ -+ -+struct io_timer; -+typedef void (*io_timer_fn)(struct io_timer *); -+ -+struct io_timer { -+ io_timer_fn fn; -+ unsigned long expire; -+}; -+ -+/* Amount to buffer up on a percpu counter */ -+#define IO_CLOCK_PCPU_SECTORS 128 -+ -+typedef HEAP(struct io_timer *) io_timer_heap; -+ -+struct io_clock { -+ atomic_long_t now; -+ u16 __percpu *pcpu_buf; -+ unsigned max_slop; -+ -+ spinlock_t timer_lock; -+ io_timer_heap timers; -+}; -+ -+#endif /* _BCACHEFS_CLOCK_TYPES_H */ -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -new file mode 100644 -index 000000000000..b50d2b0d5fd3 ---- /dev/null -+++ b/fs/bcachefs/compress.c -@@ -0,0 +1,629 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "compress.h" -+#include "extents.h" -+#include "io.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+ -+/* Bounce buffer: */ -+struct bbuf { -+ void *b; -+ enum { -+ BB_NONE, -+ BB_VMAP, -+ BB_KMALLOC, -+ BB_MEMPOOL, -+ } type; -+ int rw; -+}; -+ -+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -+{ -+ void *b; -+ -+ BUG_ON(size > c->sb.encoded_extent_max << 9); -+ -+ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; -+ -+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; -+ -+ BUG(); -+} -+ -+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ void *expected_start = NULL; -+ -+ __bio_for_each_bvec(bv, bio, iter, start) { -+ if (expected_start && -+ expected_start != page_address(bv.bv_page) + bv.bv_offset) -+ return false; -+ -+ expected_start = page_address(bv.bv_page) + -+ bv.bv_offset + bv.bv_len; -+ } -+ -+ return true; -+} -+ -+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, -+ struct bvec_iter start, int rw) -+{ -+ struct bbuf ret; -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ unsigned nr_pages = 0; -+ struct page *stack_pages[16]; -+ struct page **pages = NULL; -+ void *data; -+ -+ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); -+ -+ if (!IS_ENABLED(CONFIG_HIGHMEM) && -+ bio_phys_contig(bio, start)) -+ return (struct bbuf) { -+ .b = page_address(bio_iter_page(bio, start)) + -+ bio_iter_offset(bio, start), -+ .type = BB_NONE, .rw = rw -+ }; -+ -+ /* check if we can map the pages contiguously: */ -+ __bio_for_each_segment(bv, bio, iter, start) { -+ if (iter.bi_size != start.bi_size && -+ bv.bv_offset) -+ goto bounce; -+ -+ if (bv.bv_len < iter.bi_size && -+ bv.bv_offset + bv.bv_len < PAGE_SIZE) -+ goto bounce; -+ -+ nr_pages++; -+ } -+ -+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); -+ -+ pages = nr_pages > ARRAY_SIZE(stack_pages) -+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) -+ : stack_pages; -+ if (!pages) -+ goto bounce; -+ -+ nr_pages = 0; -+ __bio_for_each_segment(bv, bio, iter, start) -+ pages[nr_pages++] = bv.bv_page; -+ -+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); -+ if (pages != stack_pages) -+ kfree(pages); -+ -+ if (data) -+ return (struct bbuf) { -+ .b = data + bio_iter_offset(bio, start), -+ .type = BB_VMAP, .rw = rw -+ }; -+bounce: -+ ret = __bounce_alloc(c, start.bi_size, rw); -+ -+ if (rw == READ) -+ memcpy_from_bio(ret.b, bio, start); -+ -+ return ret; -+} -+ -+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -+{ -+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -+} -+ -+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -+{ -+ switch (buf.type) { -+ case BB_NONE: -+ break; -+ case BB_VMAP: -+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); -+ break; -+ case BB_KMALLOC: -+ kfree(buf.b); -+ break; -+ case BB_MEMPOOL: -+ mempool_free(buf.b, &c->compression_bounce[buf.rw]); -+ break; -+ } -+} -+ -+static inline void zlib_set_workspace(z_stream *strm, void *workspace) -+{ -+#ifdef __KERNEL__ -+ strm->workspace = workspace; -+#endif -+} -+ -+static int __bio_uncompress(struct bch_fs *c, struct bio *src, -+ void *dst_data, struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf src_data = { NULL }; -+ size_t src_len = src->bi_iter.bi_size; -+ size_t dst_len = crc.uncompressed_size << 9; -+ void *workspace; -+ int ret; -+ -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ switch (crc.compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4_old: -+ case BCH_COMPRESSION_TYPE_lz4: -+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, -+ src_len, dst_len, dst_len); -+ if (ret != dst_len) -+ goto err; -+ break; -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src_data.b, -+ .avail_in = src_len, -+ .next_out = dst_data, -+ .avail_out = dst_len, -+ }; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_inflateInit2(&strm, -MAX_WBITS); -+ ret = zlib_inflate(&strm, Z_FINISH); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != Z_STREAM_END) -+ goto err; -+ break; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_DCtx *ctx; -+ size_t real_src_len = le32_to_cpup(src_data.b); -+ -+ if (real_src_len > src_len - 4) -+ goto err; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); -+ -+ ret = ZSTD_decompressDCtx(ctx, -+ dst_data, dst_len, -+ src_data.b + 4, real_src_len); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != dst_len) -+ goto err; -+ break; -+ } -+ default: -+ BUG(); -+ } -+ ret = 0; -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ return ret; -+err: -+ ret = -EIO; -+ goto out; -+} -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, -+ struct bch_extent_crc_unpacked *crc) -+{ -+ struct bbuf data = { NULL }; -+ size_t dst_len = crc->uncompressed_size << 9; -+ -+ /* bio must own its pages: */ -+ BUG_ON(!bio->bi_vcnt); -+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); -+ -+ if (crc->uncompressed_size > c->sb.encoded_extent_max || -+ crc->compressed_size > c->sb.encoded_extent_max) { -+ bch_err(c, "error rewriting existing data: extent too big"); -+ return -EIO; -+ } -+ -+ data = __bounce_alloc(c, dst_len, WRITE); -+ -+ if (__bio_uncompress(c, bio, data.b, *crc)) { -+ bch_err(c, "error rewriting existing data: decompression error"); -+ bio_unmap_or_unbounce(c, data); -+ return -EIO; -+ } -+ -+ /* -+ * XXX: don't have a good way to assert that the bio was allocated with -+ * enough space, we depend on bch2_move_extent doing the right thing -+ */ -+ bio->bi_iter.bi_size = crc->live_size << 9; -+ -+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); -+ -+ crc->csum_type = 0; -+ crc->compression_type = 0; -+ crc->compressed_size = crc->live_size; -+ crc->uncompressed_size = crc->live_size; -+ crc->offset = 0; -+ crc->csum = (struct bch_csum) { 0, 0 }; -+ -+ bio_unmap_or_unbounce(c, data); -+ return 0; -+} -+ -+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, -+ struct bio *dst, struct bvec_iter dst_iter, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf dst_data = { NULL }; -+ size_t dst_len = crc.uncompressed_size << 9; -+ int ret = -ENOMEM; -+ -+ if (crc.uncompressed_size > c->sb.encoded_extent_max || -+ crc.compressed_size > c->sb.encoded_extent_max) -+ return -EIO; -+ -+ dst_data = dst_len == dst_iter.bi_size -+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) -+ : __bounce_alloc(c, dst_len, WRITE); -+ -+ ret = __bio_uncompress(c, src, dst_data.b, crc); -+ if (ret) -+ goto err; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -+err: -+ bio_unmap_or_unbounce(c, dst_data); -+ return ret; -+} -+ -+static int attempt_compress(struct bch_fs *c, -+ void *workspace, -+ void *dst, size_t dst_len, -+ void *src, size_t src_len, -+ enum bch_compression_type compression_type) -+{ -+ switch (compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4: { -+ int len = src_len; -+ int ret = LZ4_compress_destSize( -+ src, dst, -+ &len, dst_len, -+ workspace); -+ -+ if (len < src_len) -+ return -len; -+ -+ return ret; -+ } -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src, -+ .avail_in = src_len, -+ .next_out = dst, -+ .avail_out = dst_len, -+ }; -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, -+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, -+ Z_DEFAULT_STRATEGY); -+ -+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) -+ return 0; -+ -+ if (zlib_deflateEnd(&strm) != Z_OK) -+ return 0; -+ -+ return strm.total_out; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, -+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); -+ -+ size_t len = ZSTD_compressCCtx(ctx, -+ dst + 4, dst_len - 4, -+ src, src_len, -+ c->zstd_params); -+ if (ZSTD_isError(len)) -+ return 0; -+ -+ *((__le32 *) dst) = cpu_to_le32(len); -+ return len + 4; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned __bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ enum bch_compression_type compression_type) -+{ -+ struct bbuf src_data = { NULL }, dst_data = { NULL }; -+ void *workspace; -+ unsigned pad; -+ int ret = 0; -+ -+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); -+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); -+ -+ /* If it's only one block, don't bother trying to compress: */ -+ if (bio_sectors(src) <= c->opts.block_size) -+ return 0; -+ -+ dst_data = bio_map_or_bounce(c, dst, WRITE); -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); -+ -+ *src_len = src->bi_iter.bi_size; -+ *dst_len = dst->bi_iter.bi_size; -+ -+ /* -+ * XXX: this algorithm sucks when the compression code doesn't tell us -+ * how much would fit, like LZ4 does: -+ */ -+ while (1) { -+ if (*src_len <= block_bytes(c)) { -+ ret = -1; -+ break; -+ } -+ -+ ret = attempt_compress(c, workspace, -+ dst_data.b, *dst_len, -+ src_data.b, *src_len, -+ compression_type); -+ if (ret > 0) { -+ *dst_len = ret; -+ ret = 0; -+ break; -+ } -+ -+ /* Didn't fit: should we retry with a smaller amount? */ -+ if (*src_len <= *dst_len) { -+ ret = -1; -+ break; -+ } -+ -+ /* -+ * If ret is negative, it's a hint as to how much data would fit -+ */ -+ BUG_ON(-ret >= *src_len); -+ -+ if (ret < 0) -+ *src_len = -ret; -+ else -+ *src_len -= (*src_len - *dst_len) / 2; -+ *src_len = round_down(*src_len, block_bytes(c)); -+ } -+ -+ mempool_free(workspace, &c->compress_workspace[compression_type]); -+ -+ if (ret) -+ goto err; -+ -+ /* Didn't get smaller: */ -+ if (round_up(*dst_len, block_bytes(c)) >= *src_len) -+ goto err; -+ -+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; -+ -+ memset(dst_data.b + *dst_len, 0, pad); -+ *dst_len += pad; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); -+ -+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); -+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); -+ BUG_ON(*dst_len & (block_bytes(c) - 1)); -+ BUG_ON(*src_len & (block_bytes(c) - 1)); -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ bio_unmap_or_unbounce(c, dst_data); -+ return compression_type; -+err: -+ compression_type = BCH_COMPRESSION_TYPE_incompressible; -+ goto out; -+} -+ -+unsigned bch2_bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ unsigned compression_type) -+{ -+ unsigned orig_dst = dst->bi_iter.bi_size; -+ unsigned orig_src = src->bi_iter.bi_size; -+ -+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ -+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, -+ c->sb.encoded_extent_max << 9); -+ /* Don't generate a bigger output than input: */ -+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ -+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) -+ compression_type = BCH_COMPRESSION_TYPE_lz4; -+ -+ compression_type = -+ __bio_compress(c, dst, dst_len, src, src_len, compression_type); -+ -+ dst->bi_iter.bi_size = orig_dst; -+ src->bi_iter.bi_size = orig_src; -+ return compression_type; -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *, u64); -+ -+#define BCH_FEATURE_none 0 -+ -+static const unsigned bch2_compression_opt_to_feature[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+#undef BCH_FEATURE_none -+ -+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -+{ -+ int ret = 0; -+ -+ if ((c->sb.features & f) == f) -+ return 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if ((c->sb.features & f) == f) { -+ mutex_unlock(&c->sb_lock); -+ return 0; -+ } -+ -+ ret = __bch2_fs_compress_init(c, c->sb.features|f); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ret; -+ } -+ -+ c->disk_sb.sb->features[0] |= cpu_to_le64(f); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *c, -+ unsigned compression_type) -+{ -+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); -+ -+ return compression_type -+ ? __bch2_check_set_has_compressed_data(c, -+ 1ULL << bch2_compression_opt_to_feature[compression_type]) -+ : 0; -+} -+ -+void bch2_fs_compress_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ mempool_exit(&c->decompress_workspace); -+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) -+ mempool_exit(&c->compress_workspace[i]); -+ mempool_exit(&c->compression_bounce[WRITE]); -+ mempool_exit(&c->compression_bounce[READ]); -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -+{ -+ size_t max_extent = c->sb.encoded_extent_max << 9; -+ size_t decompress_workspace_size = 0; -+ bool decompress_workspace_needed; -+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); -+ struct { -+ unsigned feature; -+ unsigned type; -+ size_t compress_workspace; -+ size_t decompress_workspace; -+ } compression_types[] = { -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, -+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, -+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -+ zlib_inflate_workspacesize(), }, -+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, -+ ZSTD_CCtxWorkspaceBound(params.cParams), -+ ZSTD_DCtxWorkspaceBound() }, -+ }, *i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->zstd_params = params; -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) -+ if (features & (1 << i->feature)) -+ goto have_compressed; -+ -+ goto out; -+have_compressed: -+ -+ if (!mempool_initialized(&c->compression_bounce[READ])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->compression_bounce[WRITE])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) { -+ decompress_workspace_size = -+ max(decompress_workspace_size, i->decompress_workspace); -+ -+ if (!(features & (1 << i->feature))) -+ continue; -+ -+ if (i->decompress_workspace) -+ decompress_workspace_needed = true; -+ -+ if (mempool_initialized(&c->compress_workspace[i->type])) -+ continue; -+ -+ ret = mempool_init_kvpmalloc_pool( -+ &c->compress_workspace[i->type], -+ 1, i->compress_workspace); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->decompress_workspace)) { -+ ret = mempool_init_kvpmalloc_pool( -+ &c->decompress_workspace, -+ 1, decompress_workspace_size); -+ if (ret) -+ goto out; -+ } -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_compress_init(struct bch_fs *c) -+{ -+ u64 f = c->sb.features; -+ -+ if (c->opts.compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; -+ -+ if (c->opts.background_compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; -+ -+ return __bch2_fs_compress_init(c, f); -+ -+} -diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h -new file mode 100644 -index 000000000000..4bab1f61b3b5 ---- /dev/null -+++ b/fs/bcachefs/compress.h -@@ -0,0 +1,18 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_COMPRESS_H -+#define _BCACHEFS_COMPRESS_H -+ -+#include "extents_types.h" -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, -+ struct bch_extent_crc_unpacked *); -+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, -+ struct bvec_iter, struct bch_extent_crc_unpacked); -+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, -+ struct bio *, size_t *, unsigned); -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -+void bch2_fs_compress_exit(struct bch_fs *); -+int bch2_fs_compress_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_COMPRESS_H */ -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -new file mode 100644 -index 000000000000..aa10591a3b1a ---- /dev/null -+++ b/fs/bcachefs/debug.c -@@ -0,0 +1,432 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Assorted bcachefs debug code -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "super.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+static struct dentry *bch_debug; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ struct btree *v = c->verify_data; -+ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; -+ struct bset *sorted, *inmemory; -+ struct extent_ptr_decoded pick; -+ struct bch_dev *ca; -+ struct bio *bio; -+ -+ if (c->opts.nochanges) -+ return; -+ -+ btree_node_io_lock(b); -+ mutex_lock(&c->verify_lock); -+ -+ n_ondisk = c->verify_ondisk; -+ n_sorted = c->verify_data->data; -+ n_inmemory = b->data; -+ -+ bkey_copy(&v->key, &b->key); -+ v->written = 0; -+ v->c.level = b->c.level; -+ v->c.btree_id = b->c.btree_id; -+ bch2_btree_keys_init(v, &c->expensive_debug_checks); -+ -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick) <= 0) -+ return; -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ if (!bch2_dev_get_ioref(ca, READ)) -+ return; -+ -+ bio = bio_alloc_bioset(GFP_NOIO, -+ buf_pages(n_sorted, btree_bytes(c)), -+ &c->btree_bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_READ|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bch2_bio_map(bio, n_sorted, btree_bytes(c)); -+ -+ submit_bio_wait(bio); -+ -+ bio_put(bio); -+ percpu_ref_put(&ca->io_ref); -+ -+ memcpy(n_ondisk, n_sorted, btree_bytes(c)); -+ -+ if (bch2_btree_node_read_done(c, v, false)) -+ goto out; -+ -+ n_sorted = c->verify_data->data; -+ sorted = &n_sorted->keys; -+ inmemory = &n_inmemory->keys; -+ -+ if (inmemory->u64s != sorted->u64s || -+ memcmp(inmemory->start, -+ sorted->start, -+ vstruct_end(inmemory) - (void *) inmemory->start)) { -+ unsigned offset = 0, sectors; -+ struct bset *i; -+ unsigned j; -+ -+ console_lock(); -+ -+ printk(KERN_ERR "*** in memory:\n"); -+ bch2_dump_bset(c, b, inmemory, 0); -+ -+ printk(KERN_ERR "*** read back in:\n"); -+ bch2_dump_bset(c, v, sorted, 0); -+ -+ while (offset < b->written) { -+ if (!offset ) { -+ i = &n_ondisk->keys; -+ sectors = vstruct_blocks(n_ondisk, c->block_bits) << -+ c->block_bits; -+ } else { -+ struct btree_node_entry *bne = -+ (void *) n_ondisk + (offset << 9); -+ i = &bne->keys; -+ -+ sectors = vstruct_blocks(bne, c->block_bits) << -+ c->block_bits; -+ } -+ -+ printk(KERN_ERR "*** on disk block %u:\n", offset); -+ bch2_dump_bset(c, b, i, offset); -+ -+ offset += sectors; -+ } -+ -+ printk(KERN_ERR "*** block %u/%u not written\n", -+ offset >> c->block_bits, btree_blocks(c)); -+ -+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) -+ if (inmemory->_data[j] != sorted->_data[j]) -+ break; -+ -+ printk(KERN_ERR "b->written %u\n", b->written); -+ -+ console_unlock(); -+ panic("verify failed at %u\n", j); -+ } -+out: -+ mutex_unlock(&c->verify_lock); -+ btree_node_io_unlock(b); -+} -+ -+#endif -+ -+#ifdef CONFIG_DEBUG_FS -+ -+/* XXX: bch_fs refcounting */ -+ -+struct dump_iter { -+ struct bpos from; -+ struct bch_fs *c; -+ enum btree_id id; -+ -+ char buf[PAGE_SIZE]; -+ size_t bytes; /* what's currently in buf */ -+ -+ char __user *ubuf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+static int flush_buf(struct dump_iter *i) -+{ -+ if (i->bytes) { -+ size_t bytes = min(i->bytes, i->size); -+ int err = copy_to_user(i->ubuf, i->buf, bytes); -+ -+ if (err) -+ return err; -+ -+ i->ret += bytes; -+ i->ubuf += bytes; -+ i->size -= bytes; -+ i->bytes -= bytes; -+ memmove(i->buf, i->buf + bytes, i->bytes); -+ } -+ -+ return 0; -+} -+ -+static int bch2_dump_open(struct inode *inode, struct file *file) -+{ -+ struct btree_debug *bd = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ if (!i) -+ return -ENOMEM; -+ -+ file->private_data = i; -+ i->from = POS_MIN; -+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); -+ i->id = bd->id; -+ -+ return 0; -+} -+ -+static int bch2_dump_release(struct inode *inode, struct file *file) -+{ -+ kfree(file->private_data); -+ return 0; -+} -+ -+static ssize_t bch2_read_btree(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ k = bch2_btree_iter_peek(iter); -+ -+ while (k.k && !(err = bkey_err(k))) { -+ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); -+ i->bytes = strlen(i->buf); -+ BUG_ON(i->bytes >= PAGE_SIZE); -+ i->buf[i->bytes] = '\n'; -+ i->bytes++; -+ -+ k = bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree, -+}; -+ -+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size || !bkey_cmp(POS_MAX, i->from)) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ /* -+ * can't easily correctly restart a btree node traversal across -+ * all nodes, meh -+ */ -+ i->from = bkey_cmp(POS_MAX, b->key.k.p) -+ ? bkey_successor(b->key.k.p) -+ : b->key.k.p; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_format_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree_formats, -+}; -+ -+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct btree *prev_node = NULL; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(err = bkey_err(k))) { -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_packed *_k = -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+ -+ if (l->b != prev_node) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ } -+ prev_node = l->b; -+ -+ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations bfloat_failed_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_bfloat_failed, -+}; -+ -+void bch2_fs_debug_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->debug)) -+ debugfs_remove_recursive(c->debug); -+} -+ -+void bch2_fs_debug_init(struct bch_fs *c) -+{ -+ struct btree_debug *bd; -+ char name[100]; -+ -+ if (IS_ERR_OR_NULL(bch_debug)) -+ return; -+ -+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ c->debug = debugfs_create_dir(name, bch_debug); -+ if (IS_ERR_OR_NULL(c->debug)) -+ return; -+ -+ for (bd = c->btree_debug; -+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); -+ bd++) { -+ bd->id = bd - c->btree_debug; -+ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], -+ 0400, c->debug, bd, -+ &btree_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-formats", -+ bch2_btree_ids[bd->id]); -+ -+ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, -+ &btree_format_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-bfloat-failed", -+ bch2_btree_ids[bd->id]); -+ -+ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, -+ &bfloat_failed_debug_ops); -+ } -+} -+ -+#endif -+ -+void bch2_debug_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_debug)) -+ debugfs_remove_recursive(bch_debug); -+} -+ -+int __init bch2_debug_init(void) -+{ -+ int ret = 0; -+ -+ bch_debug = debugfs_create_dir("bcachefs", NULL); -+ return ret; -+} -diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h -new file mode 100644 -index 000000000000..56c2d1ab5f63 ---- /dev/null -+++ b/fs/bcachefs/debug.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DEBUG_H -+#define _BCACHEFS_DEBUG_H -+ -+#include "bcachefs.h" -+ -+struct bio; -+struct btree; -+struct bch_fs; -+ -+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_ALWAYS() -+#undef BCH_DEBUG_PARAM -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+void __bch2_btree_verify(struct bch_fs *, struct btree *); -+ -+#define bypass_torture_test(d) ((d)->bypass_torture_test) -+ -+#else /* DEBUG */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) { return false; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} -+ -+#define bypass_torture_test(d) 0 -+ -+#endif -+ -+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ if (verify_btree_ondisk(c)) -+ __bch2_btree_verify(c, b); -+} -+ -+#ifdef CONFIG_DEBUG_FS -+void bch2_fs_debug_exit(struct bch_fs *); -+void bch2_fs_debug_init(struct bch_fs *); -+#else -+static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -+static inline void bch2_fs_debug_init(struct bch_fs *c) {} -+#endif -+ -+void bch2_debug_exit(void); -+int bch2_debug_init(void); -+ -+#endif /* _BCACHEFS_DEBUG_H */ -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -new file mode 100644 -index 000000000000..f34bfda8ab0d ---- /dev/null -+++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,385 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "dirent.h" -+#include "fs.h" -+#include "keylist.h" -+#include "str_hash.h" -+ -+#include -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -+{ -+ unsigned len = bkey_val_bytes(d.k) - -+ offsetof(struct bch_dirent, d_name); -+ -+ return strnlen(d.v->d_name, len); -+} -+ -+static u64 bch2_dirent_hash(const struct bch_hash_info *info, -+ const struct qstr *name) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, name->name, name->len); -+ -+ /* [0,2) reserved for dots */ -+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -+} -+ -+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_dirent_hash(info, key); -+} -+ -+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+ -+ return bch2_dirent_hash(info, &name); -+} -+ -+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ int len = bch2_dirent_name_bytes(l); -+ const struct qstr *r = _r; -+ -+ return len - r->len ?: memcmp(l.v->d_name, r->name, len); -+} -+ -+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -+ int l_len = bch2_dirent_name_bytes(l); -+ int r_len = bch2_dirent_name_bytes(r); -+ -+ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); -+} -+ -+const struct bch_hash_desc bch2_dirent_hash_desc = { -+ .btree_id = BTREE_ID_DIRENTS, -+ .key_type = KEY_TYPE_dirent, -+ .hash_key = dirent_hash_key, -+ .hash_bkey = dirent_hash_bkey, -+ .cmp_key = dirent_cmp_key, -+ .cmp_bkey = dirent_cmp_bkey, -+}; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ unsigned len; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) -+ return "value too small"; -+ -+ len = bch2_dirent_name_bytes(d); -+ if (!len) -+ return "empty name"; -+ -+ /* -+ * older versions of bcachefs were buggy and creating dirent -+ * keys that were bigger than necessary: -+ */ -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) -+ return "value too big"; -+ -+ if (len > BCH_NAME_MAX) -+ return "dirent name too big"; -+ -+ return NULL; -+} -+ -+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ -+ bch_scnmemcpy(out, d.v->d_name, -+ bch2_dirent_name_bytes(d)); -+ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); -+} -+ -+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -+ u8 type, const struct qstr *name, u64 dst) -+{ -+ struct bkey_i_dirent *dirent; -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); -+ -+ if (name->len > BCH_NAME_MAX) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ BUG_ON(u64s > U8_MAX); -+ -+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(dirent)) -+ return dirent; -+ -+ bkey_dirent_init(&dirent->k_i); -+ dirent->k.u64s = u64s; -+ dirent->v.d_inum = cpu_to_le64(dst); -+ dirent->v.d_type = type; -+ -+ memcpy(dirent->v.d_name, name->name, name->len); -+ memset(dirent->v.d_name + name->len, 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_name) - -+ name->len); -+ -+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); -+ -+ return dirent; -+} -+ -+int bch2_dirent_create(struct btree_trans *trans, -+ u64 dir_inum, const struct bch_hash_info *hash_info, -+ u8 type, const struct qstr *name, u64 dst_inum, -+ int flags) -+{ -+ struct bkey_i_dirent *dirent; -+ int ret; -+ -+ dirent = dirent_create_key(trans, type, name, dst_inum); -+ ret = PTR_ERR_OR_ZERO(dirent); -+ if (ret) -+ return ret; -+ -+ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, -+ dir_inum, &dirent->k_i, flags); -+} -+ -+static void dirent_copy_target(struct bkey_i_dirent *dst, -+ struct bkey_s_c_dirent src) -+{ -+ dst->v.d_inum = src.v->d_inum; -+ dst->v.d_type = src.v->d_type; -+} -+ -+int bch2_dirent_rename(struct btree_trans *trans, -+ u64 src_dir, struct bch_hash_info *src_hash, -+ u64 dst_dir, struct bch_hash_info *dst_hash, -+ const struct qstr *src_name, u64 *src_inum, -+ const struct qstr *dst_name, u64 *dst_inum, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_iter = NULL, *dst_iter = NULL; -+ struct bkey_s_c old_src, old_dst; -+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; -+ struct bpos dst_pos = -+ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); -+ int ret = 0; -+ -+ *src_inum = *dst_inum = 0; -+ -+ /* -+ * Lookup dst: -+ * -+ * Note that in BCH_RENAME mode, we're _not_ checking if -+ * the target already exists - we're relying on the VFS -+ * to do that check for us for correctness: -+ */ -+ dst_iter = mode == BCH_RENAME -+ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name) -+ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_iter); -+ if (ret) -+ goto out; -+ -+ old_dst = bch2_btree_iter_peek_slot(dst_iter); -+ -+ if (mode != BCH_RENAME) -+ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); -+ -+ /* Lookup src: */ -+ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ src_hash, src_dir, src_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_iter); -+ if (ret) -+ goto out; -+ -+ old_src = bch2_btree_iter_peek_slot(src_iter); -+ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); -+ -+ /* Create new dst key: */ -+ new_dst = dirent_create_key(trans, 0, dst_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_dst); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); -+ new_dst->k.p = dst_iter->pos; -+ -+ /* Create new src key: */ -+ if (mode == BCH_RENAME_EXCHANGE) { -+ new_src = dirent_create_key(trans, 0, src_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); -+ new_src->k.p = src_iter->pos; -+ } else { -+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ bkey_init(&new_src->k); -+ new_src->k.p = src_iter->pos; -+ -+ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && -+ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { -+ /* -+ * We have a hash collision for the new dst key, -+ * and new_src - the key we're deleting - is between -+ * new_dst's hashed slot and the slot we're going to be -+ * inserting it into - oops. This will break the hash -+ * table if we don't deal with it: -+ */ -+ if (mode == BCH_RENAME) { -+ /* -+ * If we're not overwriting, we can just insert -+ * new_dst at the src position: -+ */ -+ new_dst->k.p = src_iter->pos; -+ bch2_trans_update(trans, src_iter, -+ &new_dst->k_i, 0); -+ goto out; -+ } else { -+ /* If we're overwriting, we can't insert new_dst -+ * at a different slot because it has to -+ * overwrite old_dst - just make sure to use a -+ * whiteout when deleting src: -+ */ -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } else { -+ /* Check if we need a whiteout to delete src: */ -+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, -+ src_hash, src_iter); -+ if (ret < 0) -+ goto out; -+ -+ if (ret) -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } -+ -+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); -+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, src_iter); -+ bch2_trans_iter_put(trans, dst_iter); -+ return ret; -+} -+ -+int bch2_dirent_delete_at(struct btree_trans *trans, -+ const struct bch_hash_info *hash_info, -+ struct btree_iter *iter) -+{ -+ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ hash_info, iter); -+} -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name, unsigned flags) -+{ -+ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ hash_info, dir_inum, name, flags); -+} -+ -+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 inum = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, -+ hash_info, name, 0); -+ if (IS_ERR(iter)) { -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ goto out; -+ } -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+out: -+ bch2_trans_exit(&trans); -+ return inum; -+} -+ -+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, -+ POS(dir_inum, 0), 0, k, ret) { -+ if (k.k->p.inode > dir_inum) -+ break; -+ -+ if (k.k->type == KEY_TYPE_dirent) { -+ ret = -ENOTEMPTY; -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+} -+ -+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(inum, ctx->pos), 0, k, ret) { -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ /* -+ * XXX: dir_emit() can fault and block, while we're holding -+ * locks -+ */ -+ ctx->pos = dirent.k->p.offset; -+ if (!dir_emit(ctx, dirent.v->d_name, -+ bch2_dirent_name_bytes(dirent), -+ le64_to_cpu(dirent.v->d_inum), -+ dirent.v->d_type)) -+ break; -+ ctx->pos = dirent.k->p.offset + 1; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ return ret; -+} -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -new file mode 100644 -index 000000000000..34769371dd13 ---- /dev/null -+++ b/fs/bcachefs/dirent.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DIRENT_H -+#define _BCACHEFS_DIRENT_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_dirent_hash_desc; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_dirent (struct bkey_ops) { \ -+ .key_invalid = bch2_dirent_invalid, \ -+ .val_to_text = bch2_dirent_to_text, \ -+} -+ -+struct qstr; -+struct file; -+struct dir_context; -+struct bch_fs; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); -+ -+static inline unsigned dirent_val_u64s(unsigned len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, -+ sizeof(u64)); -+} -+ -+int bch2_dirent_create(struct btree_trans *, u64, -+ const struct bch_hash_info *, u8, -+ const struct qstr *, u64, int); -+ -+int bch2_dirent_delete_at(struct btree_trans *, -+ const struct bch_hash_info *, -+ struct btree_iter *); -+ -+enum bch_rename_mode { -+ BCH_RENAME, -+ BCH_RENAME_OVERWRITE, -+ BCH_RENAME_EXCHANGE, -+}; -+ -+int bch2_dirent_rename(struct btree_trans *, -+ u64, struct bch_hash_info *, -+ u64, struct bch_hash_info *, -+ const struct qstr *, u64 *, -+ const struct qstr *, u64 *, -+ enum bch_rename_mode); -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *, u64, -+ const struct bch_hash_info *, -+ const struct qstr *, unsigned); -+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, -+ const struct qstr *); -+ -+int bch2_empty_dir_trans(struct btree_trans *, u64); -+int bch2_readdir(struct bch_fs *, u64, struct dir_context *); -+ -+#endif /* _BCACHEFS_DIRENT_H */ -diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c -new file mode 100644 -index 000000000000..c52b6faac9b4 ---- /dev/null -+++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,486 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "disk_groups.h" -+#include "super-io.h" -+ -+#include -+ -+static int group_cmp(const void *_l, const void *_r) -+{ -+ const struct bch_disk_group *l = _l; -+ const struct bch_disk_group *r = _r; -+ -+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - -+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: -+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - -+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: -+ strncmp(l->label, r->label, sizeof(l->label)); -+} -+ -+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g, *sorted = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member *m; -+ unsigned i, nr_groups, len; -+ const char *err = NULL; -+ -+ mi = bch2_sb_get_members(sb); -+ groups = bch2_sb_get_disk_groups(sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ unsigned g; -+ -+ if (!BCH_MEMBER_GROUP(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (g >= nr_groups || -+ BCH_GROUP_DELETED(&groups->entries[g])) -+ return "disk has invalid group"; -+ } -+ -+ if (!nr_groups) -+ return NULL; -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ len = strnlen(g->label, sizeof(g->label)); -+ if (!len) { -+ err = "group with empty label"; -+ goto err; -+ } -+ } -+ -+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); -+ if (!sorted) -+ return "cannot allocate memory"; -+ -+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); -+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); -+ -+ for (i = 0; i + 1 < nr_groups; i++) -+ if (!BCH_GROUP_DELETED(sorted + i) && -+ !group_cmp(sorted + i, sorted + i + 1)) { -+ err = "duplicate groups"; -+ goto err; -+ } -+ -+ err = NULL; -+err: -+ kfree(sorted); -+ return err; -+} -+ -+static void bch2_sb_disk_groups_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g; -+ unsigned nr_groups = disk_groups_nr(groups); -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (g != groups->entries) -+ pr_buf(out, " "); -+ -+ if (BCH_GROUP_DELETED(g)) -+ pr_buf(out, "[deleted]"); -+ else -+ pr_buf(out, "[parent %llu name %s]", -+ BCH_GROUP_PARENT(g), g->label); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { -+ .validate = bch2_sb_disk_groups_validate, -+ .to_text = bch2_sb_disk_groups_to_text -+}; -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_sb_field_disk_groups *groups; -+ struct bch_disk_groups_cpu *cpu_g, *old_g; -+ unsigned i, g, nr_groups; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ if (!groups) -+ return 0; -+ -+ cpu_g = kzalloc(sizeof(*cpu_g) + -+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); -+ if (!cpu_g) -+ return -ENOMEM; -+ -+ cpu_g->nr = nr_groups; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *src = &groups->entries[i]; -+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; -+ -+ dst->deleted = BCH_GROUP_DELETED(src); -+ dst->parent = BCH_GROUP_PARENT(src); -+ } -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ struct bch_disk_group_cpu *dst = -+ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m); -+ while (g) { -+ dst = &cpu_g->entries[g - 1]; -+ __set_bit(i, dst->devs.d); -+ g = dst->parent; -+ } -+ } -+ -+ old_g = rcu_dereference_protected(c->disk_groups, -+ lockdep_is_held(&c->sb_lock)); -+ rcu_assign_pointer(c->disk_groups, cpu_g); -+ if (old_g) -+ kfree_rcu(old_g, rcu); -+ -+ return 0; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return NULL; -+ case TARGET_DEV: { -+ struct bch_dev *ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ return ca ? &ca->self : NULL; -+ } -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ -+ return g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return false; -+ case TARGET_DEV: -+ return dev == t.dev; -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g; -+ const struct bch_devs_mask *m; -+ bool ret; -+ -+ rcu_read_lock(); -+ g = rcu_dereference(c->disk_groups); -+ m = g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ -+ ret = m ? test_bit(dev, m->d) : false; -+ rcu_read_unlock(); -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, -+ unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *g = groups->entries + i; -+ -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ if (!BCH_GROUP_DELETED(g) && -+ BCH_GROUP_PARENT(g) == parent && -+ strnlen(g->label, sizeof(g->label)) == namelen && -+ !memcmp(name, g->label, namelen)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ struct bch_disk_group *g; -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; -+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); -+ i++) -+ ; -+ -+ if (i == nr_groups) { -+ unsigned u64s = -+ (sizeof(struct bch_sb_field_disk_groups) + -+ sizeof(struct bch_disk_group) * (nr_groups + 1)) / -+ sizeof(u64); -+ -+ groups = bch2_sb_resize_disk_groups(sb, u64s); -+ if (!groups) -+ return -ENOSPC; -+ -+ nr_groups = disk_groups_nr(groups); -+ } -+ -+ BUG_ON(i >= nr_groups); -+ -+ g = &groups->entries[i]; -+ -+ memcpy(g->label, name, namelen); -+ if (namelen < sizeof(g->label)) -+ g->label[namelen] = '\0'; -+ SET_BCH_GROUP_DELETED(g, 0); -+ SET_BCH_GROUP_PARENT(g, parent); -+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); -+ -+ return i; -+} -+ -+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ v = __bch2_disk_group_find(groups, v + 1, name, len); -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups; -+ unsigned parent = 0; -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ groups = bch2_sb_get_disk_groups(sb->sb); -+ -+ v = __bch2_disk_group_find(groups, parent, name, len); -+ if (v < 0) -+ v = __bch2_disk_group_add(sb, parent, name, len); -+ if (v < 0) -+ return v; -+ -+ parent = v + 1; -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+void bch2_disk_path_to_text(struct printbuf *out, -+ struct bch_sb_handle *sb, -+ unsigned v) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ struct bch_disk_group *g; -+ unsigned nr = 0; -+ u16 path[32]; -+ -+ while (1) { -+ if (nr == ARRAY_SIZE(path)) -+ goto inval; -+ -+ if (v >= disk_groups_nr(groups)) -+ goto inval; -+ -+ g = groups->entries + v; -+ -+ if (BCH_GROUP_DELETED(g)) -+ goto inval; -+ -+ path[nr++] = v; -+ -+ if (!BCH_GROUP_PARENT(g)) -+ break; -+ -+ v = BCH_GROUP_PARENT(g) - 1; -+ } -+ -+ while (nr) { -+ v = path[--nr]; -+ g = groups->entries + v; -+ -+ bch_scnmemcpy(out, g->label, -+ strnlen(g->label, sizeof(g->label))); -+ -+ if (nr) -+ pr_buf(out, "."); -+ } -+ return; -+inval: -+ pr_buf(out, "invalid group %u", v); -+} -+ -+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -+{ -+ struct bch_member *mi; -+ int v = -1; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (!strlen(name) || !strcmp(name, "none")) -+ goto write_sb; -+ -+ v = bch2_disk_path_find_or_create(&c->disk_sb, name); -+ if (v < 0) { -+ mutex_unlock(&c->sb_lock); -+ return v; -+ } -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ goto unlock; -+write_sb: -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ SET_BCH_MEMBER_GROUP(mi, v + 1); -+ -+ bch2_write_super(c); -+unlock: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) -+{ -+ struct bch_dev *ca; -+ int g; -+ -+ if (!strlen(buf) || !strcmp(buf, "none")) { -+ *v = 0; -+ return 0; -+ } -+ -+ /* Is it a device? */ -+ ca = bch2_dev_lookup(c, buf); -+ if (!IS_ERR(ca)) { -+ *v = dev_to_target(ca->dev_idx); -+ percpu_ref_put(&ca->ref); -+ return 0; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ g = bch2_disk_path_find(&c->disk_sb, buf); -+ mutex_unlock(&c->sb_lock); -+ -+ if (g >= 0) { -+ *v = group_to_target(g); -+ return 0; -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) -+{ -+ struct target t = target_decode(v); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ pr_buf(out, "none"); -+ break; -+ case TARGET_DEV: { -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ char b[BDEVNAME_SIZE]; -+ -+ pr_buf(out, "/dev/%s", -+ bdevname(ca->disk_sb.bdev, b)); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ pr_buf(out, "offline device %u", t.dev); -+ } else { -+ pr_buf(out, "invalid device %u", t.dev); -+ } -+ -+ rcu_read_unlock(); -+ break; -+ } -+ case TARGET_GROUP: -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, &c->disk_sb, t.group); -+ mutex_unlock(&c->sb_lock); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h -new file mode 100644 -index 000000000000..3d84f23c34ed ---- /dev/null -+++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DISK_GROUPS_H -+#define _BCACHEFS_DISK_GROUPS_H -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; -+ -+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -+{ -+ return groups -+ ? (vstruct_end(&groups->field) - -+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) -+ : 0; -+} -+ -+struct target { -+ enum { -+ TARGET_NULL, -+ TARGET_DEV, -+ TARGET_GROUP, -+ } type; -+ union { -+ unsigned dev; -+ unsigned group; -+ }; -+}; -+ -+#define TARGET_DEV_START 1 -+#define TARGET_GROUP_START (256 + TARGET_DEV_START) -+ -+static inline u16 dev_to_target(unsigned dev) -+{ -+ return TARGET_DEV_START + dev; -+} -+ -+static inline u16 group_to_target(unsigned group) -+{ -+ return TARGET_GROUP_START + group; -+} -+ -+static inline struct target target_decode(unsigned target) -+{ -+ if (target >= TARGET_GROUP_START) -+ return (struct target) { -+ .type = TARGET_GROUP, -+ .group = target - TARGET_GROUP_START -+ }; -+ -+ if (target >= TARGET_DEV_START) -+ return (struct target) { -+ .type = TARGET_DEV, -+ .group = target - TARGET_DEV_START -+ }; -+ -+ return (struct target) { .type = TARGET_NULL }; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); -+ -+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, -+ enum bch_data_type data_type, -+ u16 target) -+{ -+ struct bch_devs_mask devs = c->rw_devs[data_type]; -+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); -+ -+ if (t) -+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); -+ return devs; -+} -+ -+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); -+ -+int bch2_disk_path_find(struct bch_sb_handle *, const char *); -+ -+/* Exported for userspace bcachefs-tools: */ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -+ -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, -+ unsigned); -+ -+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *); -+ -+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -+ -+const char *bch2_sb_validate_disk_groups(struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_DISK_GROUPS_H */ -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -new file mode 100644 -index 000000000000..5514f65378ad ---- /dev/null -+++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1639 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+/* erasure coding */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+#ifdef __KERNEL__ -+ -+#include -+#include -+ -+static void raid5_recov(unsigned disks, unsigned failed_idx, -+ size_t size, void **data) -+{ -+ unsigned i = 2, nr; -+ -+ BUG_ON(failed_idx >= disks); -+ -+ swap(data[0], data[failed_idx]); -+ memcpy(data[0], data[1], size); -+ -+ while (i < disks) { -+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); -+ xor_blocks(nr, size, data[0], data + i); -+ i += nr; -+ } -+ -+ swap(data[0], data[failed_idx]); -+} -+ -+static void raid_gen(int nd, int np, size_t size, void **v) -+{ -+ if (np >= 1) -+ raid5_recov(nd + np, nd, size, v); -+ if (np >= 2) -+ raid6_call.gen_syndrome(nd + np, size, v); -+ BUG_ON(np > 2); -+} -+ -+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -+{ -+ switch (nr) { -+ case 0: -+ break; -+ case 1: -+ if (ir[0] < nd + 1) -+ raid5_recov(nd + 1, ir[0], size, v); -+ else -+ raid6_call.gen_syndrome(nd + np, size, v); -+ break; -+ case 2: -+ if (ir[1] < nd) { -+ /* data+data failure. */ -+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); -+ } else if (ir[0] < nd) { -+ /* data + p/q failure */ -+ -+ if (ir[1] == nd) /* data + p failure */ -+ raid6_datap_recov(nd + np, size, ir[0], v); -+ else { /* data + q failure */ -+ raid5_recov(nd + 1, ir[0], size, v); -+ raid6_call.gen_syndrome(nd + np, size, v); -+ } -+ } else { -+ raid_gen(nd, np, size, v); -+ } -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+#else -+ -+#include -+ -+#endif -+ -+struct ec_bio { -+ struct bch_dev *ca; -+ struct ec_stripe_buf *buf; -+ size_t idx; -+ struct bio bio; -+}; -+ -+/* Stripes btree keys: */ -+ -+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ -+ if (k.k->p.inode) -+ return "invalid stripe key"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s)) -+ return "incorrect value size"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s) || -+ bkey_val_u64s(k.k) < stripe_val_u64s(s)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ unsigned i; -+ -+ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", -+ s->algorithm, -+ le16_to_cpu(s->sectors), -+ s->nr_blocks - s->nr_redundant, -+ s->nr_redundant, -+ s->csum_type, -+ 1U << s->csum_granularity_bits); -+ -+ for (i = 0; i < s->nr_blocks; i++) -+ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, -+ (u64) s->ptrs[i].offset, -+ stripe_blockcount_get(s, i)); -+} -+ -+static int ptr_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ const struct bch_extent_ptr *ptr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { -+ const struct bch_extent_ptr *ptr2 = v->ptrs + i; -+ -+ if (ptr->dev == ptr2->dev && -+ ptr->gen == ptr2->gen && -+ ptr->offset >= ptr2->offset && -+ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int extent_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ struct bkey_s_c k) -+{ -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const struct bch_extent_ptr *ptr; -+ int idx; -+ -+ extent_for_each_ptr(e, ptr) { -+ idx = ptr_matches_stripe(c, v, ptr); -+ if (idx >= 0) -+ return idx; -+ } -+ break; -+ } -+ } -+ -+ return -1; -+} -+ -+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ -+ extent_for_each_entry(e, entry) -+ if (extent_entry_type(entry) == -+ BCH_EXTENT_ENTRY_stripe_ptr && -+ entry->stripe_ptr.idx == idx) -+ return true; -+ -+ break; -+ } -+ } -+ -+ return false; -+} -+ -+/* Checksumming: */ -+ -+static void ec_generate_checksums(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csums_per_device = stripe_csums_per_device(v); -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i, j; -+ -+ if (!csum_bytes) -+ return; -+ -+ BUG_ON(buf->offset); -+ BUG_ON(buf->size != le16_to_cpu(v->sectors)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ for (j = 0; j < csums_per_device; j++) { -+ unsigned offset = j << v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, buf->size - offset); -+ -+ struct bch_csum csum = -+ bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + (offset << 9), -+ len << 9); -+ -+ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); -+ } -+ } -+} -+ -+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i; -+ -+ if (!csum_bytes) -+ return; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ unsigned offset = buf->offset; -+ unsigned end = buf->offset + buf->size; -+ -+ if (!test_bit(i, buf->valid)) -+ continue; -+ -+ while (offset < end) { -+ unsigned j = offset >> v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, end - offset); -+ struct bch_csum csum; -+ -+ BUG_ON(offset & (csum_granularity - 1)); -+ BUG_ON(offset + len != le16_to_cpu(v->sectors) && -+ ((offset + len) & (csum_granularity - 1))); -+ -+ csum = bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + ((offset - buf->offset) << 9), -+ len << 9); -+ -+ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { -+ __bcache_io_error(c, -+ "checksum error while doing reconstruct read (%u:%u)", -+ i, j); -+ clear_bit(i, buf->valid); -+ break; -+ } -+ -+ offset += len; -+ } -+ } -+} -+ -+/* Erasure coding: */ -+ -+static void ec_generate_ec(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = le16_to_cpu(v->sectors) << 9; -+ -+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -+} -+ -+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) -+{ -+ return nr - bitmap_weight(buf->valid, nr); -+} -+ -+static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -+{ -+ return __ec_nr_failed(buf, buf->key.v.nr_blocks); -+} -+ -+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = buf->size << 9; -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ return -1; -+ } -+ -+ for (i = 0; i < nr_data; i++) -+ if (!test_bit(i, buf->valid)) -+ failed[nr_failed++] = i; -+ -+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); -+ return 0; -+} -+ -+/* IO: */ -+ -+static void ec_block_endio(struct bio *bio) -+{ -+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); -+ struct bch_dev *ca = ec_bio->ca; -+ struct closure *cl = bio->bi_private; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", -+ bio_data_dir(bio) ? "write" : "read", -+ bch2_blk_status_to_str(bio->bi_status))) -+ clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ -+ bio_put(&ec_bio->bio); -+ percpu_ref_put(&ca->io_ref); -+ closure_put(cl); -+} -+ -+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, -+ unsigned rw, unsigned idx, struct closure *cl) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned offset = 0, bytes = buf->size << 9; -+ struct bch_extent_ptr *ptr = &v->ptrs[idx]; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (!bch2_dev_get_ioref(ca, rw)) { -+ clear_bit(idx, buf->valid); -+ return; -+ } -+ -+ while (offset < bytes) { -+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, -+ DIV_ROUND_UP(bytes, PAGE_SIZE)); -+ unsigned b = min_t(size_t, bytes - offset, -+ nr_iovecs << PAGE_SHIFT); -+ struct ec_bio *ec_bio; -+ -+ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, -+ &c->ec_bioset), -+ struct ec_bio, bio); -+ -+ ec_bio->ca = ca; -+ ec_bio->buf = buf; -+ ec_bio->idx = idx; -+ -+ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); -+ bio_set_op_attrs(&ec_bio->bio, rw, 0); -+ -+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); -+ ec_bio->bio.bi_end_io = ec_block_endio; -+ ec_bio->bio.bi_private = cl; -+ -+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); -+ -+ closure_get(cl); -+ percpu_ref_get(&ca->io_ref); -+ -+ submit_bio(&ec_bio->bio); -+ -+ offset += b; -+ } -+ -+ percpu_ref_put(&ca->io_ref); -+} -+ -+/* recovery read path: */ -+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct ec_stripe_buf *buf; -+ struct closure cl; -+ struct bkey_s_c k; -+ struct bch_stripe *v; -+ unsigned stripe_idx; -+ unsigned offset, end; -+ unsigned i, nr_data, csum_granularity; -+ int ret = 0, idx; -+ -+ closure_init_stack(&cl); -+ -+ BUG_ON(!rbio->pick.has_ec); -+ -+ stripe_idx = rbio->pick.ec.idx; -+ -+ buf = kzalloc(sizeof(*buf), GFP_NOIO); -+ if (!buf) -+ return -ENOMEM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, -+ POS(0, stripe_idx), -+ BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stripe not found"); -+ kfree(buf); -+ return bch2_trans_exit(&trans) ?: -EIO; -+ } -+ -+ bkey_reassemble(&buf->key.k_i, k); -+ bch2_trans_exit(&trans); -+ -+ v = &buf->key.v; -+ -+ nr_data = v->nr_blocks - v->nr_redundant; -+ -+ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); -+ BUG_ON(idx < 0); -+ -+ csum_granularity = 1U << v->csum_granularity_bits; -+ -+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; -+ end = offset + bio_sectors(&rbio->bio); -+ -+ BUG_ON(end > le16_to_cpu(v->sectors)); -+ -+ buf->offset = round_down(offset, csum_granularity); -+ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), -+ round_up(end, csum_granularity)) - buf->offset; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); -+ if (!buf->data[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ memset(buf->valid, 0xFF, sizeof(buf->valid)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ struct bch_extent_ptr *ptr = v->ptrs + i; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ptr_stale(ca, ptr)) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stale pointer"); -+ clear_bit(i, buf->valid); -+ continue; -+ } -+ -+ ec_block_io(c, buf, REQ_OP_READ, i, &cl); -+ } -+ -+ closure_sync(&cl); -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ ec_validate_checksums(c, buf); -+ -+ ret = ec_do_recov(c, buf); -+ if (ret) -+ goto err; -+ -+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, -+ buf->data[idx] + ((offset - buf->offset) << 9)); -+err: -+ for (i = 0; i < v->nr_blocks; i++) -+ kfree(buf->data[i]); -+ kfree(buf); -+ return ret; -+} -+ -+/* stripe bucket accounting: */ -+ -+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -+{ -+ ec_stripes_heap n, *h = &c->ec_stripes_heap; -+ -+ if (idx >= h->size) { -+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) -+ return -ENOMEM; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ if (n.size > h->size) { -+ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); -+ n.used = h->used; -+ swap(*h, n); -+ } -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ free_heap(&n); -+ } -+ -+ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) -+ return -ENOMEM; -+ -+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && -+ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+static int ec_stripe_mem_alloc(struct bch_fs *c, -+ struct btree_iter *iter) -+{ -+ size_t idx = iter->pos.offset; -+ int ret = 0; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) -+ return ret; -+ -+ bch2_trans_unlock(iter->trans); -+ ret = -EINTR; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) -+ return ret; -+ -+ return -ENOMEM; -+} -+ -+static ssize_t stripe_idx_to_delete(struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ -+ return h->used && h->data[0].blocks_nonempty == 0 -+ ? h->data[0].idx : -1; -+} -+ -+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, -+ struct ec_stripe_heap_entry l, -+ struct ec_stripe_heap_entry r) -+{ -+ return ((l.blocks_nonempty > r.blocks_nonempty) - -+ (l.blocks_nonempty < r.blocks_nonempty)); -+} -+ -+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, -+ size_t i) -+{ -+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); -+ -+ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; -+} -+ -+static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m = genradix_ptr(&c->stripes[0], idx); -+ -+ BUG_ON(!m->alive); -+ BUG_ON(m->heap_idx >= h->used); -+ BUG_ON(h->data[m->heap_idx].idx != idx); -+} -+ -+void bch2_stripes_heap_del(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ if (!m->on_heap) -+ return; -+ -+ m->on_heap = false; -+ -+ heap_verify_backpointer(c, idx); -+ -+ heap_del(&c->ec_stripes_heap, m->heap_idx, -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+} -+ -+void bch2_stripes_heap_insert(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ if (m->on_heap) -+ return; -+ -+ BUG_ON(heap_full(&c->ec_stripes_heap)); -+ -+ m->on_heap = true; -+ -+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { -+ .idx = idx, -+ .blocks_nonempty = m->blocks_nonempty, -+ }), -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+} -+ -+void bch2_stripes_heap_update(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ size_t i; -+ -+ if (!m->on_heap) -+ return; -+ -+ heap_verify_backpointer(c, idx); -+ -+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; -+ -+ i = m->heap_idx; -+ heap_sift_up(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ heap_sift_down(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+ -+ if (stripe_idx_to_delete(c) >= 0 && -+ !percpu_ref_is_dying(&c->writes)) -+ schedule_work(&c->ec_stripe_delete_work); -+} -+ -+/* stripe deletion */ -+ -+static int ec_stripe_delete(struct bch_fs *c, size_t idx) -+{ -+ //pr_info("deleting stripe %zu", idx); -+ return bch2_btree_delete_range(c, BTREE_ID_EC, -+ POS(0, idx), -+ POS(0, idx + 1), -+ NULL); -+} -+ -+static void ec_stripe_delete_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, ec_stripe_delete_work); -+ ssize_t idx; -+ -+ while (1) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ idx = stripe_idx_to_delete(c); -+ if (idx < 0) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ break; -+ } -+ -+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ if (ec_stripe_delete(c, idx)) -+ break; -+ } -+} -+ -+/* stripe creation: */ -+ -+static int ec_stripe_bkey_insert(struct bch_fs *c, -+ struct bkey_i_stripe *stripe) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bpos start_pos = POS(0, c->ec_stripe_hint); -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { -+ if (start_pos.offset) { -+ start_pos = POS_MIN; -+ bch2_btree_iter_set_pos(iter, start_pos); -+ continue; -+ } -+ -+ ret = -ENOSPC; -+ break; -+ } -+ -+ if (bkey_deleted(k.k)) -+ goto found_slot; -+ } -+ -+ goto err; -+found_slot: -+ start_pos = iter->pos; -+ -+ ret = ec_stripe_mem_alloc(c, iter); -+ if (ret) -+ goto err; -+ -+ stripe->k.p = iter->pos; -+ -+ bch2_trans_update(&trans, iter, &stripe->k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static void extent_stripe_ptr_add(struct bkey_s_extent e, -+ struct ec_stripe_buf *s, -+ struct bch_extent_ptr *ptr, -+ unsigned block) -+{ -+ struct bch_extent_stripe_ptr *dst = (void *) ptr; -+ union bch_extent_entry *end = extent_entry_last(e); -+ -+ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); -+ e.k->u64s += sizeof(*dst) / sizeof(u64); -+ -+ *dst = (struct bch_extent_stripe_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, -+ .block = block, -+ .idx = s->key.k.p.offset, -+ }; -+} -+ -+static int ec_stripe_update_ptrs(struct bch_fs *c, -+ struct ec_stripe_buf *s, -+ struct bkey *pos) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_extent e; -+ struct bkey_on_stack sk; -+ int ret = 0, dev, idx; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ /* XXX this doesn't support the reflink btree */ -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(pos), -+ BTREE_ITER_INTENT); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { -+ struct bch_extent_ptr *ptr, *ec_ptr = NULL; -+ -+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ idx = extent_matches_stripe(c, &s->key.v, k); -+ if (idx < 0) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ dev = s->key.v.ptrs[idx].dev; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ e = bkey_i_to_s_extent(sk.k); -+ -+ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); -+ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); -+ BUG_ON(!ec_ptr); -+ -+ extent_stripe_ptr_add(e, s, ec_ptr, idx); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* -+ * data buckets of new stripe all written: create the stripe -+ */ -+static void ec_stripe_create(struct ec_stripe_new *s) -+{ -+ struct bch_fs *c = s->c; -+ struct open_bucket *ob; -+ struct bkey_i *k; -+ struct stripe *m; -+ struct bch_stripe *v = &s->stripe.key.v; -+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -+ struct closure cl; -+ int ret; -+ -+ BUG_ON(s->h->s == s); -+ -+ closure_init_stack(&cl); -+ -+ if (s->err) { -+ if (s->err != -EROFS) -+ bch_err(c, "error creating stripe: error writing data buckets"); -+ goto err; -+ } -+ -+ BUG_ON(!s->allocated); -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ goto err; -+ -+ BUG_ON(bitmap_weight(s->blocks_allocated, -+ s->blocks.nr) != s->blocks.nr); -+ -+ ec_generate_ec(&s->stripe); -+ -+ ec_generate_checksums(&s->stripe); -+ -+ /* write p/q: */ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); -+ -+ closure_sync(&cl); -+ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ if (!test_bit(i, s->stripe.valid)) { -+ bch_err(c, "error creating stripe: error writing redundancy buckets"); -+ goto err_put_writes; -+ } -+ -+ ret = s->existing_stripe -+ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, -+ NULL, NULL, BTREE_INSERT_NOFAIL) -+ : ec_stripe_bkey_insert(c, &s->stripe.key); -+ if (ret) { -+ bch_err(c, "error creating stripe: error creating stripe key"); -+ goto err_put_writes; -+ } -+ -+ for_each_keylist_key(&s->keys, k) { -+ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); -+ if (ret) { -+ bch_err(c, "error creating stripe: error updating pointers"); -+ break; -+ } -+ } -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); -+#if 0 -+ pr_info("created a %s stripe %llu", -+ s->existing_stripe ? "existing" : "new", -+ s->stripe.key.k.p.offset); -+#endif -+ BUG_ON(m->on_heap); -+ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); -+ spin_unlock(&c->ec_stripes_heap_lock); -+err_put_writes: -+ percpu_ref_put(&c->writes); -+err: -+ open_bucket_for_each(c, &s->blocks, ob, i) { -+ ob->ec = NULL; -+ __bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_open_buckets_put(c, &s->parity); -+ -+ bch2_keylist_free(&s->keys, s->inline_keys); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+} -+ -+static void ec_stripe_create_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, -+ struct bch_fs, ec_stripe_create_work); -+ struct ec_stripe_new *s, *n; -+restart: -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) -+ if (!atomic_read(&s->pin)) { -+ list_del(&s->list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ ec_stripe_create(s); -+ goto restart; -+ } -+ mutex_unlock(&c->ec_stripe_new_lock); -+} -+ -+static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) -+{ -+ BUG_ON(atomic_read(&s->pin) <= 0); -+ -+ if (atomic_dec_and_test(&s->pin)) { -+ BUG_ON(!s->pending); -+ queue_work(system_long_wq, &c->ec_stripe_create_work); -+ } -+} -+ -+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s = h->s; -+ -+ BUG_ON(!s->allocated && !s->err); -+ -+ h->s = NULL; -+ s->pending = true; -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_add(&s->list, &c->ec_stripe_new_list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ -+ ec_stripe_new_put(c, s); -+} -+ -+/* have a full bucket - hand it off to be erasure coded: */ -+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ if (ob->sectors_free) -+ s->err = -1; -+ -+ ec_stripe_new_put(c, s); -+} -+ -+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ s->err = -EIO; -+} -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct bch_dev *ca; -+ unsigned offset; -+ -+ if (!ob) -+ return NULL; -+ -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ offset = ca->mi.bucket_size - ob->sectors_free; -+ -+ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); -+} -+ -+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, -+ struct bpos pos, unsigned sectors) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct ec_stripe_new *ec; -+ -+ if (!ob) -+ return; -+ -+ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); -+ -+ ec = ob->ec; -+ mutex_lock(&ec->lock); -+ -+ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, -+ ARRAY_SIZE(ec->inline_keys), -+ BKEY_U64s)) { -+ BUG(); -+ } -+ -+ bkey_init(&ec->keys.top->k); -+ ec->keys.top->k.p = pos; -+ bch2_key_resize(&ec->keys.top->k, sectors); -+ bch2_keylist_push(&ec->keys); -+ -+ mutex_unlock(&ec->lock); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ unsigned l = *((const unsigned *) _l); -+ unsigned r = *((const unsigned *) _r); -+ -+ return cmp_int(l, r); -+} -+ -+/* pick most common bucket size: */ -+static unsigned pick_blocksize(struct bch_fs *c, -+ struct bch_devs_mask *devs) -+{ -+ struct bch_dev *ca; -+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; -+ struct { -+ unsigned nr, size; -+ } cur = { 0, 0 }, best = { 0, 0 }; -+ -+ for_each_member_device_rcu(ca, c, i, devs) -+ sizes[nr++] = ca->mi.bucket_size; -+ -+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); -+ -+ for (i = 0; i < nr; i++) { -+ if (sizes[i] != cur.size) { -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ cur.nr = 0; -+ cur.size = sizes[i]; -+ } -+ -+ cur.nr++; -+ } -+ -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ return best.size; -+} -+ -+static bool may_create_new_stripe(struct bch_fs *c) -+{ -+ return false; -+} -+ -+static void ec_stripe_key_init(struct bch_fs *c, -+ struct bkey_i_stripe *s, -+ unsigned nr_data, -+ unsigned nr_parity, -+ unsigned stripe_size) -+{ -+ unsigned u64s; -+ -+ bkey_stripe_init(&s->k_i); -+ s->v.sectors = cpu_to_le16(stripe_size); -+ s->v.algorithm = 0; -+ s->v.nr_blocks = nr_data + nr_parity; -+ s->v.nr_redundant = nr_parity; -+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); -+ s->v.csum_type = BCH_CSUM_CRC32C; -+ s->v.pad = 0; -+ -+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { -+ BUG_ON(1 << s->v.csum_granularity_bits >= -+ le16_to_cpu(s->v.sectors) || -+ s->v.csum_granularity_bits == U8_MAX); -+ s->v.csum_granularity_bits++; -+ } -+ -+ set_bkey_val_u64s(&s->k, u64s); -+} -+ -+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s; -+ unsigned i; -+ -+ lockdep_assert_held(&h->lock); -+ -+ s = kzalloc(sizeof(*s), GFP_KERNEL); -+ if (!s) -+ return -ENOMEM; -+ -+ mutex_init(&s->lock); -+ atomic_set(&s->pin, 1); -+ s->c = c; -+ s->h = h; -+ s->nr_data = min_t(unsigned, h->nr_active_devs, -+ EC_STRIPE_MAX) - h->redundancy; -+ s->nr_parity = h->redundancy; -+ -+ bch2_keylist_init(&s->keys, s->inline_keys); -+ -+ s->stripe.offset = 0; -+ s->stripe.size = h->blocksize; -+ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); -+ -+ ec_stripe_key_init(c, &s->stripe.key, s->nr_data, -+ s->nr_parity, h->blocksize); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { -+ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); -+ if (!s->stripe.data[i]) -+ goto err; -+ } -+ -+ h->s = s; -+ -+ return 0; -+err: -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+ return -ENOMEM; -+} -+ -+static struct ec_stripe_head * -+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, -+ unsigned algo, unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ h = kzalloc(sizeof(*h), GFP_KERNEL); -+ if (!h) -+ return NULL; -+ -+ mutex_init(&h->lock); -+ mutex_lock(&h->lock); -+ -+ h->target = target; -+ h->algo = algo; -+ h->redundancy = redundancy; -+ -+ rcu_read_lock(); -+ h->devs = target_rw_devs(c, BCH_DATA_user, target); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (!ca->mi.durability) -+ __clear_bit(i, h->devs.d); -+ -+ h->blocksize = pick_blocksize(c, &h->devs); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (ca->mi.bucket_size == h->blocksize) -+ h->nr_active_devs++; -+ -+ rcu_read_unlock(); -+ list_add(&h->list, &c->ec_stripe_head_list); -+ return h; -+} -+ -+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ if (h->s && -+ h->s->allocated && -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr) == h->s->blocks.nr) -+ ec_stripe_set_pending(c, h); -+ -+ mutex_unlock(&h->lock); -+} -+ -+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ -+ if (!redundancy) -+ return NULL; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) -+ if (h->target == target && -+ h->algo == algo && -+ h->redundancy == redundancy) { -+ mutex_lock(&h->lock); -+ goto found; -+ } -+ -+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); -+found: -+ mutex_unlock(&c->ec_stripe_head_lock); -+ return h; -+} -+ -+/* -+ * XXX: use a higher watermark for allocating open buckets here: -+ */ -+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ unsigned i, nr_have, nr_data = -+ min_t(unsigned, h->nr_active_devs, -+ EC_STRIPE_MAX) - h->redundancy; -+ bool have_cache = true; -+ int ret = 0; -+ -+ devs = h->devs; -+ -+ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { -+ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); -+ --nr_data; -+ } -+ -+ BUG_ON(h->s->blocks.nr > nr_data); -+ BUG_ON(h->s->parity.nr > h->redundancy); -+ -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ open_bucket_for_each(c, &h->s->blocks, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+ if (h->s->parity.nr < h->redundancy) { -+ nr_have = h->s->parity.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->s->parity, -+ &h->parity_stripe, -+ &devs, -+ h->redundancy, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+ -+ if (h->s->blocks.nr < nr_data) { -+ nr_have = h->s->blocks.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->s->blocks, -+ &h->block_stripe, -+ &devs, -+ nr_data, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+err: -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ return ret; -+} -+ -+/* XXX: doesn't obey target: */ -+static s64 get_existing_stripe(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t heap_idx; -+ u64 stripe_idx; -+ -+ if (may_create_new_stripe(c)) -+ return -1; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { -+ if (!h->data[heap_idx].blocks_nonempty) -+ continue; -+ -+ stripe_idx = h->data[heap_idx].idx; -+ m = genradix_ptr(&c->stripes[0], stripe_idx); -+ -+ if (m->algorithm == algo && -+ m->nr_redundant == redundancy && -+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { -+ bch2_stripes_heap_del(c, m, stripe_idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ return stripe_idx; -+ } -+ } -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ return -1; -+} -+ -+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (!ret) -+ bkey_reassemble(&stripe->key.k_i, k); -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ struct closure cl; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i, data_idx = 0; -+ s64 idx; -+ -+ closure_init_stack(&cl); -+ -+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); -+ if (!h) -+ return NULL; -+ -+ if (!h->s && ec_new_stripe_alloc(c, h)) { -+ bch2_ec_stripe_head_put(c, h); -+ return NULL; -+ } -+ -+ if (!h->s->allocated) { -+ if (!h->s->existing_stripe && -+ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { -+ //pr_info("got existing stripe %llu", idx); -+ -+ h->s->existing_stripe = true; -+ h->s->existing_stripe_idx = idx; -+ if (get_stripe_key(c, idx, &h->s->stripe)) { -+ /* btree error */ -+ BUG(); -+ } -+ -+ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) -+ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { -+ __set_bit(i, h->s->blocks_allocated); -+ ec_block_io(c, &h->s->stripe, READ, i, &cl); -+ } -+ } -+ -+ if (new_stripe_alloc_buckets(c, h)) { -+ bch2_ec_stripe_head_put(c, h); -+ h = NULL; -+ goto out; -+ } -+ -+ open_bucket_for_each(c, &h->s->blocks, ob, i) { -+ data_idx = find_next_zero_bit(h->s->blocks_allocated, -+ h->s->nr_data, data_idx); -+ BUG_ON(data_idx >= h->s->nr_data); -+ -+ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; -+ h->s->data_block_idx[i] = data_idx; -+ data_idx++; -+ } -+ -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; -+ -+ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); -+ h->s->allocated = true; -+ } -+out: -+ closure_sync(&cl); -+ return h; -+} -+ -+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ -+ mutex_lock(&h->lock); -+ if (!h->s) -+ goto unlock; -+ -+ open_bucket_for_each(c, &h->s->blocks, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ goto unlock; -+found: -+ h->s->err = -EROFS; -+ ec_stripe_set_pending(c, h); -+unlock: -+ mutex_unlock(&h->lock); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+} -+ -+static int __bch2_stripe_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct stripe *m, -+ size_t idx, -+ struct bkey_i_stripe *new_key) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ unsigned i; -+ int ret; -+ -+ bch2_btree_iter_set_pos(iter, POS(0, idx)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ return -EIO; -+ -+ bkey_reassemble(&new_key->k_i, k); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ for (i = 0; i < new_key->v.nr_blocks; i++) -+ stripe_blockcount_set(&new_key->v, i, -+ m->block_sectors[i]); -+ m->dirty = false; -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ bch2_trans_update(trans, iter, &new_key->k_i, 0); -+ return 0; -+} -+ -+int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct genradix_iter giter; -+ struct bkey_i_stripe *new_key; -+ struct stripe *m; -+ int ret = 0; -+ -+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); -+ BUG_ON(!new_key); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ genradix_for_each(&c->stripes[0], giter, m) { -+ if (!m->dirty) -+ continue; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|flags, -+ __bch2_stripe_write_key(&trans, iter, m, -+ giter.pos, new_key)); -+ -+ if (ret) -+ break; -+ -+ *wrote = true; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ kfree(new_key); -+ -+ return ret; -+} -+ -+static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_stripe) { -+ struct stripe *m; -+ -+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: -+ bch2_mark_key(c, k, 0, 0, NULL, 0, -+ BTREE_TRIGGER_ALLOC_READ| -+ BTREE_TRIGGER_NOATOMIC); -+ if (ret) -+ return ret; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ m = genradix_ptr(&c->stripes[0], k.k->p.offset); -+ bch2_stripes_heap_insert(c, m, k.k->p.offset); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ return ret; -+} -+ -+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, -+ NULL, bch2_stripes_read_fn); -+ if (ret) -+ bch_err(c, "error reading stripes: %i", ret); -+ -+ return ret; -+} -+ -+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ size_t i, idx = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); -+ -+ k = bch2_btree_iter_prev(iter); -+ if (!IS_ERR_OR_NULL(k.k)) -+ idx = k.k->p.offset + 1; -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (!idx) -+ return 0; -+ -+ if (!gc && -+ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), -+ GFP_KERNEL)) -+ return -ENOMEM; -+#if 0 -+ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -+#else -+ for (i = 0; i < idx; i++) -+ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) -+ return -ENOMEM; -+#endif -+ return 0; -+} -+ -+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t i; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ for (i = 0; i < min(h->used, 20UL); i++) { -+ m = genradix_ptr(&c->stripes[0], h->data[i].idx); -+ -+ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, -+ h->data[i].blocks_nonempty, -+ m->nr_blocks - m->nr_redundant, -+ m->nr_redundant); -+ } -+ spin_unlock(&c->ec_stripes_heap_lock); -+} -+ -+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ struct ec_stripe_new *s; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ pr_buf(out, "target %u algo %u redundancy %u:\n", -+ h->target, h->algo, h->redundancy); -+ -+ if (h->s) -+ pr_buf(out, "\tpending: blocks %u allocated %u\n", -+ h->s->blocks.nr, -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr)); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry(s, &c->ec_stripe_new_list, list) { -+ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", -+ s->blocks.nr, -+ bitmap_weight(s->blocks_allocated, -+ s->blocks.nr), -+ atomic_read(&s->pin)); -+ } -+ mutex_unlock(&c->ec_stripe_new_lock); -+} -+ -+void bch2_fs_ec_exit(struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ -+ while (1) { -+ mutex_lock(&c->ec_stripe_head_lock); -+ h = list_first_entry_or_null(&c->ec_stripe_head_list, -+ struct ec_stripe_head, list); -+ if (h) -+ list_del(&h->list); -+ mutex_unlock(&c->ec_stripe_head_lock); -+ if (!h) -+ break; -+ -+ BUG_ON(h->s); -+ kfree(h); -+ } -+ -+ BUG_ON(!list_empty(&c->ec_stripe_new_list)); -+ -+ free_heap(&c->ec_stripes_heap); -+ genradix_free(&c->stripes[0]); -+ bioset_exit(&c->ec_bioset); -+} -+ -+int bch2_fs_ec_init(struct bch_fs *c) -+{ -+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); -+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -+ -+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), -+ BIOSET_NEED_BVECS); -+} -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -new file mode 100644 -index 000000000000..f8fc3d616cd7 ---- /dev/null -+++ b/fs/bcachefs/ec.h -@@ -0,0 +1,169 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_H -+#define _BCACHEFS_EC_H -+ -+#include "ec_types.h" -+#include "keylist_types.h" -+ -+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_stripe (struct bkey_ops) { \ -+ .key_invalid = bch2_stripe_invalid, \ -+ .val_to_text = bch2_stripe_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(le16_to_cpu(s->sectors), -+ 1 << s->csum_granularity_bits); -+} -+ -+static inline unsigned stripe_csum_offset(const struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; -+ -+ return sizeof(struct bch_stripe) + -+ sizeof(struct bch_extent_ptr) * s->nr_blocks + -+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -+} -+ -+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return stripe_csum_offset(s, s->nr_blocks, 0) + -+ sizeof(u16) * idx; -+} -+ -+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -+} -+ -+static inline void stripe_blockcount_set(struct bch_stripe *s, -+ unsigned idx, unsigned v) -+{ -+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); -+ -+ *p = cpu_to_le16(v); -+} -+ -+static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), -+ sizeof(u64)); -+} -+ -+static inline void *stripe_csum(struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ return (void *) s + stripe_csum_offset(s, dev, csum_idx); -+} -+ -+struct bch_read_bio; -+ -+struct ec_stripe_buf { -+ /* might not be buffering the entire stripe: */ -+ unsigned offset; -+ unsigned size; -+ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ void *data[EC_STRIPE_MAX]; -+ -+ union { -+ struct bkey_i_stripe key; -+ u64 pad[255]; -+ }; -+}; -+ -+struct ec_stripe_head; -+ -+struct ec_stripe_new { -+ struct bch_fs *c; -+ struct ec_stripe_head *h; -+ struct mutex lock; -+ struct list_head list; -+ -+ /* counts in flight writes, stripe is created when pin == 0 */ -+ atomic_t pin; -+ -+ int err; -+ -+ u8 nr_data; -+ u8 nr_parity; -+ bool allocated; -+ bool pending; -+ bool existing_stripe; -+ u64 existing_stripe_idx; -+ -+ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ struct open_buckets blocks; -+ u8 data_block_idx[EC_STRIPE_MAX]; -+ struct open_buckets parity; -+ -+ struct keylist keys; -+ u64 inline_keys[BKEY_U64s * 8]; -+ -+ struct ec_stripe_buf stripe; -+}; -+ -+struct ec_stripe_head { -+ struct list_head list; -+ struct mutex lock; -+ -+ unsigned target; -+ unsigned algo; -+ unsigned redundancy; -+ -+ struct bch_devs_mask devs; -+ unsigned nr_active_devs; -+ -+ unsigned blocksize; -+ -+ struct dev_stripe_state block_stripe; -+ struct dev_stripe_state parity_stripe; -+ -+ struct ec_stripe_new *s; -+}; -+ -+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, -+ struct bpos, unsigned); -+ -+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); -+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); -+ -+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -+ -+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, -+ unsigned, unsigned); -+ -+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -+ -+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -+ -+void bch2_ec_flush_new_stripes(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_stripes_read(struct bch_fs *, struct journal_keys *); -+int bch2_stripes_write(struct bch_fs *, unsigned, bool *); -+ -+int bch2_ec_mem_alloc(struct bch_fs *, bool); -+ -+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); -+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_ec_exit(struct bch_fs *); -+int bch2_fs_ec_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_EC_H */ -diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h -new file mode 100644 -index 000000000000..e4d633fca5bf ---- /dev/null -+++ b/fs/bcachefs/ec_types.h -@@ -0,0 +1,39 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_TYPES_H -+#define _BCACHEFS_EC_TYPES_H -+ -+#include -+ -+#define EC_STRIPE_MAX 16 -+ -+struct bch_replicas_padded { -+ struct bch_replicas_entry e; -+ u8 pad[EC_STRIPE_MAX]; -+}; -+ -+struct stripe { -+ size_t heap_idx; -+ -+ u16 sectors; -+ u8 algorithm; -+ -+ u8 nr_blocks; -+ u8 nr_redundant; -+ -+ unsigned alive:1; -+ unsigned dirty:1; -+ unsigned on_heap:1; -+ u8 blocks_nonempty; -+ u16 block_sectors[EC_STRIPE_MAX]; -+ -+ struct bch_replicas_padded r; -+}; -+ -+struct ec_stripe_heap_entry { -+ size_t idx; -+ unsigned blocks_nonempty; -+}; -+ -+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; -+ -+#endif /* _BCACHEFS_EC_TYPES_H */ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -new file mode 100644 -index 000000000000..cd46706fb6f5 ---- /dev/null -+++ b/fs/bcachefs/error.c -@@ -0,0 +1,172 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "error.h" -+#include "io.h" -+#include "super.h" -+ -+#define FSCK_ERR_RATELIMIT_NR 10 -+ -+bool bch2_inconsistent_error(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_ERROR, &c->flags); -+ -+ switch (c->opts.errors) { -+ case BCH_ON_ERROR_CONTINUE: -+ return false; -+ case BCH_ON_ERROR_RO: -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+ return true; -+ case BCH_ON_ERROR_PANIC: -+ panic(bch2_fmt(c, "panic after error")); -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_fatal_error(struct bch_fs *c) -+{ -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+} -+ -+void bch2_io_error_work(struct work_struct *work) -+{ -+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); -+ struct bch_fs *c = ca->fs; -+ bool dev; -+ -+ down_write(&c->state_lock); -+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED); -+ if (dev -+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED) -+ : bch2_fs_emergency_read_only(c)) -+ bch_err(ca, -+ "too many IO errors, setting %s RO", -+ dev ? "device" : "filesystem"); -+ up_write(&c->state_lock); -+} -+ -+void bch2_io_error(struct bch_dev *ca) -+{ -+ //queue_work(system_long_wq, &ca->io_error_work); -+} -+ -+#ifdef __KERNEL__ -+#define ask_yn() false -+#else -+#include "tools-util.h" -+#endif -+ -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, -+ const char *fmt, ...) -+{ -+ struct fsck_err_state *s = NULL; -+ va_list args; -+ bool fix = false, print = true, suppressing = false; -+ char _buf[sizeof(s->buf)], *buf = _buf; -+ -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { -+ va_start(args, fmt); -+ vprintk(fmt, args); -+ va_end(args); -+ -+ return bch2_inconsistent_error(c) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_FIX; -+ } -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry(s, &c->fsck_errors, list) -+ if (s->fmt == fmt) -+ goto found; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS); -+ if (!s) { -+ if (!c->fsck_alloc_err) -+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); -+ c->fsck_alloc_err = true; -+ buf = _buf; -+ goto print; -+ } -+ -+ INIT_LIST_HEAD(&s->list); -+ s->fmt = fmt; -+found: -+ list_move(&s->list, &c->fsck_errors); -+ s->nr++; -+ if (c->opts.ratelimit_errors && -+ s->nr >= FSCK_ERR_RATELIMIT_NR) { -+ if (s->nr == FSCK_ERR_RATELIMIT_NR) -+ suppressing = true; -+ else -+ print = false; -+ } -+ buf = s->buf; -+print: -+ va_start(args, fmt); -+ vscnprintf(buf, sizeof(_buf), fmt, args); -+ va_end(args); -+ -+ if (c->opts.fix_errors == FSCK_OPT_EXIT) { -+ bch_err(c, "%s, exiting", buf); -+ } else if (flags & FSCK_CAN_FIX) { -+ if (c->opts.fix_errors == FSCK_OPT_ASK) { -+ printk(KERN_ERR "%s: fix?", buf); -+ fix = ask_yn(); -+ } else if (c->opts.fix_errors == FSCK_OPT_YES || -+ (c->opts.nochanges && -+ !(flags & FSCK_CAN_IGNORE))) { -+ if (print) -+ bch_err(c, "%s, fixing", buf); -+ fix = true; -+ } else { -+ if (print) -+ bch_err(c, "%s, not fixing", buf); -+ fix = false; -+ } -+ } else if (flags & FSCK_NEED_FSCK) { -+ if (print) -+ bch_err(c, "%s (run fsck to correct)", buf); -+ } else { -+ if (print) -+ bch_err(c, "%s (repair unimplemented)", buf); -+ } -+ -+ if (suppressing) -+ bch_err(c, "Ratelimiting new instances of previous error"); -+ -+ mutex_unlock(&c->fsck_error_lock); -+ -+ if (fix) { -+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ return FSCK_ERR_FIX; -+ } else { -+ set_bit(BCH_FS_ERROR, &c->flags); -+ return c->opts.fix_errors == FSCK_OPT_EXIT || -+ !(flags & FSCK_CAN_IGNORE) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_IGNORE; -+ } -+} -+ -+void bch2_flush_fsck_errs(struct bch_fs *c) -+{ -+ struct fsck_err_state *s, *n; -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { -+ if (s->ratelimited) -+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); -+ -+ list_del(&s->list); -+ kfree(s); -+ } -+ -+ mutex_unlock(&c->fsck_error_lock); -+} -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -new file mode 100644 -index 000000000000..94b53312fbbd ---- /dev/null -+++ b/fs/bcachefs/error.h -@@ -0,0 +1,211 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ERROR_H -+#define _BCACHEFS_ERROR_H -+ -+#include -+#include -+ -+struct bch_dev; -+struct bch_fs; -+struct work_struct; -+ -+/* -+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag -+ * superblock as such -+ */ -+ -+/* Error messages: */ -+ -+/* -+ * Inconsistency errors: The on disk data is inconsistent. If these occur during -+ * initial recovery, they don't indicate a bug in the running code - we walk all -+ * the metadata before modifying anything. If they occur at runtime, they -+ * indicate either a bug in the running code or (less likely) data is being -+ * silently corrupted under us. -+ * -+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in -+ * BCH_ON_ERROR_CONTINUE mode -+ */ -+ -+bool bch2_inconsistent_error(struct bch_fs *); -+ -+#define bch2_fs_inconsistent(c, ...) \ -+({ \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_inconsistent_error(c); \ -+}) -+ -+#define bch2_fs_inconsistent_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_inconsistent(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Later we might want to mark only the particular device inconsistent, not the -+ * entire filesystem: -+ */ -+ -+#define bch2_dev_inconsistent(ca, ...) \ -+do { \ -+ bch_err(ca, __VA_ARGS__); \ -+ bch2_inconsistent_error((ca)->fs); \ -+} while (0) -+ -+#define bch2_dev_inconsistent_on(cond, ca, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_inconsistent(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally -+ * be able to repair: -+ */ -+ -+enum { -+ BCH_FSCK_OK = 0, -+ BCH_FSCK_ERRORS_NOT_FIXED = 1, -+ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, -+ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, -+ BCH_FSCK_UNKNOWN_VERSION = 4, -+}; -+ -+enum fsck_err_opts { -+ FSCK_OPT_EXIT, -+ FSCK_OPT_YES, -+ FSCK_OPT_NO, -+ FSCK_OPT_ASK, -+}; -+ -+enum fsck_err_ret { -+ FSCK_ERR_IGNORE = 0, -+ FSCK_ERR_FIX = 1, -+ FSCK_ERR_EXIT = 2, -+}; -+ -+struct fsck_err_state { -+ struct list_head list; -+ const char *fmt; -+ u64 nr; -+ bool ratelimited; -+ char buf[512]; -+}; -+ -+#define FSCK_CAN_FIX (1 << 0) -+#define FSCK_CAN_IGNORE (1 << 1) -+#define FSCK_NEED_FSCK (1 << 2) -+ -+__printf(3, 4) __cold -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *, -+ unsigned, const char *, ...); -+void bch2_flush_fsck_errs(struct bch_fs *); -+ -+#define __fsck_err(c, _flags, msg, ...) \ -+({ \ -+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ -+ \ -+ if (_fix == FSCK_ERR_EXIT) { \ -+ bch_err(c, "Unable to continue, halting"); \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ \ -+ _fix; \ -+}) -+ -+/* These macros return true if error should be fixed: */ -+ -+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -+ -+#define __fsck_err_on(cond, c, _flags, ...) \ -+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) -+ -+#define need_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define need_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+#define fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+/* -+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW -+ * mode - pretty much just due to metadata IO errors: -+ */ -+ -+void bch2_fatal_error(struct bch_fs *); -+ -+#define bch2_fs_fatal_error(c, ...) \ -+do { \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_fatal_error(c); \ -+} while (0) -+ -+#define bch2_fs_fatal_err_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_fatal_error(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * IO errors: either recoverable metadata IO (because we have replicas), or data -+ * IO - we need to log it and print out a message, but we don't (necessarily) -+ * want to shut down the fs: -+ */ -+ -+void bch2_io_error_work(struct work_struct *); -+ -+/* Does the error handling without logging a message */ -+void bch2_io_error(struct bch_dev *); -+ -+/* Logs message and handles the error: */ -+#define bch2_dev_io_error(ca, fmt, ...) \ -+do { \ -+ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ -+ "IO error on %s for " fmt), \ -+ (ca)->name, ##__VA_ARGS__); \ -+ bch2_io_error(ca); \ -+} while (0) -+ -+#define bch2_dev_io_err_on(cond, ca, ...) \ -+({ \ -+ bool _ret = (cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_io_error(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* kill? */ -+ -+#define __bcache_io_error(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, \ -+ "IO error: " fmt), ##__VA_ARGS__) -+ -+#define bcache_io_error(c, bio, fmt, ...) \ -+do { \ -+ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ -+ (bio)->bi_status = BLK_STS_IOERR; \ -+} while (0) -+ -+#endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -new file mode 100644 -index 000000000000..fd011df3cb99 ---- /dev/null -+++ b/fs/bcachefs/extent_update.c -@@ -0,0 +1,229 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "debug.h" -+#include "extents.h" -+#include "extent_update.h" -+ -+/* -+ * This counts the number of iterators to the alloc & ec btrees we'll need -+ * inserting/removing this extent: -+ */ -+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ unsigned ret = 0; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int count_iters_for_insert(struct btree_trans *trans, -+ struct bkey_s_c k, -+ unsigned offset, -+ struct bpos *end, -+ unsigned *nr_iters, -+ unsigned max_iters) -+{ -+ int ret = 0, ret2 = 0; -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ break; -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx = le64_to_cpu(p.v->idx); -+ unsigned sectors = bpos_min(*end, p.k->p).offset - -+ bkey_start_offset(p.k); -+ struct btree_iter *iter; -+ struct bkey_s_c r_k; -+ -+ for_each_btree_key(trans, iter, -+ BTREE_ID_REFLINK, POS(0, idx + offset), -+ BTREE_ITER_SLOTS, r_k, ret2) { -+ if (bkey_cmp(bkey_start_pos(r_k.k), -+ POS(0, idx + sectors)) >= 0) -+ break; -+ -+ /* extent_update_to_keys(), for the reflink_v update */ -+ *nr_iters += 1; -+ -+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); -+ -+ if (*nr_iters >= max_iters) { -+ struct bpos pos = bkey_start_pos(k.k); -+ pos.offset += min_t(u64, k.k->size, -+ r_k.k->p.offset - idx); -+ -+ *end = bpos_min(*end, pos); -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ break; -+ } -+ } -+ -+ return ret2 ?: ret; -+} -+ -+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -+ -+int bch2_extent_atomic_end(struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bpos *end) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *_k; -+ unsigned nr_iters = 0; -+ int ret; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ b = iter->l[0].b; -+ node_iter = iter->l[0].iter; -+ -+ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ -+ *end = bpos_min(insert->k.p, b->key.k.p); -+ -+ /* extent_update_to_keys(): */ -+ nr_iters += 1; -+ -+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, -+ &nr_iters, EXTENT_ITERS_MAX / 2); -+ if (ret < 0) -+ return ret; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); -+ unsigned offset = 0; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_start_pos(k.k)) > 0) -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ -+ /* extent_handle_overwrites(): */ -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ case BCH_EXTENT_OVERLAP_FRONT: -+ nr_iters += 1; -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ nr_iters += 2; -+ break; -+ } -+ -+ ret = count_iters_for_insert(trans, k, offset, end, -+ &nr_iters, EXTENT_ITERS_MAX); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ bch2_cut_back(end, k); -+ return 0; -+} -+ -+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ return !bkey_cmp(end, k->k.p); -+} -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *_k; -+ struct bkey_s_c k; -+ struct bkey unpacked; -+ int sectors; -+ -+ _k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!_k) -+ return BTREE_INSERT_OK; -+ -+ k = bkey_disassemble(l->b, _k, &unpacked); -+ -+ /* Check if we're splitting a compressed extent: */ -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && -+ bkey_cmp(insert->k.p, k.k->p) < 0 && -+ (sectors = bch2_bkey_sectors_compressed(k))) { -+ int flags = trans->flags & BTREE_INSERT_NOFAIL -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ -+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, -+ sectors, flags)) { -+ case 0: -+ break; -+ case -ENOSPC: -+ return BTREE_INSERT_ENOSPC; -+ default: -+ BUG(); -+ } -+ } -+ -+ return BTREE_INSERT_OK; -+} -diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h -new file mode 100644 -index 000000000000..38dc084627d2 ---- /dev/null -+++ b/fs/bcachefs/extent_update.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENT_UPDATE_H -+#define _BCACHEFS_EXTENT_UPDATE_H -+ -+#include "bcachefs.h" -+ -+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, -+ struct bpos *); -+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *); -+ -+#endif /* _BCACHEFS_EXTENT_UPDATE_H */ -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -new file mode 100644 -index 000000000000..568f039edcff ---- /dev/null -+++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1258 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * -+ * Code for managing the extent btree and dynamically updating the writeback -+ * dirty sector count. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+static unsigned bch2_crc_field_size_max[] = { -+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -+}; -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *, -+ struct bch_extent_crc_unpacked, -+ enum bch_extent_entry_type); -+ -+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, -+ unsigned dev) -+{ -+ struct bch_dev_io_failures *i; -+ -+ for (i = f->devs; i < f->devs + f->nr; i++) -+ if (i->dev == dev) -+ return i; -+ -+ return NULL; -+} -+ -+void bch2_mark_io_failure(struct bch_io_failures *failed, -+ struct extent_ptr_decoded *p) -+{ -+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); -+ -+ if (!f) { -+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); -+ -+ f = &failed->devs[failed->nr++]; -+ f->dev = p->ptr.dev; -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else if (p->idx != f->idx) { -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else { -+ f->nr_failed++; -+ } -+} -+ -+/* -+ * returns true if p1 is better than p2: -+ */ -+static inline bool ptr_better(struct bch_fs *c, -+ const struct extent_ptr_decoded p1, -+ const struct extent_ptr_decoded p2) -+{ -+ if (likely(!p1.idx && !p2.idx)) { -+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); -+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); -+ -+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); -+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); -+ -+ /* Pick at random, biased in favor of the faster device: */ -+ -+ return bch2_rand_range(l1 + l2) > l1; -+ } -+ -+ if (force_reconstruct_read(c)) -+ return p1.idx > p2.idx; -+ -+ return p1.idx < p2.idx; -+} -+ -+/* -+ * This picks a non-stale pointer, preferably from a device other than @avoid. -+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to -+ * other devices, it will still pick a pointer from avoid. -+ */ -+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_io_failures *failed, -+ struct extent_ptr_decoded *pick) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_dev_io_failures *f; -+ struct bch_dev *ca; -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_error) -+ return -EIO; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ /* -+ * If there are any dirty pointers it's an error if we can't -+ * read: -+ */ -+ if (!ret && !p.ptr.cached) -+ ret = -EIO; -+ -+ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) -+ continue; -+ -+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; -+ if (f) -+ p.idx = f->nr_failed < f->nr_retries -+ ? f->idx -+ : f->idx + 1; -+ -+ if (!p.idx && -+ !bch2_dev_is_readable(ca)) -+ p.idx++; -+ -+ if (force_reconstruct_read(c) && -+ !p.idx && p.has_ec) -+ p.idx++; -+ -+ if (p.idx >= (unsigned) p.has_ec + 1) -+ continue; -+ -+ if (ret > 0 && !ptr_better(c, p, *pick)) -+ continue; -+ -+ *pick = p; -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ const char *err; -+ char buf[160]; -+ struct bucket_mark mark; -+ struct bch_dev *ca; -+ -+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ mark = ptr_bucket_mark(ca, ptr); -+ -+ err = "stale"; -+ if (gen_after(mark.gen, ptr->gen)) -+ goto err; -+ -+ err = "inconsistent"; -+ if (mark.data_type != BCH_DATA_btree || -+ mark.dirty_sectors < c->opts.btree_node_size) -+ goto err; -+ } -+out: -+ percpu_up_read(&c->mark_lock); -+ return; -+err: -+ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", -+ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), -+ PTR_BUCKET_NR(ca, ptr), -+ mark.gen, (unsigned) mark.v.counter); -+ goto out; -+} -+ -+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ pr_buf(out, "seq %llx sectors %u written %u min_key ", -+ le64_to_cpu(bp.v->seq), -+ le16_to_cpu(bp.v->sectors), -+ le16_to_cpu(bp.v->sectors_written)); -+ -+ bch2_bpos_to_text(out, bp.v->min_key); -+ pr_buf(out, " "); -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s k) -+{ -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); -+ -+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bp.v->min_key, POS_MIN)) -+ bp.v->min_key = write -+ ? bkey_predecessor(bp.v->min_key) -+ : bkey_successor(bp.v->min_key); -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ char buf[160]; -+ -+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); -+ unsigned stale = gen_after(mark.gen, p.ptr.gen); -+ unsigned disk_sectors = ptr_disk_sectors(p); -+ unsigned mark_sectors = p.ptr.cached -+ ? mark.cached_sectors -+ : mark.dirty_sectors; -+ -+ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, -+ "stale dirty pointer (ptr gen %u bucket %u", -+ p.ptr.gen, mark.gen); -+ -+ bch2_fs_inconsistent_on(stale > 96, c, -+ "key too stale: %i", stale); -+ -+ bch2_fs_inconsistent_on(!stale && -+ (mark.data_type != BCH_DATA_user || -+ mark_sectors < disk_sectors), c, -+ "extent pointer not marked: %s:\n" -+ "type %u sectors %u < %u", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), -+ mark.data_type, -+ mark_sectors, disk_sectors); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+enum merge_result bch2_extent_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_extent l = bkey_s_to_extent(_l); -+ struct bkey_s_extent r = bkey_s_to_extent(_r); -+ union bch_extent_entry *en_l = l.v->start; -+ union bch_extent_entry *en_r = r.v->start; -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) -+ return BCH_MERGE_NOMERGE; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, NULL); -+ -+ extent_for_each_entry(l, en_l) { -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (extent_entry_type(en_l) != extent_entry_type(en_r)) -+ return BCH_MERGE_NOMERGE; -+ -+ switch (extent_entry_type(en_l)) { -+ case BCH_EXTENT_ENTRY_ptr: { -+ const struct bch_extent_ptr *lp = &en_l->ptr; -+ const struct bch_extent_ptr *rp = &en_r->ptr; -+ struct bch_dev *ca; -+ -+ if (lp->offset + crc_l.compressed_size != rp->offset || -+ lp->dev != rp->dev || -+ lp->gen != rp->gen) -+ return BCH_MERGE_NOMERGE; -+ -+ /* We don't allow extents to straddle buckets: */ -+ ca = bch_dev_bkey_exists(c, lp->dev); -+ -+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ } -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || -+ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) -+ return BCH_MERGE_NOMERGE; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ if (crc_l.csum_type != crc_r.csum_type || -+ crc_l.compression_type != crc_r.compression_type || -+ crc_l.nonce != crc_r.nonce) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || -+ crc_r.offset) -+ return BCH_MERGE_NOMERGE; -+ -+ if (!bch2_checksum_mergeable(crc_l.csum_type)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_is_compressed(crc_l)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.csum_type && -+ crc_l.uncompressed_size + -+ crc_r.uncompressed_size > c->sb.encoded_extent_max) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.uncompressed_size + crc_r.uncompressed_size > -+ bch2_crc_field_size_max[extent_entry_type(en_l)]) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ default: -+ return BCH_MERGE_NOMERGE; -+ } -+ } -+ -+ extent_for_each_entry(l, en_l) { -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (!extent_entry_is_crc(en_l)) -+ continue; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, -+ crc_l.csum, -+ crc_r.csum, -+ crc_r.uncompressed_size << 9); -+ -+ crc_l.uncompressed_size += crc_r.uncompressed_size; -+ crc_l.compressed_size += crc_r.compressed_size; -+ -+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, -+ extent_entry_type(en_l)); -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) -+ return "incorrect value size"; -+ -+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) -+ return "invalid nr_replicas"; -+ -+ return NULL; -+} -+ -+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ pr_buf(out, "generation %u replicas %u", -+ le32_to_cpu(r.v->generation), -+ r.v->nr_replicas); -+} -+ -+enum merge_result bch2_reservation_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reservation l = bkey_s_to_reservation(_l); -+ struct bkey_s_reservation r = bkey_s_to_reservation(_r); -+ -+ if (l.v->generation != r.v->generation || -+ l.v->nr_replicas != r.v->nr_replicas) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, r.s); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* Extent checksum entries: */ -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, -+ struct bch_extent_crc_unpacked r) -+{ -+ return (l.csum_type != r.csum_type || -+ l.compression_type != r.compression_type || -+ l.compressed_size != r.compressed_size || -+ l.uncompressed_size != r.uncompressed_size || -+ l.offset != r.offset || -+ l.live_size != r.live_size || -+ l.nonce != r.nonce || -+ bch2_crc_cmp(l.csum, r.csum)); -+} -+ -+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, -+ struct bch_extent_crc_unpacked n) -+{ -+ return !crc_is_compressed(u) && -+ u.csum_type && -+ u.uncompressed_size > u.live_size && -+ bch2_csum_type_is_encryption(u.csum_type) == -+ bch2_csum_type_is_encryption(n.csum_type); -+} -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, -+ struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ if (!n.csum_type) -+ return false; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (can_narrow_crc(crc, n)) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * We're writing another replica for this extent, so while we've got the data in -+ * memory we'll be computing a new checksum for the currently live data. -+ * -+ * If there are other replicas we aren't moving, and they are checksummed but -+ * not compressed, we can modify them to point to only the data that is -+ * currently live (so that readers won't have to bounce) while we've got the -+ * checksum we need: -+ */ -+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked u; -+ struct extent_ptr_decoded p; -+ union bch_extent_entry *i; -+ bool ret = false; -+ -+ /* Find a checksum entry that covers only live data: */ -+ if (!n.csum_type) { -+ bkey_for_each_crc(&k->k, ptrs, u, i) -+ if (!crc_is_compressed(u) && -+ u.csum_type && -+ u.live_size == u.uncompressed_size) { -+ n = u; -+ goto found; -+ } -+ return false; -+ } -+found: -+ BUG_ON(crc_is_compressed(n)); -+ BUG_ON(n.offset); -+ BUG_ON(n.live_size != k->k.size); -+ -+restart_narrow_pointers: -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ -+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) -+ if (can_narrow_crc(p.crc, n)) { -+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); -+ p.ptr.offset += p.crc.offset; -+ p.crc = n; -+ bch2_extent_ptr_decoded_append(k, &p); -+ ret = true; -+ goto restart_narrow_pointers; -+ } -+ -+ return ret; -+} -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *dst, -+ struct bch_extent_crc_unpacked src, -+ enum bch_extent_entry_type type) -+{ -+#define set_common_fields(_dst, _src) \ -+ _dst.type = 1 << type; \ -+ _dst.csum_type = _src.csum_type, \ -+ _dst.compression_type = _src.compression_type, \ -+ _dst._compressed_size = _src.compressed_size - 1, \ -+ _dst._uncompressed_size = _src.uncompressed_size - 1, \ -+ _dst.offset = _src.offset -+ -+ switch (type) { -+ case BCH_EXTENT_ENTRY_crc32: -+ set_common_fields(dst->crc32, src); -+ dst->crc32.csum = *((__le32 *) &src.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ set_common_fields(dst->crc64, src); -+ dst->crc64.nonce = src.nonce; -+ dst->crc64.csum_lo = src.csum.lo; -+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ set_common_fields(dst->crc128, src); -+ dst->crc128.nonce = src.nonce; -+ dst->crc128.csum = src.csum; -+ break; -+ default: -+ BUG(); -+ } -+#undef set_common_fields -+} -+ -+void bch2_extent_crc_append(struct bkey_i *k, -+ struct bch_extent_crc_unpacked new) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ union bch_extent_crc *crc = (void *) ptrs.end; -+ enum bch_extent_entry_type type; -+ -+ if (bch_crc_bytes[new.csum_type] <= 4 && -+ new.uncompressed_size <= CRC32_SIZE_MAX && -+ new.nonce <= CRC32_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc32; -+ else if (bch_crc_bytes[new.csum_type] <= 10 && -+ new.uncompressed_size <= CRC64_SIZE_MAX && -+ new.nonce <= CRC64_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc64; -+ else if (bch_crc_bytes[new.csum_type] <= 16 && -+ new.uncompressed_size <= CRC128_SIZE_MAX && -+ new.nonce <= CRC128_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc128; -+ else -+ BUG(); -+ -+ bch2_extent_crc_pack(crc, new, type); -+ -+ k->k.u64s += extent_entry_u64s(ptrs.end); -+ -+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -+} -+ -+/* Generic code for keys with pointers: */ -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -+{ -+ return bch2_bkey_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -+{ -+ return k.k->type == KEY_TYPE_reservation -+ ? bkey_s_c_to_reservation(k).v->nr_replicas -+ : bch2_bkey_dirty_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -+{ -+ unsigned ret = 0; -+ -+ if (k.k->type == KEY_TYPE_reservation) { -+ ret = bkey_s_c_to_reservation(k).v->nr_replicas; -+ } else { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ ret += !p.ptr.cached && !crc_is_compressed(p.crc); -+ } -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned ret = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && crc_is_compressed(p.crc)) -+ ret += p.crc.compressed_size; -+ -+ return ret; -+} -+ -+bool bch2_bkey_is_incompressible(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, entry) -+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) -+ return true; -+ return false; -+} -+ -+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, -+ unsigned nr_replicas) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end = pos; -+ struct bkey_s_c k; -+ bool ret = true; -+ int err; -+ -+ end.offset += size; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, -+ BTREE_ITER_SLOTS, k, err) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { -+ ret = false; -+ break; -+ } -+ } -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static unsigned bch2_extent_ptr_durability(struct bch_fs *c, -+ struct extent_ptr_decoded p) -+{ -+ unsigned durability = 0; -+ struct bch_dev *ca; -+ -+ if (p.ptr.cached) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) -+ durability = max_t(unsigned, durability, ca->mi.durability); -+ -+ if (p.has_ec) { -+ struct stripe *s = -+ genradix_ptr(&c->stripes[0], p.ec.idx); -+ -+ if (WARN_ON(!s)) -+ goto out; -+ -+ durability += s->nr_redundant; -+ } -+out: -+ return durability; -+} -+ -+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned durability = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ durability += bch2_extent_ptr_durability(c, p); -+ -+ return durability; -+} -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, -+ unsigned target, -+ unsigned nr_desired_replicas) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; -+ -+ if (target && extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra && -+ !bch2_dev_in_target(c, p.ptr.dev, target)) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+ -+ if (extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+} -+ -+void bch2_bkey_append_ptr(struct bkey_i *k, -+ struct bch_extent_ptr ptr) -+{ -+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); -+ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); -+ -+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ -+ memcpy((void *) &k->v + bkey_val_bytes(&k->k), -+ &ptr, -+ sizeof(ptr)); -+ k->u64s++; -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void __extent_entry_insert(struct bkey_i *k, -+ union bch_extent_entry *dst, -+ union bch_extent_entry *new) -+{ -+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); -+ -+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), -+ dst, (u64 *) end - (u64 *) dst); -+ k->k.u64s += extent_entry_u64s(new); -+ memcpy(dst, new, extent_entry_bytes(new)); -+} -+ -+void bch2_extent_ptr_decoded_append(struct bkey_i *k, -+ struct extent_ptr_decoded *p) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked crc = -+ bch2_extent_crc_unpack(&k->k, NULL); -+ union bch_extent_entry *pos; -+ -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = ptrs.start; -+ goto found; -+ } -+ -+ bkey_for_each_crc(&k->k, ptrs, crc, pos) -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = extent_entry_next(pos); -+ goto found; -+ } -+ -+ bch2_extent_crc_append(k, p->crc); -+ pos = bkey_val_end(bkey_i_to_s(k)); -+found: -+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ptr)); -+ -+ if (p->has_ec) { -+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ec)); -+ } -+} -+ -+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, -+ union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *i = ptrs.start; -+ -+ if (i == entry) -+ return NULL; -+ -+ while (extent_entry_next(i) != entry) -+ i = extent_entry_next(i); -+ return i; -+} -+ -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, -+ struct bch_extent_ptr *ptr) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *dst, *src, *prev; -+ bool drop_crc = true; -+ -+ EBUG_ON(ptr < &ptrs.start->ptr || -+ ptr >= &ptrs.end->ptr); -+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); -+ -+ src = extent_entry_next(to_entry(ptr)); -+ if (src != ptrs.end && -+ !extent_entry_is_crc(src)) -+ drop_crc = false; -+ -+ dst = to_entry(ptr); -+ while ((prev = extent_entry_prev(ptrs, dst))) { -+ if (extent_entry_is_ptr(prev)) -+ break; -+ -+ if (extent_entry_is_crc(prev)) { -+ if (drop_crc) -+ dst = prev; -+ break; -+ } -+ -+ dst = prev; -+ } -+ -+ memmove_u64s_down(dst, src, -+ (u64 *) ptrs.end - (u64 *) src); -+ k.k->u64s -= (u64 *) src - (u64 *) dst; -+ -+ return dst; -+} -+ -+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -+} -+ -+const struct bch_extent_ptr * -+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (ptr->dev == dev) -+ return ptr; -+ -+ return NULL; -+} -+ -+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (bch2_dev_in_target(c, ptr->dev, target) && -+ (!ptr->cached || -+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) -+ return true; -+ -+ return false; -+} -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_extent_ptr m, u64 offset) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == m.dev && -+ p.ptr.gen == m.gen && -+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == -+ (s64) m.offset - offset) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. -+ * -+ * Returns true if @k should be dropped entirely -+ * -+ * For existing keys, only called when btree nodes are being rewritten, not when -+ * they're merely being compacted/resorted in memory. -+ */ -+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, -+ ptr->cached && -+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); -+ -+ /* will only happen if all pointers were cached: */ -+ if (!bch2_bkey_nr_ptrs(k.s_c)) -+ k.k->type = KEY_TYPE_discard; -+ -+ return bkey_whiteout(k.k); -+} -+ -+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ const struct bch_extent_ptr *ptr; -+ const struct bch_extent_stripe_ptr *ec; -+ struct bch_dev *ca; -+ bool first = true; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (!first) -+ pr_buf(out, " "); -+ -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ptr = entry_to_ptr(entry); -+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] -+ ? bch_dev_bkey_exists(c, ptr->dev) -+ : NULL; -+ -+ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, -+ (u64) ptr->offset, ptr->gen, -+ ptr->cached ? " cached" : "", -+ ca && ptr_stale(ca, ptr) -+ ? " stale" : ""); -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", -+ crc.compressed_size, -+ crc.uncompressed_size, -+ crc.offset, crc.nonce, -+ crc.csum_type, -+ crc.compression_type); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ec = &entry->stripe_ptr; -+ -+ pr_buf(out, "ec: idx %llu block %u", -+ (u64) ec->idx, ec->block); -+ break; -+ default: -+ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); -+ return; -+ } -+ -+ first = false; -+ } -+} -+ -+static const char *extent_ptr_invalid(const struct bch_fs *c, -+ struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ unsigned size_ondisk, -+ bool metadata) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr2; -+ struct bch_dev *ca; -+ -+ if (!bch2_dev_exists2(c, ptr->dev)) -+ return "pointer to invalid device"; -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!ca) -+ return "pointer to invalid device"; -+ -+ bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr != ptr2 && ptr->dev == ptr2->dev) -+ return "multiple pointers to same device"; -+ -+ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) -+ return "offset past end of device"; -+ -+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) -+ return "offset before first bucket"; -+ -+ if (bucket_remainder(ca, ptr->offset) + -+ size_ondisk > ca->mi.bucket_size) -+ return "spans multiple buckets"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ unsigned size_ondisk = k.k->size; -+ const char *reason; -+ unsigned nonce = UINT_MAX; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr) -+ size_ondisk = c->opts.btree_node_size; -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) -+ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) -+ return "invalid extent entry type"; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr && -+ !extent_entry_is_ptr(entry)) -+ return "has non ptr field"; -+ -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ reason = extent_ptr_invalid(c, k, &entry->ptr, -+ size_ondisk, false); -+ if (reason) -+ return reason; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ if (crc.offset + crc.live_size > -+ crc.uncompressed_size) -+ return "checksum offset + key size > uncompressed size"; -+ -+ size_ondisk = crc.compressed_size; -+ -+ if (!bch2_checksum_type_valid(c, crc.csum_type)) -+ return "invalid checksum type"; -+ -+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) -+ return "invalid compression type"; -+ -+ if (bch2_csum_type_is_encryption(crc.csum_type)) { -+ if (nonce == UINT_MAX) -+ nonce = crc.offset + crc.nonce; -+ else if (nonce != crc.offset + crc.nonce) -+ return "incorrect nonce"; -+ } -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+ -+ return NULL; -+} -+ -+void bch2_ptr_swab(struct bkey_s k) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ u64 *d; -+ -+ for (d = (u64 *) ptrs.start; -+ d != (u64 *) ptrs.end; -+ d++) -+ *d = swab64(*d); -+ -+ for (entry = ptrs.start; -+ entry < ptrs.end; -+ entry = extent_entry_next(entry)) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.csum = swab32(entry->crc32.csum); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); -+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.csum.hi = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.hi); -+ entry->crc128.csum.lo = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+} -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 sub; -+ -+ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, k.k->p) > 0); -+ -+ sub = where.offset - bkey_start_offset(k.k); -+ -+ k.k->size -= sub; -+ -+ if (!k.k->size) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: { -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ bool seen_crc = false; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ if (!seen_crc) -+ entry->ptr.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ -+ if (extent_entry_is_crc(entry)) -+ seen_crc = true; -+ } -+ -+ break; -+ } -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); -+ -+ le64_add_cpu(&p.v->idx, sub); -+ break; -+ } -+ case KEY_TYPE_inline_data: { -+ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); -+ -+ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); -+ -+ memmove(d.v->data, -+ d.v->data + sub, -+ bkey_val_bytes(d.k) - sub); -+ -+ new_val_u64s -= sub >> 3; -+ break; -+ } -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -+ -+int bch2_cut_back_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 len = 0; -+ -+ if (bkey_cmp(where, k.k->p) >= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); -+ -+ len = where.offset - bkey_start_offset(k.k); -+ -+ k.k->p = where; -+ k.k->size = len; -+ -+ if (!len) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inline_data: -+ new_val_u64s = min(new_val_u64s, k.k->size << 6); -+ break; -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -new file mode 100644 -index 000000000000..29b15365d19c ---- /dev/null -+++ b/fs/bcachefs/extents.h -@@ -0,0 +1,603 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_H -+#define _BCACHEFS_EXTENTS_H -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "extents_types.h" -+ -+struct bch_fs; -+struct btree_trans; -+ -+/* extent entries: */ -+ -+#define extent_entry_last(_e) \ -+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) -+ -+#define entry_to_ptr(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ -+ \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const struct bch_extent_ptr *) (_entry), \ -+ (struct bch_extent_ptr *) (_entry)); \ -+}) -+ -+/* downcast, preserves const */ -+#define to_entry(_entry) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ -+ !type_is(_entry, struct bch_extent_ptr *) && \ -+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ -+ \ -+ __builtin_choose_expr( \ -+ (type_is_exact(_entry, const union bch_extent_crc *) || \ -+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ -+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ -+ (const union bch_extent_entry *) (_entry), \ -+ (union bch_extent_entry *) (_entry)); \ -+}) -+ -+#define extent_entry_next(_entry) \ -+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) -+ -+static inline unsigned -+__extent_entry_type(const union bch_extent_entry *e) -+{ -+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -+} -+ -+static inline enum bch_extent_entry_type -+extent_entry_type(const union bch_extent_entry *e) -+{ -+ int ret = __ffs(e->type); -+ -+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); -+ -+ return ret; -+} -+ -+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -+{ -+ switch (extent_entry_type(entry)) { -+#define x(f, n) \ -+ case BCH_EXTENT_ENTRY_##f: \ -+ return sizeof(struct bch_extent_##f); -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -+{ -+ return extent_entry_bytes(entry) / sizeof(u64); -+} -+ -+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+union bch_extent_crc { -+ u8 type; -+ struct bch_extent_crc32 crc32; -+ struct bch_extent_crc64 crc64; -+ struct bch_extent_crc128 crc128; -+}; -+ -+#define __entry_to_crc(_entry) \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const union bch_extent_crc *) (_entry), \ -+ (union bch_extent_crc *) (_entry)) -+ -+#define entry_to_crc(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ -+ \ -+ __entry_to_crc(_entry); \ -+}) -+ -+static inline struct bch_extent_crc_unpacked -+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -+{ -+#define common_fields(_crc) \ -+ .csum_type = _crc.csum_type, \ -+ .compression_type = _crc.compression_type, \ -+ .compressed_size = _crc._compressed_size + 1, \ -+ .uncompressed_size = _crc._uncompressed_size + 1, \ -+ .offset = _crc.offset, \ -+ .live_size = k->size -+ -+ if (!crc) -+ return (struct bch_extent_crc_unpacked) { -+ .compressed_size = k->size, -+ .uncompressed_size = k->size, -+ .live_size = k->size, -+ }; -+ -+ switch (extent_entry_type(to_entry(crc))) { -+ case BCH_EXTENT_ENTRY_crc32: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc32), -+ }; -+ -+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; -+ -+ memcpy(&ret.csum.lo, &crc->crc32.csum, -+ sizeof(crc->crc32.csum)); -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc64: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc64), -+ .nonce = crc->crc64.nonce, -+ .csum.lo = (__force __le64) crc->crc64.csum_lo, -+ }; -+ -+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc128: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc128), -+ .nonce = crc->crc128.nonce, -+ .csum = crc->crc128.csum, -+ }; -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+#undef common_fields -+} -+ -+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -+{ -+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && -+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -+} -+ -+/* bkey_ptrs: generically over any key type that has ptrs */ -+ -+struct bkey_ptrs_c { -+ const union bch_extent_entry *start; -+ const union bch_extent_entry *end; -+}; -+ -+struct bkey_ptrs { -+ union bch_extent_entry *start; -+ union bch_extent_entry *end; -+}; -+ -+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: { -+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ return (struct bkey_ptrs_c) { -+ e.v->start, -+ extent_entry_last(e) -+ }; -+ } -+ case KEY_TYPE_stripe: { -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&s.v->ptrs[0]), -+ to_entry(&s.v->ptrs[s.v->nr_blocks]), -+ }; -+ } -+ case KEY_TYPE_reflink_v: { -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ return (struct bkey_ptrs_c) { -+ r.v->start, -+ bkey_val_end(r), -+ }; -+ } -+ case KEY_TYPE_btree_ptr_v2: { -+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ default: -+ return (struct bkey_ptrs_c) { NULL, NULL }; -+ } -+} -+ -+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -+{ -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); -+ -+ return (struct bkey_ptrs) { -+ (void *) p.start, -+ (void *) p.end -+ }; -+} -+ -+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ -+ for ((_entry) = (_start); \ -+ (_entry) < (_end); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define __bkey_ptr_next(_ptr, _end) \ -+({ \ -+ typeof(_end) _entry; \ -+ \ -+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ -+ if (extent_entry_is_ptr(_entry)) \ -+ break; \ -+ \ -+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -+}) -+ -+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) -+ -+#define bkey_extent_entry_for_each(_p, _entry) \ -+ bkey_extent_entry_for_each_from(_p, _entry, _p.start) -+ -+#define __bkey_for_each_ptr(_start, _end, _ptr) \ -+ for ((_ptr) = (_start); \ -+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ -+ (_ptr)++) -+ -+#define bkey_ptr_next(_p, _ptr) \ -+ __bkey_ptr_next(_ptr, (_p).end) -+ -+#define bkey_for_each_ptr(_p, _ptr) \ -+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) -+ -+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -+({ \ -+ __label__ out; \ -+ \ -+ (_ptr).idx = 0; \ -+ (_ptr).has_ec = false; \ -+ \ -+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ -+ switch (extent_entry_type(_entry)) { \ -+ case BCH_EXTENT_ENTRY_ptr: \ -+ (_ptr).ptr = _entry->ptr; \ -+ goto out; \ -+ case BCH_EXTENT_ENTRY_crc32: \ -+ case BCH_EXTENT_ENTRY_crc64: \ -+ case BCH_EXTENT_ENTRY_crc128: \ -+ (_ptr).crc = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_entry)); \ -+ break; \ -+ case BCH_EXTENT_ENTRY_stripe_ptr: \ -+ (_ptr).ec = _entry->stripe_ptr; \ -+ (_ptr).has_ec = true; \ -+ break; \ -+ } \ -+out: \ -+ _entry < (_end); \ -+}) -+ -+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ -+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ -+ (_entry) = _start; \ -+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ -+ _ptr, _entry) -+ -+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ -+({ \ -+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ -+ if (extent_entry_is_crc(_iter)) { \ -+ (_crc) = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_iter)); \ -+ break; \ -+ } \ -+ \ -+ (_iter) < (_end); \ -+}) -+ -+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ -+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ -+ (_iter) = (_start); \ -+ bkey_crc_next(_k, _start, _end, _crc, _iter); \ -+ (_iter) = extent_entry_next(_iter)) -+ -+#define bkey_for_each_crc(_k, _p, _crc, _iter) \ -+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) -+ -+/* Iterate over pointers in KEY_TYPE_extent: */ -+ -+#define extent_for_each_entry_from(_e, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, \ -+ extent_entry_last(_e),_entry) -+ -+#define extent_for_each_entry(_e, _entry) \ -+ extent_for_each_entry_from(_e, _entry, (_e).v->start) -+ -+#define extent_ptr_next(_e, _ptr) \ -+ __bkey_ptr_next(_ptr, extent_entry_last(_e)) -+ -+#define extent_for_each_ptr(_e, _ptr) \ -+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -+ -+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ -+ extent_entry_last(_e), _ptr, _entry) -+ -+/* utility code common to all keys with pointers: */ -+ -+void bch2_mark_io_failure(struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, -+ int, struct bkey_s); -+ -+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_v2_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .compat = bch2_btree_ptr_v2_compat, \ -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_extent_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_extent (struct bkey_ops) { \ -+ .key_invalid = bch2_extent_invalid, \ -+ .key_debugcheck = bch2_extent_debugcheck, \ -+ .val_to_text = bch2_extent_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .key_normalize = bch2_extent_normalize, \ -+ .key_merge = bch2_extent_merge, \ -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_reservation_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reservation (struct bkey_ops) { \ -+ .key_invalid = bch2_reservation_invalid, \ -+ .val_to_text = bch2_reservation_to_text, \ -+ .key_merge = bch2_reservation_merge, \ -+} -+ -+/* Extent checksum entries: */ -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c, -+ struct bch_extent_crc_unpacked); -+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -+void bch2_extent_crc_append(struct bkey_i *, -+ struct bch_extent_crc_unpacked); -+ -+/* Generic code for keys with pointers: */ -+ -+static inline bool bkey_extent_is_direct_data(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_data(const struct bkey *k) -+{ -+ return bkey_extent_is_direct_data(k) || -+ k->type == KEY_TYPE_inline_data || -+ k->type == KEY_TYPE_reflink_p; -+} -+ -+/* -+ * Should extent be counted under inode->i_sectors? -+ */ -+static inline bool bkey_extent_is_allocation(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reservation: -+ case KEY_TYPE_reflink_p: -+ case KEY_TYPE_reflink_v: -+ case KEY_TYPE_inline_data: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (!ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -+bool bch2_bkey_is_incompressible(struct bkey_s_c); -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); -+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, -+ unsigned, unsigned); -+ -+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -+void bch2_extent_ptr_decoded_append(struct bkey_i *, -+ struct extent_ptr_decoded *); -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, -+ struct bch_extent_ptr *); -+ -+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -+do { \ -+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ -+ \ -+ _ptr = &_ptrs.start->ptr; \ -+ \ -+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ -+ if (_cond) { \ -+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ -+ _ptrs = bch2_bkey_ptrs(_k); \ -+ continue; \ -+ } \ -+ \ -+ (_ptr)++; \ -+ } \ -+} while (0) -+ -+void bch2_bkey_drop_device(struct bkey_s, unsigned); -+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, -+ struct bch_extent_ptr, u64); -+ -+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); -+ -+void bch2_ptr_swab(struct bkey_s); -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos, struct bkey_s); -+int bch2_cut_back_s(struct bpos, struct bkey_s); -+ -+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_front_s(where, bkey_i_to_s(k)); -+} -+ -+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_back_s(where, bkey_i_to_s(k)); -+} -+ -+/** -+ * bch_key_resize - adjust size of @k -+ * -+ * bkey_start_offset(k) will be preserved, modifies where the extent ends -+ */ -+static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -+{ -+ k->p.offset -= k->size; -+ k->p.offset += new_size; -+ k->size = new_size; -+} -+ -+/* -+ * In extent_sort_fix_overlapping(), insert_fixup_extent(), -+ * extent_merge_inline() - we're modifying keys in place that are packed. To do -+ * that we have to unpack the key, modify the unpacked key - then this -+ * copies/repacks the unpacked to the original as necessary. -+ */ -+static inline void extent_save(struct btree *b, struct bkey_packed *dst, -+ struct bkey *src) -+{ -+ struct bkey_format *f = &b->format; -+ struct bkey_i *dst_unpacked; -+ -+ if ((dst_unpacked = packed_to_bkey(dst))) -+ dst_unpacked->k = *src; -+ else -+ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -+} -+ -+#endif /* _BCACHEFS_EXTENTS_H */ -diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h -new file mode 100644 -index 000000000000..43d6c341ecca ---- /dev/null -+++ b/fs/bcachefs/extents_types.h -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_TYPES_H -+#define _BCACHEFS_EXTENTS_TYPES_H -+ -+#include "bcachefs_format.h" -+ -+struct bch_extent_crc_unpacked { -+ u32 compressed_size; -+ u32 uncompressed_size; -+ u32 live_size; -+ -+ u8 csum_type; -+ u8 compression_type; -+ -+ u16 offset; -+ -+ u16 nonce; -+ -+ struct bch_csum csum; -+}; -+ -+struct extent_ptr_decoded { -+ unsigned idx; -+ bool has_ec; -+ struct bch_extent_crc_unpacked crc; -+ struct bch_extent_ptr ptr; -+ struct bch_extent_stripe_ptr ec; -+}; -+ -+struct bch_io_failures { -+ u8 nr; -+ struct bch_dev_io_failures { -+ u8 dev; -+ u8 idx; -+ u8 nr_failed; -+ u8 nr_retries; -+ } devs[BCH_REPLICAS_MAX]; -+}; -+ -+#endif /* _BCACHEFS_EXTENTS_TYPES_H */ -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -new file mode 100644 -index 000000000000..26d5cad7e6a5 ---- /dev/null -+++ b/fs/bcachefs/eytzinger.h -@@ -0,0 +1,285 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _EYTZINGER_H -+#define _EYTZINGER_H -+ -+#include -+#include -+ -+#include "util.h" -+ -+/* -+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an -+ * array -+ */ -+ -+/* -+ * One based indexing version: -+ * -+ * With one based indexing each level of the tree starts at a power of two - -+ * good for cacheline alignment: -+ * -+ * Size parameter is treated as if we were using 0 based indexing, however: -+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there -+ * are actually size - 1 elements -+ */ -+ -+static inline unsigned eytzinger1_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + child; -+} -+ -+static inline unsigned eytzinger1_left_child(unsigned i) -+{ -+ return eytzinger1_child(i, 0); -+} -+ -+static inline unsigned eytzinger1_right_child(unsigned i) -+{ -+ return eytzinger1_child(i, 1); -+} -+ -+static inline unsigned eytzinger1_first(unsigned size) -+{ -+ return rounddown_pow_of_two(size - 1); -+} -+ -+static inline unsigned eytzinger1_last(unsigned size) -+{ -+ return rounddown_pow_of_two(size) - 1; -+} -+ -+/* -+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that -+ * -+ * eytzinger1_next(0) == eytzinger1_first()) -+ * eytzinger1_prev(0) == eytzinger1_last()) -+ * -+ * eytzinger1_prev(eytzinger1_first()) == 0 -+ * eytzinger1_next(eytzinger1_last()) == 0 -+ */ -+ -+static inline unsigned eytzinger1_next(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_right_child(i) < size) { -+ i = eytzinger1_right_child(i); -+ -+ i <<= __fls(size) - __fls(i); -+ i >>= i >= size; -+ } else { -+ i >>= ffz(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_left_child(i) < size) { -+ i = eytzinger1_left_child(i) + 1; -+ -+ i <<= __fls(size) - __fls(i); -+ i -= 1; -+ i >>= i >= size; -+ } else { -+ i >>= __ffs(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_extra(unsigned size) -+{ -+ return (size - rounddown_pow_of_two(size - 1)) << 1; -+} -+ -+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned b = __fls(i); -+ unsigned shift = __fls(size - 1) - b; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ i ^= 1U << b; -+ i <<= 1; -+ i |= 1; -+ i <<= shift; -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i -= (i - extra) >> 1; -+ */ -+ s = extra - i; -+ i += (s >> 1) & (s >> 31); -+ -+ return i; -+} -+ -+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned shift; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i += i - extra; -+ */ -+ s = extra - i; -+ i -= s & (s >> 31); -+ -+ shift = __ffs(i); -+ -+ i >>= shift + 1; -+ i |= 1U << (__fls(size - 1) - shift); -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -+} -+ -+#define eytzinger1_for_each(_i, _size) \ -+ for ((_i) = eytzinger1_first((_size)); \ -+ (_i) != 0; \ -+ (_i) = eytzinger1_next((_i), (_size))) -+ -+/* Zero based indexing version: */ -+ -+static inline unsigned eytzinger0_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + 1 + child; -+} -+ -+static inline unsigned eytzinger0_left_child(unsigned i) -+{ -+ return eytzinger0_child(i, 0); -+} -+ -+static inline unsigned eytzinger0_right_child(unsigned i) -+{ -+ return eytzinger0_child(i, 1); -+} -+ -+static inline unsigned eytzinger0_first(unsigned size) -+{ -+ return eytzinger1_first(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_last(unsigned size) -+{ -+ return eytzinger1_last(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_next(unsigned i, unsigned size) -+{ -+ return eytzinger1_next(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -+{ -+ return eytzinger1_prev(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_extra(unsigned size) -+{ -+ return eytzinger1_extra(size + 1); -+} -+ -+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -+} -+ -+#define eytzinger0_for_each(_i, _size) \ -+ for ((_i) = eytzinger0_first((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_next((_i), (_size))) -+ -+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); -+ -+/* return greatest node <= @search, or -1 if not found */ -+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, -+ eytzinger_cmp_fn cmp, const void *search) -+{ -+ unsigned i, n = 0; -+ -+ if (!nr) -+ return -1; -+ -+ do { -+ i = n; -+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); -+ } while (n < nr); -+ -+ if (n & 1) { -+ /* @i was greater than @search, return previous node: */ -+ -+ if (i == eytzinger0_first(nr)) -+ return -1; -+ -+ return eytzinger0_prev(i, nr); -+ } else { -+ return i; -+ } -+} -+ -+#define eytzinger0_find(base, nr, size, _cmp, search) \ -+({ \ -+ void *_base = (base); \ -+ void *_search = (search); \ -+ size_t _nr = (nr); \ -+ size_t _size = (size); \ -+ size_t _i = 0; \ -+ int _res; \ -+ \ -+ while (_i < _nr && \ -+ (_res = _cmp(_search, _base + _i * _size, _size))) \ -+ _i = eytzinger0_child(_i, _res > 0); \ -+ _i; \ -+}) -+ -+void eytzinger0_sort(void *, size_t, size_t, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+#endif /* _EYTZINGER_H */ -diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h -new file mode 100644 -index 000000000000..cdb272708a4b ---- /dev/null -+++ b/fs/bcachefs/fifo.h -@@ -0,0 +1,127 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FIFO_H -+#define _BCACHEFS_FIFO_H -+ -+#include "util.h" -+ -+#define FIFO(type) \ -+struct { \ -+ size_t front, back, size, mask; \ -+ type *data; \ -+} -+ -+#define DECLARE_FIFO(type, name) FIFO(type) name -+ -+#define fifo_buf_size(fifo) \ -+ ((fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ -+ : 0) -+ -+#define init_fifo(fifo, _size, _gfp) \ -+({ \ -+ (fifo)->front = (fifo)->back = 0; \ -+ (fifo)->size = (_size); \ -+ (fifo)->mask = (fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) - 1 \ -+ : 0; \ -+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ -+}) -+ -+#define free_fifo(fifo) \ -+do { \ -+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ -+ (fifo)->data = NULL; \ -+} while (0) -+ -+#define fifo_swap(l, r) \ -+do { \ -+ swap((l)->front, (r)->front); \ -+ swap((l)->back, (r)->back); \ -+ swap((l)->size, (r)->size); \ -+ swap((l)->mask, (r)->mask); \ -+ swap((l)->data, (r)->data); \ -+} while (0) -+ -+#define fifo_move(dest, src) \ -+do { \ -+ typeof(*((dest)->data)) _t; \ -+ while (!fifo_full(dest) && \ -+ fifo_pop(src, _t)) \ -+ fifo_push(dest, _t); \ -+} while (0) -+ -+#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) -+ -+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) -+ -+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) -+ -+#define fifo_entry_idx_abs(fifo, p) \ -+ ((((p) >= &fifo_peek_front(fifo) \ -+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ -+ (((p) - (fifo)->data))) -+ -+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] -+ -+#define fifo_push_back_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) -+ -+#define fifo_push_front_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) -+ -+#define fifo_push_back(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_push_front(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_pop_front(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_pop_back(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -+#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -+#define fifo_peek(fifo) fifo_peek_front(fifo) -+ -+#define fifo_for_each_entry(_entry, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#endif /* _BCACHEFS_FIFO_H */ -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -new file mode 100644 -index 000000000000..878419d40992 ---- /dev/null -+++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,317 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "fs-common.h" -+#include "inode.h" -+#include "xattr.h" -+ -+#include -+ -+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, -+ struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *new_inode, -+ const struct qstr *name, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct posix_acl *default_acl, -+ struct posix_acl *acl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *dir_iter = NULL; -+ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); -+ -+ if (!name) -+ new_inode->bi_flags |= BCH_INODE_UNLINKED; -+ -+ ret = bch2_inode_create(trans, new_inode, -+ BLOCKDEV_INODE_MAX, 0, -+ &c->unused_inode_hint); -+ if (ret) -+ goto err; -+ -+ if (default_acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ default_acl, ACL_TYPE_DEFAULT); -+ if (ret) -+ goto err; -+ } -+ -+ if (acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ acl, ACL_TYPE_ACCESS); -+ if (ret) -+ goto err; -+ } -+ -+ if (name) { -+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ if (S_ISDIR(new_inode->bi_mode)) -+ dir_u->bi_nlink++; -+ -+ ret = bch2_inode_write(trans, dir_iter, dir_u); -+ if (ret) -+ goto err; -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(new_inode->bi_mode), -+ name, new_inode->bi_inum, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ goto err; -+ } -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, -+ u64 inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ inode_u->bi_ctime = now; -+ bch2_inode_nlink_inc(inode_u); -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(inode_u->bi_mode), -+ name, inum, BCH_HASH_SET_MUST_CREATE) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ bch2_trans_iter_put(trans, inode_iter); -+ return ret; -+} -+ -+int bch2_unlink_trans(struct btree_trans *trans, -+ u64 dir_inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, -+ const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, -+ *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 inum, now = bch2_current_time(trans->c); -+ struct bkey_s_c k; -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, -+ name, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dirent_iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(dirent_iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; -+ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); -+ bch2_inode_nlink_dec(inode_u); -+ -+ ret = (S_ISDIR(inode_u->bi_mode) -+ ? bch2_empty_dir_trans(trans, inum) -+ : 0) ?: -+ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, inode_iter); -+ bch2_trans_iter_put(trans, dirent_iter); -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, -+ struct bch_inode_unpacked *src_u) -+{ -+ u64 src, dst; -+ unsigned id; -+ bool ret = false; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ if (dst_u->bi_fields_set & (1 << id)) -+ continue; -+ -+ src = bch2_inode_opt_get(src_u, id); -+ dst = bch2_inode_opt_get(dst_u, id); -+ -+ if (src == dst) -+ continue; -+ -+ bch2_inode_opt_set(dst_u, id, src); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+int bch2_rename_trans(struct btree_trans *trans, -+ u64 src_dir, struct bch_inode_unpacked *src_dir_u, -+ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, -+ struct bch_inode_unpacked *src_inode_u, -+ struct bch_inode_unpacked *dst_inode_u, -+ const struct qstr *src_name, -+ const struct qstr *dst_name, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; -+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; -+ struct bch_hash_info src_hash, dst_hash; -+ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); -+ int ret; -+ -+ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_dir_iter); -+ if (ret) -+ goto err; -+ -+ src_hash = bch2_hash_info_init(trans->c, src_dir_u); -+ -+ if (dst_dir != src_dir) { -+ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_dir_iter); -+ if (ret) -+ goto err; -+ -+ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); -+ } else { -+ dst_dir_u = src_dir_u; -+ dst_hash = src_hash; -+ } -+ -+ ret = bch2_dirent_rename(trans, -+ src_dir, &src_hash, -+ dst_dir, &dst_hash, -+ src_name, &src_inode, -+ dst_name, &dst_inode, -+ mode); -+ if (ret) -+ goto err; -+ -+ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_inode_iter); -+ if (ret) -+ goto err; -+ -+ if (dst_inode) { -+ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_inode_iter); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ if (S_ISDIR(src_inode_u->bi_mode) != -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -ENOTDIR; -+ goto err; -+ } -+ -+ if (S_ISDIR(dst_inode_u->bi_mode) && -+ bch2_empty_dir_trans(trans, dst_inode)) { -+ ret = -ENOTEMPTY; -+ goto err; -+ } -+ } -+ -+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -+ S_ISDIR(src_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (S_ISDIR(src_inode_u->bi_mode)) { -+ src_dir_u->bi_nlink--; -+ dst_dir_u->bi_nlink++; -+ } -+ -+ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { -+ dst_dir_u->bi_nlink--; -+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) -+ bch2_inode_nlink_dec(dst_inode_u); -+ -+ src_dir_u->bi_mtime = now; -+ src_dir_u->bi_ctime = now; -+ -+ if (src_dir != dst_dir) { -+ dst_dir_u->bi_mtime = now; -+ dst_dir_u->bi_ctime = now; -+ } -+ -+ src_inode_u->bi_ctime = now; -+ -+ if (dst_inode) -+ dst_inode_u->bi_ctime = now; -+ -+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: -+ (src_dir != dst_dir -+ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) -+ : 0 ) ?: -+ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: -+ (dst_inode -+ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) -+ : 0 ); -+err: -+ bch2_trans_iter_put(trans, dst_inode_iter); -+ bch2_trans_iter_put(trans, src_inode_iter); -+ bch2_trans_iter_put(trans, dst_dir_iter); -+ bch2_trans_iter_put(trans, src_dir_iter); -+ return ret; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h -new file mode 100644 -index 000000000000..2273b7961c9b ---- /dev/null -+++ b/fs/bcachefs/fs-common.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_COMMON_H -+#define _BCACHEFS_FS_COMMON_H -+ -+struct posix_acl; -+ -+int bch2_create_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct posix_acl *, -+ struct posix_acl *); -+ -+int bch2_link_trans(struct btree_trans *, u64, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_unlink_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_rename_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ const struct qstr *, -+ enum bch_rename_mode); -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *); -+ -+#endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -new file mode 100644 -index 000000000000..55004998536d ---- /dev/null -+++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3133 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "journal.h" -+#include "io.h" -+#include "keylist.h" -+#include "quota.h" -+#include "reflink.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+struct quota_res { -+ u64 sectors; -+}; -+ -+struct bch_writepage_io { -+ struct closure cl; -+ struct bch_inode_info *inode; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_write { -+ struct completion done; -+ struct kiocb *req; -+ struct mm_struct *mm; -+ unsigned loop:1, -+ sync:1, -+ free_iov:1; -+ struct quota_res quota_res; -+ u64 written; -+ -+ struct iov_iter iter; -+ struct iovec inline_vecs[2]; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_read { -+ struct closure cl; -+ struct kiocb *req; -+ long ret; -+ struct bch_read_bio rbio; -+}; -+ -+/* pagecache_block must be held */ -+static int write_invalidate_inode_pages_range(struct address_space *mapping, -+ loff_t start, loff_t end) -+{ -+ int ret; -+ -+ /* -+ * XXX: the way this is currently implemented, we can spin if a process -+ * is continually redirtying a specific page -+ */ -+ do { -+ if (!mapping->nrpages && -+ !mapping->nrexceptional) -+ return 0; -+ -+ ret = filemap_write_and_wait_range(mapping, start, end); -+ if (ret) -+ break; -+ -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = invalidate_inode_pages2_range(mapping, -+ start >> PAGE_SHIFT, -+ end >> PAGE_SHIFT); -+ } while (ret == -EBUSY); -+ -+ return ret; -+} -+ -+/* quotas */ -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ if (!res->sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ BUG_ON(res->sectors > inode->ei_quota_reserved); -+ -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); -+ inode->ei_quota_reserved -= res->sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ res->sectors = 0; -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ int ret; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, -+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); -+ if (likely(!ret)) { -+ inode->ei_quota_reserved += sectors; -+ res->sectors += sectors; -+ } -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+#else -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ return 0; -+} -+ -+#endif -+ -+/* i_size updates: */ -+ -+struct inode_new_size { -+ loff_t new_size; -+ u64 now; -+ unsigned fields; -+}; -+ -+static int inode_set_size(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_new_size *s = p; -+ -+ bi->bi_size = s->new_size; -+ if (s->fields & ATTR_ATIME) -+ bi->bi_atime = s->now; -+ if (s->fields & ATTR_MTIME) -+ bi->bi_mtime = s->now; -+ if (s->fields & ATTR_CTIME) -+ bi->bi_ctime = s->now; -+ -+ return 0; -+} -+ -+int __must_check bch2_write_inode_size(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ loff_t new_size, unsigned fields) -+{ -+ struct inode_new_size s = { -+ .new_size = new_size, -+ .now = bch2_current_time(c), -+ .fields = fields, -+ }; -+ -+ return bch2_write_inode(c, inode, inode_set_size, &s, fields); -+} -+ -+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ if (!sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+#ifdef CONFIG_BCACHEFS_QUOTA -+ if (quota_res && sectors > 0) { -+ BUG_ON(sectors > quota_res->sectors); -+ BUG_ON(sectors > inode->ei_quota_reserved); -+ -+ quota_res->sectors -= sectors; -+ inode->ei_quota_reserved -= sectors; -+ } else { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); -+ } -+#endif -+ inode->v.i_blocks += sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+} -+ -+/* page state: */ -+ -+/* stored in page->private: */ -+ -+struct bch_page_sector { -+ /* Uncompressed, fully allocated replicas: */ -+ unsigned nr_replicas:3; -+ -+ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ -+ unsigned replicas_reserved:3; -+ -+ /* i_sectors: */ -+ enum { -+ SECTOR_UNALLOCATED, -+ SECTOR_RESERVED, -+ SECTOR_DIRTY, -+ SECTOR_ALLOCATED, -+ } state:2; -+}; -+ -+struct bch_page_state { -+ spinlock_t lock; -+ atomic_t write_count; -+ struct bch_page_sector s[PAGE_SECTORS]; -+}; -+ -+static inline struct bch_page_state *__bch2_page_state(struct page *page) -+{ -+ return page_has_private(page) -+ ? (struct bch_page_state *) page_private(page) -+ : NULL; -+} -+ -+static inline struct bch_page_state *bch2_page_state(struct page *page) -+{ -+ EBUG_ON(!PageLocked(page)); -+ -+ return __bch2_page_state(page); -+} -+ -+/* for newly allocated pages: */ -+static void __bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = __bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+static void bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+/* for newly allocated pages: */ -+static struct bch_page_state *__bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ struct bch_page_state *s; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); -+ if (!s) -+ return NULL; -+ -+ spin_lock_init(&s->lock); -+ /* -+ * migrate_page_move_mapping() assumes that pages with private data -+ * have their count elevated by 1. -+ */ -+ get_page(page); -+ set_page_private(page, (unsigned long) s); -+ SetPagePrivate(page); -+ return s; -+} -+ -+static struct bch_page_state *bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); -+} -+ -+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ /* XXX: this should not be open coded */ -+ return inode->ei_inode.bi_data_replicas -+ ? inode->ei_inode.bi_data_replicas - 1 -+ : c->opts.data_replicas; -+} -+ -+static inline unsigned sectors_to_reserve(struct bch_page_sector *s, -+ unsigned nr_replicas) -+{ -+ return max(0, (int) nr_replicas - -+ s->nr_replicas - -+ s->replicas_reserved); -+} -+ -+static int bch2_get_page_disk_reservation(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct page *page, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned nr_replicas = inode_nr_replicas(c, inode); -+ struct disk_reservation disk_res = { 0 }; -+ unsigned i, disk_res_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ if (!disk_res_sectors) -+ return 0; -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ disk_res_sectors, 1, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ s->s[i].replicas_reserved += -+ sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ return 0; -+} -+ -+struct bch2_page_reservation { -+ struct disk_reservation disk; -+ struct quota_res quota; -+}; -+ -+static void bch2_page_reservation_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ memset(res, 0, sizeof(*res)); -+ -+ res->disk.nr_replicas = inode_nr_replicas(c, inode); -+} -+ -+static void bch2_page_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ bch2_disk_reservation_put(c, &res->disk); -+ bch2_quota_reservation_put(c, inode, &res->quota); -+} -+ -+static int bch2_page_reservation_get(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned i, disk_sectors = 0, quota_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ disk_sectors += sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; -+ } -+ -+ if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, -+ disk_sectors, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ if (quota_sectors) { -+ ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, -+ check_enospc); -+ if (unlikely(ret)) { -+ struct disk_reservation tmp = { -+ .sectors = disk_sectors -+ }; -+ -+ bch2_disk_reservation_put(c, &tmp); -+ res->disk.sectors -= disk_sectors; -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_clear_page_bits(struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_page_state *s = bch2_page_state(page); -+ struct disk_reservation disk_res = { 0 }; -+ int i, dirty_sectors = 0; -+ -+ if (!s) -+ return; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(PageWriteback(page)); -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) { -+ disk_res.sectors += s->s[i].replicas_reserved; -+ s->s[i].replicas_reserved = 0; -+ -+ if (s->s[i].state == SECTOR_DIRTY) { -+ dirty_sectors++; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ } -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, NULL, -dirty_sectors); -+ -+ bch2_page_state_release(page); -+} -+ -+static void bch2_set_page_dirty(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i, dirty_sectors = 0; -+ -+ WARN_ON((u64) page_offset(page) + offset + len > -+ round_up((u64) i_size_read(&inode->v), block_bytes(c))); -+ -+ spin_lock(&s->lock); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ unsigned sectors = sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ -+ /* -+ * This can happen if we race with the error path in -+ * bch2_writepage_io_done(): -+ */ -+ sectors = min_t(unsigned, sectors, res->disk.sectors); -+ -+ s->s[i].replicas_reserved += sectors; -+ res->disk.sectors -= sectors; -+ -+ if (s->s[i].state == SECTOR_UNALLOCATED) -+ dirty_sectors++; -+ -+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); -+ } -+ -+ spin_unlock(&s->lock); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, &res->quota, dirty_sectors); -+ -+ if (!PageDirty(page)) -+ __set_page_dirty_nobuffers(page); -+} -+ -+vm_fault_t bch2_page_fault(struct vm_fault *vmf) -+{ -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ int ret; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = filemap_fault(vmf); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return ret; -+} -+ -+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -+{ -+ struct page *page = vmf->page; -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation res; -+ unsigned len; -+ loff_t isize; -+ int ret = VM_FAULT_LOCKED; -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ sb_start_pagefault(inode->v.i_sb); -+ file_update_time(file); -+ -+ /* -+ * Not strictly necessary, but helps avoid dio writes livelocking in -+ * write_invalidate_inode_pages_range() - can drop this if/when we get -+ * a write_invalidate_inode_pages_range() that works without dropping -+ * page lock before invalidating page -+ */ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ lock_page(page); -+ isize = i_size_read(&inode->v); -+ -+ if (page->mapping != mapping || page_offset(page) >= isize) { -+ unlock_page(page); -+ ret = VM_FAULT_NOPAGE; -+ goto out; -+ } -+ -+ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); -+ -+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { -+ unlock_page(page); -+ ret = VM_FAULT_SIGBUS; -+ goto out; -+ } -+ -+ bch2_set_page_dirty(c, inode, page, &res, 0, len); -+ bch2_page_reservation_put(c, inode, &res); -+ -+ wait_for_stable_page(page); -+out: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ sb_end_pagefault(inode->v.i_sb); -+ -+ return ret; -+} -+ -+void bch2_invalidatepage(struct page *page, unsigned int offset, -+ unsigned int length) -+{ -+ if (offset || length < PAGE_SIZE) -+ return; -+ -+ bch2_clear_page_bits(page); -+} -+ -+int bch2_releasepage(struct page *page, gfp_t gfp_mask) -+{ -+ if (PageDirty(page)) -+ return 0; -+ -+ bch2_clear_page_bits(page); -+ return 1; -+} -+ -+#ifdef CONFIG_MIGRATION -+int bch2_migrate_page(struct address_space *mapping, struct page *newpage, -+ struct page *page, enum migrate_mode mode) -+{ -+ int ret; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(!PageLocked(newpage)); -+ -+ ret = migrate_page_move_mapping(mapping, newpage, page, 0); -+ if (ret != MIGRATEPAGE_SUCCESS) -+ return ret; -+ -+ if (PagePrivate(page)) { -+ ClearPagePrivate(page); -+ get_page(newpage); -+ set_page_private(newpage, page_private(page)); -+ set_page_private(page, 0); -+ put_page(page); -+ SetPagePrivate(newpage); -+ } -+ -+ if (mode != MIGRATE_SYNC_NO_COPY) -+ migrate_page_copy(newpage, page); -+ else -+ migrate_page_states(newpage, page); -+ return MIGRATEPAGE_SUCCESS; -+} -+#endif -+ -+/* readpage(s): */ -+ -+static void bch2_readpages_end_io(struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) { -+ struct page *page = bv->bv_page; -+ -+ if (!bio->bi_status) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ } -+ -+ bio_put(bio); -+} -+ -+static inline void page_state_init_for_read(struct page *page) -+{ -+ SetPagePrivate(page); -+ page->private = 0; -+} -+ -+struct readpages_iter { -+ struct address_space *mapping; -+ struct page **pages; -+ unsigned nr_pages; -+ unsigned nr_added; -+ unsigned idx; -+ pgoff_t offset; -+}; -+ -+static int readpages_iter_init(struct readpages_iter *iter, -+ struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->mapping = mapping; -+ iter->offset = list_last_entry(pages, struct page, lru)->index; -+ -+ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); -+ if (!iter->pages) -+ return -ENOMEM; -+ -+ while (!list_empty(pages)) { -+ struct page *page = list_last_entry(pages, struct page, lru); -+ -+ __bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ iter->pages[iter->nr_pages++] = page; -+ list_del(&page->lru); -+ } -+ -+ return 0; -+} -+ -+static inline struct page *readpage_iter_next(struct readpages_iter *iter) -+{ -+ struct page *page; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(iter->idx > iter->nr_added); -+ BUG_ON(iter->nr_added > iter->nr_pages); -+ -+ if (iter->idx < iter->nr_added) -+ goto out; -+ -+ while (1) { -+ if (iter->idx == iter->nr_pages) -+ return NULL; -+ -+ ret = add_to_page_cache_lru_vec(iter->mapping, -+ iter->pages + iter->nr_added, -+ iter->nr_pages - iter->nr_added, -+ iter->offset + iter->nr_added, -+ GFP_NOFS); -+ if (ret > 0) -+ break; -+ -+ page = iter->pages[iter->nr_added]; -+ iter->idx++; -+ iter->nr_added++; -+ -+ __bch2_page_state_release(page); -+ put_page(page); -+ } -+ -+ iter->nr_added += ret; -+ -+ for (i = iter->idx; i < iter->nr_added; i++) -+ put_page(iter->pages[i]); -+out: -+ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); -+ -+ return iter->pages[iter->idx]; -+} -+ -+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -+{ -+ struct bvec_iter iter; -+ struct bio_vec bv; -+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v -+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = k.k->type == KEY_TYPE_reservation -+ ? SECTOR_RESERVED -+ : SECTOR_ALLOCATED; -+ -+ bio_for_each_segment(bv, bio, iter) { -+ struct bch_page_state *s = bch2_page_state(bv.bv_page); -+ unsigned i; -+ -+ for (i = bv.bv_offset >> 9; -+ i < (bv.bv_offset + bv.bv_len) >> 9; -+ i++) { -+ s->s[i].nr_replicas = nr_ptrs; -+ s->s[i].state = state; -+ } -+ } -+} -+ -+static bool extent_partial_reads_expensive(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (crc.csum_type || crc.compression_type) -+ return true; -+ return false; -+} -+ -+static void readpage_bio_extend(struct readpages_iter *iter, -+ struct bio *bio, -+ unsigned sectors_this_extent, -+ bool get_more) -+{ -+ while (bio_sectors(bio) < sectors_this_extent && -+ bio->bi_vcnt < bio->bi_max_vecs) { -+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; -+ struct page *page = readpage_iter_next(iter); -+ int ret; -+ -+ if (page) { -+ if (iter->offset + iter->idx != page_offset) -+ break; -+ -+ iter->idx++; -+ } else { -+ if (!get_more) -+ break; -+ -+ page = xa_load(&iter->mapping->i_pages, page_offset); -+ if (page && !xa_is_value(page)) -+ break; -+ -+ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); -+ if (!page) -+ break; -+ -+ if (!__bch2_page_state_create(page, 0)) { -+ put_page(page); -+ break; -+ } -+ -+ ret = add_to_page_cache_lru(page, iter->mapping, -+ page_offset, GFP_NOFS); -+ if (ret) { -+ __bch2_page_state_release(page); -+ put_page(page); -+ break; -+ } -+ -+ put_page(page); -+ } -+ -+ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); -+ } -+} -+ -+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, -+ struct bch_read_bio *rbio, u64 inum, -+ struct readpages_iter *readpages_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_on_stack sk; -+ int flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE; -+ int ret = 0; -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+retry: -+ while (1) { -+ struct bkey_s_c k; -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inum, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(trans); -+ -+ if (readpages_iter) -+ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, -+ extent_partial_reads_expensive(k)); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ bch2_add_page_sectors(&rbio->bio, k); -+ -+ bch2_read_extent(c, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (ret) { -+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); -+ bio_endio(&rbio->bio); -+ } -+ -+ bkey_on_stack_exit(&sk, c); -+} -+ -+int bch2_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct page *page; -+ struct readpages_iter readpages_iter; -+ int ret; -+ -+ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); -+ BUG_ON(ret); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ while ((page = readpage_iter_next(&readpages_iter))) { -+ pgoff_t index = readpages_iter.offset + readpages_iter.idx; -+ unsigned n = min_t(unsigned, -+ readpages_iter.nr_pages - -+ readpages_iter.idx, -+ BIO_MAX_PAGES); -+ struct bch_read_bio *rbio = -+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), -+ opts); -+ -+ readpages_iter.idx++; -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); -+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bchfs_read(&trans, iter, rbio, inode->v.i_ino, -+ &readpages_iter); -+ } -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_trans_exit(&trans); -+ kfree(readpages_iter.pages); -+ -+ return 0; -+} -+ -+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, -+ u64 inum, struct page *page) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ -+ bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); -+ rbio->bio.bi_iter.bi_sector = -+ (sector_t) page->index << PAGE_SECTOR_SHIFT; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bchfs_read(&trans, iter, rbio, inum, NULL); -+ -+ bch2_trans_exit(&trans); -+} -+ -+int bch2_readpage(struct file *file, struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct bch_read_bio *rbio; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ return 0; -+} -+ -+static void bch2_read_single_page_end_io(struct bio *bio) -+{ -+ complete(bio->bi_private); -+} -+ -+static int bch2_read_single_page(struct page *page, -+ struct address_space *mapping) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_read_bio *rbio; -+ int ret; -+ DECLARE_COMPLETION_ONSTACK(done); -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), -+ io_opts(c, &inode->ei_inode)); -+ rbio->bio.bi_private = &done; -+ rbio->bio.bi_end_io = bch2_read_single_page_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ wait_for_completion(&done); -+ -+ ret = blk_status_to_errno(rbio->bio.bi_status); -+ bio_put(&rbio->bio); -+ -+ if (ret < 0) -+ return ret; -+ -+ SetPageUptodate(page); -+ return 0; -+} -+ -+/* writepages: */ -+ -+struct bch_writepage_state { -+ struct bch_writepage_io *io; -+ struct bch_io_opts opts; -+}; -+ -+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ return (struct bch_writepage_state) { -+ .opts = io_opts(c, &inode->ei_inode) -+ }; -+} -+ -+static void bch2_writepage_io_free(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ -+ bio_put(&io->op.wbio.bio); -+} -+ -+static void bch2_writepage_io_done(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ struct bch_fs *c = io->op.c; -+ struct bio *bio = &io->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bvec; -+ unsigned i; -+ -+ if (io->op.error) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ SetPageError(bvec->bv_page); -+ mapping_set_error(bvec->bv_page->mapping, -EIO); -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ /* -+ * racing with fallocate can cause us to add fewer sectors than -+ * expected - but we shouldn't add more sectors than expected: -+ */ -+ BUG_ON(io->op.i_sectors_delta > 0); -+ -+ /* -+ * (error (due to going RO) halfway through a page can screw that up -+ * slightly) -+ * XXX wtf? -+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); -+ */ -+ -+ /* -+ * PageWriteback is effectively our ref on the inode - fixup i_blocks -+ * before calling end_page_writeback: -+ */ -+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); -+ -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(bvec->bv_page); -+ } -+ -+ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); -+} -+ -+static void bch2_writepage_do_io(struct bch_writepage_state *w) -+{ -+ struct bch_writepage_io *io = w->io; -+ -+ w->io = NULL; -+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); -+ continue_at(&io->cl, bch2_writepage_io_done, NULL); -+} -+ -+/* -+ * Get a bch_writepage_io and add @page to it - appending to an existing one if -+ * possible, else allocating a new one: -+ */ -+static void bch2_writepage_io_alloc(struct bch_fs *c, -+ struct writeback_control *wbc, -+ struct bch_writepage_state *w, -+ struct bch_inode_info *inode, -+ u64 sector, -+ unsigned nr_replicas) -+{ -+ struct bch_write_op *op; -+ -+ w->io = container_of(bio_alloc_bioset(GFP_NOFS, -+ BIO_MAX_PAGES, -+ &c->writepage_bioset), -+ struct bch_writepage_io, op.wbio.bio); -+ -+ closure_init(&w->io->cl, NULL); -+ w->io->inode = inode; -+ -+ op = &w->io->op; -+ bch2_write_op_init(op, c, w->opts); -+ op->target = w->opts.foreground_target; -+ op_journal_seq_set(op, &inode->ei_journal_seq); -+ op->nr_replicas = nr_replicas; -+ op->res.nr_replicas = nr_replicas; -+ op->write_point = writepoint_hashed(inode->ei_last_dirtied); -+ op->pos = POS(inode->v.i_ino, sector); -+ op->wbio.bio.bi_iter.bi_sector = sector; -+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -+} -+ -+static int __bch2_writepage(struct page *page, -+ struct writeback_control *wbc, -+ void *data) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_writepage_state *w = data; -+ struct bch_page_state *s, orig; -+ unsigned i, offset, nr_replicas_this_write = U32_MAX; -+ loff_t i_size = i_size_read(&inode->v); -+ pgoff_t end_index = i_size >> PAGE_SHIFT; -+ int ret; -+ -+ EBUG_ON(!PageUptodate(page)); -+ -+ /* Is the page fully inside i_size? */ -+ if (page->index < end_index) -+ goto do_io; -+ -+ /* Is the page fully outside i_size? (truncate in progress) */ -+ offset = i_size & (PAGE_SIZE - 1); -+ if (page->index > end_index || !offset) { -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* -+ * The page straddles i_size. It must be zeroed out on each and every -+ * writepage invocation because it may be mmapped. "A file is mapped -+ * in multiples of the page size. For a file that is not a multiple of -+ * the page size, the remaining memory is zeroed when mapped, and -+ * writes to that region are not written out to the file." -+ */ -+ zero_user_segment(page, offset, PAGE_SIZE); -+do_io: -+ s = bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ ret = bch2_get_page_disk_reservation(c, inode, page, true); -+ if (ret) { -+ SetPageError(page); -+ mapping_set_error(page->mapping, ret); -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* Before unlocking the page, get copy of reservations: */ -+ orig = *s; -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ nr_replicas_this_write = -+ min_t(unsigned, nr_replicas_this_write, -+ s->s[i].nr_replicas + -+ s->s[i].replicas_reserved); -+ } -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ s->s[i].nr_replicas = w->opts.compression -+ ? 0 : nr_replicas_this_write; -+ -+ s->s[i].replicas_reserved = 0; -+ s->s[i].state = SECTOR_ALLOCATED; -+ } -+ -+ BUG_ON(atomic_read(&s->write_count)); -+ atomic_set(&s->write_count, 1); -+ -+ BUG_ON(PageWriteback(page)); -+ set_page_writeback(page); -+ -+ unlock_page(page); -+ -+ offset = 0; -+ while (1) { -+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; -+ u64 sector; -+ -+ while (offset < PAGE_SECTORS && -+ orig.s[offset].state < SECTOR_DIRTY) -+ offset++; -+ -+ if (offset == PAGE_SECTORS) -+ break; -+ -+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; -+ -+ while (offset + sectors < PAGE_SECTORS && -+ orig.s[offset + sectors].state >= SECTOR_DIRTY) -+ sectors++; -+ -+ for (i = offset; i < offset + sectors; i++) { -+ reserved_sectors += orig.s[i].replicas_reserved; -+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; -+ } -+ -+ if (w->io && -+ (w->io->op.res.nr_replicas != nr_replicas_this_write || -+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || -+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= -+ (BIO_MAX_PAGES * PAGE_SIZE) || -+ bio_end_sector(&w->io->op.wbio.bio) != sector)) -+ bch2_writepage_do_io(w); -+ -+ if (!w->io) -+ bch2_writepage_io_alloc(c, wbc, w, inode, sector, -+ nr_replicas_this_write); -+ -+ atomic_inc(&s->write_count); -+ -+ BUG_ON(inode != w->io->inode); -+ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, -+ sectors << 9, offset << 9)); -+ -+ /* Check for writing past i_size: */ -+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c))); -+ -+ w->io->op.res.sectors += reserved_sectors; -+ w->io->op.i_sectors_delta -= dirty_sectors; -+ w->io->op.new_i_size = i_size; -+ -+ offset += sectors; -+ } -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(page); -+ -+ return 0; -+} -+ -+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(mapping->host)); -+ struct blk_plug plug; -+ int ret; -+ -+ blk_start_plug(&plug); -+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ blk_finish_plug(&plug); -+ return ret; -+} -+ -+int bch2_writepage(struct page *page, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); -+ int ret; -+ -+ ret = __bch2_writepage(page, wbc, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ -+ return ret; -+} -+ -+/* buffered writes: */ -+ -+int bch2_write_begin(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned flags, -+ struct page **pagep, void **fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res; -+ pgoff_t index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ struct page *page; -+ int ret = -ENOMEM; -+ -+ res = kmalloc(sizeof(*res), GFP_KERNEL); -+ if (!res) -+ return -ENOMEM; -+ -+ bch2_page_reservation_init(c, inode, res); -+ *fsdata = res; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ page = grab_cache_page_write_begin(mapping, index, flags); -+ if (!page) -+ goto err_unlock; -+ -+ if (PageUptodate(page)) -+ goto out; -+ -+ /* If we're writing entire page, don't need to read it in first: */ -+ if (len == PAGE_SIZE) -+ goto out; -+ -+ if (!offset && pos + len >= inode->v.i_size) { -+ zero_user_segment(page, len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+ -+ if (index > inode->v.i_size >> PAGE_SHIFT) { -+ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+readpage: -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto err; -+out: -+ ret = bch2_page_reservation_get(c, inode, page, res, -+ offset, len, true); -+ if (ret) { -+ if (!PageUptodate(page)) { -+ /* -+ * If the page hasn't been read in, we won't know if we -+ * actually need a reservation - we don't actually need -+ * to read here, we just need to check if the page is -+ * fully backed by uncompressed data: -+ */ -+ goto readpage; -+ } -+ -+ goto err; -+ } -+ -+ *pagep = page; -+ return 0; -+err: -+ unlock_page(page); -+ put_page(page); -+ *pagep = NULL; -+err_unlock: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ kfree(res); -+ *fsdata = NULL; -+ return ret; -+} -+ -+int bch2_write_end(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res = fsdata; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ if (unlikely(copied < len && !PageUptodate(page))) { -+ /* -+ * The page needs to be read in, but that would destroy -+ * our partial write - simplest thing is to just force -+ * userspace to redo the write: -+ */ -+ zero_user(page, 0, PAGE_SIZE); -+ flush_dcache_page(page); -+ copied = 0; -+ } -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ if (copied) { -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, res, offset, copied); -+ -+ inode->ei_last_dirtied = (unsigned long) current; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_page_reservation_put(c, inode, res); -+ kfree(res); -+ -+ return copied; -+} -+ -+#define WRITE_BATCH_PAGES 32 -+ -+static int __bch2_buffered_write(struct bch_inode_info *inode, -+ struct address_space *mapping, -+ struct iov_iter *iter, -+ loff_t pos, unsigned len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct page *pages[WRITE_BATCH_PAGES]; -+ struct bch2_page_reservation res; -+ unsigned long index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); -+ unsigned i, reserved = 0, set_dirty = 0; -+ unsigned copied = 0, nr_pages_copied = 0; -+ int ret = 0; -+ -+ BUG_ON(!len); -+ BUG_ON(nr_pages > ARRAY_SIZE(pages)); -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ for (i = 0; i < nr_pages; i++) { -+ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); -+ if (!pages[i]) { -+ nr_pages = i; -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ len = min_t(unsigned, len, -+ nr_pages * PAGE_SIZE - offset); -+ break; -+ } -+ } -+ -+ if (offset && !PageUptodate(pages[0])) { -+ ret = bch2_read_single_page(pages[0], mapping); -+ if (ret) -+ goto out; -+ } -+ -+ if ((pos + len) & (PAGE_SIZE - 1) && -+ !PageUptodate(pages[nr_pages - 1])) { -+ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { -+ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); -+ } else { -+ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); -+ if (ret) -+ goto out; -+ } -+ } -+ -+ while (reserved < len) { -+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - reserved, -+ PAGE_SIZE - pg_offset); -+retry_reservation: -+ ret = bch2_page_reservation_get(c, inode, page, &res, -+ pg_offset, pg_len, true); -+ -+ if (ret && !PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (!ret) -+ goto retry_reservation; -+ } -+ -+ if (ret) -+ goto out; -+ -+ reserved += pg_len; -+ } -+ -+ if (mapping_writably_mapped(mapping)) -+ for (i = 0; i < nr_pages; i++) -+ flush_dcache_page(pages[i]); -+ -+ while (copied < len) { -+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - copied, -+ PAGE_SIZE - pg_offset); -+ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, -+ iter, pg_offset, pg_len); -+ -+ if (!pg_copied) -+ break; -+ -+ if (!PageUptodate(page) && -+ pg_copied != PAGE_SIZE && -+ pos + copied + pg_copied < inode->v.i_size) { -+ zero_user(page, 0, PAGE_SIZE); -+ break; -+ } -+ -+ flush_dcache_page(page); -+ iov_iter_advance(iter, pg_copied); -+ copied += pg_copied; -+ -+ if (pg_copied != pg_len) -+ break; -+ } -+ -+ if (!copied) -+ goto out; -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ while (set_dirty < copied) { -+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, copied - set_dirty, -+ PAGE_SIZE - pg_offset); -+ -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); -+ unlock_page(page); -+ put_page(page); -+ -+ set_dirty += pg_len; -+ } -+ -+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); -+ inode->ei_last_dirtied = (unsigned long) current; -+out: -+ for (i = nr_pages_copied; i < nr_pages; i++) { -+ unlock_page(pages[i]); -+ put_page(pages[i]); -+ } -+ -+ bch2_page_reservation_put(c, inode, &res); -+ -+ return copied ?: ret; -+} -+ -+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ loff_t pos = iocb->ki_pos; -+ ssize_t written = 0; -+ int ret = 0; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ do { -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE * WRITE_BATCH_PAGES - offset); -+again: -+ /* -+ * Bring in the user page that we will copy from _first_. -+ * Otherwise there's a nasty deadlock on copying from the -+ * same page as we're writing to, without it being marked -+ * up-to-date. -+ * -+ * Not only is this an optimisation, but it is also required -+ * to check that the address is actually valid, when atomic -+ * usercopies are used, below. -+ */ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE - offset); -+ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ ret = -EFAULT; -+ break; -+ } -+ } -+ -+ if (unlikely(fatal_signal_pending(current))) { -+ ret = -EINTR; -+ break; -+ } -+ -+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); -+ if (unlikely(ret < 0)) -+ break; -+ -+ cond_resched(); -+ -+ if (unlikely(ret == 0)) { -+ /* -+ * If we were unable to copy any data at all, we must -+ * fall back to a single segment length write. -+ * -+ * If we didn't fallback here, we could livelock -+ * because not all segments in the iov can be copied at -+ * once without a pagefault. -+ */ -+ bytes = min_t(unsigned long, PAGE_SIZE - offset, -+ iov_iter_single_seg_count(iter)); -+ goto again; -+ } -+ pos += ret; -+ written += ret; -+ ret = 0; -+ -+ balance_dirty_pages_ratelimited(mapping); -+ } while (iov_iter_count(iter)); -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return written ? written : ret; -+} -+ -+/* O_DIRECT reads */ -+ -+static void bch2_dio_read_complete(struct closure *cl) -+{ -+ struct dio_read *dio = container_of(cl, struct dio_read, cl); -+ -+ dio->req->ki_complete(dio->req, dio->ret, 0); -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+} -+ -+static void bch2_direct_IO_read_endio(struct bio *bio) -+{ -+ struct dio_read *dio = bio->bi_private; -+ -+ if (bio->bi_status) -+ dio->ret = blk_status_to_errno(bio->bi_status); -+ -+ closure_put(&dio->cl); -+} -+ -+static void bch2_direct_IO_read_split_endio(struct bio *bio) -+{ -+ bch2_direct_IO_read_endio(bio); -+ bio_check_pages_dirty(bio); /* transfers ownership */ -+} -+ -+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct dio_read *dio; -+ struct bio *bio; -+ loff_t offset = req->ki_pos; -+ bool sync = is_sync_kiocb(req); -+ size_t shorten; -+ ssize_t ret; -+ -+ if ((offset|iter->count) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ ret = min_t(loff_t, iter->count, -+ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); -+ -+ if (!ret) -+ return ret; -+ -+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); -+ iter->count -= shorten; -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_read_bioset); -+ -+ bio->bi_end_io = bch2_direct_IO_read_endio; -+ -+ dio = container_of(bio, struct dio_read, rbio.bio); -+ closure_init(&dio->cl, NULL); -+ -+ /* -+ * this is a _really_ horrible hack just to avoid an atomic sub at the -+ * end: -+ */ -+ if (!sync) { -+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER - -+ CLOSURE_RUNNING + -+ CLOSURE_DESTRUCTOR); -+ } else { -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER + 1); -+ } -+ -+ dio->req = req; -+ dio->ret = ret; -+ -+ goto start; -+ while (iter->count) { -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->bio_read); -+ bio->bi_end_io = bch2_direct_IO_read_split_endio; -+start: -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); -+ bio->bi_iter.bi_sector = offset >> 9; -+ bio->bi_private = dio; -+ -+ ret = bio_iov_iter_get_pages(bio, iter); -+ if (ret < 0) { -+ /* XXX: fault inject this path */ -+ bio->bi_status = BLK_STS_RESOURCE; -+ bio_endio(bio); -+ break; -+ } -+ -+ offset += bio->bi_iter.bi_size; -+ bio_set_pages_dirty(bio); -+ -+ if (iter->count) -+ closure_get(&dio->cl); -+ -+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); -+ } -+ -+ iter->count += shorten; -+ -+ if (sync) { -+ closure_sync(&dio->cl); -+ closure_debug_destroy(&dio->cl); -+ ret = dio->ret; -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+ return ret; -+ } else { -+ return -EIOCBQUEUED; -+ } -+} -+ -+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ size_t count = iov_iter_count(iter); -+ ssize_t ret; -+ -+ if (!count) -+ return 0; /* skip atime */ -+ -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ struct blk_plug plug; -+ -+ ret = filemap_write_and_wait_range(mapping, -+ iocb->ki_pos, -+ iocb->ki_pos + count - 1); -+ if (ret < 0) -+ return ret; -+ -+ file_accessed(file); -+ -+ blk_start_plug(&plug); -+ ret = bch2_direct_IO_read(iocb, iter); -+ blk_finish_plug(&plug); -+ -+ if (ret >= 0) -+ iocb->ki_pos += ret; -+ } else { -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = generic_file_read_iter(iocb, iter); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ } -+ -+ return ret; -+} -+ -+/* O_DIRECT writes */ -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *); -+ -+static long bch2_dio_write_loop(struct dio_write *dio) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct kiocb *req = dio->req; -+ struct address_space *mapping = req->ki_filp->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bio *bio = &dio->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ unsigned unaligned; -+ bool sync = dio->sync; -+ long ret; -+ -+ if (dio->loop) -+ goto loop; -+ -+ while (1) { -+ if (kthread) -+ kthread_use_mm(dio->mm); -+ BUG_ON(current->faults_disabled_mapping); -+ current->faults_disabled_mapping = mapping; -+ -+ ret = bio_iov_iter_get_pages(bio, &dio->iter); -+ -+ current->faults_disabled_mapping = NULL; -+ if (kthread) -+ kthread_unuse_mm(dio->mm); -+ -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); -+ bio->bi_iter.bi_size -= unaligned; -+ iov_iter_revert(&dio->iter, unaligned); -+ -+ if (!bio->bi_iter.bi_size) { -+ /* -+ * bio_iov_iter_get_pages was only able to get < -+ * blocksize worth of pages: -+ */ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); -+ dio->op.end_io = bch2_dio_write_loop_async; -+ dio->op.target = dio->op.opts.foreground_target; -+ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); -+ dio->op.write_point = writepoint_hashed((unsigned long) current); -+ dio->op.nr_replicas = dio->op.opts.data_replicas; -+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); -+ -+ if ((req->ki_flags & IOCB_DSYNC) && -+ !c->opts.journal_flush_disabled) -+ dio->op.flags |= BCH_WRITE_FLUSH; -+ -+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), -+ dio->op.opts.data_replicas, 0); -+ if (unlikely(ret) && -+ !bch2_check_range_allocated(c, dio->op.pos, -+ bio_sectors(bio), dio->op.opts.data_replicas)) -+ goto err; -+ -+ task_io_account_write(bio->bi_iter.bi_size); -+ -+ if (!dio->sync && !dio->loop && dio->iter.count) { -+ struct iovec *iov = dio->inline_vecs; -+ -+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { -+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), -+ GFP_KERNEL); -+ if (unlikely(!iov)) { -+ dio->sync = sync = true; -+ goto do_io; -+ } -+ -+ dio->free_iov = true; -+ } -+ -+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); -+ dio->iter.iov = iov; -+ } -+do_io: -+ dio->loop = true; -+ closure_call(&dio->op.cl, bch2_write, NULL, NULL); -+ -+ if (sync) -+ wait_for_completion(&dio->done); -+ else -+ return -EIOCBQUEUED; -+loop: -+ i_sectors_acct(c, inode, &dio->quota_res, -+ dio->op.i_sectors_delta); -+ req->ki_pos += (u64) dio->op.written << 9; -+ dio->written += dio->op.written; -+ -+ spin_lock(&inode->v.i_lock); -+ if (req->ki_pos > inode->v.i_size) -+ i_size_write(&inode->v, req->ki_pos); -+ spin_unlock(&inode->v.i_lock); -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ if (!dio->iter.count || dio->op.error) -+ break; -+ -+ bio_reset(bio); -+ reinit_completion(&dio->done); -+ } -+ -+ ret = dio->op.error ?: ((long) dio->written << 9); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ -+ if (dio->free_iov) -+ kfree(dio->iter.iov); -+ -+ bio_put(bio); -+ -+ /* inode->i_dio_count is our ref on inode and thus bch_fs */ -+ inode_dio_end(&inode->v); -+ -+ if (!sync) { -+ req->ki_complete(req, ret, 0); -+ ret = -EIOCBQUEUED; -+ } -+ return ret; -+} -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *op) -+{ -+ struct dio_write *dio = container_of(op, struct dio_write, op); -+ -+ if (dio->sync) -+ complete(&dio->done); -+ else -+ bch2_dio_write_loop(dio); -+} -+ -+static noinline -+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct dio_write *dio; -+ struct bio *bio; -+ bool locked = true, extending; -+ ssize_t ret; -+ -+ prefetch(&c->opts); -+ prefetch((void *) &c->opts + 64); -+ prefetch(&inode->ei_inode); -+ prefetch((void *) &inode->ei_inode + 64); -+ -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(req, iter); -+ if (unlikely(ret <= 0)) -+ goto err; -+ -+ ret = file_remove_privs(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = file_update_time(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) -+ goto err; -+ -+ inode_dio_begin(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ extending = req->ki_pos + iter->count > inode->v.i_size; -+ if (!extending) { -+ inode_unlock(&inode->v); -+ locked = false; -+ } -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_write_bioset); -+ dio = container_of(bio, struct dio_write, op.wbio.bio); -+ init_completion(&dio->done); -+ dio->req = req; -+ dio->mm = current->mm; -+ dio->loop = false; -+ dio->sync = is_sync_kiocb(req) || extending; -+ dio->free_iov = false; -+ dio->quota_res.sectors = 0; -+ dio->written = 0; -+ dio->iter = *iter; -+ -+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, -+ iter->count >> 9, true); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter->count - 1); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = bch2_dio_write_loop(dio); -+err: -+ if (locked) -+ inode_unlock(&inode->v); -+ return ret; -+err_put_bio: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ bio_put(bio); -+ inode_dio_end(&inode->v); -+ goto err; -+} -+ -+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ ssize_t ret; -+ -+ if (iocb->ki_flags & IOCB_DIRECT) -+ return bch2_direct_write(iocb, from); -+ -+ /* We can write back this queue in page reclaim */ -+ current->backing_dev_info = inode_to_bdi(&inode->v); -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto unlock; -+ -+ ret = file_remove_privs(file); -+ if (ret) -+ goto unlock; -+ -+ ret = file_update_time(file); -+ if (ret) -+ goto unlock; -+ -+ ret = bch2_buffered_write(iocb, from); -+ if (likely(ret > 0)) -+ iocb->ki_pos += ret; -+unlock: -+ inode_unlock(&inode->v); -+ current->backing_dev_info = NULL; -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); -+ -+ return ret; -+} -+ -+/* fsync: */ -+ -+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret, ret2; -+ -+ ret = file_write_and_wait_range(file, start, end); -+ if (ret) -+ return ret; -+ -+ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) -+ goto out; -+ -+ ret = sync_inode_metadata(&inode->v, 1); -+ if (ret) -+ return ret; -+out: -+ if (!c->opts.journal_flush_disabled) -+ ret = bch2_journal_flush_seq(&c->journal, -+ inode->ei_journal_seq); -+ ret2 = file_check_and_advance_wb_err(file); -+ -+ return ret ?: ret2; -+} -+ -+/* truncate: */ -+ -+static inline int range_has_data(struct bch_fs *c, -+ struct bpos start, -+ struct bpos end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (bkey_extent_is_data(k.k)) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_truncate_page(struct bch_inode_info *inode, -+ pgoff_t index, loff_t start, loff_t end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_page_state *s; -+ unsigned start_offset = start & (PAGE_SIZE - 1); -+ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; -+ unsigned i; -+ struct page *page; -+ int ret = 0; -+ -+ /* Page boundary? Nothing to do */ -+ if (!((index == start >> PAGE_SHIFT && start_offset) || -+ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) -+ return 0; -+ -+ /* Above i_size? */ -+ if (index << PAGE_SHIFT >= inode->v.i_size) -+ return 0; -+ -+ page = find_lock_page(mapping, index); -+ if (!page) { -+ /* -+ * XXX: we're doing two index lookups when we end up reading the -+ * page -+ */ -+ ret = range_has_data(c, -+ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), -+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); -+ if (ret <= 0) -+ return ret; -+ -+ page = find_or_create_page(mapping, index, GFP_KERNEL); -+ if (unlikely(!page)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ } -+ -+ s = bch2_page_state_create(page, 0); -+ if (!s) { -+ ret = -ENOMEM; -+ goto unlock; -+ } -+ -+ if (!PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto unlock; -+ } -+ -+ if (index != start >> PAGE_SHIFT) -+ start_offset = 0; -+ if (index != end >> PAGE_SHIFT) -+ end_offset = PAGE_SIZE; -+ -+ for (i = round_up(start_offset, block_bytes(c)) >> 9; -+ i < round_down(end_offset, block_bytes(c)) >> 9; -+ i++) { -+ s->s[i].nr_replicas = 0; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ -+ zero_user_segment(page, start_offset, end_offset); -+ -+ /* -+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. -+ * -+ * XXX: because we aren't currently tracking whether the page has actual -+ * data in it (vs. just 0s, or only partially written) this wrong. ick. -+ */ -+ ret = bch2_get_page_disk_reservation(c, inode, page, false); -+ BUG_ON(ret); -+ -+ __set_page_dirty_nobuffers(page); -+unlock: -+ unlock_page(page); -+ put_page(page); -+out: -+ return ret; -+} -+ -+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) -+{ -+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, -+ from, round_up(from, PAGE_SIZE)); -+} -+ -+static int bch2_extend(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *inode_u, -+ struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ int ret; -+ -+ /* -+ * sync appends: -+ * -+ * this has to be done _before_ extending i_size: -+ */ -+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); -+ if (ret) -+ return ret; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, inode->v.i_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_truncate_finish_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); -+ return 0; -+} -+ -+static int bch2_truncate_start_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, void *p) -+{ -+ u64 *new_i_size = p; -+ -+ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_size = *new_i_size; -+ return 0; -+} -+ -+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_inode_unpacked inode_u; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 new_i_size = iattr->ia_size; -+ s64 i_sectors_delta = 0; -+ int ret = 0; -+ -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ /* -+ * fetch current on disk i_size: inode is locked, i_size can only -+ * increase underneath us: -+ */ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(iter); -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * check this before next assertion; on filesystem error our normal -+ * invariants are a bit broken (truncate has to truncate the page cache -+ * before the inode). -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ BUG_ON(inode->v.i_size < inode_u.bi_size); -+ -+ if (iattr->ia_size > inode->v.i_size) { -+ ret = bch2_extend(inode, &inode_u, iattr); -+ goto err; -+ } -+ -+ ret = bch2_truncate_page(inode, iattr->ia_size); -+ if (unlikely(ret)) -+ goto err; -+ -+ /* -+ * When extending, we're going to write the new i_size to disk -+ * immediately so we need to flush anything above the current on disk -+ * i_size first: -+ * -+ * Also, when extending we need to flush the page that i_size currently -+ * straddles - if it's mapped to userspace, we need to ensure that -+ * userspace has to redirty it and call .mkwrite -> set_page_dirty -+ * again to allocate the part of the page that was extended. -+ */ -+ if (iattr->ia_size > inode_u.bi_size) -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, -+ iattr->ia_size - 1); -+ else if (iattr->ia_size & (PAGE_SIZE - 1)) -+ ret = filemap_write_and_wait_range(mapping, -+ round_down(iattr->ia_size, PAGE_SIZE), -+ iattr->ia_size - 1); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, -+ &new_i_size, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ round_up(iattr->ia_size, block_bytes(c)) >> 9, -+ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ return ret; -+} -+ -+/* fallocate: */ -+ -+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; -+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; -+ int ret = 0; -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (offset >> PAGE_SHIFT != -+ (offset + len) >> PAGE_SHIFT) { -+ ret = __bch2_truncate_page(inode, -+ (offset + len) >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ truncate_pagecache_range(&inode->v, offset, offset + len - 1); -+ -+ if (discard_start < discard_end) { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ discard_start, discard_end, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ } -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ -+ return ret; -+} -+ -+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, -+ loff_t offset, loff_t len, -+ bool insert) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bkey_on_stack copy; -+ struct btree_trans trans; -+ struct btree_iter *src, *dst; -+ loff_t shift, new_size; -+ u64 src_start; -+ int ret; -+ -+ if ((offset | len) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ bkey_on_stack_init(©); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); -+ -+ /* -+ * We need i_mutex to keep the page cache consistent with the extents -+ * btree, and the btree consistent with i_size - we don't need outside -+ * locking for the extents btree itself, because we're using linked -+ * iterators -+ */ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (insert) { -+ ret = -EFBIG; -+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) -+ goto err; -+ -+ ret = -EINVAL; -+ if (offset >= inode->v.i_size) -+ goto err; -+ -+ src_start = U64_MAX; -+ shift = len; -+ } else { -+ ret = -EINVAL; -+ if (offset + len >= inode->v.i_size) -+ goto err; -+ -+ src_start = offset + len; -+ shift = -len; -+ } -+ -+ new_size = inode->v.i_size + shift; -+ -+ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); -+ if (ret) -+ goto err; -+ -+ if (insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } else { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ offset >> 9, (offset + len) >> 9, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (ret) -+ goto err; -+ } -+ -+ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, src_start >> 9), -+ BTREE_ITER_INTENT); -+ BUG_ON(IS_ERR_OR_NULL(src)); -+ -+ dst = bch2_trans_copy_iter(&trans, src); -+ BUG_ON(IS_ERR_OR_NULL(dst)); -+ -+ while (1) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ struct bpos next_pos; -+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); -+ struct bpos atomic_end; -+ unsigned trigger_flags = 0; -+ -+ k = insert -+ ? bch2_btree_iter_peek_prev(src) -+ : bch2_btree_iter_peek(src); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ if (!k.k || k.k->p.inode != inode->v.i_ino) -+ break; -+ -+ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); -+ -+ if (insert && -+ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) -+ break; -+reassemble: -+ bkey_on_stack_reassemble(©, c, k); -+ -+ if (insert && -+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) -+ bch2_cut_front(move_pos, copy.k); -+ -+ copy.k->k.p.offset += shift >> 9; -+ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); -+ -+ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); -+ if (ret) -+ goto bkey_err; -+ -+ if (bkey_cmp(atomic_end, copy.k->k.p)) { -+ if (insert) { -+ move_pos = atomic_end; -+ move_pos.offset -= shift >> 9; -+ goto reassemble; -+ } else { -+ bch2_cut_back(atomic_end, copy.k); -+ } -+ } -+ -+ bkey_init(&delete.k); -+ delete.k.p = copy.k->k.p; -+ delete.k.size = copy.k->k.size; -+ delete.k.p.offset -= shift >> 9; -+ -+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; -+ -+ if (copy.k->k.size == k.k->size) { -+ /* -+ * If we're moving the entire extent, we can skip -+ * running triggers: -+ */ -+ trigger_flags |= BTREE_TRIGGER_NORUN; -+ } else { -+ /* We might end up splitting compressed extents: */ -+ unsigned nr_ptrs = -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ copy.k->k.size, nr_ptrs, -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ } -+ -+ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); -+ -+ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: -+ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: -+ bch2_trans_commit(&trans, &disk_res, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(c, &disk_res); -+bkey_err: -+ if (!ret) -+ bch2_btree_iter_set_pos(src, next_pos); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ bch2_trans_unlock(&trans); -+ -+ if (!insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(©, c); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+static long bchfs_fallocate(struct bch_inode_info *inode, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end_pos; -+ loff_t end = offset + len; -+ loff_t block_start = round_down(offset, block_bytes(c)); -+ loff_t block_end = round_up(end, block_bytes(c)); -+ unsigned sectors; -+ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { -+ ret = inode_newsize_ok(&inode->v, end); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode & FALLOC_FL_ZERO_RANGE) { -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, end); -+ -+ if (!ret && -+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) -+ ret = __bch2_truncate_page(inode, -+ end >> PAGE_SHIFT, -+ offset, end); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_pagecache_range(&inode->v, offset, end - 1); -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, block_start >> 9), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ end_pos = POS(inode->v.i_ino, block_end >> 9); -+ -+ while (bkey_cmp(iter->pos, end_pos) < 0) { -+ s64 i_sectors_delta = 0; -+ struct disk_reservation disk_res = { 0 }; -+ struct quota_res quota_res = { 0 }; -+ struct bkey_i_reservation reservation; -+ struct bkey_s_c k; -+ -+ bch2_trans_begin(&trans); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ /* already reserved */ -+ if (k.k->type == KEY_TYPE_reservation && -+ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ if (bkey_extent_is_data(k.k) && -+ !(mode & FALLOC_FL_ZERO_RANGE)) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ bkey_reservation_init(&reservation.k_i); -+ reservation.k.type = KEY_TYPE_reservation; -+ reservation.k.p = k.k->p; -+ reservation.k.size = k.k->size; -+ -+ bch2_cut_front(iter->pos, &reservation.k_i); -+ bch2_cut_back(end_pos, &reservation.k_i); -+ -+ sectors = reservation.k.size; -+ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); -+ -+ if (!bkey_extent_is_allocation(k.k)) { -+ ret = bch2_quota_reservation_add(c, inode, -+ "a_res, -+ sectors, true); -+ if (unlikely(ret)) -+ goto bkey_err; -+ } -+ -+ if (reservation.v.nr_replicas < replicas || -+ bch2_bkey_sectors_compressed(k)) { -+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, -+ replicas, 0); -+ if (unlikely(ret)) -+ goto bkey_err; -+ -+ reservation.v.nr_replicas = disk_res.nr_replicas; -+ } -+ -+ ret = bch2_extent_update(&trans, iter, &reservation.k_i, -+ &disk_res, &inode->ei_journal_seq, -+ 0, &i_sectors_delta); -+ i_sectors_acct(c, inode, "a_res, i_sectors_delta); -+bkey_err: -+ bch2_quota_reservation_put(c, inode, "a_res); -+ bch2_disk_reservation_put(c, &disk_res); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ /* -+ * Do we need to extend the file? -+ * -+ * If we zeroed up to the end of the file, we dropped whatever writes -+ * were going to write out the current i_size, so we have to extend -+ * manually even if FL_KEEP_SIZE was set: -+ */ -+ if (end >= inode->v.i_size && -+ (!(mode & FALLOC_FL_KEEP_SIZE) || -+ (mode & FALLOC_FL_ZERO_RANGE))) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ do { -+ bch2_trans_begin(&trans); -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ } while (ret == -EINTR); -+ -+ bch2_trans_unlock(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * Sync existing appends before extending i_size, -+ * as in bch2_extend(): -+ */ -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, S64_MAX); -+ if (ret) -+ goto err; -+ -+ if (mode & FALLOC_FL_KEEP_SIZE) -+ end = inode->v.i_size; -+ else -+ i_size_write(&inode->v, end); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, end, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+long bch2_fallocate_dispatch(struct file *file, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ long ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) -+ ret = bchfs_fallocate(inode, mode, offset, len); -+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) -+ ret = bchfs_fpunch(inode, offset, len); -+ else if (mode == FALLOC_FL_INSERT_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, true); -+ else if (mode == FALLOC_FL_COLLAPSE_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, false); -+ else -+ ret = -EOPNOTSUPP; -+ -+ percpu_ref_put(&c->writes); -+ -+ return ret; -+} -+ -+static void mark_range_unallocated(struct bch_inode_info *inode, -+ loff_t start, loff_t end) -+{ -+ pgoff_t index = start >> PAGE_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; -+ struct pagevec pvec; -+ -+ pagevec_init(&pvec); -+ -+ do { -+ unsigned nr_pages, i, j; -+ -+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, -+ &index, end_index); -+ if (nr_pages == 0) -+ break; -+ -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pvec.pages[i]; -+ struct bch_page_state *s; -+ -+ lock_page(page); -+ s = bch2_page_state(page); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = 0; j < PAGE_SECTORS; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ unlock_page(page); -+ } -+ pagevec_release(&pvec); -+ } while (index <= end_index); -+} -+ -+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, -+ struct file *file_dst, loff_t pos_dst, -+ loff_t len, unsigned remap_flags) -+{ -+ struct bch_inode_info *src = file_bch_inode(file_src); -+ struct bch_inode_info *dst = file_bch_inode(file_dst); -+ struct bch_fs *c = src->v.i_sb->s_fs_info; -+ s64 i_sectors_delta = 0; -+ u64 aligned_len; -+ loff_t ret = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) -+ return -EINVAL; -+ -+ if (remap_flags & REMAP_FILE_DEDUP) -+ return -EOPNOTSUPP; -+ -+ if ((pos_src & (block_bytes(c) - 1)) || -+ (pos_dst & (block_bytes(c) - 1))) -+ return -EINVAL; -+ -+ if (src == dst && -+ abs(pos_src - pos_dst) < len) -+ return -EINVAL; -+ -+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ file_update_time(file_dst); -+ -+ inode_dio_wait(&src->v); -+ inode_dio_wait(&dst->v); -+ -+ ret = generic_remap_file_range_prep(file_src, pos_src, -+ file_dst, pos_dst, -+ &len, remap_flags); -+ if (ret < 0 || len == 0) -+ goto err; -+ -+ aligned_len = round_up((u64) len, block_bytes(c)); -+ -+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, -+ pos_dst, pos_dst + len - 1); -+ if (ret) -+ goto err; -+ -+ mark_range_unallocated(src, pos_src, pos_src + aligned_len); -+ -+ ret = bch2_remap_range(c, -+ POS(dst->v.i_ino, pos_dst >> 9), -+ POS(src->v.i_ino, pos_src >> 9), -+ aligned_len >> 9, -+ &dst->ei_journal_seq, -+ pos_dst + len, &i_sectors_delta); -+ if (ret < 0) -+ goto err; -+ -+ /* -+ * due to alignment, we might have remapped slightly more than requsted -+ */ -+ ret = min((u64) ret << 9, (u64) len); -+ -+ /* XXX get a quota reservation */ -+ i_sectors_acct(c, dst, NULL, i_sectors_delta); -+ -+ spin_lock(&dst->v.i_lock); -+ if (pos_dst + ret > dst->v.i_size) -+ i_size_write(&dst->v, pos_dst + ret); -+ spin_unlock(&dst->v.i_lock); -+err: -+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ return ret; -+} -+ -+/* fseek: */ -+ -+static int page_data_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (s) -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state >= SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t bch2_seek_pagecache_data(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ struct page *page; -+ pgoff_t start_index = start_offset >> PAGE_SHIFT; -+ pgoff_t end_index = end_offset >> PAGE_SHIFT; -+ pgoff_t index = start_index; -+ loff_t ret; -+ int offset; -+ -+ while (index <= end_index) { -+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { -+ lock_page(page); -+ -+ offset = page_data_offset(page, -+ page->index == start_index -+ ? start_offset & (PAGE_SIZE - 1) -+ : 0); -+ if (offset >= 0) { -+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + -+ offset, -+ start_offset, end_offset); -+ unlock_page(page); -+ put_page(page); -+ return ret; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ } else { -+ break; -+ } -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_data(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_data = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ break; -+ } else if (bkey_extent_is_data(k.k)) { -+ next_data = max(offset, bkey_start_offset(k.k) << 9); -+ break; -+ } else if (k.k->p.offset >> 9 > isize) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_data > offset) -+ next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data); -+ -+ if (next_data >= isize) -+ return -ENXIO; -+ -+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -+} -+ -+static int __page_hole_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (!s) -+ return 0; -+ -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state < SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) -+{ -+ pgoff_t index = offset >> PAGE_SHIFT; -+ struct page *page; -+ int pg_offset; -+ loff_t ret = -1; -+ -+ page = find_lock_entry(mapping, index); -+ if (!page || xa_is_value(page)) -+ return offset; -+ -+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); -+ if (pg_offset >= 0) -+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; -+ -+ unlock_page(page); -+ -+ return ret; -+} -+ -+static loff_t bch2_seek_pagecache_hole(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ loff_t offset = start_offset, hole; -+ -+ while (offset < end_offset) { -+ hole = page_hole_offset(mapping, offset); -+ if (hole >= 0 && hole <= end_offset) -+ return max(start_offset, hole); -+ -+ offset += PAGE_SIZE; -+ offset &= PAGE_MASK; -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_hole(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_hole = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), -+ BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE); -+ break; -+ } else if (!bkey_extent_is_data(k.k)) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9); -+ -+ if (next_hole < k.k->p.offset << 9) -+ break; -+ } else { -+ offset = max(offset, bkey_start_offset(k.k) << 9); -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_hole > isize) -+ next_hole = isize; -+ -+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -+} -+ -+loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -+{ -+ switch (whence) { -+ case SEEK_SET: -+ case SEEK_CUR: -+ case SEEK_END: -+ return generic_file_llseek(file, offset, whence); -+ case SEEK_DATA: -+ return bch2_seek_data(file, offset); -+ case SEEK_HOLE: -+ return bch2_seek_hole(file, offset); -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_fs_fsio_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->dio_write_bioset); -+ bioset_exit(&c->dio_read_bioset); -+ bioset_exit(&c->writepage_bioset); -+} -+ -+int bch2_fs_fsio_init(struct bch_fs *c) -+{ -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bioset_init(&c->writepage_bioset, -+ 4, offsetof(struct bch_writepage_io, op.wbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_read_bioset, -+ 4, offsetof(struct dio_read, rbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_write_bioset, -+ 4, offsetof(struct dio_write, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ ret = -ENOMEM; -+ -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h -new file mode 100644 -index 000000000000..7063556d289b ---- /dev/null -+++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_H -+#define _BCACHEFS_FS_IO_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+#include "buckets.h" -+#include "io_types.h" -+ -+#include -+ -+struct quota_res; -+ -+int __must_check bch2_write_inode_size(struct bch_fs *, -+ struct bch_inode_info *, -+ loff_t, unsigned); -+ -+int bch2_writepage(struct page *, struct writeback_control *); -+int bch2_readpage(struct file *, struct page *); -+ -+int bch2_writepages(struct address_space *, struct writeback_control *); -+int bch2_readpages(struct file *, struct address_space *, -+ struct list_head *, unsigned); -+ -+int bch2_write_begin(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page **, void **); -+int bch2_write_end(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page *, void *); -+ -+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); -+ -+int bch2_fsync(struct file *, loff_t, loff_t, int); -+ -+int bch2_truncate(struct bch_inode_info *, struct iattr *); -+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -+ -+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, -+ loff_t, loff_t, unsigned); -+ -+loff_t bch2_llseek(struct file *, loff_t, int); -+ -+vm_fault_t bch2_page_fault(struct vm_fault *); -+vm_fault_t bch2_page_mkwrite(struct vm_fault *); -+void bch2_invalidatepage(struct page *, unsigned int, unsigned int); -+int bch2_releasepage(struct page *, gfp_t); -+int bch2_migrate_page(struct address_space *, struct page *, -+ struct page *, enum migrate_mode); -+ -+void bch2_fs_fsio_exit(struct bch_fs *); -+int bch2_fs_fsio_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_H */ -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -new file mode 100644 -index 000000000000..0873d2f0928c ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,312 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-ioctl.h" -+#include "quota.h" -+ -+#include -+#include -+ -+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -+ -+struct flags_set { -+ unsigned mask; -+ unsigned flags; -+ -+ unsigned projid; -+}; -+ -+static int bch2_inode_flags_set(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ /* -+ * We're relying on btree locking here for exclusion with other ioctl -+ * calls - use the flags in the btree (@bi), not inode->i_flags: -+ */ -+ struct flags_set *s = p; -+ unsigned newflags = s->flags; -+ unsigned oldflags = bi->bi_flags & s->mask; -+ -+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && -+ !capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ -+ if (!S_ISREG(bi->bi_mode) && -+ !S_ISDIR(bi->bi_mode) && -+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) -+ return -EINVAL; -+ -+ bi->bi_flags &= ~s->mask; -+ bi->bi_flags |= newflags; -+ -+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -+ return 0; -+} -+ -+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -+{ -+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); -+ -+ return put_user(flags, arg); -+} -+ -+static int bch2_ioc_setflags(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ void __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -+ unsigned uflags; -+ int ret; -+ -+ if (get_user(uflags, (int __user *) arg)) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -+ if (uflags) -+ return -EOPNOTSUPP; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto setflags_out; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -+ ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+setflags_out: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct fsxattr fa = { 0 }; -+ -+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); -+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -+ -+ return copy_to_user(arg, &fa, sizeof(fa)); -+} -+ -+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct flags_set *s = p; -+ -+ if (s->projid != bi->bi_project) { -+ bi->bi_fields_set |= 1U << Inode_opt_project; -+ bi->bi_project = s->projid; -+ } -+ -+ return bch2_inode_flags_set(inode, bi, p); -+} -+ -+static int bch2_ioc_fssetxattr(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -+ struct fsxattr fa; -+ int ret; -+ -+ if (copy_from_user(&fa, arg, sizeof(fa))) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -+ if (fa.fsx_xflags) -+ return -EOPNOTSUPP; -+ -+ if (fa.fsx_projid >= U32_MAX) -+ return -EINVAL; -+ -+ /* -+ * inode fields accessible via the xattr interface are stored with a +1 -+ * bias, so that 0 means unset: -+ */ -+ s.projid = fa.fsx_projid + 1; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto err; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_set_projid(c, inode, fa.fsx_projid); -+ if (ret) -+ goto err_unlock; -+ -+ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -+ ATTR_CTIME); -+err_unlock: -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_inode_info *dir = p; -+ -+ return !bch2_reinherit_attrs(bi, &dir->ei_inode); -+} -+ -+static int bch2_ioc_reinherit_attrs(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *src, -+ const char __user *name) -+{ -+ struct bch_inode_info *dst; -+ struct inode *vinode = NULL; -+ char *kname = NULL; -+ struct qstr qstr; -+ int ret = 0; -+ u64 inum; -+ -+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); -+ if (!kname) -+ return -ENOMEM; -+ -+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); -+ if (unlikely(ret < 0)) -+ goto err1; -+ -+ qstr.len = ret; -+ qstr.name = kname; -+ -+ ret = -ENOENT; -+ inum = bch2_dirent_lookup(c, src->v.i_ino, -+ &src->ei_str_hash, -+ &qstr); -+ if (!inum) -+ goto err1; -+ -+ vinode = bch2_vfs_inode_get(c, inum); -+ ret = PTR_ERR_OR_ZERO(vinode); -+ if (ret) -+ goto err1; -+ -+ dst = to_bch_ei(vinode); -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ goto err2; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ if (inode_attr_changing(src, dst, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst, -+ src->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err3; -+ } -+ -+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -+err3: -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ /* return true if we did work */ -+ if (ret >= 0) -+ ret = !ret; -+ -+ mnt_drop_write_file(file); -+err2: -+ iput(vinode); -+err1: -+ kfree(kname); -+ -+ return ret; -+} -+ -+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct super_block *sb = inode->v.i_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ return bch2_ioc_getflags(inode, (int __user *) arg); -+ -+ case FS_IOC_SETFLAGS: -+ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); -+ -+ case FS_IOC_FSGETXATTR: -+ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); -+ case FS_IOC_FSSETXATTR: -+ return bch2_ioc_fssetxattr(c, file, inode, -+ (void __user *) arg); -+ -+ case BCHFS_IOC_REINHERIT_ATTRS: -+ return bch2_ioc_reinherit_attrs(c, file, inode, -+ (void __user *) arg); -+ -+ case FS_IOC_GETVERSION: -+ return -ENOTTY; -+ case FS_IOC_SETVERSION: -+ return -ENOTTY; -+ -+ case FS_IOC_GOINGDOWN: -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ down_write(&sb->s_umount); -+ sb->s_flags |= SB_RDONLY; -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only due to ioctl"); -+ up_write(&sb->s_umount); -+ return 0; -+ -+ default: -+ return bch2_fs_ioctl(c, cmd, (void __user *) arg); -+ } -+} -+ -+#ifdef CONFIG_COMPAT -+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ /* These are just misnamed, they actually get/put from/to user an int */ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ cmd = FS_IOC_GETFLAGS; -+ break; -+ case FS_IOC32_SETFLAGS: -+ cmd = FS_IOC_SETFLAGS; -+ break; -+ default: -+ return -ENOIOCTLCMD; -+ } -+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -+} -+#endif -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h -new file mode 100644 -index 000000000000..f201980ef2c3 ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.h -@@ -0,0 +1,81 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IOCTL_H -+#define _BCACHEFS_FS_IOCTL_H -+ -+/* Inode flags: */ -+ -+/* bcachefs inode flags -> vfs inode flags: */ -+static const unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_SYNC] = S_SYNC, -+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, -+ [__BCH_INODE_APPEND] = S_APPEND, -+ [__BCH_INODE_NOATIME] = S_NOATIME, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -+static const unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_SYNC] = FS_SYNC_FL, -+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_APPEND] = FS_APPEND_FL, -+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, -+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -+static const unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, -+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, -+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, -+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -+}; -+ -+#define set_flags(_map, _in, _out) \ -+do { \ -+ unsigned _i; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & (1 << _i)) \ -+ (_out) |= _map[_i]; \ -+ else \ -+ (_out) &= ~_map[_i]; \ -+} while (0) -+ -+#define map_flags(_map, _in) \ -+({ \ -+ unsigned _out = 0; \ -+ \ -+ set_flags(_map, _in, _out); \ -+ _out; \ -+}) -+ -+#define map_flags_rev(_map, _in) \ -+({ \ -+ unsigned _i, _out = 0; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & _map[_i]) { \ -+ (_out) |= 1 << _i; \ -+ (_in) &= ~_map[_i]; \ -+ } \ -+ (_out); \ -+}) -+ -+#define map_defined(_map) \ -+({ \ -+ unsigned _in = ~0; \ -+ \ -+ map_flags_rev(_map, _in); \ -+}) -+ -+/* Set VFS inode flags from bcachefs inode: */ -+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -+{ -+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -+} -+ -+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); -+ -+#endif /* _BCACHEFS_FS_IOCTL_H */ -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -new file mode 100644 -index 000000000000..e504e6b19abe ---- /dev/null -+++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1628 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "extents.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-io.h" -+#include "fs-ioctl.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "quota.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct kmem_cache *bch2_inode_cache; -+ -+static void bch2_vfs_inode_init(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *); -+ -+static void journal_seq_copy(struct bch_inode_info *dst, -+ u64 journal_seq) -+{ -+ u64 old, v = READ_ONCE(dst->ei_journal_seq); -+ -+ do { -+ old = v; -+ -+ if (old >= journal_seq) -+ break; -+ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); -+} -+ -+static void __pagecache_lock_put(struct pagecache_lock *lock, long i) -+{ -+ BUG_ON(atomic_long_read(&lock->v) == 0); -+ -+ if (atomic_long_sub_return_release(i, &lock->v) == 0) -+ wake_up_all(&lock->wait); -+} -+ -+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) -+{ -+ long v = atomic_long_read(&lock->v), old; -+ -+ do { -+ old = v; -+ -+ if (i > 0 ? v < 0 : v > 0) -+ return false; -+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, -+ old, old + i)) != old); -+ return true; -+} -+ -+static void __pagecache_lock_get(struct pagecache_lock *lock, long i) -+{ -+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, 1); -+} -+ -+void bch2_pagecache_add_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, 1); -+} -+ -+void bch2_pagecache_block_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, -1); -+} -+ -+void bch2_pagecache_block_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, -1); -+} -+ -+void bch2_inode_update_after_write(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ unsigned fields) -+{ -+ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); -+ i_uid_write(&inode->v, bi->bi_uid); -+ i_gid_write(&inode->v, bi->bi_gid); -+ inode->v.i_mode = bi->bi_mode; -+ -+ if (fields & ATTR_ATIME) -+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); -+ if (fields & ATTR_MTIME) -+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); -+ if (fields & ATTR_CTIME) -+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); -+ -+ inode->ei_inode = *bi; -+ -+ bch2_inode_flags_to_vfs(inode); -+} -+ -+int __must_check bch2_write_inode(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ inode_set_fn set, -+ void *p, unsigned fields) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked inode_u; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ (set ? set(inode, &inode_u, p) : 0) ?: -+ bch2_inode_write(&trans, iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * the btree node lock protects inode->ei_inode, not ei_update_lock; -+ * this is important for inode updates via bchfs_write_index_update -+ */ -+ if (!ret) -+ bch2_inode_update_after_write(c, inode, &inode_u, fields); -+ -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_fs_quota_transfer(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_qid new_qid, -+ unsigned qtypes, -+ enum quota_acct_mode mode) -+{ -+ unsigned i; -+ int ret; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ for (i = 0; i < QTYP_NR; i++) -+ if (new_qid.q[i] == inode->ei_qid.q[i]) -+ qtypes &= ~(1U << i); -+ -+ if (!qtypes) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ -+ ret = bch2_quota_transfer(c, qtypes, new_qid, -+ inode->ei_qid, -+ inode->v.i_blocks + -+ inode->ei_quota_reserved, -+ mode); -+ if (!ret) -+ for (i = 0; i < QTYP_NR; i++) -+ if (qtypes & (1 << i)) -+ inode->ei_qid.q[i] = new_qid.q[i]; -+ -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) -+{ -+ struct bch_inode_unpacked inode_u; -+ struct bch_inode_info *inode; -+ int ret; -+ -+ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); -+ if (unlikely(!inode)) -+ return ERR_PTR(-ENOMEM); -+ if (!(inode->v.i_state & I_NEW)) -+ return &inode->v; -+ -+ ret = bch2_inode_find_by_inum(c, inum, &inode_u); -+ if (ret) { -+ iget_failed(&inode->v); -+ return ERR_PTR(ret); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ -+ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); -+ -+ unlock_new_inode(&inode->v); -+ -+ return &inode->v; -+} -+ -+static struct bch_inode_info * -+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, -+ umode_t mode, dev_t rdev, bool tmpfile) -+{ -+ struct bch_fs *c = dir->v.i_sb->s_fs_info; -+ struct user_namespace *ns = dir->v.i_sb->s_user_ns; -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u; -+ struct bch_inode_info *inode, *old; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *default_acl = NULL, *acl = NULL; -+ u64 journal_seq = 0; -+ int ret; -+ -+ /* -+ * preallocate acls + vfs inode before btree transaction, so that -+ * nothing can fail after the transaction succeeds: -+ */ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); -+ if (ret) -+ return ERR_PTR(ret); -+#endif -+ inode = to_bch_ei(new_inode(c->vfs_sb)); -+ if (unlikely(!inode)) { -+ inode = ERR_PTR(-ENOMEM); -+ goto err; -+ } -+ -+ bch2_inode_init_early(c, &inode_u); -+ -+ if (!tmpfile) -+ mutex_lock(&dir->ei_update_lock); -+ -+ bch2_trans_init(&trans, c, 8, 1024); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, -+ !tmpfile ? &dentry->d_name : NULL, -+ from_kuid(ns, current_fsuid()), -+ from_kgid(ns, current_fsgid()), -+ mode, rdev, -+ default_acl, acl) ?: -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (unlikely(ret)) -+ goto err_before_quota; -+ -+ ret = bch2_trans_commit(&trans, NULL, &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (unlikely(ret)) { -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+err_before_quota: -+ if (ret == -EINTR) -+ goto retry; -+ goto err_trans; -+ } -+ -+ if (!tmpfile) { -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(dir, journal_seq); -+ mutex_unlock(&dir->ei_update_lock); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ journal_seq_copy(inode, journal_seq); -+ -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); -+ -+ /* -+ * we must insert the new inode into the inode cache before calling -+ * bch2_trans_exit() and dropping locks, else we could race with another -+ * thread pulling the inode in and modifying it: -+ */ -+ -+ old = to_bch_ei(insert_inode_locked2(&inode->v)); -+ if (unlikely(old)) { -+ /* -+ * We raced, another process pulled the new inode into cache -+ * before us: -+ */ -+ journal_seq_copy(old, journal_seq); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ -+ inode = old; -+ } else { -+ /* -+ * we really don't want insert_inode_locked2() to be setting -+ * I_NEW... -+ */ -+ unlock_new_inode(&inode->v); -+ } -+ -+ bch2_trans_exit(&trans); -+err: -+ posix_acl_release(default_acl); -+ posix_acl_release(acl); -+ return inode; -+err_trans: -+ if (!tmpfile) -+ mutex_unlock(&dir->ei_update_lock); -+ -+ bch2_trans_exit(&trans); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ inode = ERR_PTR(ret); -+ goto err; -+} -+ -+/* methods */ -+ -+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, -+ unsigned int flags) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct inode *vinode = NULL; -+ u64 inum; -+ -+ inum = bch2_dirent_lookup(c, dir->v.i_ino, -+ &dir->ei_str_hash, -+ &dentry->d_name); -+ -+ if (inum) -+ vinode = bch2_vfs_inode_get(c, inum); -+ -+ return d_splice_alias(vinode, dentry); -+} -+ -+static int bch2_mknod(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, dev_t rdev) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_create(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, bool excl) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); -+} -+ -+static int __bch2_link(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_info *dir, -+ struct dentry *dentry) -+{ -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u, inode_u; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ ret = bch2_link_trans(&trans, -+ dir->v.i_ino, -+ inode->v.i_ino, &dir_u, &inode_u, -+ &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ return ret; -+} -+ -+static int bch2_link(struct dentry *old_dentry, struct inode *vdir, -+ struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ return ret; -+ -+ ihold(&inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_inode_unpacked dir_u, inode_u; -+ struct btree_trans trans; -+ int ret; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_unlink_trans(&trans, -+ dir->v.i_ino, &dir_u, -+ &inode_u, &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &dir->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_MTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ -+ return ret; -+} -+ -+static int bch2_symlink(struct inode *vdir, struct dentry *dentry, -+ const char *symname) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; -+ int ret; -+ -+ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); -+ if (unlikely(IS_ERR(inode))) -+ return PTR_ERR(inode); -+ -+ inode_lock(&inode->v); -+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); -+ inode_unlock(&inode->v); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); -+ if (unlikely(ret)) -+ goto err; -+ -+ journal_seq_copy(dir, inode->ei_journal_seq); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ goto err; -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+err: -+ iput(&inode->v); -+ return ret; -+} -+ -+static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); -+} -+ -+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, -+ struct inode *dst_vdir, struct dentry *dst_dentry, -+ unsigned flags) -+{ -+ struct bch_fs *c = src_vdir->i_sb->s_fs_info; -+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); -+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); -+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); -+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); -+ struct bch_inode_unpacked dst_dir_u, src_dir_u; -+ struct bch_inode_unpacked src_inode_u, dst_inode_u; -+ struct btree_trans trans; -+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE -+ ? BCH_RENAME_EXCHANGE -+ : dst_dentry->d_inode -+ ? BCH_RENAME_OVERWRITE : BCH_RENAME; -+ u64 journal_seq = 0; -+ int ret; -+ -+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) -+ return -EINVAL; -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, -+ 0, LLONG_MAX); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 8, 2048); -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, src_inode, -+ dst_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst_inode, -+ src_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+retry: -+ bch2_trans_begin(&trans); -+ ret = bch2_rename_trans(&trans, -+ src_dir->v.i_ino, &src_dir_u, -+ dst_dir->v.i_ino, &dst_dir_u, -+ &src_inode_u, -+ &dst_inode_u, -+ &src_dentry->d_name, -+ &dst_dentry->d_name, -+ mode) ?: -+ bch2_trans_commit(&trans, NULL, -+ &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); -+ BUG_ON(dst_inode && -+ dst_inode->v.i_ino != dst_inode_u.bi_inum); -+ -+ bch2_inode_update_after_write(c, src_dir, &src_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(src_dir, journal_seq); -+ -+ if (src_dir != dst_dir) { -+ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(dst_dir, journal_seq); -+ } -+ -+ bch2_inode_update_after_write(c, src_inode, &src_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(src_inode, journal_seq); -+ -+ if (dst_inode) { -+ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(dst_inode, journal_seq); -+ } -+err: -+ bch2_trans_exit(&trans); -+ -+ bch2_fs_quota_transfer(c, src_inode, -+ bch_qid(&src_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ if (dst_inode) -+ bch2_fs_quota_transfer(c, dst_inode, -+ bch_qid(&dst_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ return ret; -+} -+ -+void bch2_setattr_copy(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ unsigned int ia_valid = attr->ia_valid; -+ -+ if (ia_valid & ATTR_UID) -+ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); -+ if (ia_valid & ATTR_GID) -+ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); -+ -+ if (ia_valid & ATTR_ATIME) -+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); -+ if (ia_valid & ATTR_MTIME) -+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); -+ if (ia_valid & ATTR_CTIME) -+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); -+ -+ if (ia_valid & ATTR_MODE) { -+ umode_t mode = attr->ia_mode; -+ kgid_t gid = ia_valid & ATTR_GID -+ ? attr->ia_gid -+ : inode->v.i_gid; -+ -+ if (!in_group_p(gid) && -+ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) -+ mode &= ~S_ISGID; -+ bi->bi_mode = mode; -+ } -+} -+ -+static int bch2_setattr_nonsize(struct bch_inode_info *inode, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_qid qid; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl = NULL; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ -+ qid = inode->ei_qid; -+ -+ if (attr->ia_valid & ATTR_UID) -+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); -+ -+ if (attr->ia_valid & ATTR_GID) -+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); -+ -+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ kfree(acl); -+ acl = NULL; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ bch2_setattr_copy(inode, &inode_u, attr); -+ -+ if (attr->ia_valid & ATTR_MODE) { -+ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); -+ if (ret) -+ goto btree_err; -+ } -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err_trans; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); -+ -+ if (acl) -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+err_trans: -+ bch2_trans_exit(&trans); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_getattr(const struct path *path, struct kstat *stat, -+ u32 request_mask, unsigned query_flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ stat->dev = inode->v.i_sb->s_dev; -+ stat->ino = inode->v.i_ino; -+ stat->mode = inode->v.i_mode; -+ stat->nlink = inode->v.i_nlink; -+ stat->uid = inode->v.i_uid; -+ stat->gid = inode->v.i_gid; -+ stat->rdev = inode->v.i_rdev; -+ stat->size = i_size_read(&inode->v); -+ stat->atime = inode->v.i_atime; -+ stat->mtime = inode->v.i_mtime; -+ stat->ctime = inode->v.i_ctime; -+ stat->blksize = block_bytes(c); -+ stat->blocks = inode->v.i_blocks; -+ -+ if (request_mask & STATX_BTIME) { -+ stat->result_mask |= STATX_BTIME; -+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); -+ } -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) -+ stat->attributes |= STATX_ATTR_IMMUTABLE; -+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) -+ stat->attributes |= STATX_ATTR_APPEND; -+ stat->attributes_mask |= STATX_ATTR_APPEND; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) -+ stat->attributes |= STATX_ATTR_NODUMP; -+ stat->attributes_mask |= STATX_ATTR_NODUMP; -+ -+ return 0; -+} -+ -+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = setattr_prepare(dentry, iattr); -+ if (ret) -+ return ret; -+ -+ return iattr->ia_valid & ATTR_SIZE -+ ? bch2_truncate(inode, iattr) -+ : bch2_setattr_nonsize(inode, iattr); -+} -+ -+static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_mark_tmpfile(dentry, &inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_fill_extent(struct bch_fs *c, -+ struct fiemap_extent_info *info, -+ struct bkey_s_c k, unsigned flags) -+{ -+ if (bkey_extent_is_data(k.k)) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ flags |= FIEMAP_EXTENT_SHARED; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int flags2 = 0; -+ u64 offset = p.ptr.offset; -+ -+ if (p.crc.compression_type) -+ flags2 |= FIEMAP_EXTENT_ENCODED; -+ else -+ offset += p.crc.offset; -+ -+ if ((offset & (c->opts.block_size - 1)) || -+ (k.k->size & (c->opts.block_size - 1))) -+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; -+ -+ ret = fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ offset << 9, -+ k.k->size << 9, flags|flags2); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+ } else if (k.k->type == KEY_TYPE_reservation) { -+ return fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ 0, k.k->size << 9, -+ flags| -+ FIEMAP_EXTENT_DELALLOC| -+ FIEMAP_EXTENT_UNWRITTEN); -+ } else { -+ BUG(); -+ } -+} -+ -+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -+ u64 start, u64 len) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *ei = to_bch_ei(vinode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack cur, prev; -+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -+ unsigned offset_into_extent, sectors; -+ bool have_extent = false; -+ int ret = 0; -+ -+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); -+ if (ret) -+ return ret; -+ -+ if (start + len < start) -+ return -EINVAL; -+ -+ bkey_on_stack_init(&cur); -+ bkey_on_stack_init(&prev); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(ei->v.i_ino, start >> 9), 0); -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ if (!bkey_extent_is_data(k.k) && -+ k.k->type != KEY_TYPE_reservation) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_realloc(&cur, c, k.k->u64s); -+ bkey_on_stack_realloc(&prev, c, k.k->u64s); -+ bkey_reassemble(cur.k, k); -+ k = bkey_i_to_s_c(cur.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &cur); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ if (offset_into_extent) -+ bch2_cut_front(POS(k.k->p.inode, -+ bkey_start_offset(k.k) + -+ offset_into_extent), -+ cur.k); -+ bch2_key_resize(&cur.k->k, sectors); -+ cur.k->k.p = iter->pos; -+ cur.k->k.p.offset += cur.k->k.size; -+ -+ if (have_extent) { -+ ret = bch2_fill_extent(c, info, -+ bkey_i_to_s_c(prev.k), 0); -+ if (ret) -+ break; -+ } -+ -+ bkey_copy(prev.k, cur.k); -+ have_extent = true; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ bch2_btree_iter_set_pos(iter, k.k->p); -+ else -+ bch2_btree_iter_next(iter); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (!ret && have_extent) -+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -+ FIEMAP_EXTENT_LAST); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&cur, c); -+ bkey_on_stack_exit(&prev, c); -+ return ret < 0 ? ret : 0; -+} -+ -+static const struct vm_operations_struct bch_vm_ops = { -+ .fault = bch2_page_fault, -+ .map_pages = filemap_map_pages, -+ .page_mkwrite = bch2_page_mkwrite, -+}; -+ -+static int bch2_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ file_accessed(file); -+ -+ vma->vm_ops = &bch_vm_ops; -+ return 0; -+} -+ -+/* Directories: */ -+ -+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -+{ -+ return generic_file_llseek_size(file, offset, whence, -+ S64_MAX, S64_MAX); -+} -+ -+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ if (!dir_emit_dots(file, ctx)) -+ return 0; -+ -+ return bch2_readdir(c, inode->v.i_ino, ctx); -+} -+ -+static const struct file_operations bch_file_operations = { -+ .llseek = bch2_llseek, -+ .read_iter = bch2_read_iter, -+ .write_iter = bch2_write_iter, -+ .mmap = bch2_mmap, -+ .open = generic_file_open, -+ .fsync = bch2_fsync, -+ .splice_read = generic_file_splice_read, -+ /* -+ * Broken, on v5.3: -+ .splice_write = iter_file_splice_write, -+ */ -+ .fallocate = bch2_fallocate_dispatch, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+ .remap_file_range = bch2_remap_file_range, -+}; -+ -+static const struct inode_operations bch_file_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .fiemap = bch2_fiemap, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_dir_inode_operations = { -+ .lookup = bch2_lookup, -+ .create = bch2_create, -+ .link = bch2_link, -+ .unlink = bch2_unlink, -+ .symlink = bch2_symlink, -+ .mkdir = bch2_mkdir, -+ .rmdir = bch2_unlink, -+ .mknod = bch2_mknod, -+ .rename = bch2_rename2, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .tmpfile = bch2_tmpfile, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct file_operations bch_dir_file_operations = { -+ .llseek = bch2_dir_llseek, -+ .read = generic_read_dir, -+ .iterate_shared = bch2_vfs_readdir, -+ .fsync = bch2_fsync, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+}; -+ -+static const struct inode_operations bch_symlink_inode_operations = { -+ .get_link = page_get_link, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_special_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct address_space_operations bch_address_space_operations = { -+ .writepage = bch2_writepage, -+ .readpage = bch2_readpage, -+ .writepages = bch2_writepages, -+ .readpages = bch2_readpages, -+ .set_page_dirty = __set_page_dirty_nobuffers, -+ .write_begin = bch2_write_begin, -+ .write_end = bch2_write_end, -+ .invalidatepage = bch2_invalidatepage, -+ .releasepage = bch2_releasepage, -+ .direct_IO = noop_direct_IO, -+#ifdef CONFIG_MIGRATION -+ .migratepage = bch2_migrate_page, -+#endif -+ .error_remove_page = generic_error_remove_page, -+}; -+ -+static struct inode *bch2_nfs_get_inode(struct super_block *sb, -+ u64 ino, u32 generation) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct inode *vinode; -+ -+ if (ino < BCACHEFS_ROOT_INO) -+ return ERR_PTR(-ESTALE); -+ -+ vinode = bch2_vfs_inode_get(c, ino); -+ if (IS_ERR(vinode)) -+ return ERR_CAST(vinode); -+ if (generation && vinode->i_generation != generation) { -+ /* we didn't find the right inode.. */ -+ iput(vinode); -+ return ERR_PTR(-ESTALE); -+ } -+ return vinode; -+} -+ -+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_parent(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static const struct export_operations bch_export_ops = { -+ .fh_to_dentry = bch2_fh_to_dentry, -+ .fh_to_parent = bch2_fh_to_parent, -+ //.get_parent = bch2_get_parent, -+}; -+ -+static void bch2_vfs_inode_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi) -+{ -+ bch2_inode_update_after_write(c, inode, bi, ~0); -+ -+ inode->v.i_blocks = bi->bi_sectors; -+ inode->v.i_ino = bi->bi_inum; -+ inode->v.i_rdev = bi->bi_dev; -+ inode->v.i_generation = bi->bi_generation; -+ inode->v.i_size = bi->bi_size; -+ -+ inode->ei_journal_seq = 0; -+ inode->ei_quota_reserved = 0; -+ inode->ei_str_hash = bch2_hash_info_init(c, bi); -+ inode->ei_qid = bch_qid(bi); -+ -+ inode->v.i_mapping->a_ops = &bch_address_space_operations; -+ -+ switch (inode->v.i_mode & S_IFMT) { -+ case S_IFREG: -+ inode->v.i_op = &bch_file_inode_operations; -+ inode->v.i_fop = &bch_file_operations; -+ break; -+ case S_IFDIR: -+ inode->v.i_op = &bch_dir_inode_operations; -+ inode->v.i_fop = &bch_dir_file_operations; -+ break; -+ case S_IFLNK: -+ inode_nohighmem(&inode->v); -+ inode->v.i_op = &bch_symlink_inode_operations; -+ break; -+ default: -+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); -+ inode->v.i_op = &bch_special_inode_operations; -+ break; -+ } -+} -+ -+static struct inode *bch2_alloc_inode(struct super_block *sb) -+{ -+ struct bch_inode_info *inode; -+ -+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); -+ if (!inode) -+ return NULL; -+ -+ inode_init_once(&inode->v); -+ mutex_init(&inode->ei_update_lock); -+ pagecache_lock_init(&inode->ei_pagecache_lock); -+ mutex_init(&inode->ei_quota_lock); -+ inode->ei_journal_seq = 0; -+ -+ return &inode->v; -+} -+ -+static void bch2_i_callback(struct rcu_head *head) -+{ -+ struct inode *vinode = container_of(head, struct inode, i_rcu); -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ kmem_cache_free(bch2_inode_cache, inode); -+} -+ -+static void bch2_destroy_inode(struct inode *vinode) -+{ -+ call_rcu(&vinode->i_rcu, bch2_i_callback); -+} -+ -+static int inode_update_times_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); -+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); -+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); -+ -+ return 0; -+} -+ -+static int bch2_vfs_write_inode(struct inode *vinode, -+ struct writeback_control *wbc) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static void bch2_evict_inode(struct inode *vinode) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ truncate_inode_pages_final(&inode->v.i_data); -+ -+ clear_inode(&inode->v); -+ -+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); -+ -+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), -+ KEY_TYPE_QUOTA_WARN); -+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode->v.i_ino); -+ } -+} -+ -+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -+{ -+ struct super_block *sb = dentry->d_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); -+ unsigned shift = sb->s_blocksize_bits - 9; -+ u64 fsid; -+ -+ buf->f_type = BCACHEFS_STATFS_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = usage.capacity >> shift; -+ buf->f_bfree = (usage.capacity - usage.used) >> shift; -+ buf->f_bavail = buf->f_bfree; -+ buf->f_files = 0; -+ buf->f_ffree = 0; -+ -+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ -+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); -+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; -+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; -+ buf->f_namelen = BCH_NAME_MAX; -+ -+ return 0; -+} -+ -+static int bch2_sync_fs(struct super_block *sb, int wait) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (c->opts.journal_flush_disabled) -+ return 0; -+ -+ if (!wait) { -+ bch2_journal_flush_async(&c->journal, NULL); -+ return 0; -+ } -+ -+ return bch2_journal_flush(&c->journal); -+} -+ -+static struct bch_fs *bch2_path_to_fs(const char *dev) -+{ -+ struct bch_fs *c; -+ struct block_device *bdev = lookup_bdev(dev); -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ c = bch2_bdev_to_fs(bdev); -+ bdput(bdev); -+ return c ?: ERR_PTR(-ENOENT); -+} -+ -+static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, -+ unsigned nr_devs, struct bch_opts opts) -+{ -+ struct bch_fs *c, *c1, *c2; -+ size_t i; -+ -+ if (!nr_devs) -+ return ERR_PTR(-EINVAL); -+ -+ c = bch2_fs_open(devs, nr_devs, opts); -+ -+ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { -+ /* -+ * Already open? -+ * Look up each block device, make sure they all belong to a -+ * filesystem and they all belong to the _same_ filesystem -+ */ -+ -+ c1 = bch2_path_to_fs(devs[0]); -+ if (IS_ERR(c1)) -+ return c; -+ -+ for (i = 1; i < nr_devs; i++) { -+ c2 = bch2_path_to_fs(devs[i]); -+ if (!IS_ERR(c2)) -+ closure_put(&c2->cl); -+ -+ if (c1 != c2) { -+ closure_put(&c1->cl); -+ return c; -+ } -+ } -+ -+ c = c1; -+ } -+ -+ if (IS_ERR(c)) -+ return c; -+ -+ down_write(&c->state_lock); -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) { -+ up_write(&c->state_lock); -+ closure_put(&c->cl); -+ pr_err("err mounting %s: incomplete filesystem", dev_name); -+ return ERR_PTR(-EINVAL); -+ } -+ -+ up_write(&c->state_lock); -+ -+ set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); -+ return c; -+} -+ -+static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, -+ struct bch_opts opts) -+{ -+ char *dev_name = NULL, **devs = NULL, *s; -+ struct bch_fs *c = ERR_PTR(-ENOMEM); -+ size_t i, nr_devs = 0; -+ -+ dev_name = kstrdup(_dev_name, GFP_KERNEL); -+ if (!dev_name) -+ goto err; -+ -+ for (s = dev_name; s; s = strchr(s + 1, ':')) -+ nr_devs++; -+ -+ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); -+ if (!devs) -+ goto err; -+ -+ for (i = 0, s = dev_name; -+ s; -+ (s = strchr(s, ':')) && (*s++ = '\0')) -+ devs[i++] = s; -+ -+ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); -+err: -+ kfree(devs); -+ kfree(dev_name); -+ return c; -+} -+ -+static int bch2_remount(struct super_block *sb, int *flags, char *data) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_opts opts = bch2_opts_empty(); -+ int ret; -+ -+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ret; -+ -+ if (opts.read_only != c->opts.read_only) { -+ down_write(&c->state_lock); -+ -+ if (opts.read_only) { -+ bch2_fs_read_only(c); -+ -+ sb->s_flags |= SB_RDONLY; -+ } else { -+ ret = bch2_fs_read_write(c); -+ if (ret) { -+ bch_err(c, "error going rw: %i", ret); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ sb->s_flags &= ~SB_RDONLY; -+ } -+ -+ c->opts.read_only = opts.read_only; -+ -+ up_write(&c->state_lock); -+ } -+ -+ if (opts.errors >= 0) -+ c->opts.errors = opts.errors; -+ -+ return ret; -+} -+ -+static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ struct bch_dev *ca; -+ unsigned i; -+ bool first = true; -+ -+ for_each_online_member(ca, c, i) { -+ if (!first) -+ seq_putc(seq, ':'); -+ first = false; -+ seq_puts(seq, "/dev/"); -+ seq_puts(seq, ca->name); -+ } -+ -+ return 0; -+} -+ -+static int bch2_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ enum bch_opt_id i; -+ char buf[512]; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ bch2_opt_to_text(&PBUF(buf), c, opt, v, -+ OPT_SHOW_MOUNT_STYLE); -+ seq_putc(seq, ','); -+ seq_puts(seq, buf); -+ } -+ -+ return 0; -+} -+ -+static const struct super_operations bch_super_operations = { -+ .alloc_inode = bch2_alloc_inode, -+ .destroy_inode = bch2_destroy_inode, -+ .write_inode = bch2_vfs_write_inode, -+ .evict_inode = bch2_evict_inode, -+ .sync_fs = bch2_sync_fs, -+ .statfs = bch2_statfs, -+ .show_devname = bch2_show_devname, -+ .show_options = bch2_show_options, -+ .remount_fs = bch2_remount, -+#if 0 -+ .put_super = bch2_put_super, -+ .freeze_fs = bch2_freeze, -+ .unfreeze_fs = bch2_unfreeze, -+#endif -+}; -+ -+static int bch2_test_super(struct super_block *s, void *data) -+{ -+ return s->s_fs_info == data; -+} -+ -+static int bch2_set_super(struct super_block *s, void *data) -+{ -+ s->s_fs_info = data; -+ return 0; -+} -+ -+static struct dentry *bch2_mount(struct file_system_type *fs_type, -+ int flags, const char *dev_name, void *data) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ struct super_block *sb; -+ struct inode *vinode; -+ struct bch_opts opts = bch2_opts_empty(); -+ unsigned i; -+ int ret; -+ -+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ c = bch2_open_as_blockdevs(dev_name, opts); -+ if (IS_ERR(c)) -+ return ERR_CAST(c); -+ -+ sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); -+ if (IS_ERR(sb)) { -+ closure_put(&c->cl); -+ return ERR_CAST(sb); -+ } -+ -+ BUG_ON(sb->s_fs_info != c); -+ -+ if (sb->s_root) { -+ closure_put(&c->cl); -+ -+ if ((flags ^ sb->s_flags) & SB_RDONLY) { -+ ret = -EBUSY; -+ goto err_put_super; -+ } -+ goto out; -+ } -+ -+ sb->s_blocksize = block_bytes(c); -+ sb->s_blocksize_bits = ilog2(block_bytes(c)); -+ sb->s_maxbytes = MAX_LFS_FILESIZE; -+ sb->s_op = &bch_super_operations; -+ sb->s_export_op = &bch_export_ops; -+#ifdef CONFIG_BCACHEFS_QUOTA -+ sb->s_qcop = &bch2_quotactl_operations; -+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -+#endif -+ sb->s_xattr = bch2_xattr_handlers; -+ sb->s_magic = BCACHEFS_STATFS_MAGIC; -+ sb->s_time_gran = c->sb.time_precision; -+ c->vfs_sb = sb; -+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); -+ -+ ret = super_setup_bdi(sb); -+ if (ret) -+ goto err_put_super; -+ -+ sb->s_bdi->congested_fn = bch2_congested; -+ sb->s_bdi->congested_data = c; -+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; -+ -+ for_each_online_member(ca, c, i) { -+ struct block_device *bdev = ca->disk_sb.bdev; -+ -+ /* XXX: create an anonymous device for multi device filesystems */ -+ sb->s_bdev = bdev; -+ sb->s_dev = bdev->bd_dev; -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ if (c->opts.acl) -+ sb->s_flags |= SB_POSIXACL; -+#endif -+ -+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); -+ if (IS_ERR(vinode)) { -+ bch_err(c, "error mounting: error getting root inode %i", -+ (int) PTR_ERR(vinode)); -+ ret = PTR_ERR(vinode); -+ goto err_put_super; -+ } -+ -+ sb->s_root = d_make_root(vinode); -+ if (!sb->s_root) { -+ bch_err(c, "error mounting: error allocating root dentry"); -+ ret = -ENOMEM; -+ goto err_put_super; -+ } -+ -+ sb->s_flags |= SB_ACTIVE; -+out: -+ return dget(sb->s_root); -+ -+err_put_super: -+ deactivate_locked_super(sb); -+ return ERR_PTR(ret); -+} -+ -+static void bch2_kill_sb(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ generic_shutdown_super(sb); -+ -+ if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) -+ bch2_fs_stop(c); -+ else -+ closure_put(&c->cl); -+} -+ -+static struct file_system_type bcache_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "bcachefs", -+ .mount = bch2_mount, -+ .kill_sb = bch2_kill_sb, -+ .fs_flags = FS_REQUIRES_DEV, -+}; -+ -+MODULE_ALIAS_FS("bcachefs"); -+ -+void bch2_vfs_exit(void) -+{ -+ unregister_filesystem(&bcache_fs_type); -+ if (bch2_inode_cache) -+ kmem_cache_destroy(bch2_inode_cache); -+} -+ -+int __init bch2_vfs_init(void) -+{ -+ int ret = -ENOMEM; -+ -+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); -+ if (!bch2_inode_cache) -+ goto err; -+ -+ ret = register_filesystem(&bcache_fs_type); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ bch2_vfs_exit(); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h -new file mode 100644 -index 000000000000..eda903a45325 ---- /dev/null -+++ b/fs/bcachefs/fs.h -@@ -0,0 +1,174 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_H -+#define _BCACHEFS_FS_H -+ -+#include "inode.h" -+#include "opts.h" -+#include "str_hash.h" -+#include "quota_types.h" -+ -+#include -+#include -+ -+/* -+ * Two-state lock - can be taken for add or block - both states are shared, -+ * like read side of rwsem, but conflict with other state: -+ */ -+struct pagecache_lock { -+ atomic_long_t v; -+ wait_queue_head_t wait; -+}; -+ -+static inline void pagecache_lock_init(struct pagecache_lock *lock) -+{ -+ atomic_long_set(&lock->v, 0); -+ init_waitqueue_head(&lock->wait); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *); -+void bch2_pagecache_add_get(struct pagecache_lock *); -+void bch2_pagecache_block_put(struct pagecache_lock *); -+void bch2_pagecache_block_get(struct pagecache_lock *); -+ -+struct bch_inode_info { -+ struct inode v; -+ -+ struct mutex ei_update_lock; -+ u64 ei_journal_seq; -+ u64 ei_quota_reserved; -+ unsigned long ei_last_dirtied; -+ -+ struct pagecache_lock ei_pagecache_lock; -+ -+ struct mutex ei_quota_lock; -+ struct bch_qid ei_qid; -+ -+ struct bch_hash_info ei_str_hash; -+ -+ /* copy of inode in btree: */ -+ struct bch_inode_unpacked ei_inode; -+}; -+ -+#define to_bch_ei(_inode) \ -+ container_of_or_null(_inode, struct bch_inode_info, v) -+ -+static inline int ptrcmp(void *l, void *r) -+{ -+ return cmp_int(l, r); -+} -+ -+enum bch_inode_lock_op { -+ INODE_LOCK = (1U << 0), -+ INODE_PAGECACHE_BLOCK = (1U << 1), -+ INODE_UPDATE_LOCK = (1U << 2), -+}; -+ -+#define bch2_lock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ down_write_nested(&a[i]->v.i_rwsem, i); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_lock_nested(&a[i]->ei_update_lock, i);\ -+ } \ -+} while (0) -+ -+#define bch2_unlock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ up_write(&a[i]->v.i_rwsem); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_unlock(&a[i]->ei_update_lock); \ -+ } \ -+} while (0) -+ -+static inline struct bch_inode_info *file_bch_inode(struct file *file) -+{ -+ return to_bch_ei(file_inode(file)); -+} -+ -+static inline bool inode_attr_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode, -+ enum inode_opt_id id) -+{ -+ return !(inode->ei_inode.bi_fields_set & (1 << id)) && -+ bch2_inode_opt_get(&dir->ei_inode, id) != -+ bch2_inode_opt_get(&inode->ei_inode, id); -+} -+ -+static inline bool inode_attrs_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode) -+{ -+ unsigned id; -+ -+ for (id = 0; id < Inode_opt_nr; id++) -+ if (inode_attr_changing(dir, inode, id)) -+ return true; -+ -+ return false; -+} -+ -+struct bch_inode_unpacked; -+ -+#ifndef NO_BCACHEFS_FS -+ -+int bch2_fs_quota_transfer(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_qid, -+ unsigned, -+ enum quota_acct_mode); -+ -+static inline int bch2_set_projid(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ u32 projid) -+{ -+ struct bch_qid qid = inode->ei_qid; -+ -+ qid.q[QTYP_PRJ] = projid; -+ -+ return bch2_fs_quota_transfer(c, inode, qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); -+ -+/* returns 0 if we want to do the update, or error is passed up */ -+typedef int (*inode_set_fn)(struct bch_inode_info *, -+ struct bch_inode_unpacked *, void *); -+ -+void bch2_inode_update_after_write(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, -+ unsigned); -+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, -+ inode_set_fn, void *, unsigned); -+ -+void bch2_vfs_exit(void); -+int bch2_vfs_init(void); -+ -+#else -+ -+static inline void bch2_vfs_exit(void) {} -+static inline int bch2_vfs_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_FS_H */ -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -new file mode 100644 -index 000000000000..5a6df3d1973a ---- /dev/null -+++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,1502 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "inode.h" -+#include "keylist.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include /* struct qstr */ -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 sectors = 0; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, -+ POS(inum, 0), 0, k, ret) { -+ if (k.k->p.inode != inum) -+ break; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ sectors += k.k->size; -+ } -+ -+ bch2_trans_iter_free(trans, iter); -+ -+ return ret ?: sectors; -+} -+ -+static int __remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ struct bch_fs *c = trans->c; -+ struct qstr name; -+ struct bch_inode_unpacked dir_inode; -+ struct bch_hash_info dir_hash_info; -+ u64 dir_inum = dirent.k->p.inode; -+ int ret; -+ char *buf; -+ -+ name.len = bch2_dirent_name_bytes(dirent); -+ buf = bch2_trans_kmalloc(trans, name.len + 1); -+ if (IS_ERR(buf)) -+ return PTR_ERR(buf); -+ -+ memcpy(buf, dirent.v->d_name, name.len); -+ buf[name.len] = '\0'; -+ name.name = buf; -+ -+ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); -+ if (ret) -+ return ret; -+ -+ dir_hash_info = bch2_hash_info_init(c, &dir_inode); -+ -+ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, -+ &dir_hash_info, dir_inum, &name); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i deleting dirent", ret); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+static int remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ return __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __remove_dirent(trans, dirent)); -+} -+ -+static int reattach_inode(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ u64 inum) -+{ -+ struct bch_inode_unpacked dir_u, inode_u; -+ char name_buf[20]; -+ struct qstr name; -+ int ret; -+ -+ snprintf(name_buf, sizeof(name_buf), "%llu", inum); -+ name = (struct qstr) QSTR(name_buf); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW, -+ bch2_link_trans(&trans, lostfound_inode->bi_inum, -+ inum, &dir_u, &inode_u, &name)); -+ if (ret) -+ bch_err(c, "error %i reattaching inode %llu", ret, inum); -+ -+ return ret; -+} -+ -+struct inode_walker { -+ bool first_this_inode; -+ bool have_inode; -+ u64 cur_inum; -+ struct bch_inode_unpacked inode; -+}; -+ -+static struct inode_walker inode_walker_init(void) -+{ -+ return (struct inode_walker) { -+ .cur_inum = -1, -+ .have_inode = false, -+ }; -+} -+ -+static int walk_inode(struct btree_trans *trans, -+ struct inode_walker *w, u64 inum) -+{ -+ if (inum != w->cur_inum) { -+ int ret = bch2_inode_find_by_inum_trans(trans, inum, -+ &w->inode); -+ -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ w->have_inode = !ret; -+ w->cur_inum = inum; -+ w->first_this_inode = true; -+ } else { -+ w->first_this_inode = false; -+ } -+ -+ return 0; -+} -+ -+struct hash_check { -+ struct bch_hash_info info; -+ -+ /* start of current chain of hash collisions: */ -+ struct btree_iter *chain; -+ -+ /* next offset in current chain of hash collisions: */ -+ u64 chain_end; -+}; -+ -+static void hash_check_init(struct hash_check *h) -+{ -+ h->chain = NULL; -+ h->chain_end = 0; -+} -+ -+static void hash_stop_chain(struct btree_trans *trans, -+ struct hash_check *h) -+{ -+ if (h->chain) -+ bch2_trans_iter_free(trans, h->chain); -+ h->chain = NULL; -+} -+ -+static void hash_check_set_inode(struct btree_trans *trans, -+ struct hash_check *h, -+ const struct bch_inode_unpacked *bi) -+{ -+ h->info = bch2_hash_info_init(trans->c, bi); -+ hash_stop_chain(trans, h); -+} -+ -+static int hash_redo_key(const struct bch_hash_desc desc, -+ struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k, -+ u64 hashed) -+{ -+ struct bkey_i delete; -+ struct bkey_i *tmp; -+ -+ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if (IS_ERR(tmp)) -+ return PTR_ERR(tmp); -+ -+ bkey_reassemble(tmp, k); -+ -+ bkey_init(&delete.k); -+ delete.k.p = k_iter->pos; -+ bch2_trans_update(trans, k_iter, &delete, 0); -+ -+ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, -+ tmp, BCH_HASH_SET_MUST_CREATE); -+} -+ -+static int fsck_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ int ret; -+retry: -+ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret == -EINTR) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (!ret) -+ goto retry; -+ } -+ -+ return ret; -+} -+ -+static int hash_check_duplicates(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k2; -+ char buf[200]; -+ int ret = 0; -+ -+ if (!bkey_cmp(h->chain->pos, k_iter->pos)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, h->chain); -+ BUG_ON(IS_ERR(iter)); -+ -+ for_each_btree_key_continue(iter, 0, k2, ret) { -+ if (bkey_cmp(k2.k->p, k.k->p) >= 0) -+ break; -+ -+ if (fsck_err_on(k2.k->type == desc.key_type && -+ !desc.cmp_bkey(k, k2), c, -+ "duplicate hash table keys:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); -+ if (ret) -+ return ret; -+ ret = 1; -+ break; -+ } -+ } -+fsck_err: -+ bch2_trans_iter_free(trans, iter); -+ return ret; -+} -+ -+static void hash_set_chain_start(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ bool hole = (k.k->type != KEY_TYPE_whiteout && -+ k.k->type != desc.key_type); -+ -+ if (hole || k.k->p.offset > h->chain_end + 1) -+ hash_stop_chain(trans, h); -+ -+ if (!hole) { -+ if (!h->chain) { -+ h->chain = bch2_trans_copy_iter(trans, k_iter); -+ BUG_ON(IS_ERR(h->chain)); -+ } -+ -+ h->chain_end = k.k->p.offset; -+ } -+} -+ -+static bool key_has_correct_hash(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ u64 hash; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return true; -+ -+ hash = desc.hash_bkey(&h->info, k); -+ -+ return hash >= h->chain->pos.offset && -+ hash <= k.k->p.offset; -+} -+ -+static int hash_check_key(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ char buf[200]; -+ u64 hashed; -+ int ret = 0; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return 0; -+ -+ hashed = desc.hash_bkey(&h->info, k); -+ -+ if (fsck_err_on(hashed < h->chain->pos.offset || -+ hashed > k.k->p.offset, c, -+ "hash table key at wrong offset: btree %u, %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ desc.btree_id, k.k->p.offset, -+ hashed, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(desc, trans, h, k_iter, k, hashed)); -+ if (ret) { -+ bch_err(c, "hash_redo_key err %i", ret); -+ return ret; -+ } -+ return 1; -+ } -+ -+ ret = hash_check_duplicates(trans, desc, h, k_iter, k); -+fsck_err: -+ return ret; -+} -+ -+static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *iter, struct bkey_s_c *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_dirent *d = NULL; -+ int ret = -EINVAL; -+ char buf[200]; -+ unsigned len; -+ u64 hash; -+ -+ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) -+ return 0; -+ -+ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); -+ BUG_ON(!len); -+ -+ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); -+ buf[len] = '\0'; -+ -+ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); -+ if (!d) { -+ bch_err(c, "memory allocation failure"); -+ return -ENOMEM; -+ } -+ -+ bkey_reassemble(&d->k_i, *k); -+ -+ do { -+ --len; -+ if (!len) -+ goto err_redo; -+ -+ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); -+ -+ BUG_ON(bkey_val_bytes(&d->k) < -+ offsetof(struct bch_dirent, d_name) + len); -+ -+ memset(d->v.d_name + len, 0, -+ bkey_val_bytes(&d->k) - -+ offsetof(struct bch_dirent, d_name) - len); -+ -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, -+ bkey_i_to_s_c(&d->k_i)); -+ } while (hash < h->chain->pos.offset || -+ hash > k->k->p.offset); -+ -+ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", -+ buf, strlen(buf), d->v.d_name, len)) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); -+ if (ret) -+ goto err; -+ -+ *k = bch2_btree_iter_peek(iter); -+ -+ BUG_ON(k->k->type != KEY_TYPE_dirent); -+ } -+err: -+fsck_err: -+ kfree(d); -+ return ret; -+err_redo: -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); -+ -+ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" -+ "hash table key at wrong offset: btree %u, offset %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ buf, strlen(buf), BTREE_ID_DIRENTS, -+ k->k->p.offset, hash, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ *k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(bch2_dirent_hash_desc, trans, -+ h, iter, *k, hash)); -+ if (ret) -+ bch_err(c, "hash_redo_key err %i", ret); -+ else -+ ret = 1; -+ } -+ -+ goto err; -+} -+ -+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) -+{ -+ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), -+ POS(inode_nr + 1, 0), NULL); -+} -+ -+static int bch2_fix_overlapping_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, struct bpos cut_at) -+{ -+ struct btree_iter *u_iter; -+ struct bkey_i *u; -+ int ret; -+ -+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(u, k); -+ bch2_cut_front(cut_at, u); -+ -+ u_iter = bch2_trans_copy_iter(trans, iter); -+ ret = PTR_ERR_OR_ZERO(u_iter); -+ if (ret) -+ return ret; -+ -+ /* -+ * We don't want to go through the -+ * extent_handle_overwrites path: -+ */ -+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); -+ -+ /* -+ * XXX: this is going to leave disk space -+ * accounting slightly wrong -+ */ -+ ret = bch2_trans_update(trans, u_iter, u, 0); -+ bch2_trans_iter_put(trans, u_iter); -+ return ret; -+} -+ -+/* -+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and -+ * that i_size an i_sectors are consistent -+ */ -+noinline_for_stack -+static int check_extents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack prev; -+ u64 i_sectors; -+ int ret = 0; -+ -+ bkey_on_stack_init(&prev); -+ prev.k->k = KEY(0, 0, 0); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking extents"); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { -+ char buf1[200]; -+ char buf2[200]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); -+ bch2_bkey_val_to_text(&PBUF(buf2), c, k); -+ -+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_fix_overlapping_extent(&trans, -+ iter, k, prev.k->k.p)); -+ if (ret) -+ goto err; -+ } -+ } -+ bkey_on_stack_reassemble(&prev, c, k); -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "extent type %u for missing inode %llu", -+ k.k->type, k.k->p.inode) || -+ fsck_err_on(w.have_inode && -+ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, -+ "extent type %u for non regular file, inode %llu mode %o", -+ k.k->type, k.k->p.inode, w.inode.bi_mode)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(w.first_this_inode && -+ w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && -+ w.inode.bi_sectors != -+ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), -+ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", -+ w.inode.bi_inum, -+ w.inode.bi_sectors, i_sectors)) { -+ struct bkey_inode_buf p; -+ -+ w.inode.bi_sectors = i_sectors; -+ -+ bch2_trans_unlock(&trans); -+ -+ bch2_inode_pack(&p, &w.inode); -+ -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &p.inode.k_i, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i updating inode", ret); -+ goto err; -+ } -+ -+ /* revalidate iterator: */ -+ k = bch2_btree_iter_peek(iter); -+ } -+ -+ if (fsck_err_on(w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ k.k->type != KEY_TYPE_reservation && -+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, -+ "extent type %u offset %llu past end of inode %llu, i_size %llu", -+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, -+ w.inode.bi_size); -+ if (ret) -+ goto err; -+ continue; -+ } -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ bkey_on_stack_exit(&prev, c); -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, -+ * validate d_type -+ */ -+noinline_for_stack -+static int check_dirents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned name_len; -+ char buf[200]; -+ int ret = 0; -+ -+ bch_verbose(c, "checking dirents"); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ hash_check_init(&h); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ struct bkey_s_c_dirent d; -+ struct bch_inode_unpacked target; -+ bool have_target; -+ u64 d_inum; -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "dirent in nonexisting directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf)) || -+ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, -+ "dirent in non directory inode type %u:\n%s", -+ mode_to_type(w.inode.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = check_dirent_hash(&trans, &h, iter, &k); -+ if (ret > 0) { -+ ret = 0; -+ continue; -+ } -+ if (ret) -+ goto fsck_err; -+ -+ if (ret) -+ goto fsck_err; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ name_len = bch2_dirent_name_bytes(d); -+ -+ if (fsck_err_on(!name_len, c, "empty dirent") || -+ fsck_err_on(name_len == 1 && -+ !memcmp(d.v->d_name, ".", 1), c, -+ ". dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, -+ "dirent name has invalid chars")) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(d_inum == d.k->p.inode, c, -+ "dirent points to own directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); -+ if (ret && ret != -ENOENT) -+ break; -+ -+ have_target = !ret; -+ ret = 0; -+ -+ if (fsck_err_on(!have_target, c, -+ "dirent points to missing inode:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(have_target && -+ d.v->d_type != -+ mode_to_type(target.bi_mode), c, -+ "incorrect d_type: should be %u:\n%s", -+ mode_to_type(target.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ struct bkey_i_dirent *n; -+ -+ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); -+ if (!n) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_type = mode_to_type(target.bi_mode); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); -+ kfree(n); -+ if (ret) -+ goto err; -+ -+ } -+ } -+ -+ hash_stop_chain(&trans, &h); -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk xattrs: verify that they all have a corresponding inode -+ */ -+noinline_for_stack -+static int check_xattrs(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch_verbose(c, "checking xattrs"); -+ -+ hash_check_init(&h); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "xattr for missing inode %llu", -+ k.k->p.inode)) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = hash_check_key(&trans, bch2_xattr_hash_desc, -+ &h, iter, k); -+ if (ret) -+ goto fsck_err; -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Get root directory, create if it doesn't exist: */ -+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) -+{ -+ struct bkey_inode_buf packed; -+ int ret; -+ -+ bch_verbose(c, "checking root directory"); -+ -+ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "root directory missing")) -+ goto create_root; -+ -+ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, -+ "root inode not a directory")) -+ goto create_root; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_root: -+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, -+ 0, NULL); -+ root_inode->bi_inum = BCACHEFS_ROOT_INO; -+ -+ bch2_inode_pack(&packed, root_inode); -+ -+ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, -+ NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+} -+ -+/* Get lost+found, create if it doesn't exist: */ -+static int check_lostfound(struct bch_fs *c, -+ struct bch_inode_unpacked *root_inode, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct qstr lostfound = QSTR("lost+found"); -+ struct bch_hash_info root_hash_info = -+ bch2_hash_info_init(c, root_inode); -+ u64 inum; -+ int ret; -+ -+ bch_verbose(c, "checking lost+found"); -+ -+ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, -+ &lostfound); -+ if (!inum) { -+ bch_notice(c, "creating lost+found"); -+ goto create_lostfound; -+ } -+ -+ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "lost+found missing")) -+ goto create_lostfound; -+ -+ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, -+ "lost+found inode not a directory")) -+ goto create_lostfound; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_lostfound: -+ bch2_inode_init_early(c, lostfound_inode); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_create_trans(&trans, -+ BCACHEFS_ROOT_INO, root_inode, -+ lostfound_inode, &lostfound, -+ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); -+ if (ret) -+ bch_err(c, "error creating lost+found: %i", ret); -+ -+ return ret; -+} -+ -+struct inode_bitmap { -+ unsigned long *bits; -+ size_t size; -+}; -+ -+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) -+{ -+ return nr < b->size ? test_bit(nr, b->bits) : false; -+} -+ -+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) -+{ -+ if (nr >= b->size) { -+ size_t new_size = max_t(size_t, max_t(size_t, -+ PAGE_SIZE * 8, -+ b->size * 2), -+ nr + 1); -+ void *n; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); -+ if (!n) { -+ return -ENOMEM; -+ } -+ -+ b->bits = n; -+ b->size = new_size; -+ } -+ -+ __set_bit(nr, b->bits); -+ return 0; -+} -+ -+struct pathbuf { -+ size_t nr; -+ size_t size; -+ -+ struct pathbuf_entry { -+ u64 inum; -+ u64 offset; -+ } *entries; -+}; -+ -+static int path_down(struct pathbuf *p, u64 inum) -+{ -+ if (p->nr == p->size) { -+ size_t new_size = max_t(size_t, 256UL, p->size * 2); -+ void *n = krealloc(p->entries, -+ new_size * sizeof(p->entries[0]), -+ GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ p->entries = n; -+ p->size = new_size; -+ }; -+ -+ p->entries[p->nr++] = (struct pathbuf_entry) { -+ .inum = inum, -+ .offset = 0, -+ }; -+ return 0; -+} -+ -+noinline_for_stack -+static int check_directory_structure(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct inode_bitmap dirs_done = { NULL, 0 }; -+ struct pathbuf path = { 0, 0, NULL }; -+ struct pathbuf_entry *e; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ bool had_unreachable; -+ u64 d_inum; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking directory structure"); -+ -+ /* DFS: */ -+restart_dfs: -+ had_unreachable = false; -+ -+ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, BCACHEFS_ROOT_INO); -+ if (ret) -+ goto err; -+ -+ while (path.nr) { -+next: -+ e = &path.entries[path.nr - 1]; -+ -+ if (e->offset == U64_MAX) -+ goto up; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(e->inum, e->offset + 1), 0, k, ret) { -+ if (k.k->p.inode != e->inum) -+ break; -+ -+ e->offset = k.k->p.offset; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ if (dirent.v->d_type != DT_DIR) -+ continue; -+ -+ d_inum = le64_to_cpu(dirent.v->d_inum); -+ -+ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, -+ "directory %llu has multiple hardlinks", -+ d_inum)) { -+ ret = remove_dirent(&trans, dirent); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = inode_bitmap_set(&dirs_done, d_inum); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, d_inum); -+ if (ret) { -+ goto err; -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter); -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+ goto next; -+ } -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+up: -+ path.nr--; -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) -+ continue; -+ -+ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); -+ if (ret == -EINTR) -+ goto retry; -+ if (!ret) -+ continue; -+ -+ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, -+ "unreachable directory found (inum %llu)", -+ k.k->p.offset)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); -+ if (ret) { -+ goto err; -+ } -+ -+ had_unreachable = true; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ if (ret) -+ goto err; -+ -+ if (had_unreachable) { -+ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ memset(&dirs_done, 0, sizeof(dirs_done)); -+ memset(&path, 0, sizeof(path)); -+ goto restart_dfs; -+ } -+err: -+fsck_err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ return ret; -+} -+ -+struct nlink { -+ u32 count; -+ u32 dir_count; -+}; -+ -+typedef GENRADIX(struct nlink) nlink_table; -+ -+static void inc_link(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end, -+ u64 inum, bool dir) -+{ -+ struct nlink *link; -+ -+ if (inum < range_start || inum >= *range_end) -+ return; -+ -+ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); -+ if (!link) { -+ bch_verbose(c, "allocation failed during fsck - will need another pass"); -+ *range_end = inum; -+ return; -+ } -+ -+ if (dir) -+ link->dir_count++; -+ else -+ link->count++; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ u64 d_inum; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_dirent: -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ if (d.v->d_type == DT_DIR) -+ inc_link(c, links, range_start, range_end, -+ d.k->p.inode, true); -+ -+ inc_link(c, links, range_start, range_end, -+ d_inum, false); -+ -+ break; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); -+ -+ return ret; -+} -+ -+static int check_inode_nlink(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct bch_inode_unpacked *u, -+ struct nlink *link, -+ bool *do_update) -+{ -+ u32 i_nlink = bch2_inode_nlink_get(u); -+ u32 real_i_nlink = -+ link->count * nlink_bias(u->bi_mode) + -+ link->dir_count; -+ int ret = 0; -+ -+ /* -+ * These should have been caught/fixed by earlier passes, we don't -+ * repair them here: -+ */ -+ if (S_ISDIR(u->bi_mode) && link->count > 1) { -+ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", -+ u->bi_inum, link->count); -+ return 0; -+ } -+ -+ if (S_ISDIR(u->bi_mode) && !link->count) { -+ need_fsck_err(c, "unreachable directory found (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!S_ISDIR(u->bi_mode) && link->dir_count) { -+ need_fsck_err(c, "non directory with subdirectories (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!link->count && -+ !(u->bi_flags & BCH_INODE_UNLINKED) && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", -+ u->bi_inum, mode_to_type(u->bi_mode)) == -+ FSCK_ERR_IGNORE) -+ return 0; -+ -+ ret = reattach_inode(c, lostfound_inode, u->bi_inum); -+ if (ret) -+ return ret; -+ -+ link->count = 1; -+ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink < link->count) { -+ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", -+ u->bi_inum, i_nlink, link->count, -+ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ c->sb.clean) { -+ if (fsck_err(c, "filesystem marked clean, " -+ "but inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (real_i_nlink && i_nlink != real_i_nlink) -+ bch_verbose(c, "setting inode %llu nlink from %u to %u", -+ u->bi_inum, i_nlink, real_i_nlink); -+set_i_nlink: -+ if (i_nlink != real_i_nlink) { -+ bch2_inode_nlink_set(u, real_i_nlink); -+ *do_update = true; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int check_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct btree_iter *iter, -+ struct bkey_s_c_inode inode, -+ struct nlink *link) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ bool do_update = false; -+ int ret = 0; -+ -+ ret = bch2_inode_unpack(inode, &u); -+ -+ bch2_trans_unlock(trans); -+ -+ if (bch2_fs_inconsistent_on(ret, c, -+ "error unpacking inode %llu in fsck", -+ inode.k->p.inode)) -+ return ret; -+ -+ if (link) { -+ ret = check_inode_nlink(c, lostfound_inode, &u, link, -+ &do_update); -+ if (ret) -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_UNLINKED && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", -+ u.bi_inum))) { -+ bch_verbose(c, "deleting inode %llu", u.bi_inum); -+ -+ bch2_fs_lazy_rw(c); -+ -+ ret = bch2_inode_rm(c, u.bi_inum); -+ if (ret) -+ bch_err(c, "error in fsck: error %i while deleting inode", ret); -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", -+ u.bi_inum))) { -+ bch_verbose(c, "truncating inode %llu", u.bi_inum); -+ -+ bch2_fs_lazy_rw(c); -+ -+ /* -+ * XXX: need to truncate partial blocks too here - or ideally -+ * just switch units to bytes and that issue goes away -+ */ -+ -+ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i truncating inode", ret); -+ return ret; -+ } -+ -+ /* -+ * We truncated without our normal sector accounting hook, just -+ * make sure we recalculate it: -+ */ -+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; -+ -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ do_update = true; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", -+ u.bi_inum))) { -+ s64 sectors; -+ -+ bch_verbose(c, "recounting sectors for inode %llu", -+ u.bi_inum); -+ -+ sectors = bch2_count_inode_sectors(trans, u.bi_inum); -+ if (sectors < 0) { -+ bch_err(c, "error in fsck: error %i recounting inode sectors", -+ (int) sectors); -+ return sectors; -+ } -+ -+ u.bi_sectors = sectors; -+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; -+ do_update = true; -+ } -+ -+ if (do_update) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, &u); -+ -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); -+ if (ret) -+ bch_err(c, "error in fsck: error %i " -+ "updating inode", ret); -+ } -+fsck_err: -+ return ret; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_inodes(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ nlink_table *links, -+ u64 range_start, u64 range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct nlink *link, zero_links = { 0, 0 }; -+ struct genradix_iter nlinks_iter; -+ int ret = 0, ret2 = 0; -+ u64 nlinks_pos; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, -+ POS(0, range_start), 0); -+ nlinks_iter = genradix_iter_init(links, 0); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret2 = bkey_err(k))) { -+peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); -+ -+ if (!link && (!k.k || iter->pos.offset >= range_end)) -+ break; -+ -+ nlinks_pos = range_start + nlinks_iter.pos; -+ if (iter->pos.offset > nlinks_pos) { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link && link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ genradix_iter_advance(&nlinks_iter, links); -+ goto peek_nlinks; -+ } -+ -+ if (iter->pos.offset < nlinks_pos || !link) -+ link = &zero_links; -+ -+ if (k.k && k.k->type == KEY_TYPE_inode) { -+ ret = check_inode(&trans, lostfound_inode, iter, -+ bkey_s_c_to_inode(k), link); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } else { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ } -+ -+ if (nlinks_pos == iter->pos.offset) -+ genradix_iter_advance(&nlinks_iter, links); -+ -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+fsck_err: -+ bch2_trans_exit(&trans); -+ -+ if (ret2) -+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); -+ -+ return ret ?: ret2; -+} -+ -+noinline_for_stack -+static int check_inode_nlinks(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ nlink_table links; -+ u64 this_iter_range_start, next_iter_range_start = 0; -+ int ret = 0; -+ -+ bch_verbose(c, "checking inode nlinks"); -+ -+ genradix_init(&links); -+ -+ do { -+ this_iter_range_start = next_iter_range_start; -+ next_iter_range_start = U64_MAX; -+ -+ ret = bch2_gc_walk_dirents(c, &links, -+ this_iter_range_start, -+ &next_iter_range_start); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, -+ this_iter_range_start, -+ next_iter_range_start); -+ if (ret) -+ break; -+ -+ genradix_free(&links); -+ } while (next_iter_range_start != U64_MAX); -+ -+ genradix_free(&links); -+ -+ return ret; -+} -+ -+/* -+ * Checks for inconsistencies that shouldn't happen, unless we have a bug. -+ * Doesn't fix them yet, mainly because they haven't yet been observed: -+ */ -+int bch2_fsck_full(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_extents(c) ?: -+ check_dirents(c) ?: -+ check_xattrs(c) ?: -+ check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_directory_structure(c, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_inode_nlink(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_walk_inodes_only(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_inode inode; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ inode = bkey_s_c_to_inode(k); -+ -+ if (inode.v->bi_flags & -+ (BCH_INODE_I_SIZE_DIRTY| -+ BCH_INODE_I_SECTORS_DIRTY| -+ BCH_INODE_UNLINKED)) { -+ ret = check_inode(&trans, NULL, iter, inode, NULL); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } -+ } -+ BUG_ON(ret == -EINTR); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h -new file mode 100644 -index 000000000000..9e4af02bde1e ---- /dev/null -+++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FSCK_H -+#define _BCACHEFS_FSCK_H -+ -+int bch2_fsck_full(struct bch_fs *); -+int bch2_fsck_inode_nlink(struct bch_fs *); -+int bch2_fsck_walk_inodes_only(struct bch_fs *); -+ -+#endif /* _BCACHEFS_FSCK_H */ -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -new file mode 100644 -index 000000000000..7d20f082ad45 ---- /dev/null -+++ b/fs/bcachefs/inode.c -@@ -0,0 +1,554 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "str_hash.h" -+ -+#include -+ -+#include -+ -+const char * const bch2_inode_opts[] = { -+#define x(name, ...) #name, -+ BCH_INODE_OPTS() -+#undef x -+ NULL, -+}; -+ -+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -+static const u8 bits_table[8] = { -+ 1 * 8 - 1, -+ 2 * 8 - 2, -+ 3 * 8 - 3, -+ 4 * 8 - 4, -+ 6 * 8 - 5, -+ 8 * 8 - 6, -+ 10 * 8 - 7, -+ 13 * 8 - 8, -+}; -+ -+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) -+{ -+ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; -+ unsigned shift, bytes, bits = likely(!hi) -+ ? fls64(lo) -+ : fls64(hi) + 64; -+ -+ for (shift = 1; shift <= 8; shift++) -+ if (bits < bits_table[shift - 1]) -+ goto got_shift; -+ -+ BUG(); -+got_shift: -+ bytes = byte_table[shift - 1]; -+ -+ BUG_ON(out + bytes > end); -+ -+ memcpy(out, (u8 *) in + 16 - bytes, bytes); -+ *out |= (1 << 8) >> shift; -+ -+ return bytes; -+} -+ -+static int inode_decode_field(const u8 *in, const u8 *end, -+ u64 out[2], unsigned *out_bits) -+{ -+ __be64 be[2] = { 0, 0 }; -+ unsigned bytes, shift; -+ u8 *p; -+ -+ if (in >= end) -+ return -1; -+ -+ if (!*in) -+ return -1; -+ -+ /* -+ * position of highest set bit indicates number of bytes: -+ * shift = number of bits to remove in high byte: -+ */ -+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ -+ bytes = byte_table[shift - 1]; -+ -+ if (in + bytes > end) -+ return -1; -+ -+ p = (u8 *) be + 16 - bytes; -+ memcpy(p, in, bytes); -+ *p ^= (1 << 8) >> shift; -+ -+ out[0] = be64_to_cpu(be[0]); -+ out[1] = be64_to_cpu(be[1]); -+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); -+ -+ return bytes; -+} -+ -+void bch2_inode_pack(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) -+{ -+ u8 *out = packed->inode.v.fields; -+ u8 *end = (void *) &packed[1]; -+ u8 *last_nonzero_field = out; -+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; -+ unsigned bytes; -+ -+ bkey_inode_init(&packed->inode.k_i); -+ packed->inode.k.p.offset = inode->bi_inum; -+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; -+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); -+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); -+ -+#define x(_name, _bits) \ -+ out += inode_encode_field(out, end, 0, inode->_name); \ -+ nr_fields++; \ -+ \ -+ if (inode->_name) { \ -+ last_nonzero_field = out; \ -+ last_nonzero_fieldnr = nr_fields; \ -+ } -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ out = last_nonzero_field; -+ nr_fields = last_nonzero_fieldnr; -+ -+ bytes = out - (u8 *) &packed->inode.v; -+ set_bkey_val_bytes(&packed->inode.k, bytes); -+ memset_u64s_tail(&packed->inode.v, 0, bytes); -+ -+ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { -+ struct bch_inode_unpacked unpacked; -+ -+ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), -+ &unpacked); -+ BUG_ON(ret); -+ BUG_ON(unpacked.bi_inum != inode->bi_inum); -+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); -+ BUG_ON(unpacked.bi_mode != inode->bi_mode); -+ -+#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); -+ BCH_INODE_FIELDS() -+#undef x -+ } -+} -+ -+int bch2_inode_unpack(struct bkey_s_c_inode inode, -+ struct bch_inode_unpacked *unpacked) -+{ -+ const u8 *in = inode.v->fields; -+ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); -+ u64 field[2]; -+ unsigned fieldnr = 0, field_bits; -+ int ret; -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ -+#define x(_name, _bits) \ -+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ -+ memset(&unpacked->_name, 0, \ -+ sizeof(*unpacked) - \ -+ offsetof(struct bch_inode_unpacked, _name)); \ -+ return 0; \ -+ } \ -+ \ -+ ret = inode_decode_field(in, end, field, &field_bits); \ -+ if (ret < 0) \ -+ return ret; \ -+ \ -+ if (field_bits > sizeof(unpacked->_name) * 8) \ -+ return -1; \ -+ \ -+ unpacked->_name = field[1]; \ -+ in += ret; -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ -+ return 0; -+} -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u64 inum, unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), -+ BTREE_ITER_SLOTS|flags); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); -+ if (ret) -+ goto err; -+ -+ return iter; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ERR_PTR(ret); -+} -+ -+int bch2_inode_write(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode) -+{ -+ struct bkey_inode_buf *inode_p; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+ -+ bch2_inode_pack(inode_p, inode); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ return 0; -+} -+ -+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) -+ return "incorrect value size"; -+ -+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) -+ return "fs inode in blockdev range"; -+ -+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) -+ return "invalid str hash type"; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) -+ return "invalid variable length fields"; -+ -+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && -+ unpacked.bi_nlink != 0) -+ return "flagged as unlinked but bi_nlink != 0"; -+ -+ return NULL; -+} -+ -+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) { -+ pr_buf(out, "(unpack error)"); -+ return; -+ } -+ -+#define x(_name, _bits) \ -+ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); -+ BCH_INODE_FIELDS() -+#undef x -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); -+ -+ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -+} -+ -+void bch2_inode_init_early(struct bch_fs *c, -+ struct bch_inode_unpacked *inode_u) -+{ -+ enum bch_str_hash_type str_hash = -+ bch2_str_hash_opt_to_type(c, c->opts.str_hash); -+ -+ memset(inode_u, 0, sizeof(*inode_u)); -+ -+ /* ick */ -+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; -+ get_random_bytes(&inode_u->bi_hash_seed, -+ sizeof(inode_u->bi_hash_seed)); -+} -+ -+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ inode_u->bi_mode = mode; -+ inode_u->bi_uid = uid; -+ inode_u->bi_gid = gid; -+ inode_u->bi_dev = rdev; -+ inode_u->bi_atime = now; -+ inode_u->bi_mtime = now; -+ inode_u->bi_ctime = now; -+ inode_u->bi_otime = now; -+ -+ if (parent && parent->bi_mode & S_ISGID) { -+ inode_u->bi_gid = parent->bi_gid; -+ if (S_ISDIR(mode)) -+ inode_u->bi_mode |= S_ISGID; -+ } -+ -+ if (parent) { -+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ } -+} -+ -+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ bch2_inode_init_early(c, inode_u); -+ bch2_inode_init_late(inode_u, bch2_current_time(c), -+ uid, gid, mode, rdev, parent); -+} -+ -+static inline u32 bkey_generation(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ BUG(); -+ case KEY_TYPE_inode_generation: -+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_inode_create(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ u64 min, u64 max, u64 *hint) -+{ -+ struct bkey_inode_buf *inode_p; -+ struct btree_iter *iter = NULL; -+ struct bkey_s_c k; -+ u64 start; -+ int ret; -+ -+ if (!max) -+ max = ULLONG_MAX; -+ -+ if (trans->c->opts.inodes_32bit) -+ max = min_t(u64, max, U32_MAX); -+ -+ start = READ_ONCE(*hint); -+ -+ if (start >= max || start < min) -+ start = min; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+again: -+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(iter->pos, POS(0, max)) > 0) -+ break; -+ -+ if (k.k->type != KEY_TYPE_inode) -+ goto found_slot; -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ -+ if (ret) -+ return ret; -+ -+ if (start != min) { -+ /* Retry from start */ -+ start = min; -+ goto again; -+ } -+ -+ return -ENOSPC; -+found_slot: -+ *hint = k.k->p.offset; -+ inode_u->bi_inum = k.k->p.offset; -+ inode_u->bi_generation = bkey_generation(k); -+ -+ bch2_inode_pack(inode_p, inode_u); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_inode_generation delete; -+ struct bpos start = POS(inode_nr, 0); -+ struct bpos end = POS(inode_nr + 1, 0); -+ int ret; -+ -+ /* -+ * If this was a directory, there shouldn't be any real dirents left - -+ * but there could be whiteouts (from hash collisions) that we should -+ * delete: -+ * -+ * XXX: the dirent could ideally would delete whiteouts when they're no -+ * longer needed -+ */ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, -+ start, end, NULL); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ do { -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ u32 bi_generation = 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, -+ "inode %llu not found when deleting", -+ inode_nr); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: { -+ struct bch_inode_unpacked inode_u; -+ -+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) -+ bi_generation = inode_u.bi_generation + 1; -+ break; -+ } -+ case KEY_TYPE_inode_generation: { -+ struct bkey_s_c_inode_generation g = -+ bkey_s_c_to_inode_generation(k); -+ bi_generation = le32_to_cpu(g.v->bi_generation); -+ break; -+ } -+ } -+ -+ if (!bi_generation) { -+ bkey_init(&delete.k); -+ delete.k.p.offset = inode_nr; -+ } else { -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p.offset = inode_nr; -+ delete.v.bi_generation = cpu_to_le32(bi_generation); -+ } -+ -+ bch2_trans_update(&trans, iter, &delete.k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, -+ POS(0, inode_nr), BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode -+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) -+ : -ENOENT; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ return bch2_trans_do(c, NULL, NULL, 0, -+ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void) -+{ -+ struct bch_inode_unpacked *u, test_inodes[] = { -+ { -+ .bi_atime = U64_MAX, -+ .bi_ctime = U64_MAX, -+ .bi_mtime = U64_MAX, -+ .bi_otime = U64_MAX, -+ .bi_size = U64_MAX, -+ .bi_sectors = U64_MAX, -+ .bi_uid = U32_MAX, -+ .bi_gid = U32_MAX, -+ .bi_nlink = U32_MAX, -+ .bi_generation = U32_MAX, -+ .bi_dev = U32_MAX, -+ }, -+ }; -+ -+ for (u = test_inodes; -+ u < test_inodes + ARRAY_SIZE(test_inodes); -+ u++) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, u); -+ } -+} -+#endif -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -new file mode 100644 -index 000000000000..bb759a46dc41 ---- /dev/null -+++ b/fs/bcachefs/inode.h -@@ -0,0 +1,177 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_INODE_H -+#define _BCACHEFS_INODE_H -+ -+#include "opts.h" -+ -+extern const char * const bch2_inode_opts[]; -+ -+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *, -+ struct bkey_s_c); -+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_generation_invalid, \ -+ .val_to_text = bch2_inode_generation_to_text, \ -+} -+ -+struct bch_inode_unpacked { -+ u64 bi_inum; -+ __le64 bi_hash_seed; -+ u32 bi_flags; -+ u16 bi_mode; -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_FIELDS() -+#undef x -+}; -+ -+struct bkey_inode_buf { -+ struct bkey_i_inode inode; -+ -+#define x(_name, _bits) + 8 + _bits / 8 -+ u8 _pad[0 + BCH_INODE_FIELDS()]; -+#undef x -+} __attribute__((packed, aligned(8))); -+ -+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *, -+ struct bch_inode_unpacked *, u64, unsigned); -+int bch2_inode_write(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *); -+ -+void bch2_inode_init_early(struct bch_fs *, -+ struct bch_inode_unpacked *); -+void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_create(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ u64, u64, u64 *); -+ -+int bch2_inode_rm(struct bch_fs *, u64); -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); -+ -+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts ret = { 0 }; -+ -+#define x(_name, _bits) \ -+ if (inode->bi_##_name) \ -+ opt_set(ret, _name, inode->bi_##_name - 1); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ inode->bi_##_name = v; \ -+ break; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ return inode->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct bch_io_opts -+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); -+ -+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); -+ return opts; -+} -+ -+static inline u8 mode_to_type(umode_t mode) -+{ -+ return (mode >> 12) & 15; -+} -+ -+/* i_nlink: */ -+ -+static inline unsigned nlink_bias(umode_t mode) -+{ -+ return S_ISDIR(mode) ? 2 : 1; -+} -+ -+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -+{ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ else -+ bi->bi_nlink++; -+} -+ -+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) -+{ -+ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); -+ if (bi->bi_nlink) -+ bi->bi_nlink--; -+ else -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+} -+ -+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -+{ -+ return bi->bi_flags & BCH_INODE_UNLINKED -+ ? 0 -+ : bi->bi_nlink + nlink_bias(bi->bi_mode); -+} -+ -+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, -+ unsigned nlink) -+{ -+ if (nlink) { -+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ } else { -+ bi->bi_nlink = 0; -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void); -+#else -+static inline void bch2_inode_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_INODE_H */ -diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c -new file mode 100644 -index 000000000000..5c9c3cf54edd ---- /dev/null -+++ b/fs/bcachefs/io.c -@@ -0,0 +1,2387 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Some low level IO code, and hacks for various block layer limitations -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "compress.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "extent_update.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+ -+#include -+ -+const char *bch2_blk_status_to_str(blk_status_t status) -+{ -+ if (status == BLK_STS_REMOVED) -+ return "device removed"; -+ return blk_status_to_str(status); -+} -+ -+static bool bch2_target_congested(struct bch_fs *c, u16 target) -+{ -+ const struct bch_devs_mask *devs; -+ unsigned d, nr = 0, total = 0; -+ u64 now = local_clock(), last; -+ s64 congested; -+ struct bch_dev *ca; -+ -+ if (!target) -+ return false; -+ -+ rcu_read_lock(); -+ devs = bch2_target_to_mask(c, target) ?: -+ &c->rw_devs[BCH_DATA_user]; -+ -+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { -+ ca = rcu_dereference(c->devs[d]); -+ if (!ca) -+ continue; -+ -+ congested = atomic_read(&ca->congested); -+ last = READ_ONCE(ca->congested_last); -+ if (time_after64(now, last)) -+ congested -= (now - last) >> 12; -+ -+ total += max(congested, 0LL); -+ nr++; -+ } -+ rcu_read_unlock(); -+ -+ return bch2_rand_range(nr * CONGESTED_MAX) < total; -+} -+ -+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, -+ u64 now, int rw) -+{ -+ u64 latency_capable = -+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; -+ /* ideally we'd be taking into account the device's variance here: */ -+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); -+ s64 latency_over = io_latency - latency_threshold; -+ -+ if (latency_threshold && latency_over > 0) { -+ /* -+ * bump up congested by approximately latency_over * 4 / -+ * latency_threshold - we don't need much accuracy here so don't -+ * bother with the divide: -+ */ -+ if (atomic_read(&ca->congested) < CONGESTED_MAX) -+ atomic_add(latency_over >> -+ max_t(int, ilog2(latency_threshold) - 2, 0), -+ &ca->congested); -+ -+ ca->congested_last = now; -+ } else if (atomic_read(&ca->congested) > 0) { -+ atomic_dec(&ca->congested); -+ } -+} -+ -+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -+{ -+ atomic64_t *latency = &ca->cur_latency[rw]; -+ u64 now = local_clock(); -+ u64 io_latency = time_after64(now, submit_time) -+ ? now - submit_time -+ : 0; -+ u64 old, new, v = atomic64_read(latency); -+ -+ do { -+ old = v; -+ -+ /* -+ * If the io latency was reasonably close to the current -+ * latency, skip doing the update and atomic operation - most of -+ * the time: -+ */ -+ if (abs((int) (old - io_latency)) < (old >> 1) && -+ now & ~(~0 << 5)) -+ break; -+ -+ new = ewma_add(old, io_latency, 5); -+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); -+ -+ bch2_congested_acct(ca, io_latency, now, rw); -+ -+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); -+} -+ -+/* Allocate, free from mempool: */ -+ -+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ if (bv->bv_page != ZERO_PAGE(0)) -+ mempool_free(bv->bv_page, &c->bio_bounce_pages); -+ bio->bi_vcnt = 0; -+} -+ -+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -+{ -+ struct page *page; -+ -+ if (likely(!*using_mempool)) { -+ page = alloc_page(GFP_NOIO); -+ if (unlikely(!page)) { -+ mutex_lock(&c->bio_bounce_pages_lock); -+ *using_mempool = true; -+ goto pool_alloc; -+ -+ } -+ } else { -+pool_alloc: -+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); -+ } -+ -+ return page; -+} -+ -+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, -+ size_t size) -+{ -+ bool using_mempool = false; -+ -+ while (size) { -+ struct page *page = __bio_alloc_page_pool(c, &using_mempool); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ if (using_mempool) -+ mutex_unlock(&c->bio_bounce_pages_lock); -+} -+ -+/* Extent update path: */ -+ -+static int sum_sector_overwrites(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i *new, -+ bool may_allocate, -+ bool *maybe_extending, -+ s64 *delta) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c old; -+ int ret = 0; -+ -+ *maybe_extending = true; -+ *delta = 0; -+ -+ iter = bch2_trans_copy_iter(trans, extent_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { -+ if (!may_allocate && -+ bch2_bkey_nr_ptrs_fully_allocated(old) < -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { -+ ret = -ENOSPC; -+ break; -+ } -+ -+ *delta += (min(new->k.p.offset, -+ old.k->p.offset) - -+ max(bkey_start_offset(&new->k), -+ bkey_start_offset(old.k))) * -+ (bkey_extent_is_allocation(&new->k) - -+ bkey_extent_is_allocation(old.k)); -+ -+ if (bkey_cmp(old.k->p, new->k.p) >= 0) { -+ /* -+ * Check if there's already data above where we're -+ * going to be writing to - this means we're definitely -+ * not extending the file: -+ * -+ * Note that it's not sufficient to check if there's -+ * data up to the sector offset we're going to be -+ * writing to, because i_size could be up to one block -+ * less: -+ */ -+ if (!bkey_cmp(old.k->p, new->k.p)) -+ old = bch2_btree_iter_next(iter); -+ -+ if (old.k && !bkey_err(old) && -+ old.k->p.inode == extent_iter->pos.inode && -+ bkey_extent_is_data(old.k)) -+ *maybe_extending = false; -+ -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_extent_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ u64 new_i_size, -+ s64 *i_sectors_delta) -+{ -+ /* this must live until after bch2_trans_commit(): */ -+ struct bkey_inode_buf inode_p; -+ bool extending = false; -+ s64 delta = 0; -+ int ret; -+ -+ ret = bch2_extent_trim_atomic(k, iter); -+ if (ret) -+ return ret; -+ -+ ret = sum_sector_overwrites(trans, iter, k, -+ disk_res && disk_res->sectors != 0, -+ &extending, &delta); -+ if (ret) -+ return ret; -+ -+ new_i_size = extending -+ ? min(k->k.p.offset << 9, new_i_size) -+ : 0; -+ -+ if (delta || new_i_size) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ inode_iter = bch2_inode_peek(trans, &inode_u, -+ k->k.p.inode, BTREE_ITER_INTENT); -+ if (IS_ERR(inode_iter)) -+ return PTR_ERR(inode_iter); -+ -+ /* -+ * XXX: -+ * writeback can race a bit with truncate, because truncate -+ * first updates the inode then truncates the pagecache. This is -+ * ugly, but lets us preserve the invariant that the in memory -+ * i_size is always >= the on disk i_size. -+ * -+ BUG_ON(new_i_size > inode_u.bi_size && -+ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); -+ */ -+ BUG_ON(new_i_size > inode_u.bi_size && !extending); -+ -+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ new_i_size > inode_u.bi_size) -+ inode_u.bi_size = new_i_size; -+ else -+ new_i_size = 0; -+ -+ inode_u.bi_sectors += delta; -+ -+ if (delta || new_i_size) { -+ bch2_inode_pack(&inode_p, &inode_u); -+ bch2_trans_update(trans, inode_iter, -+ &inode_p.inode.k_i, 0); -+ } -+ -+ bch2_trans_iter_put(trans, inode_iter); -+ } -+ -+ bch2_trans_update(trans, iter, k, 0); -+ -+ ret = bch2_trans_commit(trans, disk_res, journal_seq, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (!ret && i_sectors_delta) -+ *i_sectors_delta += delta; -+ -+ return ret; -+} -+ -+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end, u64 *journal_seq, -+ s64 *i_sectors_delta) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); -+ struct bkey_s_c k; -+ int ret = 0, ret2 = 0; -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto btree_err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = iter->pos; -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_update(trans, iter, &delete, -+ &disk_res, journal_seq, -+ 0, i_sectors_delta); -+ bch2_disk_reservation_put(c, &disk_res); -+btree_err: -+ if (ret == -EINTR) { -+ ret2 = ret; -+ ret = 0; -+ } -+ if (ret) -+ break; -+ } -+ -+ if (bkey_cmp(iter->pos, end) > 0) { -+ bch2_btree_iter_set_pos(iter, end); -+ ret = bch2_btree_iter_traverse(iter); -+ } -+ -+ return ret ?: ret2; -+} -+ -+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, -+ u64 *journal_seq, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inum, start), -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), -+ journal_seq, i_sectors_delta); -+ bch2_trans_exit(&trans); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ -+ return ret; -+} -+ -+int bch2_write_index_default(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_on_stack sk; -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ k = bch2_keylist_front(keys); -+ -+ bkey_on_stack_realloc(&sk, c, k->k.u64s); -+ bkey_copy(sk.k, k); -+ bch2_cut_front(iter->pos, sk.k); -+ -+ ret = bch2_extent_update(&trans, iter, sk.k, -+ &op->res, op_journal_seq(op), -+ op->new_i_size, &op->i_sectors_delta); -+ if (ret == -EINTR) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_cmp(iter->pos, k->k.p) >= 0) -+ bch2_keylist_pop_front(keys); -+ } while (!bch2_keylist_empty(keys)); -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* Writes */ -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, -+ enum bch_data_type type, -+ const struct bkey_i *k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); -+ const struct bch_extent_ptr *ptr; -+ struct bch_write_bio *n; -+ struct bch_dev *ca; -+ -+ BUG_ON(c->opts.nochanges); -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || -+ !c->devs[ptr->dev]); -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (to_entry(ptr + 1) < ptrs.end) { -+ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, -+ &ca->replica_set)); -+ -+ n->bio.bi_end_io = wbio->bio.bi_end_io; -+ n->bio.bi_private = wbio->bio.bi_private; -+ n->parent = wbio; -+ n->split = true; -+ n->bounce = false; -+ n->put_bio = true; -+ n->bio.bi_opf = wbio->bio.bi_opf; -+ bio_inc_remaining(&wbio->bio); -+ } else { -+ n = wbio; -+ n->split = false; -+ } -+ -+ n->c = c; -+ n->dev = ptr->dev; -+ n->have_ioref = bch2_dev_get_ioref(ca, -+ type == BCH_DATA_btree ? READ : WRITE); -+ n->submit_time = local_clock(); -+ n->bio.bi_iter.bi_sector = ptr->offset; -+ -+ if (!journal_flushes_device(ca)) -+ n->bio.bi_opf |= REQ_FUA; -+ -+ if (likely(n->have_ioref)) { -+ this_cpu_add(ca->io_done->sectors[WRITE][type], -+ bio_sectors(&n->bio)); -+ -+ bio_set_dev(&n->bio, ca->disk_sb.bdev); -+ submit_bio(&n->bio); -+ } else { -+ n->bio.bi_status = BLK_STS_REMOVED; -+ bio_endio(&n->bio); -+ } -+ } -+} -+ -+static void __bch2_write(struct closure *); -+ -+static void bch2_write_done(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) -+ op->error = bch2_journal_error(&c->journal); -+ -+ bch2_disk_reservation_put(c, &op->res); -+ percpu_ref_put(&c->writes); -+ bch2_keylist_free(&op->insert_keys, op->inline_keys); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); -+ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ up(&c->io_in_flight); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/** -+ * bch_write_index - after a write, update index to point to new data -+ */ -+static void __bch2_write_index(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct keylist *keys = &op->insert_keys; -+ struct bch_extent_ptr *ptr; -+ struct bkey_i *src, *dst = keys->keys, *n, *k; -+ unsigned dev; -+ int ret; -+ -+ for (src = keys->keys; src != keys->top; src = n) { -+ n = bkey_next(src); -+ -+ if (bkey_extent_is_direct_data(&src->k)) { -+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, -+ test_bit(ptr->dev, op->failed.d)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { -+ ret = -EIO; -+ goto err; -+ } -+ } -+ -+ if (dst != src) -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ -+ keys->top = dst; -+ -+ /* -+ * probably not the ideal place to hook this in, but I don't -+ * particularly want to plumb io_opts all the way through the btree -+ * update stack right now -+ */ -+ for_each_keylist_key(keys, k) { -+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); -+ -+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) -+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); -+ -+ } -+ -+ if (!bch2_keylist_empty(keys)) { -+ u64 sectors_start = keylist_sectors(keys); -+ int ret = op->index_update_fn(op); -+ -+ BUG_ON(ret == -EINTR); -+ BUG_ON(keylist_sectors(keys) && !ret); -+ -+ op->written += sectors_start - keylist_sectors(keys); -+ -+ if (ret) { -+ __bcache_io_error(c, "btree IO error %i", ret); -+ op->error = ret; -+ } -+ } -+out: -+ /* If some a bucket wasn't written, we can't erasure code it: */ -+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) -+ bch2_open_bucket_write_error(c, &op->open_buckets, dev); -+ -+ bch2_open_buckets_put(c, &op->open_buckets); -+ return; -+err: -+ keys->top = keys->keys; -+ op->error = ret; -+ goto out; -+} -+ -+static void bch2_write_index(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ __bch2_write_index(op); -+ -+ if (!(op->flags & BCH_WRITE_DONE)) { -+ continue_at(cl, __bch2_write, index_update_wq(op)); -+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { -+ bch2_journal_flush_seq_async(&c->journal, -+ *op_journal_seq(op), -+ cl); -+ continue_at(cl, bch2_write_done, index_update_wq(op)); -+ } else { -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ } -+} -+ -+static void bch2_write_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", -+ bch2_blk_status_to_str(bio->bi_status))) -+ set_bit(wbio->dev, op->failed.d); -+ -+ if (wbio->have_ioref) { -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (wbio->bounce) -+ bch2_bio_free_pages_pool(c, bio); -+ -+ if (wbio->put_bio) -+ bio_put(bio); -+ -+ if (parent) -+ bio_endio(&parent->bio); -+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) -+ closure_put(cl); -+ else -+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); -+} -+ -+static void init_append_extent(struct bch_write_op *op, -+ struct write_point *wp, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_i_extent *e; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(crc.compressed_size > wp->sectors_free); -+ wp->sectors_free -= crc.compressed_size; -+ op->pos.offset += crc.uncompressed_size; -+ -+ e = bkey_extent_init(op->insert_keys.top); -+ e->k.p = op->pos; -+ e->k.size = crc.uncompressed_size; -+ e->k.version = version; -+ -+ if (crc.csum_type || -+ crc.compression_type || -+ crc.nonce) -+ bch2_extent_crc_append(&e->k_i, crc); -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ union bch_extent_entry *end = -+ bkey_val_end(bkey_i_to_s(&e->k_i)); -+ -+ end->ptr = ob->ptr; -+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ end->ptr.cached = !ca->mi.durability || -+ (op->flags & BCH_WRITE_CACHED) != 0; -+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; -+ -+ e->k.u64s++; -+ -+ BUG_ON(crc.compressed_size > ob->sectors_free); -+ ob->sectors_free -= crc.compressed_size; -+ } -+ -+ bch2_keylist_push(&op->insert_keys); -+} -+ -+static struct bio *bch2_write_bio_alloc(struct bch_fs *c, -+ struct write_point *wp, -+ struct bio *src, -+ bool *page_alloc_failed, -+ void *buf) -+{ -+ struct bch_write_bio *wbio; -+ struct bio *bio; -+ unsigned output_available = -+ min(wp->sectors_free << 9, src->bi_iter.bi_size); -+ unsigned pages = DIV_ROUND_UP(output_available + -+ (buf -+ ? ((unsigned long) buf & (PAGE_SIZE - 1)) -+ : 0), PAGE_SIZE); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); -+ wbio = wbio_init(bio); -+ wbio->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ wbio->bio.bi_opf = src->bi_opf; -+ -+ if (buf) { -+ bch2_bio_map(bio, buf, output_available); -+ return bio; -+ } -+ -+ wbio->bounce = true; -+ -+ /* -+ * We can't use mempool for more than c->sb.encoded_extent_max -+ * worth of pages, but we'd like to allocate more if we can: -+ */ -+ bch2_bio_alloc_pages_pool(c, bio, -+ min_t(unsigned, output_available, -+ c->sb.encoded_extent_max << 9)); -+ -+ if (bio->bi_iter.bi_size < output_available) -+ *page_alloc_failed = -+ bch2_bio_alloc_pages(bio, -+ output_available - -+ bio->bi_iter.bi_size, -+ GFP_NOFS) != 0; -+ -+ return bio; -+} -+ -+static int bch2_write_rechecksum(struct bch_fs *c, -+ struct bch_write_op *op, -+ unsigned new_csum_type) -+{ -+ struct bio *bio = &op->wbio.bio; -+ struct bch_extent_crc_unpacked new_crc; -+ int ret; -+ -+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ -+ -+ if (bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)) -+ new_csum_type = op->crc.csum_type; -+ -+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -+ NULL, &new_crc, -+ op->crc.offset, op->crc.live_size, -+ new_csum_type); -+ if (ret) -+ return ret; -+ -+ bio_advance(bio, op->crc.offset << 9); -+ bio->bi_iter.bi_size = op->crc.live_size << 9; -+ op->crc = new_crc; -+ return 0; -+} -+ -+static int bch2_write_decrypt(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ struct bch_csum csum; -+ -+ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) -+ return 0; -+ -+ /* -+ * If we need to decrypt data in the write path, we'll no longer be able -+ * to verify the existing checksum (poly1305 mac, in this case) after -+ * it's decrypted - this is the last point we'll be able to reverify the -+ * checksum: -+ */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return -EIO; -+ -+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ return 0; -+} -+ -+static enum prep_encoded_ret { -+ PREP_ENCODED_OK, -+ PREP_ENCODED_ERR, -+ PREP_ENCODED_CHECKSUM_ERR, -+ PREP_ENCODED_DO_WRITE, -+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *bio = &op->wbio.bio; -+ -+ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -+ return PREP_ENCODED_OK; -+ -+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); -+ -+ /* Can we just write the entire extent as is? */ -+ if (op->crc.uncompressed_size == op->crc.live_size && -+ op->crc.compressed_size <= wp->sectors_free && -+ (op->crc.compression_type == op->compression_type || -+ op->incompressible)) { -+ if (!crc_is_compressed(op->crc) && -+ op->csum_type != op->crc.csum_type && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_DO_WRITE; -+ } -+ -+ /* -+ * If the data is compressed and we couldn't write the entire extent as -+ * is, we have to decompress it: -+ */ -+ if (crc_is_compressed(op->crc)) { -+ struct bch_csum csum; -+ -+ if (bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* Last point we can still verify checksum: */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, -+ extent_nonce(op->version, op->crc), -+ bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) -+ return PREP_ENCODED_ERR; -+ } -+ -+ /* -+ * No longer have compressed data after this point - data might be -+ * encrypted: -+ */ -+ -+ /* -+ * If the data is checksummed and we're only writing a subset, -+ * rechecksum and adjust bio to point to currently live data: -+ */ -+ if ((op->crc.live_size != op->crc.uncompressed_size || -+ op->crc.csum_type != op->csum_type) && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* -+ * If we want to compress the data, it has to be decrypted: -+ */ -+ if ((op->compression_type || -+ bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(op->csum_type)) && -+ bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_OK; -+} -+ -+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, -+ struct bio **_dst) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *src = &op->wbio.bio, *dst = src; -+ struct bvec_iter saved_iter; -+ void *ec_buf; -+ struct bpos ec_pos = op->pos; -+ unsigned total_output = 0, total_input = 0; -+ bool bounce = false; -+ bool page_alloc_failed = false; -+ int ret, more = 0; -+ -+ BUG_ON(!bio_sectors(src)); -+ -+ ec_buf = bch2_writepoint_ec_buf(c, wp); -+ -+ switch (bch2_write_prep_encoded_data(op, wp)) { -+ case PREP_ENCODED_OK: -+ break; -+ case PREP_ENCODED_ERR: -+ ret = -EIO; -+ goto err; -+ case PREP_ENCODED_CHECKSUM_ERR: -+ BUG(); -+ goto csum_err; -+ case PREP_ENCODED_DO_WRITE: -+ /* XXX look for bug here */ -+ if (ec_buf) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bio_copy_data(dst, src); -+ bounce = true; -+ } -+ init_append_extent(op, wp, op->version, op->crc); -+ goto do_write; -+ } -+ -+ if (ec_buf || -+ op->compression_type || -+ (op->csum_type && -+ !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ (bch2_csum_type_is_encryption(op->csum_type) && -+ !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bounce = true; -+ } -+ -+ saved_iter = dst->bi_iter; -+ -+ do { -+ struct bch_extent_crc_unpacked crc = -+ (struct bch_extent_crc_unpacked) { 0 }; -+ struct bversion version = op->version; -+ size_t dst_len, src_len; -+ -+ if (page_alloc_failed && -+ bio_sectors(dst) < wp->sectors_free && -+ bio_sectors(dst) < c->sb.encoded_extent_max) -+ break; -+ -+ BUG_ON(op->compression_type && -+ (op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_csum_type_is_encryption(op->crc.csum_type)); -+ BUG_ON(op->compression_type && !bounce); -+ -+ crc.compression_type = op->incompressible -+ ? BCH_COMPRESSION_TYPE_incompressible -+ : op->compression_type -+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, -+ op->compression_type) -+ : 0; -+ if (!crc_is_compressed(crc)) { -+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); -+ -+ if (op->csum_type) -+ dst_len = min_t(unsigned, dst_len, -+ c->sb.encoded_extent_max << 9); -+ -+ if (bounce) { -+ swap(dst->bi_iter.bi_size, dst_len); -+ bio_copy_data(dst, src); -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ src_len = dst_len; -+ } -+ -+ BUG_ON(!src_len || !dst_len); -+ -+ if (bch2_csum_type_is_encryption(op->csum_type)) { -+ if (bversion_zero(version)) { -+ version.lo = atomic64_inc_return(&c->key_version); -+ } else { -+ crc.nonce = op->nonce; -+ op->nonce += src_len >> 9; -+ } -+ } -+ -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ !crc_is_compressed(crc) && -+ bch2_csum_type_is_encryption(op->crc.csum_type) == -+ bch2_csum_type_is_encryption(op->csum_type)) { -+ /* -+ * Note: when we're using rechecksum(), we need to be -+ * checksumming @src because it has all the data our -+ * existing checksum covers - if we bounced (because we -+ * were trying to compress), @dst will only have the -+ * part of the data the new checksum will cover. -+ * -+ * But normally we want to be checksumming post bounce, -+ * because part of the reason for bouncing is so the -+ * data can't be modified (by userspace) while it's in -+ * flight. -+ */ -+ if (bch2_rechecksum_bio(c, src, version, op->crc, -+ &crc, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->csum_type)) -+ goto csum_err; -+ } else { -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_rechecksum_bio(c, src, version, op->crc, -+ NULL, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->crc.csum_type)) -+ goto csum_err; -+ -+ crc.compressed_size = dst_len >> 9; -+ crc.uncompressed_size = src_len >> 9; -+ crc.live_size = src_len >> 9; -+ -+ swap(dst->bi_iter.bi_size, dst_len); -+ bch2_encrypt_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum = bch2_checksum_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum_type = op->csum_type; -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ init_append_extent(op, wp, version, crc); -+ -+ if (dst != src) -+ bio_advance(dst, dst_len); -+ bio_advance(src, src_len); -+ total_output += dst_len; -+ total_input += src_len; -+ } while (dst->bi_iter.bi_size && -+ src->bi_iter.bi_size && -+ wp->sectors_free && -+ !bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)); -+ -+ more = src->bi_iter.bi_size != 0; -+ -+ dst->bi_iter = saved_iter; -+ -+ if (dst == src && more) { -+ BUG_ON(total_output != total_input); -+ -+ dst = bio_split(src, total_input >> 9, -+ GFP_NOIO, &c->bio_write); -+ wbio_init(dst)->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ dst->bi_opf = src->bi_opf; -+ } -+ -+ dst->bi_iter.bi_size = total_output; -+do_write: -+ /* might have done a realloc... */ -+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); -+ -+ *_dst = dst; -+ return more; -+csum_err: -+ bch_err(c, "error verifying existing checksum while " -+ "rewriting existing data (memory corruption?)"); -+ ret = -EIO; -+err: -+ if (to_wbio(dst)->bounce) -+ bch2_bio_free_pages_pool(c, dst); -+ if (to_wbio(dst)->put_bio) -+ bio_put(dst); -+ -+ return ret; -+} -+ -+static void __bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ struct write_point *wp; -+ struct bio *bio; -+ bool skip_put = true; -+ unsigned nofs_flags; -+ int ret; -+ -+ nofs_flags = memalloc_nofs_save(); -+again: -+ memset(&op->failed, 0, sizeof(op->failed)); -+ -+ do { -+ struct bkey_i *key_to_write; -+ unsigned key_to_write_offset = op->insert_keys.top_p - -+ op->insert_keys.keys_p; -+ -+ /* +1 for possible cache device: */ -+ if (op->open_buckets.nr + op->nr_replicas + 1 > -+ ARRAY_SIZE(op->open_buckets.v)) -+ goto flush_io; -+ -+ if (bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)) -+ goto flush_io; -+ -+ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && -+ percpu_ref_is_dying(&c->writes)) { -+ ret = -EROFS; -+ goto err; -+ } -+ -+ /* -+ * The copygc thread is now global, which means it's no longer -+ * freeing up space on specific disks, which means that -+ * allocations for specific disks may hang arbitrarily long: -+ */ -+ wp = bch2_alloc_sectors_start(c, -+ op->target, -+ op->opts.erasure_code, -+ op->write_point, -+ &op->devs_have, -+ op->nr_replicas, -+ op->nr_replicas_required, -+ op->alloc_reserve, -+ op->flags, -+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| -+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); -+ EBUG_ON(!wp); -+ -+ if (unlikely(IS_ERR(wp))) { -+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { -+ ret = PTR_ERR(wp); -+ goto err; -+ } -+ -+ goto flush_io; -+ } -+ -+ /* -+ * It's possible for the allocator to fail, put us on the -+ * freelist waitlist, and then succeed in one of various retry -+ * paths: if that happens, we need to disable the skip_put -+ * optimization because otherwise there won't necessarily be a -+ * barrier before we free the bch_write_op: -+ */ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ skip_put = false; -+ -+ bch2_open_bucket_get(c, wp, &op->open_buckets); -+ ret = bch2_write_extent(op, wp, &bio); -+ bch2_alloc_sectors_done(c, wp); -+ -+ if (ret < 0) -+ goto err; -+ -+ if (ret) { -+ skip_put = false; -+ } else { -+ /* -+ * for the skip_put optimization this has to be set -+ * before we submit the bio: -+ */ -+ op->flags |= BCH_WRITE_DONE; -+ } -+ -+ bio->bi_end_io = bch2_write_endio; -+ bio->bi_private = &op->cl; -+ bio->bi_opf |= REQ_OP_WRITE; -+ -+ if (!skip_put) -+ closure_get(bio->bi_private); -+ else -+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; -+ -+ key_to_write = (void *) (op->insert_keys.keys_p + -+ key_to_write_offset); -+ -+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, -+ key_to_write); -+ } while (ret); -+ -+ if (!skip_put) -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+out: -+ memalloc_nofs_restore(nofs_flags); -+ return; -+err: -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+flush_io: -+ /* -+ * If the write can't all be submitted at once, we generally want to -+ * block synchronously as that signals backpressure to the caller. -+ * -+ * However, if we're running out of a workqueue, we can't block here -+ * because we'll be blocking other work items from completing: -+ */ -+ if (current->flags & PF_WQ_WORKER) { -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+ } -+ -+ closure_sync(cl); -+ -+ if (!bch2_keylist_empty(&op->insert_keys)) { -+ __bch2_write_index(op); -+ -+ if (op->error) { -+ op->flags |= BCH_WRITE_DONE; -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ goto out; -+ } -+ } -+ -+ goto again; -+} -+ -+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -+{ -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->wbio.bio; -+ struct bvec_iter iter; -+ struct bkey_i_inline_data *id; -+ unsigned sectors; -+ int ret; -+ -+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); -+ -+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); -+ if (ret) { -+ op->error = ret; -+ goto err; -+ } -+ -+ sectors = bio_sectors(bio); -+ op->pos.offset += sectors; -+ -+ id = bkey_inline_data_init(op->insert_keys.top); -+ id->k.p = op->pos; -+ id->k.version = op->version; -+ id->k.size = sectors; -+ -+ iter = bio->bi_iter; -+ iter.bi_size = data_len; -+ memcpy_from_bio(id->v.data, bio, iter); -+ -+ while (data_len & 7) -+ id->v.data[data_len++] = '\0'; -+ set_bkey_val_bytes(&id->k, data_len); -+ bch2_keylist_push(&op->insert_keys); -+ -+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at_nobarrier(cl, bch2_write_index, NULL); -+ return; -+err: -+ bch2_write_done(&op->cl); -+} -+ -+/** -+ * bch_write - handle a write to a cache device or flash only volume -+ * -+ * This is the starting point for any data to end up in a cache device; it could -+ * be from a normal write, or a writeback write, or a write to a flash only -+ * volume - it's also used by the moving garbage collector to compact data in -+ * mostly empty buckets. -+ * -+ * It first writes the data to the cache, creating a list of keys to be inserted -+ * (if the data won't fit in a single open bucket, there will be multiple keys); -+ * after the data is written it calls bch_journal, and after the keys have been -+ * added to the next journal write they're inserted into the btree. -+ * -+ * If op->discard is true, instead of inserting the data it invalidates the -+ * region of the cache represented by op->bio and op->inode. -+ */ -+void bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bio *bio = &op->wbio.bio; -+ struct bch_fs *c = op->c; -+ unsigned data_len; -+ -+ BUG_ON(!op->nr_replicas); -+ BUG_ON(!op->write_point.v); -+ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); -+ -+ op->start_time = local_clock(); -+ bch2_keylist_init(&op->insert_keys, op->inline_keys); -+ wbio_init(bio)->put_bio = false; -+ -+ if (bio_sectors(bio) & (c->opts.block_size - 1)) { -+ __bcache_io_error(c, "misaligned write"); -+ op->error = -EIO; -+ goto err; -+ } -+ -+ if (c->opts.nochanges || -+ !percpu_ref_tryget(&c->writes)) { -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ __bcache_io_error(c, "read only"); -+ op->error = -EROFS; -+ goto err; -+ } -+ -+ /* -+ * Can't ratelimit copygc - we'd deadlock: -+ */ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ down(&c->io_in_flight); -+ -+ bch2_increment_clock(c, bio_sectors(bio), WRITE); -+ -+ data_len = min_t(u64, bio->bi_iter.bi_size, -+ op->new_i_size - (op->pos.offset << 9)); -+ -+ if (c->opts.inline_data && -+ data_len <= min(block_bytes(c) / 2, 1024U)) { -+ bch2_write_data_inline(op, data_len); -+ return; -+ } -+ -+ continue_at_nobarrier(cl, __bch2_write, NULL); -+ return; -+err: -+ bch2_disk_reservation_put(c, &op->res); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/* Cache promotion on read */ -+ -+struct promote_op { -+ struct closure cl; -+ struct rcu_head rcu; -+ u64 start_time; -+ -+ struct rhash_head hash; -+ struct bpos pos; -+ -+ struct migrate_write write; -+ struct bio_vec bi_inline_vecs[0]; /* must be last */ -+}; -+ -+static const struct rhashtable_params bch_promote_params = { -+ .head_offset = offsetof(struct promote_op, hash), -+ .key_offset = offsetof(struct promote_op, pos), -+ .key_len = sizeof(struct bpos), -+}; -+ -+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, -+ struct bpos pos, -+ struct bch_io_opts opts, -+ unsigned flags) -+{ -+ if (!(flags & BCH_READ_MAY_PROMOTE)) -+ return false; -+ -+ if (!opts.promote_target) -+ return false; -+ -+ if (bch2_bkey_has_target(c, k, opts.promote_target)) -+ return false; -+ -+ if (bch2_target_congested(c, opts.promote_target)) { -+ /* XXX trace this */ -+ return false; -+ } -+ -+ if (rhashtable_lookup_fast(&c->promote_table, &pos, -+ bch_promote_params)) -+ return false; -+ -+ return true; -+} -+ -+static void promote_free(struct bch_fs *c, struct promote_op *op) -+{ -+ int ret; -+ -+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ percpu_ref_put(&c->writes); -+ kfree_rcu(op, rcu); -+} -+ -+static void promote_done(struct closure *cl) -+{ -+ struct promote_op *op = -+ container_of(cl, struct promote_op, cl); -+ struct bch_fs *c = op->write.op.c; -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -+ op->start_time); -+ -+ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); -+ promote_free(c, op); -+} -+ -+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->write.op.wbio.bio; -+ -+ trace_promote(&rbio->bio); -+ -+ /* we now own pages: */ -+ BUG_ON(!rbio->bounce); -+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+ -+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ -+ bch2_migrate_read_done(&op->write, rbio); -+ -+ closure_init(cl, NULL); -+ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); -+ closure_return_with_destructor(cl, promote_done); -+} -+ -+static struct promote_op *__promote_alloc(struct bch_fs *c, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned sectors, -+ struct bch_read_bio **rbio) -+{ -+ struct promote_op *op = NULL; -+ struct bio *bio; -+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ int ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return NULL; -+ -+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); -+ if (!op) -+ goto err; -+ -+ op->start_time = local_clock(); -+ op->pos = pos; -+ -+ /* -+ * We don't use the mempool here because extents that aren't -+ * checksummed or compressed can be too big for the mempool: -+ */ -+ *rbio = kzalloc(sizeof(struct bch_read_bio) + -+ sizeof(struct bio_vec) * pages, -+ GFP_NOIO); -+ if (!*rbio) -+ goto err; -+ -+ rbio_init(&(*rbio)->bio, opts); -+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); -+ -+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, -+ GFP_NOIO)) -+ goto err; -+ -+ (*rbio)->bounce = true; -+ (*rbio)->split = true; -+ (*rbio)->kmalloc = true; -+ -+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, -+ bch_promote_params)) -+ goto err; -+ -+ bio = &op->write.op.wbio.bio; -+ bio_init(bio, bio->bi_inline_vecs, pages); -+ -+ ret = bch2_migrate_write_init(c, &op->write, -+ writepoint_hashed((unsigned long) current), -+ opts, -+ DATA_PROMOTE, -+ (struct data_opts) { -+ .target = opts.promote_target -+ }, -+ btree_id, k); -+ BUG_ON(ret); -+ -+ return op; -+err: -+ if (*rbio) -+ bio_free_pages(&(*rbio)->bio); -+ kfree(*rbio); -+ *rbio = NULL; -+ kfree(op); -+ percpu_ref_put(&c->writes); -+ return NULL; -+} -+ -+noinline -+static struct promote_op *promote_alloc(struct bch_fs *c, -+ struct bvec_iter iter, -+ struct bkey_s_c k, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned flags, -+ struct bch_read_bio **rbio, -+ bool *bounce, -+ bool *read_full) -+{ -+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); -+ /* data might have to be decompressed in the write path: */ -+ unsigned sectors = promote_full -+ ? max(pick->crc.compressed_size, pick->crc.live_size) -+ : bvec_iter_sectors(iter); -+ struct bpos pos = promote_full -+ ? bkey_start_pos(k.k) -+ : POS(k.k->p.inode, iter.bi_sector); -+ struct promote_op *promote; -+ -+ if (!should_promote(c, k, pos, opts, flags)) -+ return NULL; -+ -+ promote = __promote_alloc(c, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_REFLINK -+ : BTREE_ID_EXTENTS, -+ k, pos, pick, opts, sectors, rbio); -+ if (!promote) -+ return NULL; -+ -+ *bounce = true; -+ *read_full = promote_full; -+ return promote; -+} -+ -+/* Read */ -+ -+#define READ_RETRY_AVOID 1 -+#define READ_RETRY 2 -+#define READ_ERR 3 -+ -+enum rbio_context { -+ RBIO_CONTEXT_NULL, -+ RBIO_CONTEXT_HIGHPRI, -+ RBIO_CONTEXT_UNBOUND, -+}; -+ -+static inline struct bch_read_bio * -+bch2_rbio_parent(struct bch_read_bio *rbio) -+{ -+ return rbio->split ? rbio->parent : rbio; -+} -+ -+__always_inline -+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, -+ enum rbio_context context, -+ struct workqueue_struct *wq) -+{ -+ if (context <= rbio->context) { -+ fn(&rbio->work); -+ } else { -+ rbio->work.func = fn; -+ rbio->context = context; -+ queue_work(wq, &rbio->work); -+ } -+} -+ -+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -+{ -+ BUG_ON(rbio->bounce && !rbio->split); -+ -+ if (rbio->promote) -+ promote_free(rbio->c, rbio->promote); -+ rbio->promote = NULL; -+ -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ -+ if (rbio->split) { -+ struct bch_read_bio *parent = rbio->parent; -+ -+ if (rbio->kmalloc) -+ kfree(rbio); -+ else -+ bio_put(&rbio->bio); -+ -+ rbio = parent; -+ } -+ -+ return rbio; -+} -+ -+/* -+ * Only called on a top level bch_read_bio to complete an entire read request, -+ * not a split: -+ */ -+static void bch2_rbio_done(struct bch_read_bio *rbio) -+{ -+ if (rbio->start_time) -+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], -+ rbio->start_time); -+ bio_endio(&rbio->bio); -+} -+ -+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, -+ unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ rbio->pos, BTREE_ITER_SLOTS); -+retry: -+ rbio->bio.bi_status = 0; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) -+ goto err; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ if (!bch2_bkey_matches_ptr(c, k, -+ rbio->pick.ptr, -+ rbio->pos.offset - -+ rbio->pick.crc.offset)) { -+ /* extent we wanted to read no longer exists: */ -+ rbio->hole = true; -+ goto out; -+ } -+ -+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); -+ if (ret == READ_RETRY) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_rbio_done(rbio); -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ goto out; -+} -+ -+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode, bvec_iter.bi_sector), -+ BTREE_ITER_SLOTS, k, ret) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; -+ swap(bvec_iter.bi_size, bytes); -+ -+ ret = __bch2_read_extent(c, rbio, bvec_iter, k, -+ offset_into_extent, failed, flags); -+ switch (ret) { -+ case READ_RETRY: -+ goto retry; -+ case READ_ERR: -+ goto err; -+ }; -+ -+ if (bytes == bvec_iter.bi_size) -+ goto out; -+ -+ swap(bvec_iter.bi_size, bytes); -+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ /* -+ * If we get here, it better have been because there was an error -+ * reading a btree node -+ */ -+ BUG_ON(!ret); -+ __bcache_io_error(c, "btree IO error: %i", ret); -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ bch2_rbio_done(rbio); -+} -+ -+static void bch2_rbio_retry(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bvec_iter iter = rbio->bvec_iter; -+ unsigned flags = rbio->flags; -+ u64 inode = rbio->pos.inode; -+ struct bch_io_failures failed = { .nr = 0 }; -+ -+ trace_read_retry(&rbio->bio); -+ -+ if (rbio->retry == READ_RETRY_AVOID) -+ bch2_mark_io_failure(&failed, &rbio->pick); -+ -+ rbio->bio.bi_status = 0; -+ -+ rbio = bch2_rbio_free(rbio); -+ -+ flags |= BCH_READ_IN_RETRY; -+ flags &= ~BCH_READ_MAY_PROMOTE; -+ -+ if (flags & BCH_READ_NODECODE) -+ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); -+ else -+ bch2_read_retry(c, rbio, iter, inode, &failed, flags); -+} -+ -+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, -+ blk_status_t error) -+{ -+ rbio->retry = retry; -+ -+ if (rbio->flags & BCH_READ_IN_RETRY) -+ return; -+ -+ if (retry == READ_ERR) { -+ rbio = bch2_rbio_free(rbio); -+ -+ rbio->bio.bi_status = error; -+ bch2_rbio_done(rbio); -+ } else { -+ bch2_rbio_punt(rbio, bch2_rbio_retry, -+ RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ } -+} -+ -+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, -+ struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; -+ struct bch_extent_crc_unpacked new_crc; -+ struct btree_iter *iter = NULL; -+ struct bkey_i *new; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (crc_is_compressed(rbio->pick.crc)) -+ return 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ if ((ret = PTR_ERR_OR_ZERO(iter))) -+ goto out; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto out; -+ -+ /* -+ * going to be temporarily appending another checksum entry: -+ */ -+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + -+ BKEY_EXTENT_U64s_MAX * 8); -+ if ((ret = PTR_ERR_OR_ZERO(new))) -+ goto out; -+ -+ bkey_reassemble(new, k); -+ k = bkey_i_to_s_c(new); -+ -+ if (bversion_cmp(k.k->version, rbio->version) || -+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) -+ goto out; -+ -+ /* Extent was merged? */ -+ if (bkey_start_offset(k.k) < data_offset || -+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) -+ goto out; -+ -+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, -+ rbio->pick.crc, NULL, &new_crc, -+ bkey_start_offset(k.k) - data_offset, k.k->size, -+ rbio->pick.crc.csum_type)) { -+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); -+ ret = 0; -+ goto out; -+ } -+ -+ if (!bch2_bkey_narrow_crcs(new, new_crc)) -+ goto out; -+ -+ bch2_trans_update(trans, iter, new, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -+{ -+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, -+ __bch2_rbio_narrow_crcs(&trans, rbio)); -+} -+ -+/* Inner part that may run in process context */ -+static void __bch2_read_endio(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct bio *src = &rbio->bio; -+ struct bio *dst = &bch2_rbio_parent(rbio)->bio; -+ struct bvec_iter dst_iter = rbio->bvec_iter; -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ struct bch_csum csum; -+ -+ /* Reset iterator for checksumming and copying bounced data: */ -+ if (rbio->bounce) { -+ src->bi_iter.bi_size = crc.compressed_size << 9; -+ src->bi_iter.bi_idx = 0; -+ src->bi_iter.bi_bvec_done = 0; -+ } else { -+ src->bi_iter = rbio->bvec_iter; -+ } -+ -+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) -+ goto csum_err; -+ -+ if (unlikely(rbio->narrow_crcs)) -+ bch2_rbio_narrow_crcs(rbio); -+ -+ if (rbio->flags & BCH_READ_NODECODE) -+ goto nodecode; -+ -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ -+ if (crc_is_compressed(crc)) { -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); -+ -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; -+ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ -+ if (rbio->promote) { -+ /* -+ * Re encrypt data we decrypted, so it's consistent with -+ * rbio->crc: -+ */ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ promote_start(rbio->promote, rbio); -+ rbio->promote = NULL; -+ } -+nodecode: -+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ rbio = bch2_rbio_free(rbio); -+ bch2_rbio_done(rbio); -+ } -+ return; -+csum_err: -+ /* -+ * Checksum error: if the bio wasn't bounced, we may have been -+ * reading into buffers owned by userspace (that userspace can -+ * scribble over) - retry the read, bouncing it this time: -+ */ -+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -+ rbio->flags |= BCH_READ_MUST_BOUNCE; -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); -+ return; -+ } -+ -+ bch2_dev_io_error(ca, -+ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", -+ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, -+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, -+ csum.hi, csum.lo, crc.csum_type); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ return; -+decompression_err: -+ __bcache_io_error(c, "decompression error, inode %llu offset %llu", -+ rbio->pos.inode, -+ (u64) rbio->bvec_iter.bi_sector); -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ return; -+} -+ -+static void bch2_read_endio(struct bio *bio) -+{ -+ struct bch_read_bio *rbio = -+ container_of(bio, struct bch_read_bio, bio); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct workqueue_struct *wq = NULL; -+ enum rbio_context context = RBIO_CONTEXT_NULL; -+ -+ if (rbio->have_ioref) { -+ bch2_latency_acct(ca, rbio->submit_time, READ); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (!rbio->split) -+ rbio->bio.bi_end_io = rbio->end_io; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", -+ bch2_blk_status_to_str(bio->bi_status))) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ return; -+ } -+ -+ if (rbio->pick.ptr.cached && -+ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ ptr_stale(ca, &rbio->pick.ptr))) { -+ atomic_long_inc(&c->read_realloc_races); -+ -+ if (rbio->flags & BCH_READ_RETRY_IF_STALE) -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); -+ else -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -+ return; -+ } -+ -+ if (rbio->narrow_crcs || -+ crc_is_compressed(rbio->pick.crc) || -+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) -+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; -+ else if (rbio->pick.crc.csum_type) -+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; -+ -+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -+} -+ -+int __bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *orig_k) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 reflink_offset; -+ int ret; -+ -+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + -+ *offset_into_extent; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, -+ POS(0, reflink_offset), -+ BTREE_ITER_SLOTS); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ __bcache_io_error(trans->c, -+ "pointer to nonexistent indirect extent"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); -+ bkey_on_stack_reassemble(orig_k, trans->c, k); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, -+ struct bvec_iter iter, struct bkey_s_c k, -+ unsigned offset_into_extent, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct extent_ptr_decoded pick; -+ struct bch_read_bio *rbio = NULL; -+ struct bch_dev *ca; -+ struct promote_op *promote = NULL; -+ bool bounce = false, read_full = false, narrow_crcs = false; -+ struct bpos pos = bkey_start_pos(k.k); -+ int pick_ret; -+ -+ if (k.k->type == KEY_TYPE_inline_data) { -+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); -+ unsigned bytes = min_t(unsigned, iter.bi_size, -+ bkey_val_bytes(d.k)); -+ -+ swap(iter.bi_size, bytes); -+ memcpy_to_bio(&orig->bio, iter, d.v->data); -+ swap(iter.bi_size, bytes); -+ bio_advance_iter(&orig->bio, &iter, bytes); -+ zero_fill_bio_iter(&orig->bio, iter); -+ goto out_read_done; -+ } -+ -+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ -+ /* hole or reservation - just zero fill: */ -+ if (!pick_ret) -+ goto hole; -+ -+ if (pick_ret < 0) { -+ __bcache_io_error(c, "no device to read from"); -+ goto err; -+ } -+ -+ if (pick_ret > 0) -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ if (flags & BCH_READ_NODECODE) { -+ /* -+ * can happen if we retry, and the extent we were going to read -+ * has been merged in the meantime: -+ */ -+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) -+ goto hole; -+ -+ iter.bi_size = pick.crc.compressed_size << 9; -+ goto get_bio; -+ } -+ -+ if (!(flags & BCH_READ_LAST_FRAGMENT) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_MUST_CLONE; -+ -+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -+ flags |= BCH_READ_MUST_BOUNCE; -+ -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_NONE && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_USER_MAPPED)) || -+ (flags & BCH_READ_MUST_BOUNCE)))) { -+ read_full = true; -+ bounce = true; -+ } -+ -+ if (orig->opts.promote_target) -+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, -+ &rbio, &bounce, &read_full); -+ -+ if (!read_full) { -+ EBUG_ON(crc_is_compressed(pick.crc)); -+ EBUG_ON(pick.crc.csum_type && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ bvec_iter_sectors(iter) != pick.crc.live_size || -+ pick.crc.offset || -+ offset_into_extent)); -+ -+ pos.offset += offset_into_extent; -+ pick.ptr.offset += pick.crc.offset + -+ offset_into_extent; -+ offset_into_extent = 0; -+ pick.crc.compressed_size = bvec_iter_sectors(iter); -+ pick.crc.uncompressed_size = bvec_iter_sectors(iter); -+ pick.crc.offset = 0; -+ pick.crc.live_size = bvec_iter_sectors(iter); -+ offset_into_extent = 0; -+ } -+get_bio: -+ if (rbio) { -+ /* -+ * promote already allocated bounce rbio: -+ * promote needs to allocate a bio big enough for uncompressing -+ * data in the write path, but we're not going to use it all -+ * here: -+ */ -+ EBUG_ON(rbio->bio.bi_iter.bi_size < -+ pick.crc.compressed_size << 9); -+ rbio->bio.bi_iter.bi_size = -+ pick.crc.compressed_size << 9; -+ } else if (bounce) { -+ unsigned sectors = pick.crc.compressed_size; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, -+ DIV_ROUND_UP(sectors, PAGE_SECTORS), -+ &c->bio_read_split), -+ orig->opts); -+ -+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); -+ rbio->bounce = true; -+ rbio->split = true; -+ } else if (flags & BCH_READ_MUST_CLONE) { -+ /* -+ * Have to clone if there were any splits, due to error -+ * reporting issues (if a split errored, and retrying didn't -+ * work, when it reports the error to its parent (us) we don't -+ * know if the error was from our bio, and we should retry, or -+ * from the whole bio, in which case we don't want to retry and -+ * lose the error) -+ */ -+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, -+ &c->bio_read_split), -+ orig->opts); -+ rbio->bio.bi_iter = iter; -+ rbio->split = true; -+ } else { -+ rbio = orig; -+ rbio->bio.bi_iter = iter; -+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); -+ } -+ -+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); -+ -+ rbio->c = c; -+ rbio->submit_time = local_clock(); -+ if (rbio->split) -+ rbio->parent = orig; -+ else -+ rbio->end_io = orig->bio.bi_end_io; -+ rbio->bvec_iter = iter; -+ rbio->offset_into_extent= offset_into_extent; -+ rbio->flags = flags; -+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); -+ rbio->narrow_crcs = narrow_crcs; -+ rbio->hole = 0; -+ rbio->retry = 0; -+ rbio->context = 0; -+ /* XXX: only initialize this if needed */ -+ rbio->devs_have = bch2_bkey_devs(k); -+ rbio->pick = pick; -+ rbio->pos = pos; -+ rbio->version = k.k->version; -+ rbio->promote = promote; -+ INIT_WORK(&rbio->work, NULL); -+ -+ rbio->bio.bi_opf = orig->bio.bi_opf; -+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; -+ rbio->bio.bi_end_io = bch2_read_endio; -+ -+ if (rbio->bounce) -+ trace_read_bounce(&rbio->bio); -+ -+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); -+ -+ rcu_read_lock(); -+ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); -+ rcu_read_unlock(); -+ -+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ bio_inc_remaining(&orig->bio); -+ trace_read_split(&orig->bio); -+ } -+ -+ if (!rbio->pick.idx) { -+ if (!rbio->have_ioref) { -+ __bcache_io_error(c, "no device to read from"); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], -+ bio_sectors(&rbio->bio)); -+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ submit_bio(&rbio->bio); -+ else -+ submit_bio_wait(&rbio->bio); -+ } else { -+ /* Attempting reconstruct read: */ -+ if (bch2_ec_read_extent(c, rbio)) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ bio_endio(&rbio->bio); -+ } -+out: -+ if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ return 0; -+ } else { -+ int ret; -+ -+ rbio->context = RBIO_CONTEXT_UNBOUND; -+ bch2_read_endio(&rbio->bio); -+ -+ ret = rbio->retry; -+ rbio = bch2_rbio_free(rbio); -+ -+ if (ret == READ_RETRY_AVOID) { -+ bch2_mark_io_failure(failed, &pick); -+ ret = READ_RETRY; -+ } -+ -+ return ret; -+ } -+ -+err: -+ if (flags & BCH_READ_IN_RETRY) -+ return READ_ERR; -+ -+ orig->bio.bi_status = BLK_STS_IOERR; -+ goto out_read_done; -+ -+hole: -+ /* -+ * won't normally happen in the BCH_READ_NODECODE -+ * (bch2_move_extent()) path, but if we retry and the extent we wanted -+ * to read no longer exists we have to signal that: -+ */ -+ if (flags & BCH_READ_NODECODE) -+ orig->hole = true; -+ -+ zero_fill_bio_iter(&orig->bio, iter); -+out_read_done: -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ bch2_rbio_done(orig); -+ return 0; -+} -+ -+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ unsigned flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE| -+ BCH_READ_USER_MAPPED; -+ int ret; -+ -+ BUG_ON(rbio->_state); -+ BUG_ON(flags & BCH_READ_NODECODE); -+ BUG_ON(flags & BCH_READ_IN_RETRY); -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode, rbio->bio.bi_iter.bi_sector), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inode, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ goto err; -+ -+ /* -+ * With indirect extents, the amount of data to read is the min -+ * of the original extent and the indirect extent: -+ */ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ bch2_read_extent(c, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); -+ bch2_rbio_done(rbio); -+ goto out; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *c) -+{ -+ if (c->promote_table.tbl) -+ rhashtable_destroy(&c->promote_table); -+ mempool_exit(&c->bio_bounce_pages); -+ bioset_exit(&c->bio_write); -+ bioset_exit(&c->bio_read_split); -+ bioset_exit(&c->bio_read); -+} -+ -+int bch2_fs_io_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), -+ BIOSET_NEED_BVECS) || -+ mempool_init_page_pool(&c->bio_bounce_pages, -+ max_t(unsigned, -+ c->opts.btree_node_size, -+ c->sb.encoded_extent_max) / -+ PAGE_SECTORS, 0) || -+ rhashtable_init(&c->promote_table, &bch_promote_params)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h -new file mode 100644 -index 000000000000..ded468d70f09 ---- /dev/null -+++ b/fs/bcachefs/io.h -@@ -0,0 +1,169 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_H -+#define _BCACHEFS_IO_H -+ -+#include "checksum.h" -+#include "bkey_on_stack.h" -+#include "io_types.h" -+ -+#define to_wbio(_bio) \ -+ container_of((_bio), struct bch_write_bio, bio) -+ -+#define to_rbio(_bio) \ -+ container_of((_bio), struct bch_read_bio, bio) -+ -+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -+ -+void bch2_latency_acct(struct bch_dev *, u64, int); -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, -+ enum bch_data_type, const struct bkey_i *); -+ -+#define BLK_STS_REMOVED ((__force blk_status_t)128) -+ -+const char *bch2_blk_status_to_str(blk_status_t); -+ -+enum bch_write_flags { -+ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), -+ BCH_WRITE_CACHED = (1 << 1), -+ BCH_WRITE_FLUSH = (1 << 2), -+ BCH_WRITE_DATA_ENCODED = (1 << 3), -+ BCH_WRITE_PAGES_STABLE = (1 << 4), -+ BCH_WRITE_PAGES_OWNED = (1 << 5), -+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), -+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), -+ BCH_WRITE_FROM_INTERNAL = (1 << 8), -+ -+ /* Internal: */ -+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), -+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), -+ BCH_WRITE_DONE = (1 << 11), -+}; -+ -+static inline u64 *op_journal_seq(struct bch_write_op *op) -+{ -+ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) -+ ? op->journal_seq_p : &op->journal_seq; -+} -+ -+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -+{ -+ op->journal_seq_p = journal_seq; -+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -+} -+ -+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -+{ -+ return op->alloc_reserve == RESERVE_MOVINGGC -+ ? op->c->copygc_wq -+ : op->c->wq; -+} -+ -+int bch2_extent_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct disk_reservation *, -+ u64 *, u64, s64 *); -+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *, s64 *); -+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); -+ -+int bch2_write_index_default(struct bch_write_op *); -+ -+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, -+ struct bch_io_opts opts) -+{ -+ op->c = c; -+ op->end_io = NULL; -+ op->flags = 0; -+ op->written = 0; -+ op->error = 0; -+ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); -+ op->compression_type = bch2_compression_opt_to_type[opts.compression]; -+ op->nr_replicas = 0; -+ op->nr_replicas_required = c->opts.data_replicas_required; -+ op->alloc_reserve = RESERVE_NONE; -+ op->incompressible = 0; -+ op->open_buckets.nr = 0; -+ op->devs_have.nr = 0; -+ op->target = 0; -+ op->opts = opts; -+ op->pos = POS_MAX; -+ op->version = ZERO_VERSION; -+ op->write_point = (struct write_point_specifier) { 0 }; -+ op->res = (struct disk_reservation) { 0 }; -+ op->journal_seq = 0; -+ op->new_i_size = U64_MAX; -+ op->i_sectors_delta = 0; -+ op->index_update_fn = bch2_write_index_default; -+} -+ -+void bch2_write(struct closure *); -+ -+static inline struct bch_write_bio *wbio_init(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ -+ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); -+ return wbio; -+} -+ -+struct bch_devs_mask; -+struct cache_promote_op; -+struct extent_ptr_decoded; -+ -+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, -+ struct bkey_on_stack *); -+ -+static inline int bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *k) -+{ -+ return k->k->k.type == KEY_TYPE_reflink_p -+ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) -+ : 0; -+} -+ -+enum bch_read_flags { -+ BCH_READ_RETRY_IF_STALE = 1 << 0, -+ BCH_READ_MAY_PROMOTE = 1 << 1, -+ BCH_READ_USER_MAPPED = 1 << 2, -+ BCH_READ_NODECODE = 1 << 3, -+ BCH_READ_LAST_FRAGMENT = 1 << 4, -+ -+ /* internal: */ -+ BCH_READ_MUST_BOUNCE = 1 << 5, -+ BCH_READ_MUST_CLONE = 1 << 6, -+ BCH_READ_IN_RETRY = 1 << 7, -+}; -+ -+int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, -+ struct bvec_iter, struct bkey_s_c, unsigned, -+ struct bch_io_failures *, unsigned); -+ -+static inline void bch2_read_extent(struct bch_fs *c, -+ struct bch_read_bio *rbio, -+ struct bkey_s_c k, -+ unsigned offset_into_extent, -+ unsigned flags) -+{ -+ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, -+ offset_into_extent, NULL, flags); -+} -+ -+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); -+ -+static inline struct bch_read_bio *rbio_init(struct bio *bio, -+ struct bch_io_opts opts) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->_state = 0; -+ rbio->promote = NULL; -+ rbio->opts = opts; -+ return rbio; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *); -+int bch2_fs_io_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_IO_H */ -diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h -new file mode 100644 -index 000000000000..b23727d212b9 ---- /dev/null -+++ b/fs/bcachefs/io_types.h -@@ -0,0 +1,148 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_TYPES_H -+#define _BCACHEFS_IO_TYPES_H -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "extents_types.h" -+#include "keylist_types.h" -+#include "opts.h" -+#include "super_types.h" -+ -+#include -+#include -+ -+struct bch_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ u64 submit_time; -+ -+ /* -+ * Reads will often have to be split, and if the extent being read from -+ * was checksummed or compressed we'll also have to allocate bounce -+ * buffers and copy the data back into the original bio. -+ * -+ * If we didn't have to split, we have to save and restore the original -+ * bi_end_io - @split below indicates which: -+ */ -+ union { -+ struct bch_read_bio *parent; -+ bio_end_io_t *end_io; -+ }; -+ -+ /* -+ * Saved copy of bio->bi_iter, from submission time - allows us to -+ * resubmit on IO error, and also to copy data back to the original bio -+ * when we're bouncing: -+ */ -+ struct bvec_iter bvec_iter; -+ -+ unsigned offset_into_extent; -+ -+ u16 flags; -+ union { -+ struct { -+ u16 bounce:1, -+ split:1, -+ kmalloc:1, -+ have_ioref:1, -+ narrow_crcs:1, -+ hole:1, -+ retry:2, -+ context:2; -+ }; -+ u16 _state; -+ }; -+ -+ struct bch_devs_list devs_have; -+ -+ struct extent_ptr_decoded pick; -+ /* start pos of data we read (may not be pos of data we want) */ -+ struct bpos pos; -+ struct bversion version; -+ -+ struct promote_op *promote; -+ -+ struct bch_io_opts opts; -+ -+ struct work_struct work; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_bio { -+ struct bch_fs *c; -+ struct bch_write_bio *parent; -+ -+ u64 submit_time; -+ -+ struct bch_devs_list failed; -+ u8 dev; -+ -+ unsigned split:1, -+ bounce:1, -+ put_bio:1, -+ have_ioref:1, -+ used_mempool:1; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_op { -+ struct closure cl; -+ struct bch_fs *c; -+ void (*end_io)(struct bch_write_op *); -+ u64 start_time; -+ -+ unsigned written; /* sectors */ -+ u16 flags; -+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ -+ -+ unsigned csum_type:4; -+ unsigned compression_type:4; -+ unsigned nr_replicas:4; -+ unsigned nr_replicas_required:4; -+ unsigned alloc_reserve:3; -+ unsigned incompressible:1; -+ -+ struct bch_devs_list devs_have; -+ u16 target; -+ u16 nonce; -+ struct bch_io_opts opts; -+ -+ struct bpos pos; -+ struct bversion version; -+ -+ /* For BCH_WRITE_DATA_ENCODED: */ -+ struct bch_extent_crc_unpacked crc; -+ -+ struct write_point_specifier write_point; -+ -+ struct disk_reservation res; -+ -+ struct open_buckets open_buckets; -+ -+ /* -+ * If caller wants to flush but hasn't passed us a journal_seq ptr, we -+ * still need to stash the journal_seq somewhere: -+ */ -+ union { -+ u64 *journal_seq_p; -+ u64 journal_seq; -+ }; -+ u64 new_i_size; -+ s64 i_sectors_delta; -+ -+ int (*index_update_fn)(struct bch_write_op *); -+ -+ struct bch_devs_mask failed; -+ -+ struct keylist insert_keys; -+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; -+ -+ /* Must be last: */ -+ struct bch_write_bio wbio; -+}; -+ -+#endif /* _BCACHEFS_IO_TYPES_H */ -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -new file mode 100644 -index 000000000000..210ad1b0c469 ---- /dev/null -+++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1248 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs journalling code, for btree insertions -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+ -+static bool __journal_entry_is_open(union journal_res_state state) -+{ -+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -+} -+ -+static bool journal_entry_is_open(struct journal *j) -+{ -+ return __journal_entry_is_open(j->reservations); -+} -+ -+static void journal_pin_new_entry(struct journal *j, int count) -+{ -+ struct journal_entry_pin_list *p; -+ -+ /* -+ * The fifo_push() needs to happen at the same time as j->seq is -+ * incremented for journal_last_seq() to be calculated correctly -+ */ -+ atomic64_inc(&j->seq); -+ p = fifo_push_ref(&j->pin); -+ -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, count); -+ p->devs.nr = 0; -+} -+ -+static void bch2_journal_buf_init(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ -+ memset(buf->has_inode, 0, sizeof(buf->has_inode)); -+ -+ memset(buf->data, 0, sizeof(*buf->data)); -+ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); -+ buf->data->u64s = 0; -+} -+ -+void bch2_journal_halt(struct journal *j) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ journal_wake(j); -+ closure_wake_up(&journal_cur_buf(j)->wait); -+} -+ -+/* journal entry close/open: */ -+ -+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) -+{ -+ if (!need_write_just_set && -+ test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ bch2_time_stats_update(j->delay_time, -+ j->need_write_time); -+ -+ clear_bit(JOURNAL_NEED_WRITE, &j->flags); -+ -+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); -+} -+ -+/* -+ * Returns true if journal entry is now closed: -+ */ -+static bool __journal_entry_close(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ bool set_need_write = false; -+ unsigned sectors; -+ -+ lockdep_assert_held(&j->lock); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) -+ return true; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { -+ /* this entry will never be written: */ -+ closure_wake_up(&buf->wait); -+ return true; -+ } -+ -+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); -+ j->need_write_time = local_clock(); -+ set_need_write = true; -+ } -+ -+ if (new.prev_buf_unwritten) -+ return false; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; -+ new.idx++; -+ new.prev_buf_unwritten = 1; -+ -+ BUG_ON(journal_state_count(new, new.idx)); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ -+ sectors = vstruct_blocks_plus(buf->data, c->block_bits, -+ buf->u64s_reserved) << c->block_bits; -+ BUG_ON(sectors > buf->sectors); -+ buf->sectors = sectors; -+ -+ bkey_extent_init(&buf->key); -+ -+ /* -+ * We have to set last_seq here, _before_ opening a new journal entry: -+ * -+ * A threads may replace an old pin with a new pin on their current -+ * journal reservation - the expectation being that the journal will -+ * contain either what the old pin protected or what the new pin -+ * protects. -+ * -+ * After the old pin is dropped journal_last_seq() won't include the old -+ * pin, so we can only write the updated last_seq on the entry that -+ * contains whatever the new pin protects. -+ * -+ * Restated, we can _not_ update last_seq for a given entry if there -+ * could be a newer entry open with reservations/pins that have been -+ * taken against it. -+ * -+ * Hence, we want update/set last_seq on the current journal entry right -+ * before we open a new one: -+ */ -+ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); -+ -+ if (journal_entry_empty(buf->data)) -+ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ else -+ set_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ -+ bch2_journal_buf_init(j); -+ -+ cancel_delayed_work(&j->write_work); -+ -+ bch2_journal_space_available(j); -+ -+ bch2_journal_buf_put(j, old.idx, set_need_write); -+ return true; -+} -+ -+static bool journal_entry_close(struct journal *j) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * should _only_ called from journal_res_get() - when we actually want a -+ * journal reservation - journal entry is open means journal is dirty: -+ * -+ * returns: -+ * 0: success -+ * -ENOSPC: journal currently full, must invoke reclaim -+ * -EAGAIN: journal blocked, must wait -+ * -EROFS: insufficient rw devices or journal error -+ */ -+static int journal_entry_open(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ int u64s; -+ u64 v; -+ -+ lockdep_assert_held(&j->lock); -+ BUG_ON(journal_entry_is_open(j)); -+ -+ if (j->blocked) -+ return -EAGAIN; -+ -+ if (j->cur_entry_error) -+ return j->cur_entry_error; -+ -+ BUG_ON(!j->cur_entry_sectors); -+ -+ buf->u64s_reserved = j->entry_u64s_reserved; -+ buf->disk_sectors = j->cur_entry_sectors; -+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); -+ -+ u64s = (int) (buf->sectors << 9) / sizeof(u64) - -+ journal_entry_overhead(j); -+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); -+ -+ if (u64s <= le32_to_cpu(buf->data->u64s)) -+ return -ENOSPC; -+ -+ /* -+ * Must be set before marking the journal entry as open: -+ */ -+ j->cur_entry_u64s = u64s; -+ -+ v = atomic64_read(&j->reservations.counter); -+ do { -+ old.v = new.v = v; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return -EROFS; -+ -+ /* Handle any already added entries */ -+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); -+ -+ EBUG_ON(journal_state_count(new, new.idx)); -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ if (j->res_get_blocked_start) -+ bch2_time_stats_update(j->blocked_time, -+ j->res_get_blocked_start); -+ j->res_get_blocked_start = 0; -+ -+ mod_delayed_work(system_freezable_wq, -+ &j->write_work, -+ msecs_to_jiffies(j->write_delay_ms)); -+ journal_wake(j); -+ return 0; -+} -+ -+static bool journal_quiesced(struct journal *j) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); -+ -+ if (!ret) -+ journal_entry_close(j); -+ return ret; -+} -+ -+static void journal_quiesce(struct journal *j) -+{ -+ wait_event(j->wait, journal_quiesced(j)); -+} -+ -+static void journal_write_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(work, struct journal, write_work.work); -+ -+ journal_entry_close(j); -+} -+ -+/* -+ * Given an inode number, if that inode number has data in the journal that -+ * hasn't yet been flushed, return the journal sequence number that needs to be -+ * flushed: -+ */ -+u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -+{ -+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); -+ u64 seq = 0; -+ -+ if (!test_bit(h, j->buf[0].has_inode) && -+ !test_bit(h, j->buf[1].has_inode)) -+ return 0; -+ -+ spin_lock(&j->lock); -+ if (test_bit(h, journal_cur_buf(j)->has_inode)) -+ seq = journal_cur_seq(j); -+ else if (test_bit(h, journal_prev_buf(j)->has_inode)) -+ seq = journal_cur_seq(j) - 1; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+static int __journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf; -+ bool can_discard; -+ int ret; -+retry: -+ if (journal_res_get_fast(j, res, flags)) -+ return 0; -+ -+ if (bch2_journal_error(j)) -+ return -EROFS; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Recheck after taking the lock, so we don't race with another thread -+ * that just did journal_entry_open() and call journal_entry_close() -+ * unnecessarily -+ */ -+ if (journal_res_get_fast(j, res, flags)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ /* -+ * Don't want to close current journal entry, just need to -+ * invoke reclaim: -+ */ -+ ret = -ENOSPC; -+ goto unlock; -+ } -+ -+ /* -+ * If we couldn't get a reservation because the current buf filled up, -+ * and we had room for a bigger entry on disk, signal that we want to -+ * realloc the journal bufs: -+ */ -+ buf = journal_cur_buf(j); -+ if (journal_entry_is_open(j) && -+ buf->buf_size >> 9 < buf->disk_sectors && -+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) -+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); -+ -+ if (journal_entry_is_open(j) && -+ !__journal_entry_close(j)) { -+ /* -+ * We failed to get a reservation on the current open journal -+ * entry because it's full, and we can't close it because -+ * there's still a previous one in flight: -+ */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ ret = journal_entry_open(j); -+ } -+unlock: -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ can_discard = j->can_discard; -+ spin_unlock(&j->lock); -+ -+ if (!ret) -+ goto retry; -+ -+ if (ret == -ENOSPC) { -+ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), -+ "JOURNAL_RES_GET_RESERVED set but journal full"); -+ -+ /* -+ * Journal is full - can't rely on reclaim from work item due to -+ * freezing: -+ */ -+ trace_journal_full(c); -+ -+ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { -+ if (can_discard) { -+ bch2_journal_do_discards(j); -+ goto retry; -+ } -+ -+ if (mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } -+ } -+ -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+/* -+ * Essentially the entry function to the journaling code. When bcachefs is doing -+ * a btree insert, it calls this function to get the current journal write. -+ * Journal write is the structure used set up journal writes. The calling -+ * function will then add its keys to the structure, queuing them for the next -+ * write. -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. -+ */ -+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || -+ (flags & JOURNAL_RES_GET_NONBLOCK)); -+ return ret; -+} -+ -+/* journal_preres: */ -+ -+static bool journal_preres_available(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); -+ -+ if (!ret) -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ -+ return ret; -+} -+ -+int __bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->preres_wait, -+ (ret = bch2_journal_error(j)) || -+ journal_preres_available(j, res, new_u64s, flags)); -+ return ret; -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *j, -+ struct journal_entry_res *res, -+ unsigned new_u64s) -+{ -+ union journal_res_state state; -+ int d = new_u64s - res->u64s; -+ -+ spin_lock(&j->lock); -+ -+ j->entry_u64s_reserved += d; -+ if (d <= 0) -+ goto out; -+ -+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); -+ smp_mb(); -+ state = READ_ONCE(j->reservations); -+ -+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && -+ state.cur_entry_offset > j->cur_entry_u64s) { -+ j->cur_entry_u64s += d; -+ /* -+ * Not enough room in current journal entry, have to flush it: -+ */ -+ __journal_entry_close(j); -+ } else { -+ journal_cur_buf(j)->u64s_reserved += d; -+ } -+out: -+ spin_unlock(&j->lock); -+ res->u64s += d; -+} -+ -+/* journal flushing: */ -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *j) -+{ -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ seq = journal_cur_seq(j); -+ if (j->reservations.prev_buf_unwritten) -+ seq--; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+/** -+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't -+ * open yet, or wait if we cannot -+ * -+ * used by the btree interior update machinery, when it needs to write a new -+ * btree root - every journal entry contains the roots of all the btrees, so it -+ * doesn't need to bother with getting a journal reservation -+ */ -+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ int ret; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Can't try to open more than one sequence number ahead: -+ */ -+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); -+ -+ if (journal_cur_seq(j) > seq || -+ journal_entry_is_open(j)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (journal_cur_seq(j) < seq && -+ !__journal_entry_close(j)) { -+ /* haven't finished writing out the previous one: */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ BUG_ON(journal_cur_seq(j) != seq); -+ -+ ret = journal_entry_open(j); -+ } -+ -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ if (ret == -EAGAIN || ret == -ENOSPC) -+ closure_wait(&j->async_wait, cl); -+ -+ spin_unlock(&j->lock); -+ -+ if (ret == -ENOSPC) { -+ trace_journal_full(c); -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+static int journal_seq_error(struct journal *j, u64 seq) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ -+ if (seq == journal_cur_seq(j)) -+ return bch2_journal_error(j); -+ -+ if (seq + 1 == journal_cur_seq(j) && -+ !state.prev_buf_unwritten && -+ seq > j->seq_ondisk) -+ return -EIO; -+ -+ return 0; -+} -+ -+static inline struct journal_buf * -+journal_seq_to_buf(struct journal *j, u64 seq) -+{ -+ /* seq should be for a journal entry that has been opened: */ -+ BUG_ON(seq > journal_cur_seq(j)); -+ BUG_ON(seq == journal_cur_seq(j) && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); -+ -+ if (seq == journal_cur_seq(j)) -+ return journal_cur_buf(j); -+ if (seq + 1 == journal_cur_seq(j) && -+ j->reservations.prev_buf_unwritten) -+ return journal_prev_buf(j); -+ return NULL; -+} -+ -+/** -+ * bch2_journal_wait_on_seq - wait for a journal entry to be written -+ * -+ * does _not_ cause @seq to be written immediately - if there is no other -+ * activity to cause the relevant journal entry to be filled up or flushed it -+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is -+ * configurable). -+ */ -+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if ((buf = journal_seq_to_buf(j, seq))) { -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) { -+ smp_mb(); -+ if (bch2_journal_error(j)) -+ closure_wake_up(&buf->wait); -+ } -+ } -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_flush_seq_async - wait for a journal entry to be written -+ * -+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if -+ * necessary -+ */ -+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if (parent && -+ (buf = journal_seq_to_buf(j, seq))) -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+} -+ -+static int journal_seq_flushed(struct journal *j, u64 seq) -+{ -+ int ret; -+ -+ spin_lock(&j->lock); -+ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+int bch2_journal_flush_seq(struct journal *j, u64 seq) -+{ -+ u64 start_time = local_clock(); -+ int ret, ret2; -+ -+ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); -+ -+ bch2_time_stats_update(j->flush_seq_time, start_time); -+ -+ return ret ?: ret2 < 0 ? ret2 : 0; -+} -+ -+/** -+ * bch2_journal_meta_async - force a journal entry to be written -+ */ -+void bch2_journal_meta_async(struct journal *j, struct closure *parent) -+{ -+ struct journal_res res; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ bch2_journal_res_put(j, &res); -+ -+ bch2_journal_flush_seq_async(j, res.seq, parent); -+} -+ -+int bch2_journal_meta(struct journal *j) -+{ -+ struct journal_res res; -+ int ret; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ if (ret) -+ return ret; -+ -+ bch2_journal_res_put(j, &res); -+ -+ return bch2_journal_flush_seq(j, res.seq); -+} -+ -+/* -+ * bch2_journal_flush_async - if there is an open journal entry, or a journal -+ * still being written, write it and wait for the write to complete -+ */ -+void bch2_journal_flush_async(struct journal *j, struct closure *parent) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return; -+ } -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_seq_async(j, seq, parent); -+} -+ -+int bch2_journal_flush(struct journal *j) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ spin_unlock(&j->lock); -+ -+ return bch2_journal_flush_seq(j, seq); -+} -+ -+/* block/unlock the journal: */ -+ -+void bch2_journal_unblock(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked--; -+ spin_unlock(&j->lock); -+ -+ journal_wake(j); -+} -+ -+void bch2_journal_block(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked++; -+ spin_unlock(&j->lock); -+ -+ journal_quiesce(j); -+} -+ -+/* allocate journal on a device: */ -+ -+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, -+ bool new_fs, struct closure *cl) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets; -+ u64 *new_bucket_seq = NULL, *new_buckets = NULL; -+ int ret = 0; -+ -+ /* don't handle reducing nr of buckets yet: */ -+ if (nr <= ja->nr) -+ return 0; -+ -+ ret = -ENOMEM; -+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ if (!new_buckets || !new_bucket_seq) -+ goto err; -+ -+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, -+ nr + sizeof(*journal_buckets) / sizeof(u64)); -+ if (!journal_buckets) -+ goto err; -+ -+ /* -+ * We may be called from the device add path, before the new device has -+ * actually been added to the running filesystem: -+ */ -+ if (c) -+ spin_lock(&c->journal.lock); -+ -+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); -+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); -+ swap(new_buckets, ja->buckets); -+ swap(new_bucket_seq, ja->bucket_seq); -+ -+ if (c) -+ spin_unlock(&c->journal.lock); -+ -+ while (ja->nr < nr) { -+ struct open_bucket *ob = NULL; -+ unsigned pos; -+ long bucket; -+ -+ if (new_fs) { -+ bucket = bch2_bucket_alloc_new_fs(ca); -+ if (bucket < 0) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ } else { -+ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, -+ false, cl); -+ if (IS_ERR(ob)) { -+ ret = cl ? -EAGAIN : -ENOSPC; -+ goto err; -+ } -+ -+ bucket = sector_to_bucket(ca, ob->ptr.offset); -+ } -+ -+ if (c) { -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->journal.lock); -+ } -+ -+ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; -+ __array_insert_item(ja->buckets, ja->nr, pos); -+ __array_insert_item(ja->bucket_seq, ja->nr, pos); -+ __array_insert_item(journal_buckets->buckets, ja->nr, pos); -+ ja->nr++; -+ -+ ja->buckets[pos] = bucket; -+ ja->bucket_seq[pos] = 0; -+ journal_buckets->buckets[pos] = cpu_to_le64(bucket); -+ -+ if (pos <= ja->discard_idx) -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ if (pos <= ja->dirty_idx_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ if (pos <= ja->dirty_idx) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ if (pos <= ja->cur_idx) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), -+ 0); -+ -+ if (c) { -+ spin_unlock(&c->journal.lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ if (!new_fs) -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ ret = 0; -+err: -+ kfree(new_bucket_seq); -+ kfree(new_buckets); -+ -+ return ret; -+} -+ -+/* -+ * Allocate more journal space at runtime - not currently making use if it, but -+ * the code works: -+ */ -+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct closure cl; -+ unsigned current_nr; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ struct disk_reservation disk_res = { 0, 0 }; -+ -+ closure_sync(&cl); -+ -+ mutex_lock(&c->sb_lock); -+ current_nr = ja->nr; -+ -+ /* -+ * note: journal buckets aren't really counted as _sectors_ used yet, so -+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c -+ * when space used goes up without a reservation - but we do need the -+ * reservation to ensure we'll actually be able to allocate: -+ */ -+ -+ if (bch2_disk_reservation_get(c, &disk_res, -+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { -+ mutex_unlock(&c->sb_lock); -+ return -ENOSPC; -+ } -+ -+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (ja->nr != current_nr) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } while (ret == -EAGAIN); -+ -+ return ret; -+} -+ -+int bch2_dev_journal_alloc(struct bch_dev *ca) -+{ -+ unsigned nr; -+ -+ if (dynamic_fault("bcachefs:add:journal_alloc")) -+ return -ENOMEM; -+ -+ /* -+ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever -+ * is smaller: -+ */ -+ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, -+ BCH_JOURNAL_BUCKETS_MIN, -+ min(1 << 10, -+ (1 << 20) / ca->mi.bucket_size)); -+ -+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); -+} -+ -+/* startup/shutdown: */ -+ -+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -+{ -+ union journal_res_state state; -+ struct journal_buf *w; -+ bool ret; -+ -+ spin_lock(&j->lock); -+ state = READ_ONCE(j->reservations); -+ w = j->buf + !state.idx; -+ -+ ret = state.prev_buf_unwritten && -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -+{ -+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -+} -+ -+void bch2_fs_journal_stop(struct journal *j) -+{ -+ bch2_journal_flush_all_pins(j); -+ -+ wait_event(j->wait, journal_entry_close(j)); -+ -+ /* do we need to write another journal entry? */ -+ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) -+ bch2_journal_meta(j); -+ -+ journal_quiesce(j); -+ -+ BUG_ON(!bch2_journal_error(j) && -+ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); -+ -+ cancel_delayed_work_sync(&j->write_work); -+ cancel_delayed_work_sync(&j->reclaim_work); -+} -+ -+int bch2_fs_journal_start(struct journal *j, u64 cur_seq, -+ struct list_head *journal_entries) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ struct journal_replay *i; -+ u64 last_seq = cur_seq, nr, seq; -+ -+ if (!list_empty(journal_entries)) -+ last_seq = le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ nr = cur_seq - last_seq; -+ -+ if (nr + 1 > j->pin.size) { -+ free_fifo(&j->pin); -+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -+ if (!j->pin.data) { -+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -+ return -ENOMEM; -+ } -+ } -+ -+ j->replay_journal_seq = last_seq; -+ j->replay_journal_seq_end = cur_seq; -+ j->last_seq_ondisk = last_seq; -+ j->pin.front = last_seq; -+ j->pin.back = cur_seq; -+ atomic64_set(&j->seq, cur_seq - 1); -+ -+ fifo_for_each_entry_ptr(p, &j->pin, seq) { -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, 1); -+ p->devs.nr = 0; -+ } -+ -+ list_for_each_entry(i, journal_entries, list) { -+ seq = le64_to_cpu(i->j.seq); -+ BUG_ON(seq >= cur_seq); -+ -+ if (seq < last_seq) -+ continue; -+ -+ journal_seq_pin(j, seq)->devs = i->devs; -+ } -+ -+ spin_lock(&j->lock); -+ -+ set_bit(JOURNAL_STARTED, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ bch2_journal_buf_init(j); -+ -+ c->last_bucket_seq_cleanup = journal_cur_seq(j); -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ return 0; -+} -+ -+/* init/exit: */ -+ -+void bch2_dev_journal_exit(struct bch_dev *ca) -+{ -+ kfree(ca->journal.bio); -+ kfree(ca->journal.buckets); -+ kfree(ca->journal.bucket_seq); -+ -+ ca->journal.bio = NULL; -+ ca->journal.buckets = NULL; -+ ca->journal.bucket_seq = NULL; -+} -+ -+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(sb); -+ unsigned i; -+ -+ ja->nr = bch2_nr_journal_buckets(journal_buckets); -+ -+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->bucket_seq) -+ return -ENOMEM; -+ -+ ca->journal.bio = bio_kmalloc(GFP_KERNEL, -+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); -+ if (!ca->journal.bio) -+ return -ENOMEM; -+ -+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->buckets) -+ return -ENOMEM; -+ -+ for (i = 0; i < ja->nr; i++) -+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); -+ -+ return 0; -+} -+ -+void bch2_fs_journal_exit(struct journal *j) -+{ -+ kvpfree(j->buf[1].data, j->buf[1].buf_size); -+ kvpfree(j->buf[0].data, j->buf[0].buf_size); -+ free_fifo(&j->pin); -+} -+ -+int bch2_fs_journal_init(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ static struct lock_class_key res_key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ spin_lock_init(&j->lock); -+ spin_lock_init(&j->err_lock); -+ init_waitqueue_head(&j->wait); -+ INIT_DELAYED_WORK(&j->write_work, journal_write_work); -+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); -+ init_waitqueue_head(&j->pin_flush_wait); -+ mutex_init(&j->reclaim_lock); -+ mutex_init(&j->discard_lock); -+ -+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); -+ -+ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->write_delay_ms = 1000; -+ j->reclaim_delay_ms = 100; -+ -+ /* Btree roots: */ -+ j->entry_u64s_reserved += -+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); -+ -+ atomic64_set(&j->reservations.counter, -+ ((union journal_res_state) -+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -+ -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || -+ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || -+ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ j->pin.front = j->pin.back = 1; -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+/* debug: */ -+ -+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ union journal_res_state s; -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ rcu_read_lock(); -+ spin_lock(&j->lock); -+ s = READ_ONCE(j->reservations); -+ -+ pr_buf(out, -+ "active journal entries:\t%llu\n" -+ "seq:\t\t\t%llu\n" -+ "last_seq:\t\t%llu\n" -+ "last_seq_ondisk:\t%llu\n" -+ "prereserved:\t\t%u/%u\n" -+ "current entry sectors:\t%u\n" -+ "current entry:\t\t", -+ fifo_used(&j->pin), -+ journal_cur_seq(j), -+ journal_last_seq(j), -+ j->last_seq_ondisk, -+ j->prereserved.reserved, -+ j->prereserved.remaining, -+ j->cur_entry_sectors); -+ -+ switch (s.cur_entry_offset) { -+ case JOURNAL_ENTRY_ERROR_VAL: -+ pr_buf(out, "error\n"); -+ break; -+ case JOURNAL_ENTRY_CLOSED_VAL: -+ pr_buf(out, "closed\n"); -+ break; -+ default: -+ pr_buf(out, "%u/%u\n", -+ s.cur_entry_offset, -+ j->cur_entry_u64s); -+ break; -+ } -+ -+ pr_buf(out, -+ "current entry refs:\t%u\n" -+ "prev entry unwritten:\t", -+ journal_state_count(s, s.idx)); -+ -+ if (s.prev_buf_unwritten) -+ pr_buf(out, "yes, ref %u sectors %u\n", -+ journal_state_count(s, !s.idx), -+ journal_prev_buf(j)->sectors); -+ else -+ pr_buf(out, "no\n"); -+ -+ pr_buf(out, -+ "need write:\t\t%i\n" -+ "replay done:\t\t%i\n", -+ test_bit(JOURNAL_NEED_WRITE, &j->flags), -+ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); -+ -+ for_each_member_device_rcu(ca, c, iter, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ pr_buf(out, -+ "dev %u:\n" -+ "\tnr\t\t%u\n" -+ "\tavailable\t%u:%u\n" -+ "\tdiscard_idx\t\t%u\n" -+ "\tdirty_idx_ondisk\t%u (seq %llu)\n" -+ "\tdirty_idx\t\t%u (seq %llu)\n" -+ "\tcur_idx\t\t%u (seq %llu)\n", -+ iter, ja->nr, -+ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), -+ ja->sectors_free, -+ ja->discard_idx, -+ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], -+ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], -+ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); -+ } -+ -+ spin_unlock(&j->lock); -+ rcu_read_unlock(); -+} -+ -+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *pin; -+ u64 i; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { -+ pr_buf(out, "%llu: count %u\n", -+ i, atomic_read(&pin_list->count)); -+ -+ list_for_each_entry(pin, &pin_list->list, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); -+ -+ if (!list_empty(&pin_list->flushed)) -+ pr_buf(out, "flushed:\n"); -+ -+ list_for_each_entry(pin, &pin_list->flushed, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); -+ } -+ spin_unlock(&j->lock); -+} -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -new file mode 100644 -index 000000000000..56438840efd7 ---- /dev/null -+++ b/fs/bcachefs/journal.h -@@ -0,0 +1,519 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_H -+#define _BCACHEFS_JOURNAL_H -+ -+/* -+ * THE JOURNAL: -+ * -+ * The primary purpose of the journal is to log updates (insertions) to the -+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. -+ * -+ * Without the journal, the b-tree is always internally consistent on -+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal -+ * but did handle unclean shutdowns by doing all index updates synchronously -+ * (with coalescing). -+ * -+ * Updates to interior nodes still happen synchronously and without the journal -+ * (for simplicity) - this may change eventually but updates to interior nodes -+ * are rare enough it's not a huge priority. -+ * -+ * This means the journal is relatively separate from the b-tree; it consists of -+ * just a list of keys and journal replay consists of just redoing those -+ * insertions in same order that they appear in the journal. -+ * -+ * PERSISTENCE: -+ * -+ * For synchronous updates (where we're waiting on the index update to hit -+ * disk), the journal entry will be written out immediately (or as soon as -+ * possible, if the write for the previous journal entry was still in flight). -+ * -+ * Synchronous updates are specified by passing a closure (@flush_cl) to -+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter -+ * down to the journalling code. That closure will will wait on the journal -+ * write to complete (via closure_wait()). -+ * -+ * If the index update wasn't synchronous, the journal entry will be -+ * written out after 10 ms have elapsed, by default (the delay_ms field -+ * in struct journal). -+ * -+ * JOURNAL ENTRIES: -+ * -+ * A journal entry is variable size (struct jset), it's got a fixed length -+ * header and then a variable number of struct jset_entry entries. -+ * -+ * Journal entries are identified by monotonically increasing 64 bit sequence -+ * numbers - jset->seq; other places in the code refer to this sequence number. -+ * -+ * A jset_entry entry contains one or more bkeys (which is what gets inserted -+ * into the b-tree). We need a container to indicate which b-tree the key is -+ * for; also, the roots of the various b-trees are stored in jset_entry entries -+ * (one for each b-tree) - this lets us add new b-tree types without changing -+ * the on disk format. -+ * -+ * We also keep some things in the journal header that are logically part of the -+ * superblock - all the things that are frequently updated. This is for future -+ * bcache on raw flash support; the superblock (which will become another -+ * journal) can't be moved or wear leveled, so it contains just enough -+ * information to find the main journal, and the superblock only has to be -+ * rewritten when we want to move/wear level the main journal. -+ * -+ * JOURNAL LAYOUT ON DISK: -+ * -+ * The journal is written to a ringbuffer of buckets (which is kept in the -+ * superblock); the individual buckets are not necessarily contiguous on disk -+ * which means that journal entries are not allowed to span buckets, but also -+ * that we can resize the journal at runtime if desired (unimplemented). -+ * -+ * The journal buckets exist in the same pool as all the other buckets that are -+ * managed by the allocator and garbage collection - garbage collection marks -+ * the journal buckets as metadata buckets. -+ * -+ * OPEN/DIRTY JOURNAL ENTRIES: -+ * -+ * Open/dirty journal entries are journal entries that contain b-tree updates -+ * that have not yet been written out to the b-tree on disk. We have to track -+ * which journal entries are dirty, and we also have to avoid wrapping around -+ * the journal and overwriting old but still dirty journal entries with new -+ * journal entries. -+ * -+ * On disk, this is represented with the "last_seq" field of struct jset; -+ * last_seq is the first sequence number that journal replay has to replay. -+ * -+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in -+ * journal_device->seq) of for each journal bucket, the highest sequence number -+ * any journal entry it contains. Then, by comparing that against last_seq we -+ * can determine whether that journal bucket contains dirty journal entries or -+ * not. -+ * -+ * To track which journal entries are dirty, we maintain a fifo of refcounts -+ * (where each entry corresponds to a specific sequence number) - when a ref -+ * goes to 0, that journal entry is no longer dirty. -+ * -+ * Journalling of index updates is done at the same time as the b-tree itself is -+ * being modified (see btree_insert_key()); when we add the key to the journal -+ * the pending b-tree write takes a ref on the journal entry the key was added -+ * to. If a pending b-tree write would need to take refs on multiple dirty -+ * journal entries, it only keeps the ref on the oldest one (since a newer -+ * journal entry will still be replayed if an older entry was dirty). -+ * -+ * JOURNAL FILLING UP: -+ * -+ * There are two ways the journal could fill up; either we could run out of -+ * space to write to, or we could have too many open journal entries and run out -+ * of room in the fifo of refcounts. Since those refcounts are decremented -+ * without any locking we can't safely resize that fifo, so we handle it the -+ * same way. -+ * -+ * If the journal fills up, we start flushing dirty btree nodes until we can -+ * allocate space for a journal write again - preferentially flushing btree -+ * nodes that are pinning the oldest journal entries first. -+ */ -+ -+#include -+ -+#include "journal_types.h" -+ -+struct bch_fs; -+ -+static inline void journal_wake(struct journal *j) -+{ -+ wake_up(&j->wait); -+ closure_wake_up(&j->async_wait); -+ closure_wake_up(&j->preres_wait); -+} -+ -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ return j->buf + j->reservations.idx; -+} -+ -+static inline struct journal_buf *journal_prev_buf(struct journal *j) -+{ -+ return j->buf + !j->reservations.idx; -+} -+ -+/* Sequence number of oldest dirty journal entry */ -+ -+static inline u64 journal_last_seq(struct journal *j) -+{ -+ return j->pin.front; -+} -+ -+static inline u64 journal_cur_seq(struct journal *j) -+{ -+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); -+ -+ return j->pin.back - 1; -+} -+ -+u64 bch2_inode_journal_seq(struct journal *, u64); -+ -+static inline int journal_state_count(union journal_res_state s, int idx) -+{ -+ return idx == 0 ? s.buf0_count : s.buf1_count; -+} -+ -+static inline void journal_state_inc(union journal_res_state *s) -+{ -+ s->buf0_count += s->idx == 0; -+ s->buf1_count += s->idx == 1; -+} -+ -+static inline void bch2_journal_set_has_inode(struct journal *j, -+ struct journal_res *res, -+ u64 inum) -+{ -+ struct journal_buf *buf = &j->buf[res->idx]; -+ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); -+ -+ /* avoid atomic op if possible */ -+ if (unlikely(!test_bit(bit, buf->has_inode))) -+ set_bit(bit, buf->has_inode); -+} -+ -+/* -+ * Amount of space that will be taken up by some keys in the journal (i.e. -+ * including the jset header) -+ */ -+static inline unsigned jset_u64s(unsigned u64s) -+{ -+ return u64s + sizeof(struct jset_entry) / sizeof(u64); -+} -+ -+static inline int journal_entry_overhead(struct journal *j) -+{ -+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -+} -+ -+static inline struct jset_entry * -+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -+{ -+ struct jset *jset = buf->data; -+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); -+ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ -+ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -+ -+ return entry; -+} -+ -+static inline struct jset_entry * -+journal_res_entry(struct journal *j, struct journal_res *res) -+{ -+ return vstruct_idx(j->buf[res->idx].data, res->offset); -+} -+ -+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, -+ enum btree_id id, unsigned level, -+ const void *data, unsigned u64s) -+{ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ entry->type = type; -+ entry->btree_id = id; -+ entry->level = level; -+ memcpy_u64s_small(entry->_data, data, u64s); -+ -+ return jset_u64s(u64s); -+} -+ -+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, -+ unsigned type, enum btree_id id, -+ unsigned level, -+ const void *data, unsigned u64s) -+{ -+ unsigned actual = journal_entry_set(journal_res_entry(j, res), -+ type, id, level, data, u64s); -+ -+ EBUG_ON(!res->ref); -+ EBUG_ON(actual > res->u64s); -+ -+ res->offset += actual; -+ res->u64s -= actual; -+} -+ -+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, -+ enum btree_id id, const struct bkey_i *k) -+{ -+ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, -+ id, 0, k, k->k.u64s); -+} -+ -+static inline bool journal_entry_empty(struct jset *j) -+{ -+ struct jset_entry *i; -+ -+ if (j->seq != j->last_seq) -+ return false; -+ -+ vstruct_for_each(j, i) -+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) -+ return false; -+ return true; -+} -+ -+void __bch2_journal_buf_put(struct journal *, bool); -+ -+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, -+ bool need_write_just_set) -+{ -+ union journal_res_state s; -+ -+ s.v = atomic64_sub_return(((union journal_res_state) { -+ .buf0_count = idx == 0, -+ .buf1_count = idx == 1, -+ }).v, &j->reservations.counter); -+ if (!journal_state_count(s, idx)) { -+ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); -+ __bch2_journal_buf_put(j, need_write_just_set); -+ } -+} -+ -+/* -+ * This function releases the journal write structure so other threads can -+ * then proceed to add their keys as well. -+ */ -+static inline void bch2_journal_res_put(struct journal *j, -+ struct journal_res *res) -+{ -+ if (!res->ref) -+ return; -+ -+ lock_release(&j->res_map, _THIS_IP_); -+ -+ while (res->u64s) -+ bch2_journal_add_entry(j, res, -+ BCH_JSET_ENTRY_btree_keys, -+ 0, 0, NULL, 0); -+ -+ bch2_journal_buf_put(j, res->idx, false); -+ -+ res->ref = 0; -+} -+ -+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, -+ unsigned); -+ -+#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -+#define JOURNAL_RES_GET_CHECK (1 << 1) -+#define JOURNAL_RES_GET_RESERVED (1 << 2) -+#define JOURNAL_RES_GET_RECLAIM (1 << 3) -+ -+static inline int journal_res_get_fast(struct journal *j, -+ struct journal_res *res, -+ unsigned flags) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ /* -+ * Check if there is still room in the current journal -+ * entry: -+ */ -+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) -+ return 0; -+ -+ EBUG_ON(!journal_state_count(new, new.idx)); -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_CHECK) -+ return 1; -+ -+ new.cur_entry_offset += res->u64s; -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ res->ref = true; -+ res->idx = old.idx; -+ res->offset = old.cur_entry_offset; -+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ return 1; -+} -+ -+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned u64s, unsigned flags) -+{ -+ int ret; -+ -+ EBUG_ON(res->ref); -+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ -+ res->u64s = u64s; -+ -+ if (journal_res_get_fast(j, res, flags)) -+ goto out; -+ -+ ret = bch2_journal_res_get_slowpath(j, res, flags); -+ if (ret) -+ return ret; -+out: -+ if (!(flags & JOURNAL_RES_GET_CHECK)) { -+ lock_acquire_shared(&j->res_map, 0, -+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, -+ NULL, _THIS_IP_); -+ EBUG_ON(!res->ref); -+ } -+ return 0; -+} -+ -+/* journal_preres: */ -+ -+static inline bool journal_check_may_get_unreserved(struct journal *j) -+{ -+ union journal_preres_state s = READ_ONCE(j->prereserved); -+ bool ret = s.reserved <= s.remaining && -+ fifo_free(&j->pin) > 8; -+ -+ lockdep_assert_held(&j->lock); -+ -+ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ if (ret) { -+ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ journal_wake(j); -+ } else { -+ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ } -+ } -+ return ret; -+} -+ -+static inline void bch2_journal_preres_put(struct journal *j, -+ struct journal_preres *res) -+{ -+ union journal_preres_state s = { .reserved = res->u64s }; -+ -+ if (!res->u64s) -+ return; -+ -+ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); -+ res->u64s = 0; -+ closure_wake_up(&j->preres_wait); -+ -+ if (s.reserved <= s.remaining && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ spin_lock(&j->lock); -+ journal_check_may_get_unreserved(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+int __bch2_journal_preres_get(struct journal *, -+ struct journal_preres *, unsigned, unsigned); -+ -+static inline int bch2_journal_preres_get_fast(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int d = new_u64s - res->u64s; -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ new.reserved += d; -+ -+ /* -+ * If we're being called from the journal reclaim path, we have -+ * to unconditionally give out the pre-reservation, there's -+ * nothing else sensible we can do - otherwise we'd recurse back -+ * into the reclaim path and deadlock: -+ */ -+ -+ if (!(flags & JOURNAL_RES_GET_RECLAIM) && -+ new.reserved > new.remaining) -+ return 0; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+ -+ res->u64s += d; -+ return 1; -+} -+ -+static inline int bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ if (new_u64s <= res->u64s) -+ return 0; -+ -+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_NONBLOCK) -+ return -EAGAIN; -+ -+ return __bch2_journal_preres_get(j, res, new_u64s, flags); -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *, -+ struct journal_entry_res *, -+ unsigned); -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *); -+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); -+ -+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -+void bch2_journal_flush_async(struct journal *, struct closure *); -+void bch2_journal_meta_async(struct journal *, struct closure *); -+ -+int bch2_journal_flush_seq(struct journal *, u64); -+int bch2_journal_flush(struct journal *); -+int bch2_journal_meta(struct journal *); -+ -+void bch2_journal_halt(struct journal *); -+ -+static inline int bch2_journal_error(struct journal *j) -+{ -+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL -+ ? -EIO : 0; -+} -+ -+struct bch_dev; -+ -+static inline bool journal_flushes_device(struct bch_dev *ca) -+{ -+ return true; -+} -+ -+static inline void bch2_journal_set_replay_done(struct journal *j) -+{ -+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ set_bit(JOURNAL_REPLAY_DONE, &j->flags); -+} -+ -+void bch2_journal_unblock(struct journal *); -+void bch2_journal_block(struct journal *); -+ -+void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -+void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -+ -+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, -+ unsigned nr); -+int bch2_dev_journal_alloc(struct bch_dev *); -+ -+void bch2_dev_journal_stop(struct journal *, struct bch_dev *); -+ -+void bch2_fs_journal_stop(struct journal *); -+int bch2_fs_journal_start(struct journal *, u64, struct list_head *); -+ -+void bch2_dev_journal_exit(struct bch_dev *); -+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -+void bch2_fs_journal_exit(struct journal *); -+int bch2_fs_journal_init(struct journal *); -+ -+#endif /* _BCACHEFS_JOURNAL_H */ -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -new file mode 100644 -index 000000000000..bd0e6b371701 ---- /dev/null -+++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1183 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_io.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+ -+#include -+ -+struct journal_list { -+ struct closure cl; -+ struct mutex lock; -+ struct list_head *head; -+ int ret; -+}; -+ -+#define JOURNAL_ENTRY_ADD_OK 0 -+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 -+ -+/* -+ * Given a journal entry we just read, add it to the list of journal entries to -+ * be replayed: -+ */ -+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, -+ struct journal_list *jlist, struct jset *j, -+ bool bad) -+{ -+ struct journal_replay *i, *pos; -+ struct bch_devs_list devs = { .nr = 0 }; -+ struct list_head *where; -+ size_t bytes = vstruct_bytes(j); -+ __le64 last_seq; -+ int ret; -+ -+ last_seq = !list_empty(jlist->head) -+ ? list_last_entry(jlist->head, struct journal_replay, -+ list)->j.last_seq -+ : 0; -+ -+ if (!c->opts.read_entire_journal) { -+ /* Is this entry older than the range we need? */ -+ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { -+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; -+ goto out; -+ } -+ -+ /* Drop entries we don't need anymore */ -+ list_for_each_entry_safe(i, pos, jlist->head, list) { -+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) -+ break; -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+ } -+ -+ list_for_each_entry_reverse(i, jlist->head, list) { -+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { -+ where = &i->list; -+ goto add; -+ } -+ } -+ -+ where = jlist->head; -+add: -+ i = where->next != jlist->head -+ ? container_of(where->next, struct journal_replay, list) -+ : NULL; -+ -+ /* -+ * Duplicate journal entries? If so we want the one that didn't have a -+ * checksum error: -+ */ -+ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { -+ if (i->bad) { -+ devs = i->devs; -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } else if (bad) { -+ goto found; -+ } else { -+ fsck_err_on(bytes != vstruct_bytes(&i->j) || -+ memcmp(j, &i->j, bytes), c, -+ "found duplicate but non identical journal entries (seq %llu)", -+ le64_to_cpu(j->seq)); -+ goto found; -+ } -+ -+ } -+ -+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_add(&i->list, where); -+ i->devs = devs; -+ i->bad = bad; -+ memcpy(&i->j, j, bytes); -+found: -+ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) -+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); -+ else -+ fsck_err_on(1, c, "duplicate journal entries on same device"); -+ ret = JOURNAL_ENTRY_ADD_OK; -+out: -+fsck_err: -+ return ret; -+} -+ -+static struct nonce journal_nonce(const struct jset *jset) -+{ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = ((__le32 *) &jset->seq)[0], -+ [2] = ((__le32 *) &jset->seq)[1], -+ [3] = BCH_NONCE_JOURNAL, -+ }}; -+} -+ -+/* this fills in a range with empty jset_entries: */ -+static void journal_entry_null_range(void *start, void *end) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = start; entry != end; entry = vstruct_next(entry)) -+ memset(entry, 0, sizeof(*entry)); -+} -+ -+#define JOURNAL_ENTRY_REREAD 5 -+#define JOURNAL_ENTRY_NONE 6 -+#define JOURNAL_ENTRY_BAD 7 -+ -+#define journal_entry_err(c, msg, ...) \ -+({ \ -+ switch (write) { \ -+ case READ: \ -+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write:\n" \ -+ msg, ##__VA_ARGS__); \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+ true; \ -+}) -+ -+#define journal_entry_err_on(cond, c, msg, ...) \ -+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -+ -+static int journal_validate_key(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, -+ unsigned level, enum btree_id btree_id, -+ struct bkey_i *k, -+ const char *type, int write) -+{ -+ void *next = vstruct_next(entry); -+ const char *invalid; -+ unsigned version = le32_to_cpu(jset->version); -+ int ret = 0; -+ -+ if (journal_entry_err_on(!k->k.u64s, c, -+ "invalid %s in journal: k->u64s 0", type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on((void *) bkey_next(k) > -+ (void *) vstruct_next(entry), c, -+ "invalid %s in journal: extends past end of journal entry", -+ type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, -+ "invalid %s in journal: bad format %u", -+ type, k->k.format)) { -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (!write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+ -+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), -+ __btree_node_type(level, btree_id)); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); -+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", -+ type, invalid, buf); -+ -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_btree_keys(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k; -+ -+ vstruct_for_each(entry, k) { -+ int ret = journal_validate_key(c, jset, entry, -+ entry->level, -+ entry->btree_id, -+ k, "key", write); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int journal_entry_validate_btree_root(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k = entry->start; -+ int ret = 0; -+ -+ if (journal_entry_err_on(!entry->u64s || -+ le16_to_cpu(entry->u64s) != k->k.u64s, c, -+ "invalid btree root journal entry: wrong number of keys")) { -+ void *next = vstruct_next(entry); -+ /* -+ * we don't want to null out this jset_entry, -+ * just the contents, so that later we can tell -+ * we were _supposed_ to have a btree root -+ */ -+ entry->u64s = 0; -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -+ "btree root", write); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_prio_ptrs(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ /* obsolete, don't care: */ -+ return 0; -+} -+ -+static int journal_entry_validate_blacklist(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_blacklist_v2(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_blacklist_v2 *bl_entry; -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ goto out; -+ } -+ -+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > -+ le64_to_cpu(bl_entry->end), c, -+ "invalid journal seq blacklist entry: start > end")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+out: -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u), -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_data_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u) || -+ bytes < sizeof(*u) + u->r.nr_devs, -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+struct jset_entry_ops { -+ int (*validate)(struct bch_fs *, struct jset *, -+ struct jset_entry *, int); -+}; -+ -+static const struct jset_entry_ops bch2_jset_entry_ops[] = { -+#define x(f, nr) \ -+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ -+ .validate = journal_entry_validate_##f, \ -+ }, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+}; -+ -+static int journal_entry_validate(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, int write) -+{ -+ return entry->type < BCH_JSET_ENTRY_NR -+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, -+ entry, write) -+ : 0; -+} -+ -+static int jset_validate_entries(struct bch_fs *c, struct jset *jset, -+ int write) -+{ -+ struct jset_entry *entry; -+ int ret = 0; -+ -+ vstruct_for_each(jset, entry) { -+ if (journal_entry_err_on(vstruct_next(entry) > -+ vstruct_last(jset), c, -+ "journal entry extends past end of jset")) { -+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); -+ break; -+ } -+ -+ ret = journal_entry_validate(c, jset, entry, write); -+ if (ret) -+ break; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int jset_validate(struct bch_fs *c, -+ struct bch_dev *ca, -+ struct jset *jset, u64 sector, -+ unsigned bucket_sectors_left, -+ unsigned sectors_read, -+ int write) -+{ -+ size_t bytes = vstruct_bytes(jset); -+ struct bch_csum csum; -+ unsigned version; -+ int ret = 0; -+ -+ if (le64_to_cpu(jset->magic) != jset_magic(c)) -+ return JOURNAL_ENTRY_NONE; -+ -+ version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max, c, -+ "%s sector %llu seq %llu: unknown journal entry version %u", -+ ca->name, sector, le64_to_cpu(jset->seq), -+ version)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, -+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", -+ ca->name, sector, le64_to_cpu(jset->seq), bytes)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ if (bytes > sectors_read << 9) -+ return JOURNAL_ENTRY_REREAD; -+ -+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, -+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", -+ ca->name, sector, le64_to_cpu(jset->seq), -+ JSET_CSUM_TYPE(jset))) -+ return JOURNAL_ENTRY_BAD; -+ -+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); -+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, -+ "%s sector %llu seq %llu: journal checksum bad", -+ ca->name, sector, le64_to_cpu(jset->seq))) { -+ /* XXX: retry IO, when we start retrying checksum errors */ -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, -+ "invalid journal entry: last_seq > seq")) { -+ jset->last_seq = jset->seq; -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ return 0; -+fsck_err: -+ return ret; -+} -+ -+struct journal_read_buf { -+ void *data; -+ size_t size; -+}; -+ -+static int journal_read_buf_realloc(struct journal_read_buf *b, -+ size_t new_size) -+{ -+ void *n; -+ -+ /* the bios are sized for this many pages, max: */ -+ if (new_size > JOURNAL_ENTRY_SIZE_MAX) -+ return -ENOMEM; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = kvpmalloc(new_size, GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ kvpfree(b->data, b->size); -+ b->data = n; -+ b->size = new_size; -+ return 0; -+} -+ -+static int journal_read_bucket(struct bch_dev *ca, -+ struct journal_read_buf *buf, -+ struct journal_list *jlist, -+ unsigned bucket) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct jset *j = NULL; -+ unsigned sectors, sectors_read = 0; -+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), -+ end = offset + ca->mi.bucket_size; -+ bool saw_bad = false; -+ int ret = 0; -+ -+ pr_debug("reading %u", bucket); -+ -+ while (offset < end) { -+ if (!sectors_read) { -+ struct bio *bio; -+reread: -+ sectors_read = min_t(unsigned, -+ end - offset, buf->size >> 9); -+ -+ bio = bio_kmalloc(GFP_KERNEL, -+ buf_pages(buf->data, -+ sectors_read << 9)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(bio, REQ_OP_READ, 0); -+ bch2_bio_map(bio, buf->data, sectors_read << 9); -+ -+ ret = submit_bio_wait(bio); -+ bio_put(bio); -+ -+ if (bch2_dev_io_err_on(ret, ca, -+ "journal read from sector %llu", -+ offset) || -+ bch2_meta_read_fault("journal")) -+ return -EIO; -+ -+ j = buf->data; -+ } -+ -+ ret = jset_validate(c, ca, j, offset, -+ end - offset, sectors_read, -+ READ); -+ switch (ret) { -+ case BCH_FSCK_OK: -+ sectors = vstruct_sectors(j, c->block_bits); -+ break; -+ case JOURNAL_ENTRY_REREAD: -+ if (vstruct_bytes(j) > buf->size) { -+ ret = journal_read_buf_realloc(buf, -+ vstruct_bytes(j)); -+ if (ret) -+ return ret; -+ } -+ goto reread; -+ case JOURNAL_ENTRY_NONE: -+ if (!saw_bad) -+ return 0; -+ sectors = c->opts.block_size; -+ goto next_block; -+ case JOURNAL_ENTRY_BAD: -+ saw_bad = true; -+ /* -+ * On checksum error we don't really trust the size -+ * field of the journal entry we read, so try reading -+ * again at next block boundary: -+ */ -+ sectors = c->opts.block_size; -+ break; -+ default: -+ return ret; -+ } -+ -+ /* -+ * This happens sometimes if we don't have discards on - -+ * when we've partially overwritten a bucket with new -+ * journal entries. We don't need the rest of the -+ * bucket: -+ */ -+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) -+ return 0; -+ -+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); -+ -+ mutex_lock(&jlist->lock); -+ ret = journal_entry_add(c, ca, jlist, j, ret != 0); -+ mutex_unlock(&jlist->lock); -+ -+ switch (ret) { -+ case JOURNAL_ENTRY_ADD_OK: -+ break; -+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: -+ break; -+ default: -+ return ret; -+ } -+next_block: -+ pr_debug("next"); -+ offset += sectors; -+ sectors_read -= sectors; -+ j = ((void *) j) + (sectors << 9); -+ } -+ -+ return 0; -+} -+ -+static void bch2_journal_read_device(struct closure *cl) -+{ -+ struct journal_device *ja = -+ container_of(cl, struct journal_device, read); -+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); -+ struct journal_list *jlist = -+ container_of(cl->parent, struct journal_list, cl); -+ struct journal_read_buf buf = { NULL, 0 }; -+ u64 min_seq = U64_MAX; -+ unsigned i; -+ int ret; -+ -+ if (!ja->nr) -+ goto out; -+ -+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); -+ if (ret) -+ goto err; -+ -+ pr_debug("%u journal buckets", ja->nr); -+ -+ for (i = 0; i < ja->nr; i++) { -+ ret = journal_read_bucket(ca, &buf, jlist, i); -+ if (ret) -+ goto err; -+ } -+ -+ /* Find the journal bucket with the highest sequence number: */ -+ for (i = 0; i < ja->nr; i++) { -+ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) -+ ja->cur_idx = i; -+ -+ min_seq = min(ja->bucket_seq[i], min_seq); -+ } -+ -+ /* -+ * If there's duplicate journal entries in multiple buckets (which -+ * definitely isn't supposed to happen, but...) - make sure to start -+ * cur_idx at the last of those buckets, so we don't deadlock trying to -+ * allocate -+ */ -+ while (ja->bucket_seq[ja->cur_idx] > min_seq && -+ ja->bucket_seq[ja->cur_idx] > -+ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ ja->sectors_free = 0; -+ -+ /* -+ * Set dirty_idx to indicate the entire journal is full and needs to be -+ * reclaimed - journal reclaim will immediately reclaim whatever isn't -+ * pinned when it first runs: -+ */ -+ ja->discard_idx = ja->dirty_idx_ondisk = -+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -+out: -+ kvpfree(buf.data, buf.size); -+ percpu_ref_put(&ca->io_ref); -+ closure_return(cl); -+ return; -+err: -+ mutex_lock(&jlist->lock); -+ jlist->ret = ret; -+ mutex_unlock(&jlist->lock); -+ goto out; -+} -+ -+int bch2_journal_read(struct bch_fs *c, struct list_head *list) -+{ -+ struct journal_list jlist; -+ struct journal_replay *i; -+ struct bch_dev *ca; -+ unsigned iter; -+ size_t keys = 0, entries = 0; -+ bool degraded = false; -+ int ret = 0; -+ -+ closure_init_stack(&jlist.cl); -+ mutex_init(&jlist.lock); -+ jlist.head = list; -+ jlist.ret = 0; -+ -+ for_each_member_device(ca, c, iter) { -+ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && -+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) -+ continue; -+ -+ if ((ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO) && -+ percpu_ref_tryget(&ca->io_ref)) -+ closure_call(&ca->journal.read, -+ bch2_journal_read_device, -+ system_unbound_wq, -+ &jlist.cl); -+ else -+ degraded = true; -+ } -+ -+ closure_sync(&jlist.cl); -+ -+ if (jlist.ret) -+ return jlist.ret; -+ -+ list_for_each_entry(i, list, list) { -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct bch_replicas_padded replicas; -+ char buf[80]; -+ -+ ret = jset_validate_entries(c, &i->j, READ); -+ if (ret) -+ goto fsck_err; -+ -+ /* -+ * If we're mounting in degraded mode - if we didn't read all -+ * the devices - this is wrong: -+ */ -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); -+ -+ if (!degraded && -+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, -+ "superblock not marked as containing replicas %s", -+ (bch2_replicas_entry_to_text(&PBUF(buf), -+ &replicas.e), buf)))) { -+ ret = bch2_mark_replicas(c, &replicas.e); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_jset_key(k, _n, entry, &i->j) -+ keys++; -+ entries++; -+ } -+ -+ if (!list_empty(list)) { -+ i = list_last_entry(list, struct journal_replay, list); -+ -+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", -+ keys, entries, le64_to_cpu(i->j.seq)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal write: */ -+ -+static void __journal_write_alloc(struct journal *j, -+ struct journal_buf *w, -+ struct dev_alloc_list *devs_sorted, -+ unsigned sectors, -+ unsigned *replicas, -+ unsigned replicas_want) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (*replicas >= replicas_want) -+ return; -+ -+ for (i = 0; i < devs_sorted->nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ /* -+ * Check that we can use this device, and aren't already using -+ * it: -+ */ -+ if (!ca->mi.durability || -+ ca->mi.state != BCH_MEMBER_STATE_RW || -+ !ja->nr || -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), -+ ca->dev_idx) || -+ sectors > ja->sectors_free) -+ continue; -+ -+ bch2_dev_stripe_increment(ca, &j->wp.stripe); -+ -+ bch2_bkey_append_ptr(&w->key, -+ (struct bch_extent_ptr) { -+ .offset = bucket_to_sector(ca, -+ ja->buckets[ja->cur_idx]) + -+ ca->mi.bucket_size - -+ ja->sectors_free, -+ .dev = ca->dev_idx, -+ }); -+ -+ ja->sectors_free -= sectors; -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ -+ *replicas += ca->mi.durability; -+ -+ if (*replicas >= replicas_want) -+ break; -+ } -+} -+ -+/** -+ * journal_next_bucket - move on to the next journal bucket if possible -+ */ -+static int journal_write_alloc(struct journal *j, struct journal_buf *w, -+ unsigned sectors) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ struct dev_alloc_list devs_sorted; -+ unsigned i, replicas = 0, replicas_want = -+ READ_ONCE(c->opts.metadata_replicas); -+ -+ rcu_read_lock(); -+ -+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, -+ &c->rw_devs[BCH_DATA_journal]); -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+ -+ if (replicas >= replicas_want) -+ goto done; -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ if (sectors > ja->sectors_free && -+ sectors <= ca->mi.bucket_size && -+ bch2_journal_dev_buckets_available(j, ja, -+ journal_space_discarded)) { -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ /* -+ * ja->bucket_seq[ja->cur_idx] must always have -+ * something sensible: -+ */ -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ } -+ } -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+done: -+ rcu_read_unlock(); -+ -+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; -+} -+ -+static void journal_write_compact(struct jset *jset) -+{ -+ struct jset_entry *i, *next, *prev = NULL; -+ -+ /* -+ * Simple compaction, dropping empty jset_entries (from journal -+ * reservations that weren't fully used) and merging jset_entries that -+ * can be. -+ * -+ * If we wanted to be really fancy here, we could sort all the keys in -+ * the jset and drop keys that were overwritten - probably not worth it: -+ */ -+ vstruct_for_each_safe(jset, i, next) { -+ unsigned u64s = le16_to_cpu(i->u64s); -+ -+ /* Empty entry: */ -+ if (!u64s) -+ continue; -+ -+ /* Can we merge with previous entry? */ -+ if (prev && -+ i->btree_id == prev->btree_id && -+ i->level == prev->level && -+ i->type == prev->type && -+ i->type == BCH_JSET_ENTRY_btree_keys && -+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { -+ memmove_u64s_down(vstruct_next(prev), -+ i->_data, -+ u64s); -+ le16_add_cpu(&prev->u64s, u64s); -+ continue; -+ } -+ -+ /* Couldn't merge, move i into new position (after prev): */ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ if (i != prev) -+ memmove_u64s_down(prev, i, jset_u64s(u64s)); -+ } -+ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ -+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -+{ -+ /* we aren't holding j->lock: */ -+ unsigned new_size = READ_ONCE(j->buf_size_want); -+ void *new_buf; -+ -+ if (buf->buf_size >= new_size) -+ return; -+ -+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); -+ if (!new_buf) -+ return; -+ -+ memcpy(new_buf, buf->data, buf->buf_size); -+ kvpfree(buf->data, buf->buf_size); -+ buf->data = new_buf; -+ buf->buf_size = new_size; -+} -+ -+static void journal_write_done(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *w = journal_prev_buf(j); -+ struct bch_devs_list devs = -+ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); -+ struct bch_replicas_padded replicas; -+ u64 seq = le64_to_cpu(w->data->seq); -+ u64 last_seq = le64_to_cpu(w->data->last_seq); -+ -+ bch2_time_stats_update(j->write_time, j->write_start_time); -+ -+ if (!devs.nr) { -+ bch_err(c, "unable to write journal to sufficient devices"); -+ goto err; -+ } -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); -+ -+ if (bch2_mark_replicas(c, &replicas.e)) -+ goto err; -+ -+ spin_lock(&j->lock); -+ if (seq >= j->pin.front) -+ journal_seq_pin(j, seq)->devs = devs; -+ -+ j->seq_ondisk = seq; -+ j->last_seq_ondisk = last_seq; -+ bch2_journal_space_available(j); -+ -+ /* -+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard -+ * more buckets: -+ * -+ * Must come before signaling write completion, for -+ * bch2_fs_journal_stop(): -+ */ -+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -+out: -+ /* also must come before signalling write completion: */ -+ closure_debug_destroy(cl); -+ -+ BUG_ON(!j->reservations.prev_buf_unwritten); -+ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, -+ &j->reservations.counter); -+ -+ closure_wake_up(&w->wait); -+ journal_wake(j); -+ -+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ mod_delayed_work(system_freezable_wq, &j->write_work, 0); -+ spin_unlock(&j->lock); -+ return; -+err: -+ bch2_fatal_error(c); -+ spin_lock(&j->lock); -+ goto out; -+} -+ -+static void journal_write_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ struct journal *j = &ca->fs->journal; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("journal")) { -+ struct journal_buf *w = journal_prev_buf(j); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&j->err_lock, flags); -+ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); -+ spin_unlock_irqrestore(&j->err_lock, flags); -+ } -+ -+ closure_put(&j->io); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+void bch2_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_prev_buf(j); -+ struct jset_entry *start, *end; -+ struct jset *jset; -+ struct bio *bio; -+ struct bch_extent_ptr *ptr; -+ bool validate_before_checksum = false; -+ unsigned i, sectors, bytes, u64s; -+ int ret; -+ -+ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); -+ -+ journal_buf_realloc(j, w); -+ jset = w->data; -+ -+ j->write_start_time = local_clock(); -+ -+ /* -+ * New btree roots are set by journalling them; when the journal entry -+ * gets written we have to propagate them to c->btree_roots -+ * -+ * But, every journal entry we write has to contain all the btree roots -+ * (at least for now); so after we copy btree roots to c->btree_roots we -+ * have to get any missing btree roots and add them to this journal -+ * entry: -+ */ -+ -+ bch2_journal_entries_to_btree_roots(c, jset); -+ -+ start = end = vstruct_last(jset); -+ -+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); -+ -+ end = bch2_journal_super_entries_add_common(c, end, -+ le64_to_cpu(jset->seq)); -+ u64s = (u64 *) end - (u64 *) start; -+ BUG_ON(u64s > j->entry_u64s_reserved); -+ -+ le32_add_cpu(&jset->u64s, u64s); -+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); -+ -+ journal_write_compact(jset); -+ -+ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ jset->magic = cpu_to_le64(jset_magic(c)); -+ -+ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le32(BCH_JSET_VERSION_OLD) -+ : cpu_to_le32(c->sb.version); -+ -+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); -+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) -+ validate_before_checksum = true; -+ -+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ if (validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), -+ journal_nonce(jset), jset); -+ -+ if (!validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ sectors = vstruct_sectors(jset, c->block_bits); -+ BUG_ON(sectors > w->sectors); -+ -+ bytes = vstruct_bytes(jset); -+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); -+ -+retry_alloc: -+ spin_lock(&j->lock); -+ ret = journal_write_alloc(j, w, sectors); -+ -+ if (ret && j->can_discard) { -+ spin_unlock(&j->lock); -+ bch2_journal_do_discards(j); -+ goto retry_alloc; -+ } -+ -+ /* -+ * write is allocated, no longer need to account for it in -+ * bch2_journal_space_available(): -+ */ -+ w->sectors = 0; -+ -+ /* -+ * journal entry has been compacted and allocated, recalculate space -+ * available: -+ */ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ if (ret) { -+ bch_err(c, "Unable to allocate journal write"); -+ bch2_fatal_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+ } -+ -+ /* -+ * XXX: we really should just disable the entire journal in nochanges -+ * mode -+ */ -+ if (c->opts.nochanges) -+ goto no_io; -+ -+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ /* XXX: fix this */ -+ bch_err(c, "missing device for journal write\n"); -+ continue; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], -+ sectors); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = ptr->offset; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, -+ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); -+ bch2_bio_map(bio, jset, sectors << 9); -+ -+ trace_journal_write(bio); -+ closure_bio_submit(bio, cl); -+ -+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); -+ } -+ -+ for_each_rw_member(ca, c, i) -+ if (journal_flushes_device(ca) && -+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { -+ percpu_ref_get(&ca->io_ref); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_FLUSH; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ closure_bio_submit(bio, cl); -+ } -+ -+no_io: -+ bch2_bucket_seq_cleanup(c); -+ -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+err: -+ bch2_inconsistent_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+} -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -new file mode 100644 -index 000000000000..6958ee0f8cf2 ---- /dev/null -+++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_IO_H -+#define _BCACHEFS_JOURNAL_IO_H -+ -+/* -+ * Only used for holding the journal entries we read in btree_journal_read() -+ * during cache_registration -+ */ -+struct journal_replay { -+ struct list_head list; -+ struct bch_devs_list devs; -+ /* checksum error, but we may want to try using it anyways: */ -+ bool bad; -+ /* must be last: */ -+ struct jset j; -+}; -+ -+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, -+ struct jset_entry *entry, unsigned type) -+{ -+ while (entry < vstruct_last(jset)) { -+ if (entry->type == type) -+ return entry; -+ -+ entry = vstruct_next(entry); -+ } -+ -+ return NULL; -+} -+ -+#define for_each_jset_entry_type(entry, jset, type) \ -+ for (entry = (jset)->start; \ -+ (entry = __jset_entry_type_next(jset, entry, type)); \ -+ entry = vstruct_next(entry)) -+ -+#define for_each_jset_key(k, _n, entry, jset) \ -+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ -+ vstruct_for_each_safe(entry, k, _n) -+ -+int bch2_journal_read(struct bch_fs *, struct list_head *); -+ -+void bch2_journal_write(struct closure *); -+ -+#endif /* _BCACHEFS_JOURNAL_IO_H */ -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -new file mode 100644 -index 000000000000..57591983eebd ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,644 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+#include "super.h" -+ -+/* Free space calculations: */ -+ -+static unsigned journal_space_from(struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ switch (from) { -+ case journal_space_discarded: -+ return ja->discard_idx; -+ case journal_space_clean_ondisk: -+ return ja->dirty_idx_ondisk; -+ case journal_space_clean: -+ return ja->dirty_idx; -+ default: -+ BUG(); -+ } -+} -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *j, -+ struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ unsigned available = (journal_space_from(ja, from) - -+ ja->cur_idx - 1 + ja->nr) % ja->nr; -+ -+ /* -+ * Don't use the last bucket unless writing the new last_seq -+ * will make another bucket available: -+ */ -+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) -+ --available; -+ -+ return available; -+} -+ -+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) -+{ -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ new.remaining = u64s_remaining; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+} -+ -+static struct journal_space { -+ unsigned next_entry; -+ unsigned remaining; -+} __journal_space_available(struct journal *j, unsigned nr_devs_want, -+ enum journal_space_from from) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned sectors_next_entry = UINT_MAX; -+ unsigned sectors_total = UINT_MAX; -+ unsigned i, nr_devs = 0; -+ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten -+ ? journal_prev_buf(j)->sectors -+ : 0; -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ unsigned buckets_this_device, sectors_this_device; -+ -+ if (!ja->nr) -+ continue; -+ -+ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); -+ sectors_this_device = ja->sectors_free; -+ -+ /* -+ * We that we don't allocate the space for a journal entry -+ * until we write it out - thus, account for it here: -+ */ -+ if (unwritten_sectors >= sectors_this_device) { -+ if (!buckets_this_device) -+ continue; -+ -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ sectors_this_device -= unwritten_sectors; -+ -+ if (sectors_this_device < ca->mi.bucket_size && -+ buckets_this_device) { -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ if (!sectors_this_device) -+ continue; -+ -+ sectors_next_entry = min(sectors_next_entry, -+ sectors_this_device); -+ -+ sectors_total = min(sectors_total, -+ buckets_this_device * ca->mi.bucket_size + -+ sectors_this_device); -+ -+ nr_devs++; -+ } -+ rcu_read_unlock(); -+ -+ if (nr_devs < nr_devs_want) -+ return (struct journal_space) { 0, 0 }; -+ -+ return (struct journal_space) { -+ .next_entry = sectors_next_entry, -+ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), -+ }; -+} -+ -+void bch2_journal_space_available(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_space discarded, clean_ondisk, clean; -+ unsigned overhead, u64s_remaining = 0; -+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, -+ j->buf[1].buf_size >> 9); -+ unsigned i, nr_online = 0, nr_devs_want; -+ bool can_discard = false; -+ int ret = 0; -+ -+ lockdep_assert_held(&j->lock); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ while (ja->dirty_idx != ja->cur_idx && -+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ -+ while (ja->dirty_idx_ondisk != ja->dirty_idx && -+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ -+ if (ja->discard_idx != ja->dirty_idx_ondisk) -+ can_discard = true; -+ -+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); -+ nr_online++; -+ } -+ rcu_read_unlock(); -+ -+ j->can_discard = can_discard; -+ -+ if (nr_online < c->opts.metadata_replicas_required) { -+ ret = -EROFS; -+ goto out; -+ } -+ -+ if (!fifo_free(&j->pin)) { -+ ret = -ENOSPC; -+ goto out; -+ } -+ -+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); -+ -+ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); -+ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); -+ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); -+ -+ if (!discarded.next_entry) -+ ret = -ENOSPC; -+ -+ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * -+ journal_entry_overhead(j); -+ u64s_remaining = clean.remaining << 6; -+ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); -+ u64s_remaining /= 4; -+out: -+ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; -+ j->cur_entry_error = ret; -+ journal_set_remaining(j, u64s_remaining); -+ journal_check_may_get_unreserved(j); -+ -+ if (!ret) -+ journal_wake(j); -+} -+ -+/* Discards - last part of journal reclaim: */ -+ -+static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = ja->discard_idx != ja->dirty_idx_ondisk; -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * Advance ja->discard_idx as long as it points to buckets that are no longer -+ * dirty, issuing discards if necessary: -+ */ -+void bch2_journal_do_discards(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ mutex_lock(&j->discard_lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ -+ while (should_discard_bucket(j, ja)) { -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, -+ ja->buckets[ja->discard_idx]), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ spin_lock(&j->lock); -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ } -+ } -+ -+ mutex_unlock(&j->discard_lock); -+} -+ -+/* -+ * Journal entry pinning - machinery for holding a reference on a given journal -+ * entry, holding it open to ensure it gets replayed during recovery: -+ */ -+ -+static void bch2_journal_reclaim_fast(struct journal *j) -+{ -+ struct journal_entry_pin_list temp; -+ bool popped = false; -+ -+ lockdep_assert_held(&j->lock); -+ -+ /* -+ * Unpin journal entries whose reference counts reached zero, meaning -+ * all btree nodes got written out -+ */ -+ while (!fifo_empty(&j->pin) && -+ !atomic_read(&fifo_peek_front(&j->pin).count)) { -+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); -+ BUG_ON(!fifo_pop(&j->pin, temp)); -+ popped = true; -+ } -+ -+ if (popped) -+ bch2_journal_space_available(j); -+} -+ -+void bch2_journal_pin_put(struct journal *j, u64 seq) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ if (atomic_dec_and_test(&pin_list->count)) { -+ spin_lock(&j->lock); -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+static inline void __journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ struct journal_entry_pin_list *pin_list; -+ -+ if (!journal_pin_active(pin)) -+ return; -+ -+ pin_list = journal_seq_pin(j, pin->seq); -+ pin->seq = 0; -+ list_del_init(&pin->list); -+ -+ /* -+ * Unpinning a journal entry make make journal_next_bucket() succeed, if -+ * writing a new last_seq will now make another bucket available: -+ */ -+ if (atomic_dec_and_test(&pin_list->count) && -+ pin_list == &fifo_peek_front(&j->pin)) -+ bch2_journal_reclaim_fast(j); -+ else if (fifo_used(&j->pin) == 1 && -+ atomic_read(&pin_list->count) == 1) -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ spin_lock(&j->lock); -+ __journal_pin_drop(j, pin); -+ spin_unlock(&j->lock); -+} -+ -+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ __journal_pin_drop(j, pin); -+ -+ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); -+ -+ atomic_inc(&pin_list->count); -+ pin->seq = seq; -+ pin->flush = flush_fn; -+ -+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); -+} -+ -+void __bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_update(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (journal_pin_active(pin) && pin->seq < seq) -+ return; -+ -+ spin_lock(&j->lock); -+ -+ if (pin->seq != seq) { -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ } else { -+ struct journal_entry_pin_list *pin_list = -+ journal_seq_pin(j, seq); -+ -+ /* -+ * If the pin is already pinning the right sequence number, it -+ * still might've already been flushed: -+ */ -+ list_move(&pin->list, &pin_list->list); -+ } -+ -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_copy(struct journal *j, -+ struct journal_entry_pin *dst, -+ struct journal_entry_pin *src, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ -+ if (journal_pin_active(src) && -+ (!journal_pin_active(dst) || src->seq < dst->seq)) -+ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running -+ */ -+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -+{ -+ BUG_ON(journal_pin_active(pin)); -+ -+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -+} -+ -+/* -+ * Journal reclaim: flush references to open journal entries to reclaim space in -+ * the journal -+ * -+ * May be done by the journal code in the background as needed to free up space -+ * for more journal entries, or as part of doing a clean shutdown, or to migrate -+ * data off of a specific device: -+ */ -+ -+static struct journal_entry_pin * -+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *ret = NULL; -+ -+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) -+ return NULL; -+ -+ spin_lock(&j->lock); -+ -+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) -+ if (*seq > max_seq || -+ (ret = list_first_entry_or_null(&pin_list->list, -+ struct journal_entry_pin, list))) -+ break; -+ -+ if (ret) { -+ list_move(&ret->list, &pin_list->flushed); -+ BUG_ON(j->flush_in_progress); -+ j->flush_in_progress = ret; -+ j->last_flushed = jiffies; -+ } -+ -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* returns true if we did work */ -+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, -+ unsigned min_nr) -+{ -+ struct journal_entry_pin *pin; -+ bool ret = false; -+ u64 seq; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ while ((pin = journal_get_next_pin(j, min_nr -+ ? U64_MAX : seq_to_flush, &seq))) { -+ if (min_nr) -+ min_nr--; -+ -+ pin->flush(j, pin, seq); -+ -+ BUG_ON(j->flush_in_progress != pin); -+ j->flush_in_progress = NULL; -+ wake_up(&j->pin_flush_wait); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_journal_reclaim - free up journal buckets -+ * -+ * Background journal reclaim writes out btree nodes. It should be run -+ * early enough so that we never completely run out of journal buckets. -+ * -+ * High watermarks for triggering background reclaim: -+ * - FIFO has fewer than 512 entries left -+ * - fewer than 25% journal buckets free -+ * -+ * Background reclaim runs until low watermarks are reached: -+ * - FIFO has more than 1024 entries left -+ * - more than 50% journal buckets free -+ * -+ * As long as a reclaim can complete in the time it takes to fill up -+ * 512 journal entries or 25% of all journal buckets, then -+ * journal_next_bucket() should not stall. -+ */ -+void bch2_journal_reclaim(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter, min_nr = 0; -+ u64 seq_to_flush = 0; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ bch2_journal_do_discards(j); -+ -+ spin_lock(&j->lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ unsigned nr_buckets, bucket_to_flush; -+ -+ if (!ja->nr) -+ continue; -+ -+ /* Try to keep the journal at most half full: */ -+ nr_buckets = ja->nr / 2; -+ -+ /* And include pre-reservations: */ -+ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, -+ (ca->mi.bucket_size << 6) - -+ journal_entry_overhead(j)); -+ -+ nr_buckets = min(nr_buckets, ja->nr); -+ -+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; -+ seq_to_flush = max(seq_to_flush, -+ ja->bucket_seq[bucket_to_flush]); -+ } -+ -+ /* Also flush if the pin fifo is more than half full */ -+ seq_to_flush = max_t(s64, seq_to_flush, -+ (s64) journal_cur_seq(j) - -+ (j->pin.size >> 1)); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If it's been longer than j->reclaim_delay_ms since we last flushed, -+ * make sure to flush at least one journal pin: -+ */ -+ if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(j->reclaim_delay_ms))) -+ min_nr = 1; -+ -+ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { -+ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); -+ min_nr = 1; -+ } -+ -+ journal_flush_pins(j, seq_to_flush, min_nr); -+ -+ if (!bch2_journal_error(j)) -+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, -+ msecs_to_jiffies(j->reclaim_delay_ms)); -+} -+ -+void bch2_journal_reclaim_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(to_delayed_work(work), -+ struct journal, reclaim_work); -+ -+ mutex_lock(&j->reclaim_lock); -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+} -+ -+static int journal_flush_done(struct journal *j, u64 seq_to_flush, -+ bool *did_work) -+{ -+ int ret; -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&j->reclaim_lock); -+ -+ *did_work = journal_flush_pins(j, seq_to_flush, 0); -+ -+ spin_lock(&j->lock); -+ /* -+ * If journal replay hasn't completed, the unreplayed journal entries -+ * hold refs on their corresponding sequence numbers -+ */ -+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || -+ journal_last_seq(j) > seq_to_flush || -+ (fifo_used(&j->pin) == 1 && -+ atomic_read(&fifo_peek_front(&j->pin).count) == 1); -+ -+ spin_unlock(&j->lock); -+ mutex_unlock(&j->reclaim_lock); -+ -+ return ret; -+} -+ -+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -+{ -+ bool did_work = false; -+ -+ if (!test_bit(JOURNAL_STARTED, &j->flags)) -+ return false; -+ -+ closure_wait_event(&j->async_wait, -+ journal_flush_done(j, seq_to_flush, &did_work)); -+ -+ return did_work; -+} -+ -+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ u64 iter, seq = 0; -+ int ret = 0; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(p, &j->pin, iter) -+ if (dev_idx >= 0 -+ ? bch2_dev_list_has_dev(p->devs, dev_idx) -+ : p->devs.nr < c->opts.metadata_replicas) -+ seq = iter; -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_pins(j, seq); -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->replicas_gc_lock); -+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); -+ -+ seq = 0; -+ -+ spin_lock(&j->lock); -+ while (!ret && seq < j->pin.back) { -+ struct bch_replicas_padded replicas; -+ -+ seq = max(seq, journal_last_seq(j)); -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, -+ journal_seq_pin(j, seq)->devs); -+ seq++; -+ -+ spin_unlock(&j->lock); -+ ret = bch2_mark_replicas(c, &replicas.e); -+ spin_lock(&j->lock); -+ } -+ spin_unlock(&j->lock); -+ -+ ret = bch2_replicas_gc_end(c, ret); -+ mutex_unlock(&c->replicas_gc_lock); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h -new file mode 100644 -index 000000000000..8128907a7623 ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.h -@@ -0,0 +1,69 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -+#define _BCACHEFS_JOURNAL_RECLAIM_H -+ -+#define JOURNAL_PIN (32 * 1024) -+ -+enum journal_space_from { -+ journal_space_discarded, -+ journal_space_clean_ondisk, -+ journal_space_clean, -+}; -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *, -+ struct journal_device *, -+ enum journal_space_from); -+void bch2_journal_space_available(struct journal *); -+ -+static inline bool journal_pin_active(struct journal_entry_pin *pin) -+{ -+ return pin->seq != 0; -+} -+ -+static inline struct journal_entry_pin_list * -+journal_seq_pin(struct journal *j, u64 seq) -+{ -+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); -+ -+ return &j->pin.data[seq & j->pin.mask]; -+} -+ -+void bch2_journal_pin_put(struct journal *, u64); -+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -+ -+void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+static inline void bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) -+ __bch2_journal_pin_add(j, seq, pin, flush_fn); -+} -+ -+void bch2_journal_pin_update(struct journal *, u64, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_copy(struct journal *, -+ struct journal_entry_pin *, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -+ -+void bch2_journal_do_discards(struct journal *); -+void bch2_journal_reclaim(struct journal *); -+void bch2_journal_reclaim_work(struct work_struct *); -+ -+bool bch2_journal_flush_pins(struct journal *, u64); -+ -+static inline bool bch2_journal_flush_all_pins(struct journal *j) -+{ -+ return bch2_journal_flush_pins(j, U64_MAX); -+} -+ -+int bch2_journal_flush_device_pins(struct journal *, int); -+ -+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -new file mode 100644 -index 000000000000..d0f1bbf8f6a7 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -0,0 +1,309 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_iter.h" -+#include "eytzinger.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+/* -+ * journal_seq_blacklist machinery: -+ * -+ * To guarantee order of btree updates after a crash, we need to detect when a -+ * btree node entry (bset) is newer than the newest journal entry that was -+ * successfully written, and ignore it - effectively ignoring any btree updates -+ * that didn't make it into the journal. -+ * -+ * If we didn't do this, we might have two btree nodes, a and b, both with -+ * updates that weren't written to the journal yet: if b was updated after a, -+ * but b was flushed and not a - oops; on recovery we'll find that the updates -+ * to b happened, but not the updates to a that happened before it. -+ * -+ * Ignoring bsets that are newer than the newest journal entry is always safe, -+ * because everything they contain will also have been journalled - and must -+ * still be present in the journal on disk until a journal entry has been -+ * written _after_ that bset was written. -+ * -+ * To accomplish this, bsets record the newest journal sequence number they -+ * contain updates for; then, on startup, the btree code queries the journal -+ * code to ask "Is this sequence number newer than the newest journal entry? If -+ * so, ignore it." -+ * -+ * When this happens, we must blacklist that journal sequence number: the -+ * journal must not write any entries with that sequence number, and it must -+ * record that it was blacklisted so that a) on recovery we don't think we have -+ * missing journal entries and b) so that the btree code continues to ignore -+ * that bset, until that btree node is rewritten. -+ */ -+ -+static unsigned sb_blacklist_u64s(unsigned nr) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ -+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -+} -+ -+static struct bch_sb_field_journal_seq_blacklist * -+blacklist_entry_try_merge(struct bch_fs *c, -+ struct bch_sb_field_journal_seq_blacklist *bl, -+ unsigned i) -+{ -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ if (le64_to_cpu(bl->start[i].end) >= -+ le64_to_cpu(bl->start[i + 1].start)) { -+ bl->start[i].end = bl->start[i + 1].end; -+ --nr; -+ memmove(&bl->start[i], -+ &bl->start[i + 1], -+ sizeof(bl->start[0]) * (nr - i)); -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr)); -+ BUG_ON(!bl); -+ } -+ -+ return bl; -+} -+ -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ unsigned i, nr; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ nr = blacklist_nr_entries(bl); -+ -+ if (bl) { -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = -+ bl->start + i; -+ -+ if (start == le64_to_cpu(e->start) && -+ end == le64_to_cpu(e->end)) -+ goto out; -+ -+ if (start <= le64_to_cpu(e->start) && -+ end >= le64_to_cpu(e->end)) { -+ e->start = cpu_to_le64(start); -+ e->end = cpu_to_le64(end); -+ -+ if (i + 1 < nr) -+ bl = blacklist_entry_try_merge(c, -+ bl, i); -+ if (i) -+ bl = blacklist_entry_try_merge(c, -+ bl, i - 1); -+ goto out_write_sb; -+ } -+ } -+ } -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr + 1)); -+ if (!bl) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ bl->start[nr].start = cpu_to_le64(start); -+ bl->start[nr].end = cpu_to_le64(end); -+out_write_sb: -+ c->disk_sb.sb->features[0] |= -+ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; -+ -+ ret = bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static int journal_seq_blacklist_table_cmp(const void *_l, -+ const void *_r, size_t size) -+{ -+ const struct journal_seq_blacklist_table_entry *l = _l; -+ const struct journal_seq_blacklist_table_entry *r = _r; -+ -+ return cmp_int(l->start, r->start); -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, -+ bool dirty) -+{ -+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; -+ struct journal_seq_blacklist_table_entry search = { .start = seq }; -+ int idx; -+ -+ if (!t) -+ return false; -+ -+ idx = eytzinger0_find_le(t->entries, t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ &search); -+ if (idx < 0) -+ return false; -+ -+ BUG_ON(t->entries[idx].start > seq); -+ -+ if (seq >= t->entries[idx].end) -+ return false; -+ -+ if (dirty) -+ t->entries[idx].dirty = true; -+ return true; -+} -+ -+int bch2_blacklist_table_initialize(struct bch_fs *c) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ struct journal_seq_blacklist_table *t; -+ unsigned i, nr = blacklist_nr_entries(bl); -+ -+ BUG_ON(c->journal_seq_blacklist_table); -+ -+ if (!bl) -+ return 0; -+ -+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, -+ GFP_KERNEL); -+ if (!t) -+ return -ENOMEM; -+ -+ t->nr = nr; -+ -+ for (i = 0; i < nr; i++) { -+ t->entries[i].start = le64_to_cpu(bl->start[i].start); -+ t->entries[i].end = le64_to_cpu(bl->start[i].end); -+ } -+ -+ eytzinger0_sort(t->entries, -+ t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ NULL); -+ -+ c->journal_seq_blacklist_table = t; -+ return 0; -+} -+ -+static const char * -+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (le64_to_cpu(i->start) >= -+ le64_to_cpu(i->end)) -+ return "entry start >= end"; -+ -+ if (i + 1 < bl->start + nr && -+ le64_to_cpu(i[0].end) > -+ le64_to_cpu(i[1].start)) -+ return "entries out of order"; -+ } -+ -+ return NULL; -+} -+ -+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (i != bl->start) -+ pr_buf(out, " "); -+ -+ pr_buf(out, "%llu-%llu", -+ le64_to_cpu(i->start), -+ le64_to_cpu(i->end)); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { -+ .validate = bch2_sb_journal_seq_blacklist_validate, -+ .to_text = bch2_sb_journal_seq_blacklist_to_text -+}; -+ -+void bch2_blacklist_entries_gc(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ journal_seq_blacklist_gc_work); -+ struct journal_seq_blacklist_table *t; -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ struct journal_seq_blacklist_entry *src, *dst; -+ struct btree_trans trans; -+ unsigned i, nr, new_nr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_iter *iter; -+ struct btree *b; -+ -+ for_each_btree_node(&trans, iter, i, POS_MIN, -+ BTREE_ITER_PREFETCH, b) -+ if (test_bit(BCH_FS_STOPPING, &c->flags)) { -+ bch2_trans_exit(&trans); -+ return; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ if (!bl) -+ goto out; -+ -+ nr = blacklist_nr_entries(bl); -+ dst = bl->start; -+ -+ t = c->journal_seq_blacklist_table; -+ BUG_ON(nr != t->nr); -+ -+ for (src = bl->start, i = eytzinger0_first(t->nr); -+ src < bl->start + nr; -+ src++, i = eytzinger0_next(i, nr)) { -+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); -+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); -+ -+ if (t->entries[i].dirty) -+ *dst++ = *src; -+ } -+ -+ new_nr = dst - bl->start; -+ -+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); -+ -+ if (new_nr != nr) { -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ new_nr ? sb_blacklist_u64s(new_nr) : 0); -+ BUG_ON(new_nr && !bl); -+ -+ if (!new_nr) -+ c->disk_sb.sb->features[0] &= -+ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); -+ -+ bch2_write_super(c); -+ } -+out: -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h -new file mode 100644 -index 000000000000..afb886ec8e25 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.h -@@ -0,0 +1,22 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+ -+static inline unsigned -+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -+{ -+ return bl -+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / -+ sizeof(struct journal_seq_blacklist_entry)) -+ : 0; -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -+int bch2_blacklist_table_initialize(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -+ -+void bch2_blacklist_entries_gc(struct work_struct *); -+ -+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -new file mode 100644 -index 000000000000..154b51b891d3 ---- /dev/null -+++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,277 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_TYPES_H -+#define _BCACHEFS_JOURNAL_TYPES_H -+ -+#include -+#include -+ -+#include "alloc_types.h" -+#include "super_types.h" -+#include "fifo.h" -+ -+struct journal_res; -+ -+/* -+ * We put two of these in struct journal; we used them for writes to the -+ * journal that are being staged or in flight. -+ */ -+struct journal_buf { -+ struct jset *data; -+ -+ BKEY_PADDED(key); -+ -+ struct closure_waitlist wait; -+ -+ unsigned buf_size; /* size in bytes of @data */ -+ unsigned sectors; /* maximum size for current entry */ -+ unsigned disk_sectors; /* maximum size entry could have been, if -+ buf_size was bigger */ -+ unsigned u64s_reserved; -+ /* bloom filter: */ -+ unsigned long has_inode[1024 / sizeof(unsigned long)]; -+}; -+ -+/* -+ * Something that makes a journal entry dirty - i.e. a btree node that has to be -+ * flushed: -+ */ -+ -+struct journal_entry_pin_list { -+ struct list_head list; -+ struct list_head flushed; -+ atomic_t count; -+ struct bch_devs_list devs; -+}; -+ -+struct journal; -+struct journal_entry_pin; -+typedef void (*journal_pin_flush_fn)(struct journal *j, -+ struct journal_entry_pin *, u64); -+ -+struct journal_entry_pin { -+ struct list_head list; -+ journal_pin_flush_fn flush; -+ u64 seq; -+}; -+ -+struct journal_res { -+ bool ref; -+ u8 idx; -+ u16 u64s; -+ u32 offset; -+ u64 seq; -+}; -+ -+/* -+ * For reserving space in the journal prior to getting a reservation on a -+ * particular journal entry: -+ */ -+struct journal_preres { -+ unsigned u64s; -+}; -+ -+union journal_res_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 cur_entry_offset:20, -+ idx:1, -+ prev_buf_unwritten:1, -+ buf0_count:21, -+ buf1_count:21; -+ }; -+}; -+ -+union journal_preres_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u32 reserved; -+ u32 remaining; -+ }; -+}; -+ -+/* bytes: */ -+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+ -+/* -+ * We stash some journal state as sentinal values in cur_entry_offset: -+ * note - cur_entry_offset is in units of u64s -+ */ -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+ -+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) -+ -+/* -+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, -+ * either because something's waiting on the write to complete or because it's -+ * been dirty too long and the timer's expired. -+ */ -+ -+enum { -+ JOURNAL_REPLAY_DONE, -+ JOURNAL_STARTED, -+ JOURNAL_RECLAIM_STARTED, -+ JOURNAL_NEED_WRITE, -+ JOURNAL_NOT_EMPTY, -+ JOURNAL_MAY_GET_UNRESERVED, -+}; -+ -+/* Embedded in struct bch_fs */ -+struct journal { -+ /* Fastpath stuff up front: */ -+ -+ unsigned long flags; -+ -+ union journal_res_state reservations; -+ -+ /* Max size of current journal entry */ -+ unsigned cur_entry_u64s; -+ unsigned cur_entry_sectors; -+ -+ /* -+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if -+ * insufficient devices: -+ */ -+ int cur_entry_error; -+ -+ union journal_preres_state prereserved; -+ -+ /* Reserved space in journal entry to be used just prior to write */ -+ unsigned entry_u64s_reserved; -+ -+ unsigned buf_size_want; -+ -+ /* -+ * Two journal entries -- one is currently open for new entries, the -+ * other is possibly being written out. -+ */ -+ struct journal_buf buf[2]; -+ -+ spinlock_t lock; -+ -+ /* if nonzero, we may not open a new journal entry: */ -+ unsigned blocked; -+ -+ /* Used when waiting because the journal was full */ -+ wait_queue_head_t wait; -+ struct closure_waitlist async_wait; -+ struct closure_waitlist preres_wait; -+ -+ struct closure io; -+ struct delayed_work write_work; -+ -+ /* Sequence number of most recent journal entry (last entry in @pin) */ -+ atomic64_t seq; -+ -+ /* seq, last_seq from the most recent journal entry successfully written */ -+ u64 seq_ondisk; -+ u64 last_seq_ondisk; -+ -+ /* -+ * FIFO of journal entries whose btree updates have not yet been -+ * written out. -+ * -+ * Each entry is a reference count. The position in the FIFO is the -+ * entry's sequence number relative to @seq. -+ * -+ * The journal entry itself holds a reference count, put when the -+ * journal entry is written out. Each btree node modified by the journal -+ * entry also holds a reference count, put when the btree node is -+ * written. -+ * -+ * When a reference count reaches zero, the journal entry is no longer -+ * needed. When all journal entries in the oldest journal bucket are no -+ * longer needed, the bucket can be discarded and reused. -+ */ -+ struct { -+ u64 front, back, size, mask; -+ struct journal_entry_pin_list *data; -+ } pin; -+ -+ u64 replay_journal_seq; -+ u64 replay_journal_seq_end; -+ -+ struct write_point wp; -+ spinlock_t err_lock; -+ -+ struct delayed_work reclaim_work; -+ struct mutex reclaim_lock; -+ unsigned long last_flushed; -+ struct journal_entry_pin *flush_in_progress; -+ wait_queue_head_t pin_flush_wait; -+ -+ /* protects advancing ja->discard_idx: */ -+ struct mutex discard_lock; -+ bool can_discard; -+ -+ unsigned write_delay_ms; -+ unsigned reclaim_delay_ms; -+ -+ u64 res_get_blocked_start; -+ u64 need_write_time; -+ u64 write_start_time; -+ -+ struct time_stats *write_time; -+ struct time_stats *delay_time; -+ struct time_stats *blocked_time; -+ struct time_stats *flush_seq_time; -+ -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map res_map; -+#endif -+}; -+ -+/* -+ * Embedded in struct bch_dev. First three fields refer to the array of journal -+ * buckets, in bch_sb. -+ */ -+struct journal_device { -+ /* -+ * For each journal bucket, contains the max sequence number of the -+ * journal writes it contains - so we know when a bucket can be reused. -+ */ -+ u64 *bucket_seq; -+ -+ unsigned sectors_free; -+ -+ /* -+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: -+ */ -+ unsigned discard_idx; /* Next bucket to discard */ -+ unsigned dirty_idx_ondisk; -+ unsigned dirty_idx; -+ unsigned cur_idx; /* Journal bucket we're currently writing to */ -+ unsigned nr; -+ -+ u64 *buckets; -+ -+ /* Bio for journal reads/writes to this device */ -+ struct bio *bio; -+ -+ /* for bch_journal_read_device */ -+ struct closure read; -+}; -+ -+/* -+ * journal_entry_res - reserve space in every journal entry: -+ */ -+struct journal_entry_res { -+ unsigned u64s; -+}; -+ -+#endif /* _BCACHEFS_JOURNAL_TYPES_H */ -diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c -new file mode 100644 -index 000000000000..864dfaa67b7a ---- /dev/null -+++ b/fs/bcachefs/keylist.c -@@ -0,0 +1,67 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "keylist.h" -+ -+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, -+ size_t nr_inline_u64s, size_t new_u64s) -+{ -+ size_t oldsize = bch2_keylist_u64s(l); -+ size_t newsize = oldsize + new_u64s; -+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; -+ u64 *new_keys; -+ -+ newsize = roundup_pow_of_two(newsize); -+ -+ if (newsize <= nr_inline_u64s || -+ (old_buf && roundup_pow_of_two(oldsize) == newsize)) -+ return 0; -+ -+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); -+ if (!new_keys) -+ return -ENOMEM; -+ -+ if (!old_buf) -+ memcpy_u64s(new_keys, inline_u64s, oldsize); -+ -+ l->keys_p = new_keys; -+ l->top_p = new_keys + oldsize; -+ -+ return 0; -+} -+ -+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -+{ -+ struct bkey_i *where; -+ -+ for_each_keylist_key(l, where) -+ if (bkey_cmp(insert->k.p, where->k.p) < 0) -+ break; -+ -+ memmove_u64s_up((u64 *) where + insert->k.u64s, -+ where, -+ ((u64 *) l->top) - ((u64 *) where)); -+ -+ l->top_p += insert->k.u64s; -+ bkey_copy(where, insert); -+} -+ -+void bch2_keylist_pop_front(struct keylist *l) -+{ -+ l->top_p -= bch2_keylist_front(l)->k.u64s; -+ -+ memmove_u64s_down(l->keys, -+ bkey_next(l->keys), -+ bch2_keylist_u64s(l)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *l) -+{ -+ struct bkey_i *k; -+ -+ for_each_keylist_key(l, k) -+ BUG_ON(bkey_next(k) != l->top && -+ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); -+} -+#endif -diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h -new file mode 100644 -index 000000000000..195799bb20bc ---- /dev/null -+++ b/fs/bcachefs/keylist.h -@@ -0,0 +1,76 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_H -+#define _BCACHEFS_KEYLIST_H -+ -+#include "keylist_types.h" -+ -+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); -+void bch2_keylist_pop_front(struct keylist *); -+ -+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -+{ -+ l->top_p = l->keys_p = inline_keys; -+} -+ -+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -+{ -+ if (l->keys_p != inline_keys) -+ kfree(l->keys_p); -+ bch2_keylist_init(l, inline_keys); -+} -+ -+static inline void bch2_keylist_push(struct keylist *l) -+{ -+ l->top = bkey_next(l->top); -+} -+ -+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -+{ -+ bkey_copy(l->top, k); -+ bch2_keylist_push(l); -+} -+ -+static inline bool bch2_keylist_empty(struct keylist *l) -+{ -+ return l->top == l->keys; -+} -+ -+static inline size_t bch2_keylist_u64s(struct keylist *l) -+{ -+ return l->top_p - l->keys_p; -+} -+ -+static inline size_t bch2_keylist_bytes(struct keylist *l) -+{ -+ return bch2_keylist_u64s(l) * sizeof(u64); -+} -+ -+static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -+{ -+ return l->keys; -+} -+ -+#define for_each_keylist_key(_keylist, _k) \ -+ for (_k = (_keylist)->keys; \ -+ _k != (_keylist)->top; \ -+ _k = bkey_next(_k)) -+ -+static inline u64 keylist_sectors(struct keylist *keys) -+{ -+ struct bkey_i *k; -+ u64 ret = 0; -+ -+ for_each_keylist_key(keys, k) -+ ret += k->k.size; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *); -+#else -+static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -+#endif -+ -+#endif /* _BCACHEFS_KEYLIST_H */ -diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h -new file mode 100644 -index 000000000000..4b3ff7d8a875 ---- /dev/null -+++ b/fs/bcachefs/keylist_types.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_TYPES_H -+#define _BCACHEFS_KEYLIST_TYPES_H -+ -+struct keylist { -+ union { -+ struct bkey_i *keys; -+ u64 *keys_p; -+ }; -+ union { -+ struct bkey_i *top; -+ u64 *top_p; -+ }; -+}; -+ -+#endif /* _BCACHEFS_KEYLIST_TYPES_H */ -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -new file mode 100644 -index 000000000000..96c8690adc5b ---- /dev/null -+++ b/fs/bcachefs/migrate.c -@@ -0,0 +1,170 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for moving data off a device. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "extents.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "migrate.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, -+ unsigned dev_idx, int flags, bool metadata) -+{ -+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; -+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; -+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; -+ unsigned nr_good; -+ -+ bch2_bkey_drop_device(k, dev_idx); -+ -+ nr_good = bch2_bkey_durability(c, k.s_c); -+ if ((!nr_good && !(flags & lost)) || -+ (nr_good < replicas && !(flags & degraded))) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, -+ enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack sk; -+ int ret = 0; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k))) { -+ if (!bch2_bkey_has_device(k, dev_idx)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), -+ dev_idx, flags, false); -+ if (ret) -+ break; -+ -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * don't want to leave ret == -EINTR, since if we raced and -+ * something else overwrote the key we could spuriously return -+ * -EINTR below: -+ */ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: -+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); -+} -+ -+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct closure cl; -+ struct btree *b; -+ unsigned id; -+ int ret; -+ -+ /* don't handle this yet: */ -+ if (flags & BCH_FORCE_IF_METADATA_LOST) -+ return -EINVAL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ closure_init_stack(&cl); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+retry: -+ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), -+ dev_idx)) -+ continue; -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), -+ dev_idx, flags, true); -+ if (ret) { -+ bch_err(c, "Cannot drop device without losing data"); -+ goto err; -+ } -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) { -+ b = bch2_btree_iter_peek_node(iter); -+ goto retry; -+ } -+ if (ret) { -+ bch_err(c, "Error updating btree node key: %i", ret); -+ goto err; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ /* flush relevant btree updates */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = 0; -+err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, dev_idx, flags); -+} -diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h -new file mode 100644 -index 000000000000..027efaa0d575 ---- /dev/null -+++ b/fs/bcachefs/migrate.h -@@ -0,0 +1,7 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MIGRATE_H -+#define _BCACHEFS_MIGRATE_H -+ -+int bch2_dev_data_drop(struct bch_fs *, unsigned, int); -+ -+#endif /* _BCACHEFS_MIGRATE_H */ -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -new file mode 100644 -index 000000000000..2f3be487ef65 ---- /dev/null -+++ b/fs/bcachefs/move.c -@@ -0,0 +1,819 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "inode.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "keylist.h" -+ -+#include -+#include -+ -+#include -+ -+#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 -+ -+struct moving_io { -+ struct list_head list; -+ struct closure cl; -+ bool read_completed; -+ -+ unsigned read_sectors; -+ unsigned write_sectors; -+ -+ struct bch_read_bio rbio; -+ -+ struct migrate_write write; -+ /* Must be last since it is variable size */ -+ struct bio_vec bi_inline_vecs[0]; -+}; -+ -+struct moving_context { -+ /* Closure for waiting on all reads and writes to complete */ -+ struct closure cl; -+ -+ struct bch_move_stats *stats; -+ -+ struct list_head reads; -+ -+ /* in flight sectors: */ -+ atomic_t read_sectors; -+ atomic_t write_sectors; -+ -+ wait_queue_head_t wait; -+}; -+ -+static int bch2_migrate_index_update(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct migrate_write *m = -+ container_of(op, struct migrate_write, op); -+ struct keylist *keys = &op->insert_keys; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, m->btree_id, -+ bkey_start_pos(&bch2_keylist_front(keys)->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (1) { -+ struct bkey_s_c k; -+ struct bkey_i *insert; -+ struct bkey_i_extent *new; -+ BKEY_PADDED(k) _new, _insert; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bool did_work = false; -+ int nr; -+ -+ bch2_trans_reset(&trans, 0); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ if (ret == -EINTR) -+ continue; -+ break; -+ } -+ -+ new = bkey_i_to_extent(bch2_keylist_front(keys)); -+ -+ if (bversion_cmp(k.k->version, new->k.version) || -+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) -+ goto nomatch; -+ -+ if (m->data_cmd == DATA_REWRITE && -+ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) -+ goto nomatch; -+ -+ bkey_reassemble(&_insert.k, k); -+ insert = &_insert.k; -+ -+ bkey_copy(&_new.k, bch2_keylist_front(keys)); -+ new = bkey_i_to_extent(&_new.k); -+ bch2_cut_front(iter->pos, &new->k_i); -+ -+ bch2_cut_front(iter->pos, insert); -+ bch2_cut_back(new->k.p, insert); -+ bch2_cut_back(insert->k.p, &new->k_i); -+ -+ if (m->data_cmd == DATA_REWRITE) -+ bch2_bkey_drop_device(bkey_i_to_s(insert), -+ m->data_opts.rewrite_dev); -+ -+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { -+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { -+ /* -+ * raced with another move op? extent already -+ * has a pointer to the device we just wrote -+ * data to -+ */ -+ continue; -+ } -+ -+ bch2_extent_ptr_decoded_append(insert, &p); -+ did_work = true; -+ } -+ -+ if (!did_work) -+ goto nomatch; -+ -+ bch2_bkey_narrow_crcs(insert, -+ (struct bch_extent_crc_unpacked) { 0 }); -+ bch2_extent_normalize(c, bkey_i_to_s(insert)); -+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), -+ op->opts.background_target, -+ op->opts.data_replicas); -+ -+ /* -+ * If we're not fully overwriting @k, and it's compressed, we -+ * need a reservation for all the pointers in @insert -+ */ -+ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - -+ m->nr_ptrs_reserved; -+ -+ if (insert->k.size < k.k->size && -+ bch2_bkey_sectors_compressed(k) && -+ nr > 0) { -+ ret = bch2_disk_reservation_add(c, &op->res, -+ keylist_sectors(keys) * nr, 0); -+ if (ret) -+ goto out; -+ -+ m->nr_ptrs_reserved += nr; -+ goto next; -+ } -+ -+ bch2_trans_update(&trans, iter, insert, 0); -+ -+ ret = bch2_trans_commit(&trans, &op->res, -+ op_journal_seq(op), -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ m->data_opts.btree_insert_flags); -+ if (!ret) -+ atomic_long_inc(&c->extent_migrate_done); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+next: -+ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { -+ bch2_keylist_pop_front(keys); -+ if (bch2_keylist_empty(keys)) -+ goto out; -+ } -+ continue; -+nomatch: -+ if (m->ctxt) { -+ BUG_ON(k.k->p.offset <= iter->pos.offset); -+ atomic64_inc(&m->ctxt->stats->keys_raced); -+ atomic64_add(k.k->p.offset - iter->pos.offset, -+ &m->ctxt->stats->sectors_raced); -+ } -+ atomic_long_inc(&c->extent_migrate_raced); -+ trace_move_race(&new->k); -+ bch2_btree_iter_next_slot(iter); -+ goto next; -+ } -+out: -+ bch2_trans_exit(&trans); -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -+ -+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) -+{ -+ /* write bio must own pages: */ -+ BUG_ON(!m->op.wbio.bio.bi_vcnt); -+ -+ m->ptr = rbio->pick.ptr; -+ m->offset = rbio->pos.offset - rbio->pick.crc.offset; -+ m->op.devs_have = rbio->devs_have; -+ m->op.pos = rbio->pos; -+ m->op.version = rbio->version; -+ m->op.crc = rbio->pick.crc; -+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; -+ -+ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { -+ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; -+ m->op.csum_type = m->op.crc.csum_type; -+ } -+ -+ if (m->data_cmd == DATA_REWRITE) -+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); -+} -+ -+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ m->btree_id = btree_id; -+ m->data_cmd = data_cmd; -+ m->data_opts = data_opts; -+ m->nr_ptrs_reserved = 0; -+ -+ bch2_write_op_init(&m->op, c, io_opts); -+ -+ if (!bch2_bkey_is_incompressible(k)) -+ m->op.compression_type = -+ bch2_compression_opt_to_type[io_opts.background_compression ?: -+ io_opts.compression]; -+ else -+ m->op.incompressible = true; -+ -+ m->op.target = data_opts.target, -+ m->op.write_point = wp; -+ -+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { -+ m->op.alloc_reserve = RESERVE_MOVINGGC; -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; -+ } else { -+ /* XXX: this should probably be passed in */ -+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -+ } -+ -+ m->op.flags |= BCH_WRITE_PAGES_STABLE| -+ BCH_WRITE_PAGES_OWNED| -+ BCH_WRITE_DATA_ENCODED| -+ BCH_WRITE_FROM_INTERNAL; -+ -+ m->op.nr_replicas = 1; -+ m->op.nr_replicas_required = 1; -+ m->op.index_update_fn = bch2_migrate_index_update; -+ -+ switch (data_cmd) { -+ case DATA_ADD_REPLICAS: { -+ /* -+ * DATA_ADD_REPLICAS is used for moving data to a different -+ * device in the background, and due to compression the new copy -+ * might take up more space than the old copy: -+ */ -+#if 0 -+ int nr = (int) io_opts.data_replicas - -+ bch2_bkey_nr_ptrs_allocated(k); -+#endif -+ int nr = (int) io_opts.data_replicas; -+ -+ if (nr > 0) { -+ m->op.nr_replicas = m->nr_ptrs_reserved = nr; -+ -+ ret = bch2_disk_reservation_get(c, &m->op.res, -+ k.k->size, m->op.nr_replicas, 0); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_REWRITE: { -+ unsigned compressed_sectors = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ crc_is_compressed(p.crc) && -+ bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) -+ compressed_sectors += p.crc.compressed_size; -+ -+ if (compressed_sectors) { -+ ret = bch2_disk_reservation_add(c, &m->op.res, -+ compressed_sectors, -+ BCH_DISK_RESERVATION_NOFAIL); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_PROMOTE: -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; -+ m->op.flags |= BCH_WRITE_CACHED; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+static void move_free(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ struct moving_context *ctxt = io->write.ctxt; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); -+ -+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) -+ if (bv->bv_page) -+ __free_page(bv->bv_page); -+ -+ wake_up(&ctxt->wait); -+ -+ kfree(io); -+} -+ -+static void move_write_done(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_return_with_destructor(cl, move_free); -+} -+ -+static void move_write(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ closure_return_with_destructor(cl, move_free); -+ return; -+ } -+ -+ bch2_migrate_read_done(&io->write, &io->rbio); -+ -+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_call(&io->write.op.cl, bch2_write, NULL, cl); -+ continue_at(cl, move_write_done, NULL); -+} -+ -+static inline struct moving_io *next_pending_write(struct moving_context *ctxt) -+{ -+ struct moving_io *io = -+ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); -+ -+ return io && io->read_completed ? io : NULL; -+} -+ -+static void move_read_endio(struct bio *bio) -+{ -+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ atomic_sub(io->read_sectors, &ctxt->read_sectors); -+ io->read_completed = true; -+ -+ if (next_pending_write(ctxt)) -+ wake_up(&ctxt->wait); -+ -+ closure_put(&ctxt->cl); -+} -+ -+static void do_pending_writes(struct moving_context *ctxt) -+{ -+ struct moving_io *io; -+ -+ while ((io = next_pending_write(ctxt))) { -+ list_del(&io->list); -+ closure_call(&io->cl, move_write, NULL, &ctxt->cl); -+ } -+} -+ -+#define move_ctxt_wait_event(_ctxt, _cond) \ -+do { \ -+ do_pending_writes(_ctxt); \ -+ \ -+ if (_cond) \ -+ break; \ -+ __wait_event((_ctxt)->wait, \ -+ next_pending_write(_ctxt) || (_cond)); \ -+} while (1) -+ -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -+{ -+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); -+ -+ move_ctxt_wait_event(ctxt, -+ !atomic_read(&ctxt->write_sectors) || -+ atomic_read(&ctxt->write_sectors) != sectors_pending); -+} -+ -+static int bch2_move_extent(struct bch_fs *c, -+ struct moving_context *ctxt, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct moving_io *io; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned sectors = k.k->size, pages; -+ int ret = -ENOMEM; -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->write_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->read_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ /* write path might have to decompress data: */ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -+ -+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ io = kzalloc(sizeof(struct moving_io) + -+ sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ if (!io) -+ goto err; -+ -+ io->write.ctxt = ctxt; -+ io->read_sectors = k.k->size; -+ io->write_sectors = k.k->size; -+ -+ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); -+ bio_set_prio(&io->write.op.wbio.bio, -+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ -+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -+ GFP_KERNEL)) -+ goto err_free; -+ -+ io->rbio.c = c; -+ io->rbio.opts = io_opts; -+ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); -+ io->rbio.bio.bi_vcnt = pages; -+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ -+ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); -+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -+ io->rbio.bio.bi_end_io = move_read_endio; -+ -+ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, -+ data_cmd, data_opts, btree_id, k); -+ if (ret) -+ goto err_free_pages; -+ -+ atomic64_inc(&ctxt->stats->keys_moved); -+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); -+ -+ trace_move_extent(k.k); -+ -+ atomic_add(io->read_sectors, &ctxt->read_sectors); -+ list_add_tail(&io->list, &ctxt->reads); -+ -+ /* -+ * dropped by move_read_endio() - guards against use after free of -+ * ctxt when doing wakeup -+ */ -+ closure_get(&ctxt->cl); -+ bch2_read_extent(c, &io->rbio, k, 0, -+ BCH_READ_NODECODE| -+ BCH_READ_LAST_FRAGMENT); -+ return 0; -+err_free_pages: -+ bio_free_pages(&io->write.op.wbio.bio); -+err_free: -+ kfree(io); -+err: -+ trace_move_alloc_fail(k.k); -+ return ret; -+} -+ -+static int __bch2_move_data(struct bch_fs *c, -+ struct moving_context *ctxt, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats, -+ enum btree_id btree_id) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct bkey_on_stack sk; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct data_opts data_opts; -+ enum data_cmd data_cmd; -+ u64 delay, cur_inum = U64_MAX; -+ int ret = 0, ret2; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_user; -+ stats->btree_id = btree_id; -+ stats->pos = POS_MIN; -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, start, -+ BTREE_ITER_PREFETCH); -+ -+ if (rate) -+ bch2_ratelimit_reset(rate); -+ -+ while (1) { -+ do { -+ delay = rate ? bch2_ratelimit_delay(rate) : 0; -+ -+ if (delay) { -+ bch2_trans_unlock(&trans); -+ set_current_state(TASK_INTERRUPTIBLE); -+ } -+ -+ if (kthread && (ret = kthread_should_stop())) { -+ __set_current_state(TASK_RUNNING); -+ goto out; -+ } -+ -+ if (delay) -+ schedule_timeout(delay); -+ -+ if (unlikely(freezing(current))) { -+ bch2_trans_unlock(&trans); -+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); -+ try_to_freeze(); -+ } -+ } while (delay); -+peek: -+ k = bch2_btree_iter_peek(iter); -+ -+ stats->pos = iter->pos; -+ -+ if (!k.k) -+ break; -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (!bkey_extent_is_direct_data(k.k)) -+ goto next_nondata; -+ -+ if (btree_id == BTREE_ID_EXTENTS && -+ cur_inum != k.k->p.inode) { -+ struct bch_inode_unpacked inode; -+ -+ /* don't hold btree locks while looking up inode: */ -+ bch2_trans_unlock(&trans); -+ -+ io_opts = bch2_opts_to_inode_opts(c->opts); -+ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) -+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); -+ cur_inum = k.k->p.inode; -+ goto peek; -+ } -+ -+ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ case DATA_PROMOTE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* unlock before doing IO: */ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, -+ data_cmd, data_opts); -+ if (ret2) { -+ if (ret2 == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); -+ continue; -+ } -+ -+ /* XXX signal failure */ -+ goto next; -+ } -+ -+ if (rate) -+ bch2_ratelimit_increment(rate, k.k->size); -+next: -+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), -+ &stats->sectors_seen); -+next_nondata: -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+out: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+int bch2_move_data(struct bch_fs *c, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct moving_context ctxt = { .stats = stats }; -+ int ret; -+ -+ closure_init_stack(&ctxt.cl); -+ INIT_LIST_HEAD(&ctxt.reads); -+ init_waitqueue_head(&ctxt.wait); -+ -+ stats->data_type = BCH_DATA_user; -+ -+ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_EXTENTS) ?: -+ __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_REFLINK); -+ -+ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); -+ closure_sync(&ctxt.cl); -+ -+ EBUG_ON(atomic_read(&ctxt.write_sectors)); -+ -+ trace_move_data(c, -+ atomic64_read(&stats->sectors_moved), -+ atomic64_read(&stats->keys_moved)); -+ -+ return ret; -+} -+ -+static int bch2_move_btree(struct bch_fs *c, -+ move_pred_fn pred, -+ void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned id; -+ struct data_opts data_opts; -+ enum data_cmd cmd; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_btree; -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ stats->btree_id = id; -+ -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ stats->pos = iter->pos; -+ -+ switch ((cmd = pred(c, arg, -+ bkey_i_to_s_c(&b->key), -+ &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ ret = bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, 0) ?: ret; -+next: -+ bch2_trans_cond_resched(&trans); -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+#if 0 -+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ return DATA_SCRUB; -+} -+#endif -+ -+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ unsigned nr_good = bch2_bkey_durability(c, k); -+ unsigned replicas = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ replicas = c->opts.metadata_replicas; -+ break; -+ case KEY_TYPE_extent: -+ replicas = io_opts->data_replicas; -+ break; -+ } -+ -+ if (!nr_good || nr_good >= replicas) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+} -+ -+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ struct bch_ioctl_data *op = arg; -+ -+ if (!bch2_bkey_has_device(k, op->migrate.dev)) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ data_opts->rewrite_dev = op->migrate.dev; -+ return DATA_REWRITE; -+} -+ -+int bch2_data_job(struct bch_fs *c, -+ struct bch_move_stats *stats, -+ struct bch_ioctl_data op) -+{ -+ int ret = 0; -+ -+ switch (op.op) { -+ case BCH_DATA_OP_REREPLICATE: -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, -1); -+ -+ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; -+ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ rereplicate_pred, c, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ case BCH_DATA_OP_MIGRATE: -+ if (op.migrate.dev >= c->sb.nr_devices) -+ return -EINVAL; -+ -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -+ -+ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -new file mode 100644 -index 000000000000..0acd1720d4f8 ---- /dev/null -+++ b/fs/bcachefs/move.h -@@ -0,0 +1,64 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_H -+#define _BCACHEFS_MOVE_H -+ -+#include "btree_iter.h" -+#include "buckets.h" -+#include "io_types.h" -+#include "move_types.h" -+ -+struct bch_read_bio; -+struct moving_context; -+ -+enum data_cmd { -+ DATA_SKIP, -+ DATA_SCRUB, -+ DATA_ADD_REPLICAS, -+ DATA_REWRITE, -+ DATA_PROMOTE, -+}; -+ -+struct data_opts { -+ u16 target; -+ unsigned rewrite_dev; -+ int btree_insert_flags; -+}; -+ -+struct migrate_write { -+ enum btree_id btree_id; -+ enum data_cmd data_cmd; -+ struct data_opts data_opts; -+ -+ unsigned nr_ptrs_reserved; -+ -+ struct moving_context *ctxt; -+ -+ /* what we read: */ -+ struct bch_extent_ptr ptr; -+ u64 offset; -+ -+ struct bch_write_op op; -+}; -+ -+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); -+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, -+ struct write_point_specifier, -+ struct bch_io_opts, -+ enum data_cmd, struct data_opts, -+ enum btree_id, struct bkey_s_c); -+ -+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, -+ struct bkey_s_c, -+ struct bch_io_opts *, struct data_opts *); -+ -+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, -+ struct write_point_specifier, -+ struct bpos, struct bpos, -+ move_pred_fn, void *, -+ struct bch_move_stats *); -+ -+int bch2_data_job(struct bch_fs *, -+ struct bch_move_stats *, -+ struct bch_ioctl_data); -+ -+#endif /* _BCACHEFS_MOVE_H */ -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -new file mode 100644 -index 000000000000..fc0de165af9f ---- /dev/null -+++ b/fs/bcachefs/move_types.h -@@ -0,0 +1,17 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_TYPES_H -+#define _BCACHEFS_MOVE_TYPES_H -+ -+struct bch_move_stats { -+ enum bch_data_type data_type; -+ enum btree_id btree_id; -+ struct bpos pos; -+ -+ atomic64_t keys_moved; -+ atomic64_t keys_raced; -+ atomic64_t sectors_moved; -+ atomic64_t sectors_seen; -+ atomic64_t sectors_raced; -+}; -+ -+#endif /* _BCACHEFS_MOVE_TYPES_H */ -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -new file mode 100644 -index 000000000000..de0a7974ec9f ---- /dev/null -+++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,359 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Moving/copying garbage collector -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "eytzinger.h" -+#include "io.h" -+#include "keylist.h" -+#include "move.h" -+#include "movinggc.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * We can't use the entire copygc reserve in one iteration of copygc: we may -+ * need the buckets we're freeing up to go back into the copygc reserve to make -+ * forward progress, but if the copygc reserve is full they'll be available for -+ * any allocation - and it's possible that in a given iteration, we free up most -+ * of the buckets we're going to free before we allocate most of the buckets -+ * we're going to allocate. -+ * -+ * If we only use half of the reserve per iteration, then in steady state we'll -+ * always have room in the reserve for the buckets we're going to need in the -+ * next iteration: -+ */ -+#define COPYGC_BUCKETS_PER_ITER(ca) \ -+ ((ca)->free[RESERVE_MOVINGGC].size / 2) -+ -+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -+{ -+ const struct copygc_heap_entry *l = _l; -+ const struct copygc_heap_entry *r = _r; -+ -+ return cmp_int(l->dev, r->dev) ?: -+ cmp_int(l->offset, r->offset); -+} -+ -+static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) -+{ -+ copygc_heap *h = &c->copygc_heap; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct copygc_heap_entry search = { -+ .dev = ptr->dev, -+ .offset = ptr->offset -+ }; -+ -+ ssize_t i = eytzinger0_find_le(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, &search); -+#if 0 -+ /* eytzinger search verify code: */ -+ ssize_t j = -1, k; -+ -+ for (k = 0; k < h->used; k++) -+ if (h->data[k].offset <= ptr->offset && -+ (j < 0 || h->data[k].offset > h->data[j].offset)) -+ j = k; -+ -+ BUG_ON(i != j); -+#endif -+ if (i >= 0 && -+ ptr->offset < h->data[i].offset + ca->mi.bucket_size && -+ ptr->gen == h->data[i].gen) -+ return ptr->dev; -+ } -+ -+ return -1; -+} -+ -+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ int dev_idx = __copygc_pred(c, k); -+ if (dev_idx < 0) -+ return DATA_SKIP; -+ -+ data_opts->target = io_opts->background_target; -+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; -+ data_opts->rewrite_dev = dev_idx; -+ return DATA_REWRITE; -+} -+ -+static bool have_copygc_reserve(struct bch_dev *ca) -+{ -+ bool ret; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || -+ ca->allocator_state != ALLOCATOR_RUNNING; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ return ret; -+} -+ -+static inline int fragmentation_cmp(copygc_heap *heap, -+ struct copygc_heap_entry l, -+ struct copygc_heap_entry r) -+{ -+ return cmp_int(l.fragmentation, r.fragmentation); -+} -+ -+static int bch2_copygc(struct bch_fs *c) -+{ -+ copygc_heap *h = &c->copygc_heap; -+ struct copygc_heap_entry e, *i; -+ struct bucket_array *buckets; -+ struct bch_move_stats move_stats; -+ u64 sectors_to_move = 0, sectors_not_moved = 0; -+ u64 sectors_reserved = 0; -+ u64 buckets_to_move, buckets_not_moved = 0; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ size_t b, heap_size = 0; -+ int ret; -+ -+ memset(&move_stats, 0, sizeof(move_stats)); -+ /* -+ * Find buckets with lowest sector counts, skipping completely -+ * empty buckets, by building a maxheap sorted by sector count, -+ * and repeatedly replacing the maximum element until all -+ * buckets have been visited. -+ */ -+ h->used = 0; -+ -+ for_each_rw_member(ca, c, dev_idx) -+ heap_size += ca->mi.nbuckets >> 7; -+ -+ if (h->size < heap_size) { -+ free_heap(&c->copygc_heap); -+ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { -+ bch_err(c, "error allocating copygc heap"); -+ return 0; -+ } -+ } -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); -+ -+ spin_lock(&ca->fs->freelist_lock); -+ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ struct copygc_heap_entry e; -+ -+ if (m.owned_by_allocator || -+ m.data_type != BCH_DATA_user || -+ !bucket_sectors_used(m) || -+ bucket_sectors_used(m) >= ca->mi.bucket_size) -+ continue; -+ -+ e = (struct copygc_heap_entry) { -+ .dev = dev_idx, -+ .gen = m.gen, -+ .fragmentation = bucket_sectors_used(m) * (1U << 15) -+ / ca->mi.bucket_size, -+ .sectors = bucket_sectors_used(m), -+ .offset = bucket_to_sector(ca, b), -+ }; -+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); -+ } -+ up_read(&ca->bucket_lock); -+ } -+ -+ if (!sectors_reserved) { -+ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); -+ return -1; -+ } -+ -+ for (i = h->data; i < h->data + h->used; i++) -+ sectors_to_move += i->sectors; -+ -+ while (sectors_to_move > sectors_reserved) { -+ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); -+ sectors_to_move -= e.sectors; -+ } -+ -+ buckets_to_move = h->used; -+ -+ if (!buckets_to_move) -+ return 0; -+ -+ eytzinger0_sort(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, NULL); -+ -+ ret = bch2_move_data(c, &c->copygc_pd.rate, -+ writepoint_ptr(&c->copygc_write_point), -+ POS_MIN, POS_MAX, -+ copygc_pred, NULL, -+ &move_stats); -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ for (i = h->data; i < h->data + h->used; i++) { -+ struct bucket_mark m; -+ size_t b; -+ -+ if (i->dev != dev_idx) -+ continue; -+ -+ b = sector_to_bucket(ca, i->offset); -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (i->gen == m.gen && -+ bucket_sectors_used(m)) { -+ sectors_not_moved += bucket_sectors_used(m); -+ buckets_not_moved++; -+ } -+ } -+ up_read(&ca->bucket_lock); -+ } -+ -+ if (sectors_not_moved && !ret) -+ bch_warn_ratelimited(c, -+ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", -+ sectors_not_moved, sectors_to_move, -+ buckets_not_moved, buckets_to_move, -+ atomic64_read(&move_stats.sectors_moved), -+ atomic64_read(&move_stats.keys_raced), -+ atomic64_read(&move_stats.sectors_raced)); -+ -+ trace_copygc(c, -+ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, -+ buckets_to_move, buckets_not_moved); -+ return 0; -+} -+ -+/* -+ * Copygc runs when the amount of fragmented data is above some arbitrary -+ * threshold: -+ * -+ * The threshold at the limit - when the device is full - is the amount of space -+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of -+ * disk space stranded due to fragmentation and store everything we have -+ * promised to store. -+ * -+ * But we don't want to be running copygc unnecessarily when the device still -+ * has plenty of free space - rather, we want copygc to smoothly run every so -+ * often and continually reduce the amount of fragmented space as the device -+ * fills up. So, we increase the threshold by half the current free space. -+ */ -+unsigned long bch2_copygc_wait_amount(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ u64 fragmented_allowed = c->copygc_threshold; -+ u64 fragmented = 0; -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ struct bch_dev_usage usage = bch2_dev_usage_read(ca); -+ -+ fragmented_allowed += ((__dev_buckets_available(ca, usage) * -+ ca->mi.bucket_size) >> 1); -+ fragmented += usage.sectors_fragmented; -+ } -+ -+ return max_t(s64, 0, fragmented_allowed - fragmented); -+} -+ -+static int bch2_copygc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last, wait; -+ -+ set_freezable(); -+ -+ while (!kthread_should_stop()) { -+ if (kthread_wait_freezable(c->copy_gc_enabled)) -+ break; -+ -+ last = atomic_long_read(&clock->now); -+ wait = bch2_copygc_wait_amount(c); -+ -+ if (wait > clock->max_slop) { -+ bch2_kthread_io_clock_wait(clock, last + wait, -+ MAX_SCHEDULE_TIMEOUT); -+ continue; -+ } -+ -+ if (bch2_copygc(c)) -+ break; -+ } -+ -+ return 0; -+} -+ -+void bch2_copygc_stop(struct bch_fs *c) -+{ -+ c->copygc_pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->copygc_pd.rate); -+ -+ if (c->copygc_thread) { -+ kthread_stop(c->copygc_thread); -+ put_task_struct(c->copygc_thread); -+ } -+ c->copygc_thread = NULL; -+} -+ -+int bch2_copygc_start(struct bch_fs *c) -+{ -+ struct task_struct *t; -+ -+ if (c->copygc_thread) -+ return 0; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ if (bch2_fs_init_fault("copygc_start")) -+ return -ENOMEM; -+ -+ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); -+ if (IS_ERR(t)) -+ return PTR_ERR(t); -+ -+ get_task_struct(t); -+ -+ c->copygc_thread = t; -+ wake_up_process(c->copygc_thread); -+ -+ return 0; -+} -+ -+void bch2_fs_copygc_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->copygc_pd); -+ c->copygc_pd.d_term = 0; -+} -diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h -new file mode 100644 -index 000000000000..922738247d03 ---- /dev/null -+++ b/fs/bcachefs/movinggc.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVINGGC_H -+#define _BCACHEFS_MOVINGGC_H -+ -+void bch2_copygc_stop(struct bch_fs *); -+int bch2_copygc_start(struct bch_fs *); -+void bch2_fs_copygc_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_MOVINGGC_H */ -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -new file mode 100644 -index 000000000000..afe25cd26c06 ---- /dev/null -+++ b/fs/bcachefs/opts.c -@@ -0,0 +1,437 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+ -+#include "bcachefs.h" -+#include "compress.h" -+#include "disk_groups.h" -+#include "opts.h" -+#include "super-io.h" -+#include "util.h" -+ -+const char * const bch2_error_actions[] = { -+ "continue", -+ "remount-ro", -+ "panic", -+ NULL -+}; -+ -+const char * const bch2_sb_features[] = { -+#define x(f, n) #f, -+ BCH_SB_FEATURES() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_csum_opts[] = { -+ "none", -+ "crc32c", -+ "crc64", -+ NULL -+}; -+ -+const char * const bch2_compression_opts[] = { -+#define x(t, n) #t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_str_hash_types[] = { -+ "crc32c", -+ "crc64", -+ "siphash", -+ NULL -+}; -+ -+const char * const bch2_data_types[] = { -+#define x(t, n) #t, -+ BCH_DATA_TYPES() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_cache_replacement_policies[] = { -+ "lru", -+ "fifo", -+ "random", -+ NULL -+}; -+ -+/* Default is -1; we skip past it for struct cached_dev's cache mode */ -+const char * const bch2_cache_modes[] = { -+ "default", -+ "writethrough", -+ "writeback", -+ "writearound", -+ "none", -+ NULL -+}; -+ -+const char * const bch2_dev_state[] = { -+ "readwrite", -+ "readonly", -+ "failed", -+ "spare", -+ NULL -+}; -+ -+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -+{ -+#define x(_name, ...) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ -+ BCH_OPTS() -+#undef x -+} -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opt_defined(*opts, _name); -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opts->_name; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ opt_set(*opts, _name, v); \ -+ break; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Initial options from superblock - here we don't want any options undefined, -+ * any options the superblock doesn't specify are set to 0: -+ */ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ -+ if (_sb_opt != NO_SB_OPT) \ -+ opt_set(opts, _name, _sb_opt(sb)); -+ BCH_OPTS() -+#undef x -+ -+ return opts; -+} -+ -+const struct bch_option bch2_opt_table[] = { -+#define OPT_BOOL() .type = BCH_OPT_BOOL -+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -+#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -+#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices -+#define OPT_FN(_fn) .type = BCH_OPT_FN, \ -+ .parse = _fn##_parse, \ -+ .to_text = _fn##_to_text -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ -+ [Opt_##_name] = { \ -+ .attr = { \ -+ .name = #_name, \ -+ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ -+ }, \ -+ .mode = _mode, \ -+ .hint = _hint, \ -+ .help = _help, \ -+ .set_sb = SET_##_sb_opt, \ -+ _type \ -+ }, -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+int bch2_opt_lookup(const char *name) -+{ -+ const struct bch_option *i; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); -+ i++) -+ if (!strcmp(name, i->attr.name)) -+ return i - bch2_opt_table; -+ -+ return -1; -+} -+ -+struct synonym { -+ const char *s1, *s2; -+}; -+ -+static const struct synonym bch_opt_synonyms[] = { -+ { "quota", "usrquota" }, -+}; -+ -+static int bch2_mount_opt_lookup(const char *name) -+{ -+ const struct synonym *i; -+ -+ for (i = bch_opt_synonyms; -+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); -+ i++) -+ if (!strcmp(name, i->s1)) -+ name = i->s2; -+ -+ return bch2_opt_lookup(name); -+} -+ -+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, -+ const char *val, u64 *res) -+{ -+ ssize_t ret; -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res > 1) -+ return -ERANGE; -+ break; -+ case BCH_OPT_UINT: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_SECTORS: -+ ret = bch2_strtou64_h(val, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res & 511) -+ return -EINVAL; -+ -+ *res >>= 9; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_STR: -+ ret = match_string(opt->choices, -1, val); -+ if (ret < 0) -+ return ret; -+ -+ *res = ret; -+ break; -+ case BCH_OPT_FN: -+ if (!c) -+ return -EINVAL; -+ -+ return opt->parse(c, val, res); -+ } -+ -+ return 0; -+} -+ -+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct bch_option *opt, u64 v, -+ unsigned flags) -+{ -+ if (flags & OPT_SHOW_MOUNT_STYLE) { -+ if (opt->type == BCH_OPT_BOOL) { -+ pr_buf(out, "%s%s", -+ v ? "" : "no", -+ opt->attr.name); -+ return; -+ } -+ -+ pr_buf(out, "%s=", opt->attr.name); -+ } -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ case BCH_OPT_UINT: -+ pr_buf(out, "%lli", v); -+ break; -+ case BCH_OPT_SECTORS: -+ bch2_hprint(out, v); -+ break; -+ case BCH_OPT_STR: -+ if (flags & OPT_SHOW_FULL_LIST) -+ bch2_string_opt_to_text(out, opt->choices, v); -+ else -+ pr_buf(out, opt->choices[v]); -+ break; -+ case BCH_OPT_FN: -+ opt->to_text(out, c, v); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) -+{ -+ int ret = 0; -+ -+ switch (id) { -+ case Opt_compression: -+ case Opt_background_compression: -+ ret = bch2_check_set_has_compressed_data(c, v); -+ break; -+ case Opt_erasure_code: -+ if (v) -+ bch2_check_set_feature(c, BCH_FEATURE_ec); -+ break; -+ } -+ -+ return ret; -+} -+ -+int bch2_opts_check_may_set(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ ret = bch2_opt_check_may_set(c, i, -+ bch2_opt_get_by_id(&c->opts, i)); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_parse_mount_opts(struct bch_opts *opts, char *options) -+{ -+ char *opt, *name, *val; -+ int ret, id; -+ u64 v; -+ -+ while ((opt = strsep(&options, ",")) != NULL) { -+ name = strsep(&opt, "="); -+ val = opt; -+ -+ if (val) { -+ id = bch2_mount_opt_lookup(name); -+ if (id < 0) -+ goto bad_opt; -+ -+ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); -+ if (ret < 0) -+ goto bad_val; -+ } else { -+ id = bch2_mount_opt_lookup(name); -+ v = 1; -+ -+ if (id < 0 && -+ !strncmp("no", name, 2)) { -+ id = bch2_mount_opt_lookup(name + 2); -+ v = 0; -+ } -+ -+ if (id < 0) -+ goto bad_opt; -+ -+ if (bch2_opt_table[id].type != BCH_OPT_BOOL) -+ goto no_val; -+ } -+ -+ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) -+ goto bad_opt; -+ -+ if (id == Opt_acl && -+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) -+ goto bad_opt; -+ -+ if ((id == Opt_usrquota || -+ id == Opt_grpquota) && -+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) -+ goto bad_opt; -+ -+ bch2_opt_set_by_id(opts, id, v); -+ } -+ -+ return 0; -+bad_opt: -+ pr_err("Bad mount option %s", name); -+ return -1; -+bad_val: -+ pr_err("Invalid value %s for mount option %s", val, name); -+ return -1; -+no_val: -+ pr_err("Mount option %s requires a value", name); -+ return -1; -+} -+ -+/* io opts: */ -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -+{ -+ struct bch_io_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) -+{ -+ struct bch_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) -+{ -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+} -+ -+bool bch2_opt_is_inode_opt(enum bch_opt_id id) -+{ -+ static const enum bch_opt_id inode_opt_list[] = { -+#define x(_name, _bits) Opt_##_name, -+ BCH_INODE_OPTS() -+#undef x -+ }; -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) -+ if (inode_opt_list[i] == id) -+ return true; -+ -+ return false; -+} -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -new file mode 100644 -index 000000000000..014c608ca0c6 ---- /dev/null -+++ b/fs/bcachefs/opts.h -@@ -0,0 +1,440 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_OPTS_H -+#define _BCACHEFS_OPTS_H -+ -+#include -+#include -+#include -+#include -+#include "bcachefs_format.h" -+ -+extern const char * const bch2_error_actions[]; -+extern const char * const bch2_sb_features[]; -+extern const char * const bch2_csum_opts[]; -+extern const char * const bch2_compression_opts[]; -+extern const char * const bch2_str_hash_types[]; -+extern const char * const bch2_data_types[]; -+extern const char * const bch2_cache_replacement_policies[]; -+extern const char * const bch2_cache_modes[]; -+extern const char * const bch2_dev_state[]; -+ -+/* -+ * Mount options; we also store defaults in the superblock. -+ * -+ * Also exposed via sysfs: if an option is writeable, and it's also stored in -+ * the superblock, changing it via sysfs (currently? might change this) also -+ * updates the superblock. -+ * -+ * We store options as signed integers, where -1 means undefined. This means we -+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only -+ * apply the options from that struct that are defined. -+ */ -+ -+/* dummy option, for options that aren't stored in the superblock */ -+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); -+ -+/* When can be set: */ -+enum opt_mode { -+ OPT_FORMAT = (1 << 0), -+ OPT_MOUNT = (1 << 1), -+ OPT_RUNTIME = (1 << 2), -+ OPT_INODE = (1 << 3), -+ OPT_DEVICE = (1 << 4), -+}; -+ -+enum opt_type { -+ BCH_OPT_BOOL, -+ BCH_OPT_UINT, -+ BCH_OPT_SECTORS, -+ BCH_OPT_STR, -+ BCH_OPT_FN, -+}; -+ -+/** -+ * x(name, shortopt, type, in mem type, mode, sb_opt) -+ * -+ * @name - name of mount option, sysfs attribute, and struct bch_opts -+ * member -+ * -+ * @mode - when opt may be set -+ * -+ * @sb_option - name of corresponding superblock option -+ * -+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR -+ */ -+ -+/* -+ * XXX: add fields for -+ * - default value -+ * - helptext -+ */ -+ -+#ifdef __KERNEL__ -+#define RATELIMIT_ERRORS true -+#else -+#define RATELIMIT_ERRORS false -+#endif -+ -+#define BCH_OPTS() \ -+ x(block_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 128), \ -+ BCH_SB_BLOCK_SIZE, 8, \ -+ "size", NULL) \ -+ x(btree_node_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 512), \ -+ BCH_SB_BTREE_NODE_SIZE, 512, \ -+ "size", "Btree node size, default 256k") \ -+ x(errors, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_error_actions), \ -+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ -+ NULL, "Action to take on filesystem error") \ -+ x(metadata_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_WANT, 1, \ -+ "#", "Number of metadata replicas") \ -+ x(data_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_WANT, 1, \ -+ "#", "Number of data replicas") \ -+ x(metadata_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(data_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(metadata_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(data_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(background_compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(str_hash, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_str_hash_types), \ -+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ -+ NULL, "Hash function for directory entries and xattrs")\ -+ x(foreground_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_FOREGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group for foreground writes") \ -+ x(background_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_BACKGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group to move data to in the background")\ -+ x(promote_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_PROMOTE_TARGET, 0, \ -+ "(target)", "Device or disk group to promote data to on read")\ -+ x(erasure_code, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_BOOL(), \ -+ BCH_SB_ERASURE_CODE, false, \ -+ NULL, "Enable erasure coding (DO NOT USE YET)") \ -+ x(inodes_32bit, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_INODE_32BIT, false, \ -+ NULL, "Constrain inode numbers to 32 bits") \ -+ x(gc_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(5, 21), \ -+ BCH_SB_GC_RESERVE, 8, \ -+ "%", "Percentage of disk space to reserve for copygc")\ -+ x(gc_reserve_bytes, u64, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_SECTORS(0, U64_MAX), \ -+ BCH_SB_GC_RESERVE_BYTES, 0, \ -+ "%", "Amount of disk space to reserve for copygc\n" \ -+ "Takes precedence over gc_reserve_percent if set")\ -+ x(root_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(0, 100), \ -+ BCH_SB_ROOT_RESERVE, 0, \ -+ "%", "Percentage of disk space to reserve for superuser")\ -+ x(wide_macs, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_128_BIT_MACS, false, \ -+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ -+ x(inline_data, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable inline data extents") \ -+ x(acl, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_POSIX_ACL, true, \ -+ NULL, "Enable POSIX acls") \ -+ x(usrquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_USRQUOTA, false, \ -+ NULL, "Enable user quotas") \ -+ x(grpquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_GRPQUOTA, false, \ -+ NULL, "Enable group quotas") \ -+ x(prjquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_PRJQUOTA, false, \ -+ NULL, "Enable project quotas") \ -+ x(reflink, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_REFLINK, true, \ -+ NULL, "Enable reflink support") \ -+ x(degraded, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Allow mounting in degraded mode") \ -+ x(discard, u8, \ -+ OPT_MOUNT|OPT_DEVICE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable discard/TRIM support") \ -+ x(verbose, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Extra debugging information during mount/recovery")\ -+ x(journal_flush_disabled, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Disable journal flush on sync/fsync\n" \ -+ "If enabled, writes can be lost, but only since the\n"\ -+ "last journal write (default 1 second)") \ -+ x(fsck, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Run fsck on mount") \ -+ x(fix_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Fix errors during fsck without asking") \ -+ x(ratelimit_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, RATELIMIT_ERRORS, \ -+ NULL, "Ratelimit error messages during fsck") \ -+ x(nochanges, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Super read only mode - no writes at all will be issued,\n"\ -+ "even if we have to replay the journal") \ -+ x(norecovery, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't replay the journal") \ -+ x(rebuild_replicas, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Rebuild the superblock replicas section") \ -+ x(keep_journal, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't free journal entries/keys after startup")\ -+ x(read_entire_journal, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Read all journal entries, not just dirty ones")\ -+ x(noexcl, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't open device in exclusive mode") \ -+ x(sb, u64, \ -+ OPT_MOUNT, \ -+ OPT_UINT(0, S64_MAX), \ -+ NO_SB_OPT, BCH_SB_SECTOR, \ -+ "offset", "Sector offset of superblock") \ -+ x(read_only, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(nostart, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don\'t start filesystem, only open devices") \ -+ x(reconstruct_alloc, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Reconstruct alloc btree") \ -+ x(version_upgrade, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Set superblock to latest version,\n" \ -+ "allowing any new features to be used") \ -+ x(project, u8, \ -+ OPT_INODE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(fs_size, u64, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(bucket, u32, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(durability, u8, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, BCH_REPLICAS_MAX), \ -+ NO_SB_OPT, 1, \ -+ "n", "Data written to this device will be considered\n"\ -+ "to have already been replicated n times") -+ -+struct bch_opts { -+#define x(_name, _bits, ...) unsigned _name##_defined:1; -+ BCH_OPTS() -+#undef x -+ -+#define x(_name, _bits, ...) _bits _name; -+ BCH_OPTS() -+#undef x -+}; -+ -+static const struct bch_opts bch2_opts_default = { -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ -+ ._name##_defined = true, \ -+ ._name = _default, \ -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+#define opt_defined(_opts, _name) ((_opts)._name##_defined) -+ -+#define opt_get(_opts, _name) \ -+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) -+ -+#define opt_set(_opts, _name, _v) \ -+do { \ -+ (_opts)._name##_defined = true; \ -+ (_opts)._name = _v; \ -+} while (0) -+ -+static inline struct bch_opts bch2_opts_empty(void) -+{ -+ return (struct bch_opts) { 0 }; -+} -+ -+void bch2_opts_apply(struct bch_opts *, struct bch_opts); -+ -+enum bch_opt_id { -+#define x(_name, ...) Opt_##_name, -+ BCH_OPTS() -+#undef x -+ bch2_opts_nr -+}; -+ -+struct bch_fs; -+struct printbuf; -+ -+struct bch_option { -+ struct attribute attr; -+ void (*set_sb)(struct bch_sb *, u64); -+ enum opt_mode mode; -+ enum opt_type type; -+ -+ union { -+ struct { -+ u64 min, max; -+ }; -+ struct { -+ const char * const *choices; -+ }; -+ struct { -+ int (*parse)(struct bch_fs *, const char *, u64 *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, u64); -+ }; -+ }; -+ -+ const char *hint; -+ const char *help; -+ -+}; -+ -+extern const struct bch_option bch2_opt_table[]; -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -+ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *); -+ -+int bch2_opt_lookup(const char *); -+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); -+ -+#define OPT_SHOW_FULL_LIST (1 << 0) -+#define OPT_SHOW_MOUNT_STYLE (1 << 1) -+ -+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, -+ const struct bch_option *, u64, unsigned); -+ -+int bch2_opt_check_may_set(struct bch_fs *, int, u64); -+int bch2_opts_check_may_set(struct bch_fs *); -+int bch2_parse_mount_opts(struct bch_opts *, char *); -+ -+/* inode opts: */ -+ -+struct bch_io_opts { -+#define x(_name, _bits) unsigned _name##_defined:1; -+ BCH_INODE_OPTS() -+#undef x -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_OPTS() -+#undef x -+}; -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); -+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); -+bool bch2_opt_is_inode_opt(enum bch_opt_id); -+ -+#endif /* _BCACHEFS_OPTS_H */ -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -new file mode 100644 -index 000000000000..d3032a46e7f3 ---- /dev/null -+++ b/fs/bcachefs/quota.c -@@ -0,0 +1,783 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "inode.h" -+#include "quota.h" -+#include "super-io.h" -+ -+static const char *bch2_sb_validate_quota(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_quota *q = field_to_type(f, quota); -+ -+ if (vstruct_bytes(&q->field) != sizeof(*q)) -+ return "invalid field quota: wrong size"; -+ -+ return NULL; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_quota = { -+ .validate = bch2_sb_validate_quota, -+}; -+ -+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->p.inode >= QTYP_NR) -+ return "invalid quota type"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+static const char * const bch2_quota_counters[] = { -+ "space", -+ "inodes", -+}; -+ -+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); -+ unsigned i; -+ -+ for (i = 0; i < Q_COUNTERS; i++) -+ pr_buf(out, "%s hardlimit %llu softlimit %llu", -+ bch2_quota_counters[i], -+ le64_to_cpu(dq.v->c[i].hardlimit), -+ le64_to_cpu(dq.v->c[i].softlimit)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+#include -+#include -+#include -+ -+static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -+{ -+ qtypes >>= i; -+ return qtypes ? i + __ffs(qtypes) : QTYP_NR; -+} -+ -+#define for_each_set_qtype(_c, _i, _q, _qtypes) \ -+ for (_i = 0; \ -+ (_i = __next_qtype(_i, _qtypes), \ -+ _q = &(_c)->quotas[_i], \ -+ _i < QTYP_NR); \ -+ _i++) -+ -+static bool ignore_hardlimit(struct bch_memquota_type *q) -+{ -+ if (capable(CAP_SYS_RESOURCE)) -+ return true; -+#if 0 -+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; -+ -+ return capable(CAP_SYS_RESOURCE) && -+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || -+ !(info->dqi_flags & DQF_ROOT_SQUASH)); -+#endif -+ return false; -+} -+ -+enum quota_msg { -+ SOFTWARN, /* Softlimit reached */ -+ SOFTLONGWARN, /* Grace time expired */ -+ HARDWARN, /* Hardlimit reached */ -+ -+ HARDBELOW, /* Usage got below inode hardlimit */ -+ SOFTBELOW, /* Usage got below inode softlimit */ -+}; -+ -+static int quota_nl[][Q_COUNTERS] = { -+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, -+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, -+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, -+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, -+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, -+ -+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, -+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, -+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, -+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, -+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -+}; -+ -+struct quota_msgs { -+ u8 nr; -+ struct { -+ u8 qtype; -+ u8 msg; -+ } m[QTYP_NR * Q_COUNTERS]; -+}; -+ -+static void prepare_msg(unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); -+ -+ msgs->m[msgs->nr].qtype = qtype; -+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; -+ msgs->nr++; -+} -+ -+static void prepare_warning(struct memquota_counter *qc, -+ unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ if (qc->warning_issued & (1 << msg_type)) -+ return; -+ -+ prepare_msg(qtype, counter, msgs, msg_type); -+} -+ -+static void flush_warnings(struct bch_qid qid, -+ struct super_block *sb, -+ struct quota_msgs *msgs) -+{ -+ unsigned i; -+ -+ for (i = 0; i < msgs->nr; i++) -+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), -+ sb->s_dev, msgs->m[i].msg); -+} -+ -+static int bch2_quota_check_limit(struct bch_fs *c, -+ unsigned qtype, -+ struct bch_memquota *mq, -+ struct quota_msgs *msgs, -+ enum quota_counters counter, -+ s64 v, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q = &c->quotas[qtype]; -+ struct memquota_counter *qc = &mq->c[counter]; -+ u64 n = qc->v + v; -+ -+ BUG_ON((s64) n < 0); -+ -+ if (mode == KEY_TYPE_QUOTA_NOCHECK) -+ return 0; -+ -+ if (v <= 0) { -+ if (n < qc->hardlimit && -+ (qc->warning_issued & (1 << HARDWARN))) { -+ qc->warning_issued &= ~(1 << HARDWARN); -+ prepare_msg(qtype, counter, msgs, HARDBELOW); -+ } -+ -+ if (n < qc->softlimit && -+ (qc->warning_issued & (1 << SOFTWARN))) { -+ qc->warning_issued &= ~(1 << SOFTWARN); -+ prepare_msg(qtype, counter, msgs, SOFTBELOW); -+ } -+ -+ qc->warning_issued = 0; -+ return 0; -+ } -+ -+ if (qc->hardlimit && -+ qc->hardlimit < n && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, HARDWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer && -+ ktime_get_real_seconds() >= qc->timer && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer == 0) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); -+ -+ /* XXX is this the right one? */ -+ qc->timer = ktime_get_real_seconds() + -+ q->limits[counter].warnlimit; -+ } -+ -+ return 0; -+} -+ -+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ unsigned qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq[QTYP_NR]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); -+ if (!mq[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mq[i]->c[counter].v += v; -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(qid, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static void __bch2_quota_transfer(struct bch_memquota *src_q, -+ struct bch_memquota *dst_q, -+ enum quota_counters counter, s64 v) -+{ -+ BUG_ON(v > src_q->c[counter].v); -+ BUG_ON(v + dst_q->c[counter].v < v); -+ -+ src_q->c[counter].v -= v; -+ dst_q->c[counter].v += v; -+} -+ -+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q; -+ struct bch_memquota *src_q[3], *dst_q[3]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); -+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); -+ -+ if (!src_q[i] || !dst_q[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, -+ dst_q[i]->c[Q_SPC].v + space, -+ mode); -+ if (ret) -+ goto err; -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, -+ dst_q[i]->c[Q_INO].v + 1, -+ mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); -+ } -+ -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(dst, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq; -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq; -+ unsigned i; -+ -+ BUG_ON(k.k->p.inode >= QTYP_NR); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_quota: -+ dq = bkey_s_c_to_quota(k); -+ q = &c->quotas[k.k->p.inode]; -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); -+ if (!mq) { -+ mutex_unlock(&q->lock); -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < Q_COUNTERS; i++) { -+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); -+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); -+ } -+ -+ mutex_unlock(&q->lock); -+ } -+ -+ return 0; -+} -+ -+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->p.inode != type) -+ break; -+ -+ ret = __bch2_quota_set(c, k); -+ if (ret) -+ break; -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+void bch2_fs_quota_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ genradix_free(&c->quotas[i].table); -+} -+ -+void bch2_fs_quota_init(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ mutex_init(&c->quotas[i].lock); -+} -+ -+static void bch2_sb_quota_read(struct bch_fs *c) -+{ -+ struct bch_sb_field_quota *sb_quota; -+ unsigned i, j; -+ -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) -+ return; -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ struct bch_memquota_type *q = &c->quotas[i]; -+ -+ for (j = 0; j < Q_COUNTERS; j++) { -+ q->limits[j].timelimit = -+ le32_to_cpu(sb_quota->q[i].c[j].timelimit); -+ q->limits[j].warnlimit = -+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); -+ } -+ } -+} -+ -+int bch2_fs_quota_read(struct bch_fs *c) -+{ -+ unsigned i, qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked u; -+ struct bkey_s_c k; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ bch2_sb_quota_read(c); -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_init_type(c, i); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); -+ if (ret) -+ return ret; -+ -+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, -+ KEY_TYPE_QUOTA_NOCHECK); -+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, -+ KEY_TYPE_QUOTA_NOCHECK); -+ } -+ } -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Enable/disable/delete quotas for an entire filesystem: */ -+ -+static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ /* Accounting must be enabled at mount time: */ -+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) -+ return -EINVAL; -+ -+ /* Can't enable enforcement without accounting: */ -+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) -+ return -EINVAL; -+ -+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) -+ return -EINVAL; -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (uflags & FS_USER_QUOTA) { -+ if (c->opts.usrquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_USR, 0), -+ POS(QTYP_USR + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_GROUP_QUOTA) { -+ if (c->opts.grpquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_GRP, 0), -+ POS(QTYP_GRP + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_PROJ_QUOTA) { -+ if (c->opts.prjquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_PRJ, 0), -+ POS(QTYP_PRJ + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Return quota status information, such as enforcements, quota file inode -+ * numbers etc. -+ */ -+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ unsigned qtypes = enabled_qtypes(c); -+ unsigned i; -+ -+ memset(state, 0, sizeof(*state)); -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ state->s_state[i].flags |= QCI_SYSFILE; -+ -+ if (!(qtypes & (1 << i))) -+ continue; -+ -+ state->s_state[i].flags |= QCI_ACCT_ENABLED; -+ -+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; -+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; -+ -+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; -+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Adjust quota timers & warnings -+ */ -+static int bch2_quota_set_info(struct super_block *sb, int type, -+ struct qc_info *info) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_sb_field_quota *sb_quota; -+ struct bch_memquota_type *q; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (type >= QTYP_NR) -+ return -EINVAL; -+ -+ if (!((1 << type) & enabled_qtypes(c))) -+ return -ESRCH; -+ -+ if (info->i_fieldmask & -+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) -+ return -EINVAL; -+ -+ q = &c->quotas[type]; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) { -+ sb_quota = bch2_sb_resize_quota(&c->disk_sb, -+ sizeof(*sb_quota) / sizeof(u64)); -+ if (!sb_quota) -+ return -ENOSPC; -+ } -+ -+ if (info->i_fieldmask & QC_SPC_TIMER) -+ sb_quota->q[type].c[Q_SPC].timelimit = -+ cpu_to_le32(info->i_spc_timelimit); -+ -+ if (info->i_fieldmask & QC_SPC_WARNS) -+ sb_quota->q[type].c[Q_SPC].warnlimit = -+ cpu_to_le32(info->i_spc_warnlimit); -+ -+ if (info->i_fieldmask & QC_INO_TIMER) -+ sb_quota->q[type].c[Q_INO].timelimit = -+ cpu_to_le32(info->i_ino_timelimit); -+ -+ if (info->i_fieldmask & QC_INO_WARNS) -+ sb_quota->q[type].c[Q_INO].warnlimit = -+ cpu_to_le32(info->i_ino_warnlimit); -+ -+ bch2_sb_quota_read(c); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+/* Get/set individual quotas: */ -+ -+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -+{ -+ dst->d_space = src->c[Q_SPC].v << 9; -+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; -+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; -+ dst->d_spc_timer = src->c[Q_SPC].timer; -+ dst->d_spc_warns = src->c[Q_SPC].warns; -+ -+ dst->d_ino_count = src->c[Q_INO].v; -+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; -+ dst->d_ino_softlimit = src->c[Q_INO].softlimit; -+ dst->d_ino_timer = src->c[Q_INO].timer; -+ dst->d_ino_warns = src->c[Q_INO].warns; -+} -+ -+static int bch2_get_quota(struct super_block *sb, struct kqid kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid.type]; -+ qid_t qid = from_kqid(&init_user_ns, kqid); -+ struct bch_memquota *mq; -+ -+ memset(qdq, 0, sizeof(*qdq)); -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr(&q->table, qid); -+ if (mq) -+ __bch2_quota_get(qdq, mq); -+ mutex_unlock(&q->lock); -+ -+ return 0; -+} -+ -+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid->type]; -+ qid_t qid = from_kqid(&init_user_ns, *kqid); -+ struct genradix_iter iter; -+ struct bch_memquota *mq; -+ int ret = 0; -+ -+ mutex_lock(&q->lock); -+ -+ genradix_for_each_from(&q->table, iter, mq, qid) -+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { -+ __bch2_quota_get(qdq, mq); -+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); -+ goto found; -+ } -+ -+ ret = -ENOENT; -+found: -+ mutex_unlock(&q->lock); -+ return ret; -+} -+ -+static int bch2_set_quota_trans(struct btree_trans *trans, -+ struct bkey_i_quota *new_quota, -+ struct qc_dqblk *qdq) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_slot(iter); -+ -+ ret = bkey_err(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (k.k->type == KEY_TYPE_quota) -+ new_quota->v = *bkey_s_c_to_quota(k).v; -+ -+ if (qdq->d_fieldmask & QC_SPC_SOFT) -+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); -+ if (qdq->d_fieldmask & QC_SPC_HARD) -+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); -+ -+ if (qdq->d_fieldmask & QC_INO_SOFT) -+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); -+ if (qdq->d_fieldmask & QC_INO_HARD) -+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); -+ -+ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); -+} -+ -+static int bch2_set_quota(struct super_block *sb, struct kqid qid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct btree_trans trans; -+ struct bkey_i_quota new_quota; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ bkey_quota_init(&new_quota.k_i); -+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, -+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: -+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+const struct quotactl_ops bch2_quotactl_operations = { -+ .quota_enable = bch2_quota_enable, -+ .quota_disable = bch2_quota_disable, -+ .rm_xquota = bch2_quota_remove, -+ -+ .get_state = bch2_quota_get_state, -+ .set_info = bch2_quota_set_info, -+ -+ .get_dqblk = bch2_get_quota, -+ .get_nextdqblk = bch2_get_next_quota, -+ .set_dqblk = bch2_set_quota, -+}; -+ -+#endif /* CONFIG_BCACHEFS_QUOTA */ -diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h -new file mode 100644 -index 000000000000..51e4f9713ef0 ---- /dev/null -+++ b/fs/bcachefs/quota.h -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_H -+#define _BCACHEFS_QUOTA_H -+ -+#include "inode.h" -+#include "quota_types.h" -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -+ -+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_quota (struct bkey_ops) { \ -+ .key_invalid = bch2_quota_invalid, \ -+ .val_to_text = bch2_quota_to_text, \ -+} -+ -+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -+{ -+ return (struct bch_qid) { -+ .q[QTYP_USR] = u->bi_uid, -+ .q[QTYP_GRP] = u->bi_gid, -+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, -+ }; -+} -+ -+static inline unsigned enabled_qtypes(struct bch_fs *c) -+{ -+ return ((c->opts.usrquota << QTYP_USR)| -+ (c->opts.grpquota << QTYP_GRP)| -+ (c->opts.prjquota << QTYP_PRJ)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, -+ s64, enum quota_acct_mode); -+ -+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, -+ struct bch_qid, u64, enum quota_acct_mode); -+ -+void bch2_fs_quota_exit(struct bch_fs *); -+void bch2_fs_quota_init(struct bch_fs *); -+int bch2_fs_quota_read(struct bch_fs *); -+ -+extern const struct quotactl_ops bch2_quotactl_operations; -+ -+#else -+ -+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -+static inline void bch2_fs_quota_init(struct bch_fs *c) {} -+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } -+ -+#endif -+ -+#endif /* _BCACHEFS_QUOTA_H */ -diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h -new file mode 100644 -index 000000000000..6a136083d389 ---- /dev/null -+++ b/fs/bcachefs/quota_types.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_TYPES_H -+#define _BCACHEFS_QUOTA_TYPES_H -+ -+#include -+ -+struct bch_qid { -+ u32 q[QTYP_NR]; -+}; -+ -+enum quota_acct_mode { -+ KEY_TYPE_QUOTA_PREALLOC, -+ KEY_TYPE_QUOTA_WARN, -+ KEY_TYPE_QUOTA_NOCHECK, -+}; -+ -+struct memquota_counter { -+ u64 v; -+ u64 hardlimit; -+ u64 softlimit; -+ s64 timer; -+ int warns; -+ int warning_issued; -+}; -+ -+struct bch_memquota { -+ struct memquota_counter c[Q_COUNTERS]; -+}; -+ -+typedef GENRADIX(struct bch_memquota) bch_memquota_table; -+ -+struct quota_limit { -+ u32 timelimit; -+ u32 warnlimit; -+}; -+ -+struct bch_memquota_type { -+ struct quota_limit limits[Q_COUNTERS]; -+ bch_memquota_table table; -+ struct mutex lock; -+}; -+ -+#endif /* _BCACHEFS_QUOTA_TYPES_H */ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -new file mode 100644 -index 000000000000..56a1f761271f ---- /dev/null -+++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,331 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "extents.h" -+#include "io.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Check if an extent should be moved: -+ * returns -1 if it should not be moved, or -+ * device of pointer that should be moved, if known, or INT_MAX if unknown -+ */ -+static int __bch2_rebalance_pred(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ if (io_opts->background_compression && -+ !bch2_bkey_is_incompressible(k)) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ p.crc.compression_type != -+ bch2_compression_opt_to_type[io_opts->background_compression]) -+ return p.ptr.dev; -+ -+ if (io_opts->background_target) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) -+ return p.ptr.dev; -+ -+ return -1; -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ atomic64_t *counter; -+ int dev; -+ -+ dev = __bch2_rebalance_pred(c, k, io_opts); -+ if (dev < 0) -+ return; -+ -+ counter = dev < INT_MAX -+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work -+ : &c->rebalance.work_unknown_dev; -+ -+ if (atomic64_add_return(k.k->size, counter) == k.k->size) -+ rebalance_wakeup(c); -+} -+ -+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { -+ data_opts->target = io_opts->background_target; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+ } else { -+ return DATA_SKIP; -+ } -+} -+ -+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -+{ -+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == -+ sectors) -+ rebalance_wakeup(c); -+} -+ -+struct rebalance_work { -+ int dev_most_full_idx; -+ unsigned dev_most_full_percent; -+ u64 dev_most_full_work; -+ u64 dev_most_full_capacity; -+ u64 total_work; -+}; -+ -+static void rebalance_work_accumulate(struct rebalance_work *w, -+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) -+{ -+ unsigned percent_full; -+ u64 work = dev_work + unknown_dev; -+ -+ if (work < dev_work || work < unknown_dev) -+ work = U64_MAX; -+ work = min(work, capacity); -+ -+ percent_full = div64_u64(work * 100, capacity); -+ -+ if (percent_full >= w->dev_most_full_percent) { -+ w->dev_most_full_idx = idx; -+ w->dev_most_full_percent = percent_full; -+ w->dev_most_full_work = work; -+ w->dev_most_full_capacity = capacity; -+ } -+ -+ if (w->total_work + dev_work >= w->total_work && -+ w->total_work + dev_work >= dev_work) -+ w->total_work += dev_work; -+} -+ -+static struct rebalance_work rebalance_work(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct rebalance_work ret = { .dev_most_full_idx = -1 }; -+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ rebalance_work_accumulate(&ret, -+ atomic64_read(&ca->rebalance_work), -+ unknown_dev, -+ bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket), -+ i); -+ -+ rebalance_work_accumulate(&ret, -+ unknown_dev, 0, c->capacity, -1); -+ -+ return ret; -+} -+ -+static void rebalance_work_reset(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ atomic64_set(&ca->rebalance_work, 0); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, 0); -+} -+ -+static unsigned long curr_cputime(void) -+{ -+ u64 utime, stime; -+ -+ task_cputime_adjusted(current, &utime, &stime); -+ return nsecs_to_jiffies(utime + stime); -+} -+ -+static int bch2_rebalance_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct rebalance_work w, p; -+ unsigned long start, prev_start; -+ unsigned long prev_run_time, prev_run_cputime; -+ unsigned long cputime, prev_cputime; -+ unsigned long io_start; -+ long throttle; -+ -+ set_freezable(); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = rebalance_work(c); -+ prev_start = jiffies; -+ prev_cputime = curr_cputime(); -+ -+ while (!kthread_wait_freezable(r->enabled)) { -+ cond_resched(); -+ -+ start = jiffies; -+ cputime = curr_cputime(); -+ -+ prev_run_time = start - prev_start; -+ prev_run_cputime = cputime - prev_cputime; -+ -+ w = rebalance_work(c); -+ BUG_ON(!w.dev_most_full_capacity); -+ -+ if (!w.total_work) { -+ r->state = REBALANCE_WAITING; -+ kthread_wait_freezable(rebalance_work(c).total_work); -+ continue; -+ } -+ -+ /* -+ * If there isn't much work to do, throttle cpu usage: -+ */ -+ throttle = prev_run_cputime * 100 / -+ max(1U, w.dev_most_full_percent) - -+ prev_run_time; -+ -+ if (w.dev_most_full_percent < 20 && throttle > 0) { -+ r->throttled_until_iotime = io_start + -+ div_u64(w.dev_most_full_capacity * -+ (20 - w.dev_most_full_percent), -+ 50); -+ -+ if (atomic_long_read(&clock->now) + clock->max_slop < -+ r->throttled_until_iotime) { -+ r->throttled_until_cputime = start + throttle; -+ r->state = REBALANCE_THROTTLED; -+ -+ bch2_kthread_io_clock_wait(clock, -+ r->throttled_until_iotime, -+ throttle); -+ continue; -+ } -+ } -+ -+ /* minimum 1 mb/sec: */ -+ r->pd.rate.rate = -+ max_t(u64, 1 << 11, -+ r->pd.rate.rate * -+ max(p.dev_most_full_percent, 1U) / -+ max(w.dev_most_full_percent, 1U)); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = w; -+ prev_start = start; -+ prev_cputime = cputime; -+ -+ r->state = REBALANCE_RUNNING; -+ memset(&r->move_stats, 0, sizeof(r->move_stats)); -+ rebalance_work_reset(c); -+ -+ bch2_move_data(c, -+ /* ratelimiting disabled for now */ -+ NULL, /* &r->pd.rate, */ -+ writepoint_ptr(&c->rebalance_write_point), -+ POS_MIN, POS_MAX, -+ rebalance_pred, NULL, -+ &r->move_stats); -+ } -+ -+ return 0; -+} -+ -+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct rebalance_work w = rebalance_work(c); -+ char h1[21], h2[21]; -+ -+ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); -+ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); -+ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", -+ w.dev_most_full_idx, h1, h2); -+ -+ bch2_hprint(&PBUF(h1), w.total_work << 9); -+ bch2_hprint(&PBUF(h2), c->capacity << 9); -+ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); -+ -+ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); -+ -+ switch (r->state) { -+ case REBALANCE_WAITING: -+ pr_buf(out, "waiting\n"); -+ break; -+ case REBALANCE_THROTTLED: -+ bch2_hprint(&PBUF(h1), -+ (r->throttled_until_iotime - -+ atomic_long_read(&c->io_clock[WRITE].now)) << 9); -+ pr_buf(out, "throttled for %lu sec or %s io\n", -+ (r->throttled_until_cputime - jiffies) / HZ, -+ h1); -+ break; -+ case REBALANCE_RUNNING: -+ pr_buf(out, "running\n"); -+ pr_buf(out, "pos %llu:%llu\n", -+ r->move_stats.pos.inode, -+ r->move_stats.pos.offset); -+ break; -+ } -+} -+ -+void bch2_rebalance_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ c->rebalance.pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->rebalance.pd.rate); -+ -+ p = rcu_dereference_protected(c->rebalance.thread, 1); -+ c->rebalance.thread = NULL; -+ -+ if (p) { -+ /* for sychronizing with rebalance_wakeup() */ -+ synchronize_rcu(); -+ -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_rebalance_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(c->rebalance.thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_rebalance_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->rebalance.pd); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); -+} -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -new file mode 100644 -index 000000000000..7ade0bb81cce ---- /dev/null -+++ b/fs/bcachefs/rebalance.h -@@ -0,0 +1,28 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_H -+#define _BCACHEFS_REBALANCE_H -+ -+#include "rebalance_types.h" -+ -+static inline void rebalance_wakeup(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(c->rebalance.thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_opts *); -+void bch2_rebalance_add_work(struct bch_fs *, u64); -+ -+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_rebalance_stop(struct bch_fs *); -+int bch2_rebalance_start(struct bch_fs *); -+void bch2_fs_rebalance_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REBALANCE_H */ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -new file mode 100644 -index 000000000000..192c6be20ced ---- /dev/null -+++ b/fs/bcachefs/rebalance_types.h -@@ -0,0 +1,27 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_TYPES_H -+#define _BCACHEFS_REBALANCE_TYPES_H -+ -+#include "move_types.h" -+ -+enum rebalance_state { -+ REBALANCE_WAITING, -+ REBALANCE_THROTTLED, -+ REBALANCE_RUNNING, -+}; -+ -+struct bch_fs_rebalance { -+ struct task_struct __rcu *thread; -+ struct bch_pd_controller pd; -+ -+ atomic64_t work_unknown_dev; -+ -+ enum rebalance_state state; -+ unsigned long throttled_until_iotime; -+ unsigned long throttled_until_cputime; -+ struct bch_move_stats move_stats; -+ -+ unsigned enabled:1; -+}; -+ -+#endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -new file mode 100644 -index 000000000000..6e829bf0a31f ---- /dev/null -+++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1330 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "buckets.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "quota.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+/* iterate over keys read from the journal: */ -+ -+static struct journal_key *journal_key_search(struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ size_t l = 0, r = journal_keys->nr, m; -+ -+ while (l < r) { -+ m = l + ((r - l) >> 1); -+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: -+ cmp_int(level, journal_keys->d[m].level) ?: -+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ BUG_ON(l < journal_keys->nr && -+ (cmp_int(id, journal_keys->d[l].btree_id) ?: -+ cmp_int(level, journal_keys->d[l].level) ?: -+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); -+ -+ BUG_ON(l && -+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: -+ cmp_int(level, journal_keys->d[l - 1].level) ?: -+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); -+ -+ return l < journal_keys->nr ? journal_keys->d + l : NULL; -+} -+ -+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) -+{ -+ if (iter->k && -+ iter->k < iter->keys->d + iter->keys->nr && -+ iter->k->btree_id == iter->btree_id && -+ iter->k->level == iter->level) -+ return iter->k->k; -+ -+ iter->k = NULL; -+ return NULL; -+} -+ -+static void bch2_journal_iter_advance(struct journal_iter *iter) -+{ -+ if (iter->k) -+ iter->k++; -+} -+ -+static void bch2_journal_iter_init(struct journal_iter *iter, -+ struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ iter->btree_id = id; -+ iter->level = level; -+ iter->keys = journal_keys; -+ iter->k = journal_key_search(journal_keys, id, level, pos); -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -+{ -+ return iter->btree -+ ? bch2_btree_iter_peek(iter->btree) -+ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, -+ iter->b, &iter->unpacked); -+} -+ -+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -+{ -+ if (iter->btree) -+ bch2_btree_iter_next(iter->btree); -+ else -+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -+} -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -+{ -+ switch (iter->last) { -+ case none: -+ break; -+ case btree: -+ bch2_journal_iter_advance_btree(iter); -+ break; -+ case journal: -+ bch2_journal_iter_advance(&iter->journal); -+ break; -+ } -+ -+ iter->last = none; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -+{ -+ struct bkey_s_c ret; -+ -+ while (1) { -+ struct bkey_s_c btree_k = -+ bch2_journal_iter_peek_btree(iter); -+ struct bkey_s_c journal_k = -+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); -+ -+ if (btree_k.k && journal_k.k) { -+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); -+ -+ if (!cmp) -+ bch2_journal_iter_advance_btree(iter); -+ -+ iter->last = cmp < 0 ? btree : journal; -+ } else if (btree_k.k) { -+ iter->last = btree; -+ } else if (journal_k.k) { -+ iter->last = journal; -+ } else { -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ ret = iter->last == journal ? journal_k : btree_k; -+ -+ if (iter->b && -+ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { -+ iter->journal.k = NULL; -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ if (!bkey_deleted(ret.k)) -+ break; -+ -+ bch2_btree_and_journal_iter_advance(iter); -+ } -+ -+ return ret; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) -+{ -+ bch2_btree_and_journal_iter_advance(iter); -+ -+ return bch2_btree_and_journal_iter_peek(iter); -+} -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, -+ struct btree_trans *trans, -+ struct journal_keys *journal_keys, -+ enum btree_id id, struct bpos pos) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); -+ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); -+} -+ -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct journal_keys *journal_keys, -+ struct btree *b) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->b = b; -+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); -+ bch2_journal_iter_init(&iter->journal, journal_keys, -+ b->c.btree_id, b->c.level, b->data->min_key); -+} -+ -+/* Walk btree, overlaying keys from the journal: */ -+ -+static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ ret = key_fn(c, btree_id, b->c.level, k); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ if (b->c.level > 0) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, child, -+ journal_keys, btree_id, node_fn, key_fn); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree *b = c->btree_roots[btree_id].b; -+ int ret = 0; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, -+ node_fn, key_fn) ?: -+ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+/* sort and dedup all keys in the journal: */ -+ -+void bch2_journal_entries_free(struct list_head *list) -+{ -+ -+ while (!list_empty(list)) { -+ struct journal_replay *i = -+ list_first_entry(list, struct journal_replay, list); -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+} -+ -+/* -+ * When keys compare equal, oldest compares first: -+ */ -+static int journal_sort_key_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->level, r->level) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+void bch2_journal_keys_free(struct journal_keys *keys) -+{ -+ kvfree(keys->d); -+ keys->d = NULL; -+ keys->nr = 0; -+} -+ -+static struct journal_keys journal_keys_sort(struct list_head *journal_entries) -+{ -+ struct journal_replay *p; -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct journal_keys keys = { NULL }; -+ struct journal_key *src, *dst; -+ size_t nr_keys = 0; -+ -+ if (list_empty(journal_entries)) -+ return keys; -+ -+ keys.journal_seq_base = -+ le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ nr_keys++; -+ } -+ -+ -+ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); -+ if (!keys.d) -+ goto err; -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ keys.d[keys.nr++] = (struct journal_key) { -+ .btree_id = entry->btree_id, -+ .level = entry->level, -+ .k = k, -+ .journal_seq = le64_to_cpu(p->j.seq) - -+ keys.journal_seq_base, -+ .journal_offset = k->_data - p->j._data, -+ }; -+ } -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); -+ -+ src = dst = keys.d; -+ while (src < keys.d + keys.nr) { -+ while (src + 1 < keys.d + keys.nr && -+ src[0].btree_id == src[1].btree_id && -+ src[0].level == src[1].level && -+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) -+ src++; -+ -+ *dst++ = *src++; -+ } -+ -+ keys.nr = dst - keys.d; -+err: -+ return keys; -+} -+ -+/* journal replay: */ -+ -+static void replay_now_at(struct journal *j, u64 seq) -+{ -+ BUG_ON(seq < j->replay_journal_seq); -+ BUG_ON(seq > j->replay_journal_seq_end); -+ -+ while (j->replay_journal_seq < seq) -+ bch2_journal_pin_put(j, j->replay_journal_seq++); -+} -+ -+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, -+ struct bkey_i *k) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter, *split_iter; -+ /* -+ * We might cause compressed extents to be split, so we need to pass in -+ * a disk_reservation: -+ */ -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i *split; -+ struct bpos atomic_end; -+ /* -+ * Some extents aren't equivalent - w.r.t. what the triggers do -+ * - if they're split: -+ */ -+ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || -+ k->k.type == KEY_TYPE_reflink_p; -+ bool remark = false; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ -+ do { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); -+ -+ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); -+ ret = PTR_ERR_OR_ZERO(split); -+ if (ret) -+ goto err; -+ -+ if (!remark && -+ remark_if_split && -+ bkey_cmp(atomic_end, k->k.p) < 0) { -+ ret = bch2_disk_reservation_add(c, &disk_res, -+ k->k.size * -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ -+ remark = true; -+ } -+ -+ bkey_copy(split, k); -+ bch2_cut_front(iter->pos, split); -+ bch2_cut_back(atomic_end, split); -+ -+ split_iter = bch2_trans_copy_iter(&trans, iter); -+ ret = PTR_ERR_OR_ZERO(split_iter); -+ if (ret) -+ goto err; -+ -+ /* -+ * It's important that we don't go through the -+ * extent_handle_overwrites() and extent_update_to_keys() path -+ * here: journal replay is supposed to treat extents like -+ * regular keys -+ */ -+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); -+ bch2_trans_update(&trans, split_iter, split, -+ BTREE_TRIGGER_NORUN); -+ -+ bch2_btree_iter_set_pos(iter, split->k.p); -+ -+ if (remark) { -+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), -+ 0, split->k.size, -+ BTREE_TRIGGER_INSERT); -+ if (ret) -+ goto err; -+ } -+ } while (bkey_cmp(iter->pos, k->k.p) < 0); -+ -+ if (remark) { -+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), -+ 0, -((s64) k->k.size), -+ BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_trans_commit(&trans, &disk_res, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_journal_replay_key(struct btree_trans *trans, -+ enum btree_id id, unsigned level, -+ struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_node_iter(trans, id, k->k.p, -+ BTREE_MAX_DEPTH, level, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ /* -+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run -+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't -+ * want that here, journal replay is supposed to treat extents like -+ * regular keys: -+ */ -+ __bch2_btree_iter_set_pos(iter, k->k.p, false); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_journal_replay_key(&trans, id, level, k)); -+} -+ -+static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_alloc_replay_key(&trans, k)); -+} -+ -+static int journal_sort_seq_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(r->level, l->level) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->btree_id, r->btree_id) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p); -+} -+ -+static int bch2_journal_replay(struct bch_fs *c, -+ struct journal_keys keys) -+{ -+ struct journal *j = &c->journal; -+ struct journal_key *i; -+ u64 seq; -+ int ret; -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); -+ -+ if (keys.nr) -+ replay_now_at(j, keys.journal_seq_base); -+ -+ seq = j->replay_journal_seq; -+ -+ /* -+ * First replay updates to the alloc btree - these will only update the -+ * btree key cache: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_alloc_replay_key(c, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Next replay updates to interior btree nodes: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Now that the btree is in a consistent state, we can start journal -+ * reclaim (which will be flushing entries from the btree key cache back -+ * to the btree: -+ */ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); -+ -+ j->replay_journal_seq = seq; -+ -+ /* -+ * Now replay leaf node updates: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level || i->btree_id == BTREE_ID_ALLOC) -+ continue; -+ -+ replay_now_at(j, keys.journal_seq_base + i->journal_seq); -+ -+ ret = i->k->k.size -+ ? bch2_extent_replay_key(c, i->btree_id, i->k) -+ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ -+ replay_now_at(j, j->replay_journal_seq_end); -+ j->replay_journal_seq = 0; -+ -+ bch2_journal_set_replay_done(j); -+ bch2_journal_flush_all_pins(j); -+ return bch2_journal_error(j); -+err: -+ bch_err(c, "journal replay: error %d while replaying key", ret); -+ return ret; -+} -+ -+static bool journal_empty(struct list_head *journal) -+{ -+ return list_empty(journal) || -+ journal_entry_empty(&list_last_entry(journal, -+ struct journal_replay, list)->j); -+} -+ -+static int -+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, -+ struct list_head *journal) -+{ -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ u64 start_seq = le64_to_cpu(i->j.last_seq); -+ u64 end_seq = le64_to_cpu(i->j.seq); -+ u64 seq = start_seq; -+ int ret = 0; -+ -+ list_for_each_entry(i, journal, list) { -+ if (le64_to_cpu(i->j.seq) < start_seq) -+ continue; -+ -+ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, -+ "journal entries %llu-%llu missing! (replaying %llu-%llu)", -+ seq, le64_to_cpu(i->j.seq) - 1, -+ start_seq, end_seq); -+ -+ seq = le64_to_cpu(i->j.seq); -+ -+ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, -+ "found blacklisted journal entry %llu", seq); -+ -+ do { -+ seq++; -+ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal replay early: */ -+ -+static int journal_replay_entry_early(struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ int ret = 0; -+ -+ switch (entry->type) { -+ case BCH_JSET_ENTRY_btree_root: { -+ struct btree_root *r; -+ -+ if (entry->btree_id >= BTREE_ID_NR) { -+ bch_err(c, "filesystem has unknown btree type %u", -+ entry->btree_id); -+ return -EINVAL; -+ } -+ -+ r = &c->btree_roots[entry->btree_id]; -+ -+ if (entry->u64s) { -+ r->level = entry->level; -+ bkey_copy(&r->key, &entry->start[0]); -+ r->error = 0; -+ } else { -+ r->error = -EIO; -+ } -+ r->alive = true; -+ break; -+ } -+ case BCH_JSET_ENTRY_usage: { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ switch (entry->btree_id) { -+ case FS_USAGE_RESERVED: -+ if (entry->level < BCH_REPLICAS_MAX) -+ c->usage_base->persistent_reserved[entry->level] = -+ le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_INODES: -+ c->usage_base->nr_inodes = le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_KEY_VERSION: -+ atomic64_set(&c->key_version, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ -+ break; -+ } -+ case BCH_JSET_ENTRY_data_usage: { -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ ret = bch2_replicas_set_usage(c, &u->r, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist: { -+ struct jset_entry_blacklist *bl_entry = -+ container_of(entry, struct jset_entry_blacklist, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->seq), -+ le64_to_cpu(bl_entry->seq) + 1); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist_v2: { -+ struct jset_entry_blacklist_v2 *bl_entry = -+ container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->start), -+ le64_to_cpu(bl_entry->end) + 1); -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int journal_replay_early(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct list_head *journal) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ if (clean) { -+ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); -+ -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } else { -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ -+ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); -+ -+ list_for_each_entry(i, journal, list) -+ vstruct_for_each(&i->j, entry) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ bch2_fs_usage_initialize(c); -+ -+ return 0; -+} -+ -+/* sb clean section: */ -+ -+static struct bkey_i *btree_root_find(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct jset *j, -+ enum btree_id id, unsigned *level) -+{ -+ struct bkey_i *k; -+ struct jset_entry *entry, *start, *end; -+ -+ if (clean) { -+ start = clean->start; -+ end = vstruct_end(&clean->field); -+ } else { -+ start = j->start; -+ end = vstruct_last(j); -+ } -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root && -+ entry->btree_id == id) -+ goto found; -+ -+ return NULL; -+found: -+ if (!entry->u64s) -+ return ERR_PTR(-EINVAL); -+ -+ k = entry->start; -+ *level = entry->level; -+ return k; -+} -+ -+static int verify_superblock_clean(struct bch_fs *c, -+ struct bch_sb_field_clean **cleanp, -+ struct jset *j) -+{ -+ unsigned i; -+ struct bch_sb_field_clean *clean = *cleanp; -+ int ret = 0; -+ -+ if (!c->sb.clean || !j) -+ return 0; -+ -+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, -+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", -+ le64_to_cpu(clean->journal_seq), -+ le64_to_cpu(j->seq))) { -+ kfree(clean); -+ *cleanp = NULL; -+ return 0; -+ } -+ -+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, -+ "superblock read clock doesn't match journal after clean shutdown"); -+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, -+ "superblock read clock doesn't match journal after clean shutdown"); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ char buf1[200], buf2[200]; -+ struct bkey_i *k1, *k2; -+ unsigned l1 = 0, l2 = 0; -+ -+ k1 = btree_root_find(c, clean, NULL, i, &l1); -+ k2 = btree_root_find(c, NULL, j, i, &l2); -+ -+ if (!k1 && !k2) -+ continue; -+ -+ mustfix_fsck_err_on(!k1 || !k2 || -+ IS_ERR(k1) || -+ IS_ERR(k2) || -+ k1->k.u64s != k2->k.u64s || -+ memcmp(k1, k2, bkey_bytes(k1)) || -+ l1 != l2, c, -+ "superblock btree root %u doesn't match journal after clean shutdown\n" -+ "sb: l=%u %s\n" -+ "journal: l=%u %s\n", i, -+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), -+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean, *sb_clean; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); -+ -+ if (fsck_err_on(!sb_clean, c, -+ "superblock marked clean but clean section not present")) { -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ mutex_unlock(&c->sb_lock); -+ return NULL; -+ } -+ -+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), -+ GFP_KERNEL); -+ if (!clean) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(-ENOMEM); -+ } -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(clean, READ); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return clean; -+fsck_err: -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+} -+ -+static int read_btree_roots(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = &c->btree_roots[i]; -+ -+ if (!r->alive) -+ continue; -+ -+ if (i == BTREE_ID_ALLOC && -+ c->opts.reconstruct_alloc) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ continue; -+ } -+ -+ -+ if (r->error) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "invalid btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ -+ ret = bch2_btree_root_read(c, i, &r->key, r->level); -+ if (ret) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "error reading btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (!c->btree_roots[i].b) -+ bch2_btree_root_alloc(c, i); -+fsck_err: -+ return ret; -+} -+ -+int bch2_fs_recovery(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_clean *clean = NULL; -+ u64 journal_seq; -+ bool wrote = false, write_sb = false; -+ int ret; -+ -+ if (c->sb.clean) -+ clean = read_superblock_clean(c); -+ ret = PTR_ERR_OR_ZERO(clean); -+ if (ret) -+ goto err; -+ -+ if (c->sb.clean) -+ bch_info(c, "recovering from clean shutdown, journal seq %llu", -+ le64_to_cpu(clean->journal_seq)); -+ -+ if (!c->replicas.entries || -+ c->opts.rebuild_replicas) { -+ bch_info(c, "building replicas info"); -+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ } -+ -+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { -+ struct jset *j; -+ -+ ret = bch2_journal_read(c, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, -+ "filesystem marked clean but journal not empty")) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ } -+ -+ if (!c->sb.clean && list_empty(&c->journal_entries)) { -+ bch_err(c, "no journal entries found"); -+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; -+ goto err; -+ } -+ -+ c->journal_keys = journal_keys_sort(&c->journal_entries); -+ if (!c->journal_keys.d) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ j = &list_last_entry(&c->journal_entries, -+ struct journal_replay, list)->j; -+ -+ ret = verify_superblock_clean(c, &clean, j); -+ if (ret) -+ goto err; -+ -+ journal_seq = le64_to_cpu(j->seq) + 1; -+ } else { -+ journal_seq = le64_to_cpu(clean->journal_seq) + 1; -+ } -+ -+ if (!c->sb.clean && -+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { -+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = journal_replay_early(c, clean, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (!c->sb.clean) { -+ ret = bch2_journal_seq_blacklist_add(c, -+ journal_seq, -+ journal_seq + 4); -+ if (ret) { -+ bch_err(c, "error creating new journal seq blacklist entry"); -+ goto err; -+ } -+ -+ journal_seq += 4; -+ -+ /* -+ * The superblock needs to be written before we do any btree -+ * node writes: it will be in the read_write() path -+ */ -+ } -+ -+ ret = bch2_blacklist_table_initialize(c); -+ -+ if (!list_empty(&c->journal_entries)) { -+ ret = verify_journal_entries_not_blacklisted_or_missing(c, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_fs_journal_start(&c->journal, journal_seq, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ ret = read_btree_roots(c); -+ if (ret) -+ goto err; -+ -+ bch_verbose(c, "starting alloc read"); -+ err = "error reading allocation information"; -+ ret = bch2_alloc_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "alloc read done"); -+ -+ bch_verbose(c, "starting stripes_read"); -+ err = "error reading stripes"; -+ ret = bch2_stripes_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "stripes_read done"); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ -+ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { -+ /* -+ * interior btree node updates aren't consistent with the -+ * journal; after an unclean shutdown we have to walk all -+ * pointers to metadata: -+ */ -+ bch_info(c, "starting metadata mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, true); -+ if (ret) -+ goto err; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ if (c->opts.fsck || -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || -+ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { -+ bch_info(c, "starting mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, false); -+ if (ret) -+ goto err; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ /* -+ * Skip past versions that might have possibly been used (as nonces), -+ * but hadn't had their pointers written: -+ */ -+ if (c->sb.encryption_type && !c->sb.clean) -+ atomic64_add(1 << 16, &c->key_version); -+ -+ if (c->opts.norecovery) -+ goto out; -+ -+ bch_verbose(c, "starting journal replay"); -+ err = "journal replay failed"; -+ ret = bch2_journal_replay(c, c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "journal replay done"); -+ -+ if (!c->opts.nochanges) { -+ /* -+ * note that even when filesystem was clean there might be work -+ * to do here, if we ran gc (because of fsck) which recalculated -+ * oldest_gen: -+ */ -+ bch_verbose(c, "writing allocation info"); -+ err = "error writing out alloc info"; -+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: -+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); -+ if (ret) { -+ bch_err(c, "error writing alloc info"); -+ goto err; -+ } -+ bch_verbose(c, "alloc write done"); -+ -+ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); -+ } -+ -+ if (!c->sb.clean) { -+ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ bch_info(c, "checking inode link counts"); -+ err = "error in recovery"; -+ ret = bch2_fsck_inode_nlink(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ -+ } else { -+ bch_verbose(c, "checking for deleted inodes"); -+ err = "error in recovery"; -+ ret = bch2_fsck_walk_inodes_only(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ } -+ } -+ -+ if (c->opts.fsck) { -+ bch_info(c, "starting fsck"); -+ err = "error in fsck"; -+ ret = bch2_fsck_full(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "fsck done"); -+ } -+ -+ if (enabled_qtypes(c)) { -+ bch_verbose(c, "reading quotas"); -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "quotas done"); -+ } -+ -+ mutex_lock(&c->sb_lock); -+ if (c->opts.version_upgrade) { -+ if (c->sb.version < bcachefs_metadata_version_new_versioning) -+ c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_min); -+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ write_sb = true; -+ } -+ -+ if (!test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ write_sb = true; -+ } -+ -+ if (c->opts.fsck && -+ !test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); -+ write_sb = true; -+ } -+ -+ if (write_sb) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (c->journal_seq_blacklist_table && -+ c->journal_seq_blacklist_table->nr > 128) -+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); -+out: -+ ret = 0; -+err: -+fsck_err: -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ bch2_flush_fsck_errs(c); -+ -+ if (!c->opts.keep_journal) { -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ } -+ kfree(clean); -+ if (ret) -+ bch_err(c, "Error in recovery: %s (%i)", err, ret); -+ else -+ bch_verbose(c, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_initialize(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ struct bkey_inode_buf packed_inode; -+ struct qstr lostfound = QSTR("lost+found"); -+ const char *err = "cannot allocate memory"; -+ struct bch_dev *ca; -+ LIST_HEAD(journal); -+ unsigned i; -+ int ret; -+ -+ bch_notice(c, "initializing new filesystem"); -+ -+ mutex_lock(&c->sb_lock); -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->version = c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ bch2_btree_root_alloc(c, i); -+ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); -+ -+ err = "unable to allocate journal buckets"; -+ for_each_online_member(ca, c, i) { -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ /* -+ * journal_res_get() will crash if called before this has -+ * set up the journal.pin FIFO and journal.cur pointer: -+ */ -+ bch2_fs_journal_start(&c->journal, 1, &journal); -+ bch2_journal_set_replay_done(&c->journal); -+ -+ bch2_inode_init(c, &root_inode, 0, 0, -+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); -+ root_inode.bi_inum = BCACHEFS_ROOT_INO; -+ bch2_inode_pack(&packed_inode, &root_inode); -+ -+ err = "error creating root directory"; -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &packed_inode.inode.k_i, -+ NULL, NULL, BTREE_INSERT_LAZY_RW); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_early(c, &lostfound_inode); -+ -+ err = "error creating lost+found"; -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, -+ &root_inode, &lostfound_inode, -+ &lostfound, -+ 0, 0, S_IFDIR|0700, 0, -+ NULL, NULL)); -+ if (ret) -+ goto err; -+ -+ if (enabled_qtypes(c)) { -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ } -+ -+ err = "error writing first journal entry"; -+ ret = bch2_journal_meta(&c->journal); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+err: -+ pr_err("Error initializing new filesystem: %s (%i)", err, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -new file mode 100644 -index 000000000000..a66827c9addf ---- /dev/null -+++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,60 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_H -+#define _BCACHEFS_RECOVERY_H -+ -+#define for_each_journal_key(keys, i) \ -+ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) -+ -+struct journal_iter { -+ enum btree_id btree_id; -+ unsigned level; -+ struct journal_keys *keys; -+ struct journal_key *k; -+}; -+ -+/* -+ * Iterate over keys in the btree, with keys from the journal overlaid on top: -+ */ -+ -+struct btree_and_journal_iter { -+ struct btree_iter *btree; -+ -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey unpacked; -+ -+ struct journal_iter journal; -+ -+ enum last_key_returned { -+ none, -+ btree, -+ journal, -+ } last; -+}; -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, -+ struct btree_trans *, -+ struct journal_keys *, -+ enum btree_id, struct bpos); -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct journal_keys *, -+ struct btree *); -+ -+typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); -+typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k); -+ -+int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, -+ btree_walk_node_fn, btree_walk_key_fn); -+ -+void bch2_journal_keys_free(struct journal_keys *); -+void bch2_journal_entries_free(struct list_head *); -+ -+int bch2_fs_recovery(struct bch_fs *); -+int bch2_fs_initialize(struct bch_fs *); -+ -+#endif /* _BCACHEFS_RECOVERY_H */ -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -new file mode 100644 -index 000000000000..3c473f1380a6 ---- /dev/null -+++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,303 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "inode.h" -+#include "io.h" -+#include "reflink.h" -+ -+#include -+ -+/* reflink pointers */ -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ if (bkey_val_bytes(p.k) != sizeof(*p.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); -+} -+ -+enum merge_result bch2_reflink_p_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); -+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); -+ -+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, _r); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* indirect extents */ -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ if (bkey_val_bytes(r.k) < sizeof(*r.v)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -+ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+static int bch2_make_extent_indirect(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i_extent *e) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *reflink_iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ struct bkey_i_reflink_p *r_p; -+ int ret; -+ -+ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, -+ POS(0, c->reflink_hint), -+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { -+ if (reflink_iter->pos.inode) { -+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); -+ continue; -+ } -+ -+ if (bkey_deleted(k.k) && e->k.size <= k.k->size) -+ break; -+ } -+ -+ if (ret) -+ goto err; -+ -+ /* rewind iter to start of hole, if necessary: */ -+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); -+ -+ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reflink_v_init(&r_v->k_i); -+ r_v->k.p = reflink_iter->pos; -+ bch2_key_resize(&r_v->k, e->k.size); -+ r_v->k.version = e->k.version; -+ -+ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + -+ bkey_val_u64s(&e->k)); -+ r_v->v.refcount = 0; -+ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); -+ -+ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); -+ -+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); -+ if (IS_ERR(r_p)) -+ return PTR_ERR(r_p); -+ -+ e->k.type = KEY_TYPE_reflink_p; -+ r_p = bkey_i_to_reflink_p(&e->k_i); -+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); -+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); -+ -+ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); -+err: -+ if (!IS_ERR(reflink_iter)) -+ c->reflink_hint = reflink_iter->pos.offset; -+ bch2_trans_iter_put(trans, reflink_iter); -+ -+ return ret; -+} -+ -+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) -+{ -+ struct bkey_s_c k = bch2_btree_iter_peek(iter); -+ int ret; -+ -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(iter->pos, end) >= 0) -+ return bkey_s_c_null; -+ -+ if (k.k->type == KEY_TYPE_extent || -+ k.k->type == KEY_TYPE_reflink_p) -+ break; -+ } -+ -+ return k; -+} -+ -+s64 bch2_remap_range(struct bch_fs *c, -+ struct bpos dst_start, struct bpos src_start, -+ u64 remap_sectors, u64 *journal_seq, -+ u64 new_i_size, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *dst_iter, *src_iter; -+ struct bkey_s_c src_k; -+ BKEY_PADDED(k) new_dst; -+ struct bkey_on_stack new_src; -+ struct bpos dst_end = dst_start, src_end = src_start; -+ struct bpos dst_want, src_want; -+ u64 src_done, dst_done; -+ int ret = 0, ret2 = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ bch2_check_set_feature(c, BCH_FEATURE_reflink); -+ -+ dst_end.offset += remap_sectors; -+ src_end.offset += remap_sectors; -+ -+ bkey_on_stack_init(&new_src); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); -+ -+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, -+ BTREE_ITER_INTENT); -+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, -+ BTREE_ITER_INTENT); -+ -+ while (1) { -+ bch2_trans_begin(&trans); -+ -+ trans.mem_top = 0; -+ -+ if (fatal_signal_pending(current)) { -+ ret = -EINTR; -+ goto err; -+ } -+ -+ src_k = get_next_src(src_iter, src_end); -+ ret = bkey_err(src_k); -+ if (ret) -+ goto btree_err; -+ -+ src_done = bpos_min(src_iter->pos, src_end).offset - -+ src_start.offset; -+ dst_want = POS(dst_start.inode, dst_start.offset + src_done); -+ -+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { -+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, -+ journal_seq, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ continue; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); -+ -+ if (!bkey_cmp(dst_iter->pos, dst_end)) -+ break; -+ -+ if (src_k.k->type == KEY_TYPE_extent) { -+ bkey_on_stack_reassemble(&new_src, c, src_k); -+ src_k = bkey_i_to_s_c(new_src.k); -+ -+ bch2_cut_front(src_iter->pos, new_src.k); -+ bch2_cut_back(src_end, new_src.k); -+ -+ ret = bch2_make_extent_indirect(&trans, src_iter, -+ bkey_i_to_extent(new_src.k)); -+ if (ret) -+ goto btree_err; -+ -+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); -+ } -+ -+ if (src_k.k->type == KEY_TYPE_reflink_p) { -+ struct bkey_s_c_reflink_p src_p = -+ bkey_s_c_to_reflink_p(src_k); -+ struct bkey_i_reflink_p *dst_p = -+ bkey_reflink_p_init(&new_dst.k); -+ -+ u64 offset = le64_to_cpu(src_p.v->idx) + -+ (src_iter->pos.offset - -+ bkey_start_offset(src_k.k)); -+ -+ dst_p->v.idx = cpu_to_le64(offset); -+ } else { -+ BUG(); -+ } -+ -+ new_dst.k.k.p = dst_iter->pos; -+ bch2_key_resize(&new_dst.k.k, -+ min(src_k.k->p.offset - src_iter->pos.offset, -+ dst_end.offset - dst_iter->pos.offset)); -+ -+ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, -+ NULL, journal_seq, -+ new_i_size, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ src_want = POS(src_start.inode, src_start.offset + dst_done); -+ bch2_btree_iter_set_pos(src_iter, src_want); -+btree_err: -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); -+err: -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); -+ -+ bch2_trans_begin(&trans); -+ -+ do { -+ struct bch_inode_unpacked inode_u; -+ struct btree_iter *inode_iter; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ dst_start.inode, BTREE_ITER_INTENT); -+ ret2 = PTR_ERR_OR_ZERO(inode_iter); -+ -+ if (!ret2 && -+ inode_u.bi_size < new_i_size) { -+ inode_u.bi_size = new_i_size; -+ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, journal_seq, 0); -+ } -+ } while (ret2 == -EINTR); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&new_src, c); -+ -+ percpu_ref_put(&c->writes); -+ -+ return dst_done ?: ret ?: ret2; -+} -diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h -new file mode 100644 -index 000000000000..5445c1cf0797 ---- /dev/null -+++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REFLINK_H -+#define _BCACHEFS_REFLINK_H -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+enum merge_result bch2_reflink_p_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_p_invalid, \ -+ .val_to_text = bch2_reflink_p_to_text, \ -+ .key_merge = bch2_reflink_p_merge, \ -+} -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+ -+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_v_invalid, \ -+ .val_to_text = bch2_reflink_v_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, -+ u64, u64 *, u64, s64 *); -+ -+#endif /* _BCACHEFS_REFLINK_H */ -diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c -new file mode 100644 -index 000000000000..6b6506c68609 ---- /dev/null -+++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1059 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "buckets.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, -+ struct bch_replicas_cpu *); -+ -+/* Replicas tracking - in memory: */ -+ -+static inline int u8_cmp(u8 l, u8 r) -+{ -+ return cmp_int(l, r); -+} -+ -+static void verify_replicas_entry(struct bch_replicas_entry *e) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ unsigned i; -+ -+ BUG_ON(e->data_type >= BCH_DATA_NR); -+ BUG_ON(!e->nr_devs); -+ BUG_ON(e->nr_required > 1 && -+ e->nr_required >= e->nr_devs); -+ -+ for (i = 0; i + 1 < e->nr_devs; i++) -+ BUG_ON(e->devs[i] >= e->devs[i + 1]); -+#endif -+} -+ -+static void replicas_entry_sort(struct bch_replicas_entry *e) -+{ -+ bubble_sort(e->devs, e->nr_devs, u8_cmp); -+} -+ -+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -+{ -+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -+} -+ -+void bch2_replicas_entry_to_text(struct printbuf *out, -+ struct bch_replicas_entry *e) -+{ -+ unsigned i; -+ -+ pr_buf(out, "%s: %u/%u [", -+ bch2_data_types[e->data_type], -+ e->nr_required, -+ e->nr_devs); -+ -+ for (i = 0; i < e->nr_devs; i++) -+ pr_buf(out, i ? " %u" : "%u", e->devs[i]); -+ pr_buf(out, "]"); -+} -+ -+void bch2_cpu_replicas_to_text(struct printbuf *out, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_cpu_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+static void extent_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ r->nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ continue; -+ -+ if (!p.has_ec) -+ r->devs[r->nr_devs++] = p.ptr.dev; -+ else -+ r->nr_required = 0; -+ } -+} -+ -+static void stripe_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ const struct bch_extent_ptr *ptr; -+ -+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; -+ -+ for (ptr = s.v->ptrs; -+ ptr < s.v->ptrs + s.v->nr_blocks; -+ ptr++) -+ r->devs[r->nr_devs++] = ptr->dev; -+} -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *e, -+ struct bkey_s_c k) -+{ -+ e->nr_devs = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ e->data_type = BCH_DATA_btree; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ e->data_type = BCH_DATA_user; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_stripe: -+ e->data_type = BCH_DATA_user; -+ stripe_to_replicas(k, e); -+ break; -+ } -+ -+ replicas_entry_sort(e); -+} -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *e, -+ enum bch_data_type data_type, -+ struct bch_devs_list devs) -+{ -+ unsigned i; -+ -+ BUG_ON(!data_type || -+ data_type == BCH_DATA_sb || -+ data_type >= BCH_DATA_NR); -+ -+ e->data_type = data_type; -+ e->nr_devs = 0; -+ e->nr_required = 1; -+ -+ for (i = 0; i < devs.nr; i++) -+ e->devs[e->nr_devs++] = devs.devs[i]; -+ -+ replicas_entry_sort(e); -+} -+ -+static struct bch_replicas_cpu -+cpu_replicas_add_entry(struct bch_replicas_cpu *old, -+ struct bch_replicas_entry *new_entry) -+{ -+ unsigned i; -+ struct bch_replicas_cpu new = { -+ .nr = old->nr + 1, -+ .entry_size = max_t(unsigned, old->entry_size, -+ replicas_entry_bytes(new_entry)), -+ }; -+ -+ BUG_ON(!new_entry->data_type); -+ verify_replicas_entry(new_entry); -+ -+ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); -+ if (!new.entries) -+ return new; -+ -+ for (i = 0; i < old->nr; i++) -+ memcpy(cpu_replicas_entry(&new, i), -+ cpu_replicas_entry(old, i), -+ old->entry_size); -+ -+ memcpy(cpu_replicas_entry(&new, old->nr), -+ new_entry, -+ replicas_entry_bytes(new_entry)); -+ -+ bch2_cpu_replicas_sort(&new); -+ return new; -+} -+ -+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ int idx, entry_size = replicas_entry_bytes(search); -+ -+ if (unlikely(entry_size > r->entry_size)) -+ return -1; -+ -+ verify_replicas_entry(search); -+ -+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) -+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, -+ entry_cmp, search); -+#undef entry_cmp -+ -+ return idx < r->nr ? idx : -1; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ replicas_entry_sort(search); -+ -+ return __replicas_entry_idx(&c->replicas, search); -+} -+ -+static bool __replicas_has_entry(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ return __replicas_entry_idx(r, search) >= 0; -+} -+ -+bool bch2_replicas_marked(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ bool marked; -+ -+ if (!search->nr_devs) -+ return true; -+ -+ verify_replicas_entry(search); -+ -+ percpu_down_read(&c->mark_lock); -+ marked = __replicas_has_entry(&c->replicas, search) && -+ (likely((!c->replicas_gc.entries)) || -+ __replicas_has_entry(&c->replicas_gc, search)); -+ percpu_up_read(&c->mark_lock); -+ -+ return marked; -+} -+ -+static void __replicas_table_update(struct bch_fs_usage *dst, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage *src, -+ struct bch_replicas_cpu *src_r) -+{ -+ int src_idx, dst_idx; -+ -+ *dst = *src; -+ -+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { -+ if (!src->replicas[src_idx]) -+ continue; -+ -+ dst_idx = __replicas_entry_idx(dst_r, -+ cpu_replicas_entry(src_r, src_idx)); -+ BUG_ON(dst_idx < 0); -+ -+ dst->replicas[dst_idx] = src->replicas[src_idx]; -+ } -+} -+ -+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage __percpu *src_p, -+ struct bch_replicas_cpu *src_r) -+{ -+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; -+ struct bch_fs_usage *dst, *src = (void *) -+ bch2_acc_percpu_u64s((void *) src_p, src_nr); -+ -+ preempt_disable(); -+ dst = this_cpu_ptr(dst_p); -+ preempt_enable(); -+ -+ __replicas_table_update(dst, dst_r, src, src_r); -+} -+ -+/* -+ * Resize filesystem accounting: -+ */ -+static int replicas_table_update(struct bch_fs *c, -+ struct bch_replicas_cpu *new_r) -+{ -+ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; -+ struct bch_fs_usage *new_scratch = NULL; -+ struct bch_fs_usage __percpu *new_gc = NULL; -+ struct bch_fs_usage *new_base = NULL; -+ unsigned bytes = sizeof(struct bch_fs_usage) + -+ sizeof(u64) * new_r->nr; -+ int ret = -ENOMEM; -+ -+ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || -+ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || -+ (c->usage_gc && -+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { -+ bch_err(c, "error updating replicas table: memory allocation failure"); -+ goto err; -+ } -+ -+ if (c->usage_base) -+ __replicas_table_update(new_base, new_r, -+ c->usage_base, &c->replicas); -+ if (c->usage[0]) -+ __replicas_table_update_pcpu(new_usage[0], new_r, -+ c->usage[0], &c->replicas); -+ if (c->usage[1]) -+ __replicas_table_update_pcpu(new_usage[1], new_r, -+ c->usage[1], &c->replicas); -+ if (c->usage_gc) -+ __replicas_table_update_pcpu(new_gc, new_r, -+ c->usage_gc, &c->replicas); -+ -+ swap(c->usage_base, new_base); -+ swap(c->usage[0], new_usage[0]); -+ swap(c->usage[1], new_usage[1]); -+ swap(c->usage_scratch, new_scratch); -+ swap(c->usage_gc, new_gc); -+ swap(c->replicas, *new_r); -+ ret = 0; -+err: -+ free_percpu(new_gc); -+ kfree(new_scratch); -+ free_percpu(new_usage[1]); -+ free_percpu(new_usage[0]); -+ kfree(new_base); -+ return ret; -+} -+ -+static unsigned reserve_journal_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ unsigned journal_res_u64s = 0; -+ -+ /* nr_inodes: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* key_version: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* persistent_reserved: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * -+ BCH_REPLICAS_MAX; -+ -+ for_each_cpu_replicas_entry(r, e) -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + -+ e->nr_devs, sizeof(u64)); -+ return journal_res_u64s; -+} -+ -+noinline -+static int bch2_mark_replicas_slowpath(struct bch_fs *c, -+ struct bch_replicas_entry *new_entry) -+{ -+ struct bch_replicas_cpu new_r, new_gc; -+ int ret = 0; -+ -+ verify_replicas_entry(new_entry); -+ -+ memset(&new_r, 0, sizeof(new_r)); -+ memset(&new_gc, 0, sizeof(new_gc)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (c->replicas_gc.entries && -+ !__replicas_has_entry(&c->replicas_gc, new_entry)) { -+ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); -+ if (!new_gc.entries) -+ goto err; -+ } -+ -+ if (!__replicas_has_entry(&c->replicas, new_entry)) { -+ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); -+ if (!new_r.entries) -+ goto err; -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); -+ if (ret) -+ goto err; -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->replicas_journal_res, -+ reserve_journal_replicas(c, &new_r)); -+ } -+ -+ if (!new_r.entries && -+ !new_gc.entries) -+ goto out; -+ -+ /* allocations done, now commit: */ -+ -+ if (new_r.entries) -+ bch2_write_super(c); -+ -+ /* don't update in memory replicas until changes are persistent */ -+ percpu_down_write(&c->mark_lock); -+ if (new_r.entries) -+ ret = replicas_table_update(c, &new_r); -+ if (new_gc.entries) -+ swap(new_gc, c->replicas_gc); -+ percpu_up_write(&c->mark_lock); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ kfree(new_r.entries); -+ kfree(new_gc.entries); -+ -+ return ret; -+err: -+ bch_err(c, "error adding replicas entry: memory allocation failure"); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int __bch2_mark_replicas(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ bool check) -+{ -+ return likely(bch2_replicas_marked(c, r)) ? 0 -+ : check ? -1 -+ : bch2_mark_replicas_slowpath(c, r); -+} -+ -+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) -+{ -+ return __bch2_mark_replicas(c, r, false); -+} -+ -+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, -+ bool check) -+{ -+ struct bch_replicas_padded search; -+ struct bch_devs_list cached = bch2_bkey_cached_devs(k); -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < cached.nr; i++) { -+ bch2_replicas_entry_cached(&search.e, cached.devs[i]); -+ -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_bkey_to_replicas(&search.e, k); -+ -+ return __bch2_mark_replicas(c, &search.e, check); -+} -+ -+bool bch2_bkey_replicas_marked(struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, true) == 0; -+} -+ -+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, false); -+} -+ -+int bch2_replicas_gc_end(struct bch_fs *c, int ret) -+{ -+ unsigned i; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * this is kind of crappy; the replicas gc mechanism needs to be ripped -+ * out -+ */ -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct bch_replicas_cpu n; -+ -+ if (!__replicas_has_entry(&c->replicas_gc, e) && -+ (c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i]))) { -+ n = cpu_replicas_add_entry(&c->replicas_gc, e); -+ if (!n.entries) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ swap(n, c->replicas_gc); -+ kfree(n.entries); -+ } -+ } -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &c->replicas_gc); -+err: -+ kfree(c->replicas_gc.entries); -+ c->replicas_gc.entries = NULL; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i = 0; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ BUG_ON(c->replicas_gc.entries); -+ -+ c->replicas_gc.nr = 0; -+ c->replicas_gc.entry_size = 0; -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) { -+ c->replicas_gc.nr++; -+ c->replicas_gc.entry_size = -+ max_t(unsigned, c->replicas_gc.entry_size, -+ replicas_entry_bytes(e)); -+ } -+ -+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, -+ c->replicas_gc.entry_size, -+ GFP_NOIO); -+ if (!c->replicas_gc.entries) { -+ mutex_unlock(&c->sb_lock); -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) -+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), -+ e, c->replicas_gc.entry_size); -+ -+ bch2_cpu_replicas_sort(&c->replicas_gc); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_replicas_gc2(struct bch_fs *c) -+{ -+ struct bch_replicas_cpu new = { 0 }; -+ unsigned i, nr; -+ int ret = 0; -+ -+ bch2_journal_meta(&c->journal); -+retry: -+ nr = READ_ONCE(c->replicas.nr); -+ new.entry_size = READ_ONCE(c->replicas.entry_size); -+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); -+ if (!new.entries) { -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ if (nr != c->replicas.nr || -+ new.entry_size != c->replicas.entry_size) { -+ percpu_up_write(&c->mark_lock); -+ mutex_unlock(&c->sb_lock); -+ kfree(new.entries); -+ goto retry; -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (e->data_type == BCH_DATA_journal || -+ c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i])) -+ memcpy(cpu_replicas_entry(&new, new.nr++), -+ e, new.entry_size); -+ } -+ -+ bch2_cpu_replicas_sort(&new); -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &new); -+err: -+ kfree(new.entries); -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_set_usage(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ u64 sectors) -+{ -+ int ret, idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) { -+ struct bch_replicas_cpu n; -+ -+ n = cpu_replicas_add_entry(&c->replicas, r); -+ if (!n.entries) -+ return -ENOMEM; -+ -+ ret = replicas_table_update(c, &n); -+ if (ret) -+ return ret; -+ -+ kfree(n.entries); -+ -+ idx = bch2_replicas_entry_idx(c, r); -+ BUG_ON(ret < 0); -+ } -+ -+ c->usage_base->replicas[idx] = sectors; -+ -+ return 0; -+} -+ -+/* Replicas tracking - superblock: */ -+ -+static int -+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry *e, *dst; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ dst = cpu_replicas_entry(cpu_r, idx++); -+ memcpy(dst, e, replicas_entry_bytes(e)); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+static int -+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry_v0 *e; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ entry_size += sizeof(struct bch_replicas_entry) - -+ sizeof(struct bch_replicas_entry_v0); -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ struct bch_replicas_entry *dst = -+ cpu_replicas_entry(cpu_r, idx++); -+ -+ dst->data_type = e->data_type; -+ dst->nr_devs = e->nr_devs; -+ dst->nr_required = 1; -+ memcpy(dst->devs, e->devs, e->nr_devs); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -+{ -+ struct bch_sb_field_replicas *sb_v1; -+ struct bch_sb_field_replicas_v0 *sb_v0; -+ struct bch_replicas_cpu new_r = { 0, 0, NULL }; -+ int ret = 0; -+ -+ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); -+ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); -+ -+ if (ret) -+ return -ENOMEM; -+ -+ bch2_cpu_replicas_sort(&new_r); -+ -+ percpu_down_write(&c->mark_lock); -+ -+ ret = replicas_table_update(c, &new_r); -+ percpu_up_write(&c->mark_lock); -+ -+ kfree(new_r.entries); -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r; -+ struct bch_replicas_entry_v0 *dst; -+ struct bch_replicas_entry *src; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) -+ bytes += replicas_entry_bytes(src) - 1; -+ -+ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); -+ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ dst->data_type = src->data_type; -+ dst->nr_devs = src->nr_devs; -+ memcpy(dst->devs, src->devs, src->nr_devs); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas *sb_r; -+ struct bch_replicas_entry *dst, *src; -+ bool need_v1 = false; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) { -+ bytes += replicas_entry_bytes(src); -+ if (src->nr_required != 1) -+ need_v1 = true; -+ } -+ -+ if (!need_v1) -+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); -+ -+ sb_r = bch2_sb_resize_replicas(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); -+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ memcpy(dst, src, replicas_entry_bytes(src)); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) -+{ -+ unsigned i; -+ -+ sort_cmp_size(cpu_r->entries, -+ cpu_r->nr, -+ cpu_r->entry_size, -+ memcmp, NULL); -+ -+ for (i = 0; i + 1 < cpu_r->nr; i++) { -+ struct bch_replicas_entry *l = -+ cpu_replicas_entry(cpu_r, i); -+ struct bch_replicas_entry *r = -+ cpu_replicas_entry(cpu_r, i + 1); -+ -+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); -+ -+ if (!memcmp(l, r, cpu_r->entry_size)) -+ return "duplicate replicas entry"; -+ } -+ -+ return NULL; -+} -+ -+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: bad nr_required"; -+ if (e->nr_required > 1 && -+ e->nr_required >= e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+static void bch2_sb_replicas_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *r = field_to_type(f, replicas); -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas = { -+ .validate = bch2_sb_validate_replicas, -+ .to_text = bch2_sb_replicas_to_text, -+}; -+ -+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry_v0 *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry_v0(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { -+ .validate = bch2_sb_validate_replicas_v0, -+}; -+ -+/* Query replicas: */ -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *c, -+ struct bch_devs_mask online_devs) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_replicas_entry *e; -+ unsigned i, nr_online, nr_offline; -+ struct replicas_status ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ ret.replicas[i].redundancy = INT_MAX; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) { -+ if (e->data_type >= ARRAY_SIZE(ret.replicas)) -+ panic("e %p data_type %u\n", e, e->data_type); -+ -+ nr_online = nr_offline = 0; -+ -+ for (i = 0; i < e->nr_devs; i++) { -+ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, -+ e->devs[i])); -+ -+ if (test_bit(e->devs[i], online_devs.d)) -+ nr_online++; -+ else -+ nr_offline++; -+ } -+ -+ ret.replicas[e->data_type].redundancy = -+ min(ret.replicas[e->data_type].redundancy, -+ (int) nr_online - (int) e->nr_required); -+ -+ ret.replicas[e->data_type].nr_offline = -+ max(ret.replicas[e->data_type].nr_offline, -+ nr_offline); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ if (ret.replicas[i].redundancy == INT_MAX) -+ ret.replicas[i].redundancy = 0; -+ -+ return ret; -+} -+ -+struct replicas_status bch2_replicas_status(struct bch_fs *c) -+{ -+ return __bch2_replicas_status(c, bch2_online_devs(c)); -+} -+ -+static bool have_enough_devs(struct replicas_status s, -+ enum bch_data_type type, -+ bool force_if_degraded, -+ bool force_if_lost) -+{ -+ return (!s.replicas[type].nr_offline || force_if_degraded) && -+ (s.replicas[type].redundancy >= 0 || force_if_lost); -+} -+ -+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -+{ -+ return (have_enough_devs(s, BCH_DATA_journal, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_btree, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_user, -+ flags & BCH_FORCE_IF_DATA_DEGRADED, -+ flags & BCH_FORCE_IF_DATA_LOST)); -+} -+ -+int bch2_replicas_online(struct bch_fs *c, bool meta) -+{ -+ struct replicas_status s = bch2_replicas_status(c); -+ -+ return (meta -+ ? min(s.replicas[BCH_DATA_journal].redundancy, -+ s.replicas[BCH_DATA_btree].redundancy) -+ : s.replicas[BCH_DATA_user].redundancy) + 1; -+} -+ -+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i, ret = 0; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ for (i = 0; i < e->nr_devs; i++) -+ if (e->devs[i] == ca->dev_idx) -+ ret |= 1 << e->data_type; -+ -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_fs_replicas_init(struct bch_fs *c) -+{ -+ c->journal.entry_u64s_reserved += -+ reserve_journal_replicas(c, &c->replicas); -+ -+ return replicas_table_update(c, &c->replicas); -+} -diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h -new file mode 100644 -index 000000000000..8b95164fbb56 ---- /dev/null -+++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REPLICAS_H -+#define _BCACHEFS_REPLICAS_H -+ -+#include "eytzinger.h" -+#include "replicas_types.h" -+ -+void bch2_replicas_entry_to_text(struct printbuf *, -+ struct bch_replicas_entry *); -+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -+ -+static inline struct bch_replicas_entry * -+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -+{ -+ return (void *) r->entries + r->entry_size * i; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *, -+ enum bch_data_type, -+ struct bch_devs_list); -+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); -+int bch2_mark_replicas(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); -+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); -+ -+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, -+ unsigned dev) -+{ -+ e->data_type = BCH_DATA_cached; -+ e->nr_devs = 1; -+ e->nr_required = 1; -+ e->devs[0] = dev; -+} -+ -+struct replicas_status { -+ struct { -+ int redundancy; -+ unsigned nr_offline; -+ } replicas[BCH_DATA_NR]; -+}; -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *, -+ struct bch_devs_mask); -+struct replicas_status bch2_replicas_status(struct bch_fs *); -+bool bch2_have_enough_devs(struct replicas_status, unsigned); -+ -+int bch2_replicas_online(struct bch_fs *, bool); -+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -+ -+int bch2_replicas_gc_end(struct bch_fs *, int); -+int bch2_replicas_gc_start(struct bch_fs *, unsigned); -+int bch2_replicas_gc2(struct bch_fs *); -+ -+int bch2_replicas_set_usage(struct bch_fs *, -+ struct bch_replicas_entry *, -+ u64); -+ -+#define for_each_cpu_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ -+ _i = (void *) (_i) + (_r)->entry_size) -+ -+/* iterate over superblock replicas - used by userspace tools: */ -+ -+#define replicas_entry_next(_i) \ -+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) -+ -+#define for_each_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+#define for_each_replicas_entry_v0(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; -+ -+int bch2_fs_replicas_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REPLICAS_H */ -diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h -new file mode 100644 -index 000000000000..0535b1d3760e ---- /dev/null -+++ b/fs/bcachefs/replicas_types.h -@@ -0,0 +1,10 @@ -+#ifndef _BCACHEFS_REPLICAS_TYPES_H -+#define _BCACHEFS_REPLICAS_TYPES_H -+ -+struct bch_replicas_cpu { -+ unsigned nr; -+ unsigned entry_size; -+ struct bch_replicas_entry *entries; -+}; -+ -+#endif /* _BCACHEFS_REPLICAS_TYPES_H */ -diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c -new file mode 100644 -index 000000000000..c062edb3fbc2 ---- /dev/null -+++ b/fs/bcachefs/siphash.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: BSD-3-Clause -+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ -+ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ */ -+ -+/* -+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d -+ * are the number of compression rounds and the number of finalization rounds. -+ * A compression round is identical to a finalization round and this round -+ * function is called SipRound. Given a 128-bit key k and a (possibly empty) -+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). -+ * -+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, -+ * by Jean-Philippe Aumasson and Daniel J. Bernstein, -+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa -+ * https://131002.net/siphash/siphash.pdf -+ * https://131002.net/siphash/ -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include "siphash.h" -+ -+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -+{ -+ while (rounds--) { -+ ctx->v[0] += ctx->v[1]; -+ ctx->v[2] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 13); -+ ctx->v[3] = rol64(ctx->v[3], 16); -+ -+ ctx->v[1] ^= ctx->v[0]; -+ ctx->v[3] ^= ctx->v[2]; -+ ctx->v[0] = rol64(ctx->v[0], 32); -+ -+ ctx->v[2] += ctx->v[1]; -+ ctx->v[0] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 17); -+ ctx->v[3] = rol64(ctx->v[3], 21); -+ -+ ctx->v[1] ^= ctx->v[2]; -+ ctx->v[3] ^= ctx->v[0]; -+ ctx->v[2] = rol64(ctx->v[2], 32); -+ } -+} -+ -+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -+{ -+ u64 m = get_unaligned_le64(ptr); -+ -+ ctx->v[3] ^= m; -+ SipHash_Rounds(ctx, rounds); -+ ctx->v[0] ^= m; -+} -+ -+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -+{ -+ u64 k0, k1; -+ -+ k0 = le64_to_cpu(key->k0); -+ k1 = le64_to_cpu(key->k1); -+ -+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; -+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; -+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; -+ ctx->v[3] = 0x7465646279746573ULL ^ k1; -+ -+ memset(ctx->buf, 0, sizeof(ctx->buf)); -+ ctx->bytes = 0; -+} -+ -+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, -+ const void *src, size_t len) -+{ -+ const u8 *ptr = src; -+ size_t left, used; -+ -+ if (len == 0) -+ return; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ ctx->bytes += len; -+ -+ if (used > 0) { -+ left = sizeof(ctx->buf) - used; -+ -+ if (len >= left) { -+ memcpy(&ctx->buf[used], ptr, left); -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ len -= left; -+ ptr += left; -+ } else { -+ memcpy(&ctx->buf[used], ptr, len); -+ return; -+ } -+ } -+ -+ while (len >= sizeof(ctx->buf)) { -+ SipHash_CRounds(ctx, ptr, rc); -+ len -= sizeof(ctx->buf); -+ ptr += sizeof(ctx->buf); -+ } -+ -+ if (len > 0) -+ memcpy(&ctx->buf[used], ptr, len); -+} -+ -+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ -+ r = SipHash_End(ctx, rc, rf); -+ -+ *((__le64 *) dst) = cpu_to_le64(r); -+} -+ -+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ size_t left, used; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ left = sizeof(ctx->buf) - used; -+ memset(&ctx->buf[used], 0, left - 1); -+ ctx->buf[7] = ctx->bytes; -+ -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ ctx->v[2] ^= 0xff; -+ SipHash_Rounds(ctx, rf); -+ -+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); -+ memset(ctx, 0, sizeof(*ctx)); -+ return (r); -+} -+ -+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -+{ -+ SIPHASH_CTX ctx; -+ -+ SipHash_Init(&ctx, key); -+ SipHash_Update(&ctx, rc, rf, src, len); -+ return SipHash_End(&ctx, rc, rf); -+} -diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h -new file mode 100644 -index 000000000000..3dfaf34a43b2 ---- /dev/null -+++ b/fs/bcachefs/siphash.h -@@ -0,0 +1,87 @@ -+/* SPDX-License-Identifier: BSD-3-Clause */ -+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * $FreeBSD$ -+ */ -+ -+/* -+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) -+ * optimized for speed on short messages returning a 64bit hash/digest value. -+ * -+ * The number of rounds is defined during the initialization: -+ * SipHash24_Init() for the fast and resonable strong version -+ * SipHash48_Init() for the strong version (half as fast) -+ * -+ * struct SIPHASH_CTX ctx; -+ * SipHash24_Init(&ctx); -+ * SipHash_SetKey(&ctx, "16bytes long key"); -+ * SipHash_Update(&ctx, pointer_to_string, length_of_string); -+ * SipHash_Final(output, &ctx); -+ */ -+ -+#ifndef _SIPHASH_H_ -+#define _SIPHASH_H_ -+ -+#include -+ -+#define SIPHASH_BLOCK_LENGTH 8 -+#define SIPHASH_KEY_LENGTH 16 -+#define SIPHASH_DIGEST_LENGTH 8 -+ -+typedef struct _SIPHASH_CTX { -+ u64 v[4]; -+ u8 buf[SIPHASH_BLOCK_LENGTH]; -+ u32 bytes; -+} SIPHASH_CTX; -+ -+typedef struct { -+ __le64 k0; -+ __le64 k1; -+} SIPHASH_KEY; -+ -+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -+u64 SipHash_End(SIPHASH_CTX *, int, int); -+void SipHash_Final(void *, SIPHASH_CTX *, int, int); -+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); -+ -+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -+#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) -+ -+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -+#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) -+ -+#endif /* _SIPHASH_H_ */ -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -new file mode 100644 -index 000000000000..dea9b7252b88 ---- /dev/null -+++ b/fs/bcachefs/str_hash.h -@@ -0,0 +1,336 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_STR_HASH_H -+#define _BCACHEFS_STR_HASH_H -+ -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "checksum.h" -+#include "error.h" -+#include "inode.h" -+#include "siphash.h" -+#include "super.h" -+ -+#include -+#include -+#include -+ -+static inline enum bch_str_hash_type -+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -+{ -+ switch (opt) { -+ case BCH_STR_HASH_OPT_CRC32C: -+ return BCH_STR_HASH_CRC32C; -+ case BCH_STR_HASH_OPT_CRC64: -+ return BCH_STR_HASH_CRC64; -+ case BCH_STR_HASH_OPT_SIPHASH: -+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) -+ ? BCH_STR_HASH_SIPHASH -+ : BCH_STR_HASH_SIPHASH_OLD; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_info { -+ u8 type; -+ union { -+ __le64 crc_key; -+ SIPHASH_KEY siphash_key; -+ }; -+}; -+ -+static inline struct bch_hash_info -+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -+{ -+ /* XXX ick */ -+ struct bch_hash_info info = { -+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & -+ ~(~0U << INODE_STR_HASH_BITS), -+ .crc_key = bi->bi_hash_seed, -+ }; -+ -+ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { -+ SHASH_DESC_ON_STACK(desc, c->sha256); -+ u8 digest[SHA256_DIGEST_SIZE]; -+ -+ desc->tfm = c->sha256; -+ -+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, -+ sizeof(bi->bi_hash_seed), digest); -+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); -+ } -+ -+ return info; -+} -+ -+struct bch_str_hash_ctx { -+ union { -+ u32 crc32c; -+ u64 crc64; -+ SIPHASH_CTX siphash; -+ }; -+}; -+ -+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Init(&ctx->siphash, &info->siphash_key); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info, -+ const void *data, size_t len) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(ctx->crc32c, data, len); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(ctx->crc64, data, len); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Update(&ctx->siphash, data, len); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ return ctx->crc32c; -+ case BCH_STR_HASH_CRC64: -+ return ctx->crc64 >> 1; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ return SipHash24_End(&ctx->siphash) >> 1; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_desc { -+ enum btree_id btree_id; -+ u8 key_type; -+ -+ u64 (*hash_key)(const struct bch_hash_info *, const void *); -+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); -+ bool (*cmp_key)(struct bkey_s_c, const void *); -+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); -+}; -+ -+static __always_inline struct btree_iter * -+bch2_hash_lookup(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key, -+ unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|flags, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_key(k, key)) -+ return iter; -+ } else if (k.k->type == KEY_TYPE_whiteout) { -+ ; -+ } else { -+ /* hole, not found */ -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOENT); -+} -+ -+static __always_inline struct btree_iter * -+bch2_hash_hole(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type != desc.key_type) -+ return iter; -+ } -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOSPC); -+} -+ -+static __always_inline -+int bch2_hash_needs_whiteout(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *start) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_copy_iter(trans, start); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ bch2_btree_iter_next_slot(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->type != desc.key_type && -+ k.k->type != KEY_TYPE_whiteout) -+ break; -+ -+ if (k.k->type == desc.key_type && -+ desc.hash_bkey(info, k) <= start->pos.offset) { -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static __always_inline -+int bch2_hash_set(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, struct bkey_i *insert, int flags) -+{ -+ struct btree_iter *iter, *slot = NULL; -+ struct bkey_s_c k; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) -+ goto found; -+ -+ /* hash collision: */ -+ continue; -+ } -+ -+ if (!slot && -+ !(flags & BCH_HASH_SET_MUST_REPLACE)) { -+ slot = bch2_trans_copy_iter(trans, iter); -+ if (IS_ERR(slot)) -+ return PTR_ERR(slot); -+ } -+ -+ if (k.k->type != KEY_TYPE_whiteout) -+ goto not_found; -+ } -+ -+ if (!ret) -+ ret = -ENOSPC; -+out: -+ bch2_trans_iter_put(trans, slot); -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+found: -+ found = true; -+not_found: -+ -+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { -+ ret = -ENOENT; -+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { -+ ret = -EEXIST; -+ } else { -+ if (!found && slot) -+ swap(iter, slot); -+ -+ insert->k.p = iter->pos; -+ bch2_trans_update(trans, iter, insert, 0); -+ } -+ -+ goto out; -+} -+ -+static __always_inline -+int bch2_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ struct bkey_i *delete; -+ int ret; -+ -+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); -+ if (ret < 0) -+ return ret; -+ -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ if (IS_ERR(delete)) -+ return PTR_ERR(delete); -+ -+ bkey_init(&delete->k); -+ delete->k.p = iter->pos; -+ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; -+ -+ bch2_trans_update(trans, iter, delete, 0); -+ return 0; -+} -+ -+static __always_inline -+int bch2_hash_delete(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_hash_lookup(trans, desc, info, inode, key, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_hash_delete_at(trans, desc, info, iter); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+#endif /* _BCACHEFS_STR_HASH_H */ -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -new file mode 100644 -index 000000000000..cee6cc938734 ---- /dev/null -+++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1158 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_seq_blacklist.h" -+#include "replicas.h" -+#include "quota.h" -+#include "super-io.h" -+#include "super.h" -+#include "vstructs.h" -+ -+#include -+#include -+ -+const char * const bch2_sb_fields[] = { -+#define x(name, nr) #name, -+ BCH_SB_FIELDS() -+#undef x -+ NULL -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *, -+ struct bch_sb_field *); -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f; -+ -+ /* XXX: need locking around superblock to access optional fields */ -+ -+ vstruct_for_each(sb, f) -+ if (le32_to_cpu(f->type) == type) -+ return f; -+ return NULL; -+} -+ -+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, -+ struct bch_sb_field *f, -+ unsigned u64s) -+{ -+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; -+ -+ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > -+ sb->page_order); -+ -+ if (!f && !u64s) { -+ /* nothing to do: */ -+ } else if (!f) { -+ f = vstruct_last(sb->sb); -+ memset(f, 0, sizeof(u64) * u64s); -+ f->u64s = cpu_to_le32(u64s); -+ f->type = 0; -+ } else { -+ void *src, *dst; -+ -+ src = vstruct_end(f); -+ -+ if (u64s) { -+ f->u64s = cpu_to_le32(u64s); -+ dst = vstruct_end(f); -+ } else { -+ dst = f; -+ } -+ -+ memmove(dst, src, vstruct_end(sb->sb) - src); -+ -+ if (dst > src) -+ memset(src, 0, dst - src); -+ } -+ -+ sb->sb->u64s = cpu_to_le32(sb_u64s); -+ -+ return u64s ? f : NULL; -+} -+ -+void bch2_sb_field_delete(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ -+ if (f) -+ __bch2_sb_field_resize(sb, f, 0); -+} -+ -+/* Superblock realloc/free: */ -+ -+void bch2_free_super(struct bch_sb_handle *sb) -+{ -+ if (sb->bio) -+ bio_put(sb->bio); -+ if (!IS_ERR_OR_NULL(sb->bdev)) -+ blkdev_put(sb->bdev, sb->mode); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ memset(sb, 0, sizeof(*sb)); -+} -+ -+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -+{ -+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); -+ unsigned order = get_order(new_bytes); -+ struct bch_sb *new_sb; -+ struct bio *bio; -+ -+ if (sb->sb && sb->page_order >= order) -+ return 0; -+ -+ if (sb->have_layout) { -+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; -+ -+ if (new_bytes > max_bytes) { -+ char buf[BDEVNAME_SIZE]; -+ -+ pr_err("%s: superblock too big: want %zu but have %llu", -+ bdevname(sb->bdev, buf), new_bytes, max_bytes); -+ return -ENOSPC; -+ } -+ } -+ -+ if (sb->page_order >= order && sb->sb) -+ return 0; -+ -+ if (dynamic_fault("bcachefs:add:super_realloc")) -+ return -ENOMEM; -+ -+ if (sb->have_bio) { -+ bio = bio_kmalloc(GFP_KERNEL, 1 << order); -+ if (!bio) -+ return -ENOMEM; -+ -+ if (sb->bio) -+ bio_put(sb->bio); -+ sb->bio = bio; -+ } -+ -+ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); -+ if (!new_sb) -+ return -ENOMEM; -+ -+ if (sb->sb) -+ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ sb->sb = new_sb; -+ -+ sb->page_order = order; -+ -+ return 0; -+} -+ -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type, -+ unsigned u64s) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ ssize_t d = -old_u64s + u64s; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) -+ return NULL; -+ -+ if (sb->fs_sb) { -+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ /* XXX: we're not checking that offline device have enough space */ -+ -+ for_each_online_member(ca, c, i) { -+ struct bch_sb_handle *sb = &ca->disk_sb; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { -+ percpu_ref_put(&ca->ref); -+ return NULL; -+ } -+ } -+ } -+ -+ f = bch2_sb_field_get(sb->sb, type); -+ f = __bch2_sb_field_resize(sb, f, u64s); -+ if (f) -+ f->type = cpu_to_le32(type); -+ return f; -+} -+ -+/* Superblock validate: */ -+ -+static inline void __bch2_sb_layout_size_assert(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -+} -+ -+static const char *validate_sb_layout(struct bch_sb_layout *layout) -+{ -+ u64 offset, prev_offset, max_sectors; -+ unsigned i; -+ -+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock layout"; -+ -+ if (layout->layout_type != 0) -+ return "Invalid superblock layout type"; -+ -+ if (!layout->nr_superblocks) -+ return "Invalid superblock layout: no superblocks"; -+ -+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) -+ return "Invalid superblock layout: too many superblocks"; -+ -+ max_sectors = 1 << layout->sb_max_size_bits; -+ -+ prev_offset = le64_to_cpu(layout->sb_offset[0]); -+ -+ for (i = 1; i < layout->nr_superblocks; i++) { -+ offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset < prev_offset + max_sectors) -+ return "Invalid superblock layout: superblocks overlap"; -+ prev_offset = offset; -+ } -+ -+ return NULL; -+} -+ -+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) -+{ -+ struct bch_sb *sb = disk_sb->sb; -+ struct bch_sb_field *f; -+ struct bch_sb_field_members *mi; -+ const char *err; -+ u32 version, version_min; -+ u16 block_size; -+ -+ version = le16_to_cpu(sb->version); -+ version_min = version >= bcachefs_metadata_version_new_versioning -+ ? le16_to_cpu(sb->version_min) -+ : version; -+ -+ if (version >= bcachefs_metadata_version_max || -+ version_min < bcachefs_metadata_version_min) -+ return "Unsupported superblock version"; -+ -+ if (version_min > version) -+ return "Bad minimum version"; -+ -+ if (sb->features[1] || -+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) -+ return "Filesystem has incompatible features"; -+ -+ block_size = le16_to_cpu(sb->block_size); -+ -+ if (!is_power_of_2(block_size) || -+ block_size > PAGE_SECTORS) -+ return "Bad block size"; -+ -+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) -+ return "Bad user UUID"; -+ -+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) -+ return "Bad internal UUID"; -+ -+ if (!sb->nr_devices || -+ sb->nr_devices <= sb->dev_idx || -+ sb->nr_devices > BCH_SB_MEMBERS_MAX) -+ return "Bad number of member devices"; -+ -+ if (!BCH_SB_META_REPLICAS_WANT(sb) || -+ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_META_REPLICAS_REQ(sb) || -+ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || -+ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || -+ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) -+ return "Invalid compression type"; -+ -+ if (!BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "Btree node size not set"; -+ -+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) -+ return "Btree node size not a power of two"; -+ -+ if (BCH_SB_GC_RESERVE(sb) < 5) -+ return "gc reserve percentage too small"; -+ -+ if (!sb->time_precision || -+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) -+ return "invalid time precision"; -+ -+ /* validate layout */ -+ err = validate_sb_layout(&sb->layout); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (!f->u64s) -+ return "Invalid superblock: invalid optional field"; -+ -+ if (vstruct_next(f) > vstruct_last(sb)) -+ return "Invalid superblock: invalid optional field"; -+ } -+ -+ /* members must be validated first: */ -+ mi = bch2_sb_get_members(sb); -+ if (!mi) -+ return "Invalid superblock: member info area missing"; -+ -+ err = bch2_sb_field_validate(sb, &mi->field); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) -+ continue; -+ -+ err = bch2_sb_field_validate(sb, f); -+ if (err) -+ return err; -+ } -+ -+ return NULL; -+} -+ -+/* device open: */ -+ -+static void bch2_sb_update(struct bch_fs *c) -+{ -+ struct bch_sb *src = c->disk_sb.sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(src); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ c->sb.uuid = src->uuid; -+ c->sb.user_uuid = src->user_uuid; -+ c->sb.version = le16_to_cpu(src->version); -+ c->sb.nr_devices = src->nr_devices; -+ c->sb.clean = BCH_SB_CLEAN(src); -+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); -+ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); -+ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); -+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); -+ c->sb.time_precision = le32_to_cpu(src->time_precision); -+ c->sb.features = le64_to_cpu(src->features[0]); -+ c->sb.compat = le64_to_cpu(src->compat[0]); -+ -+ for_each_member_device(ca, c, i) -+ ca->mi = bch2_mi_to_cpu(mi->members + i); -+} -+ -+/* doesn't copy member info */ -+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -+{ -+ struct bch_sb_field *src_f, *dst_f; -+ struct bch_sb *dst = dst_handle->sb; -+ unsigned i; -+ -+ dst->version = src->version; -+ dst->version_min = src->version_min; -+ dst->seq = src->seq; -+ dst->uuid = src->uuid; -+ dst->user_uuid = src->user_uuid; -+ memcpy(dst->label, src->label, sizeof(dst->label)); -+ -+ dst->block_size = src->block_size; -+ dst->nr_devices = src->nr_devices; -+ -+ dst->time_base_lo = src->time_base_lo; -+ dst->time_base_hi = src->time_base_hi; -+ dst->time_precision = src->time_precision; -+ -+ memcpy(dst->flags, src->flags, sizeof(dst->flags)); -+ memcpy(dst->features, src->features, sizeof(dst->features)); -+ memcpy(dst->compat, src->compat, sizeof(dst->compat)); -+ -+ for (i = 0; i < BCH_SB_FIELD_NR; i++) { -+ if (i == BCH_SB_FIELD_journal) -+ continue; -+ -+ src_f = bch2_sb_field_get(src, i); -+ dst_f = bch2_sb_field_get(dst, i); -+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, -+ src_f ? le32_to_cpu(src_f->u64s) : 0); -+ -+ if (src_f) -+ memcpy(dst_f, src_f, vstruct_bytes(src_f)); -+ } -+} -+ -+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -+{ -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(src); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ int ret; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ ret = bch2_sb_realloc(&c->disk_sb, -+ le32_to_cpu(src->u64s) - journal_u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&c->disk_sb, src); -+ -+ ret = bch2_sb_replicas_to_cpu_replicas(c); -+ if (ret) -+ return ret; -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ return ret; -+ -+ bch2_sb_update(c); -+ return 0; -+} -+ -+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(dst); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; -+ int ret; -+ -+ ret = bch2_sb_realloc(&ca->disk_sb, u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&ca->disk_sb, src); -+ return 0; -+} -+ -+/* read superblock: */ -+ -+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) -+{ -+ struct bch_csum csum; -+ size_t bytes; -+reread: -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ if (submit_bio_wait(sb->bio)) -+ return "IO error"; -+ -+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock"; -+ -+ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || -+ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) -+ return "Unsupported superblock version"; -+ -+ bytes = vstruct_bytes(sb->sb); -+ -+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) -+ return "Bad superblock: too big"; -+ -+ if (get_order(bytes) > sb->page_order) { -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) -+ return "cannot allocate memory"; -+ goto reread; -+ } -+ -+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) -+ return "unknown csum type"; -+ -+ /* XXX: verify MACs */ -+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), -+ null_nonce(), sb->sb); -+ -+ if (bch2_crc_cmp(csum, sb->sb->csum)) -+ return "bad checksum reading superblock"; -+ -+ sb->seq = le64_to_cpu(sb->sb->seq); -+ -+ return NULL; -+} -+ -+int bch2_read_super(const char *path, struct bch_opts *opts, -+ struct bch_sb_handle *sb) -+{ -+ u64 offset = opt_get(*opts, sb); -+ struct bch_sb_layout layout; -+ const char *err; -+ __le64 *i; -+ int ret; -+ -+ pr_verbose_init(*opts, ""); -+ -+ memset(sb, 0, sizeof(*sb)); -+ sb->mode = FMODE_READ; -+ sb->have_bio = true; -+ -+ if (!opt_get(*opts, noexcl)) -+ sb->mode |= FMODE_EXCL; -+ -+ if (!opt_get(*opts, nochanges)) -+ sb->mode |= FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (IS_ERR(sb->bdev) && -+ PTR_ERR(sb->bdev) == -EACCES && -+ opt_get(*opts, read_only)) { -+ sb->mode &= ~FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (!IS_ERR(sb->bdev)) -+ opt_set(*opts, nochanges, true); -+ } -+ -+ if (IS_ERR(sb->bdev)) { -+ ret = PTR_ERR(sb->bdev); -+ goto out; -+ } -+ -+ err = "cannot allocate memory"; -+ ret = bch2_sb_realloc(sb, 0); -+ if (ret) -+ goto err; -+ -+ ret = -EFAULT; -+ err = "dynamic fault"; -+ if (bch2_fs_init_fault("read_super")) -+ goto err; -+ -+ ret = -EINVAL; -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ -+ if (opt_defined(*opts, sb)) -+ goto err; -+ -+ pr_err("error reading default superblock: %s", err); -+ -+ /* -+ * Error reading primary superblock - read location of backup -+ * superblocks: -+ */ -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ /* -+ * use sb buffer to read layout, since sb buffer is page aligned but -+ * layout won't be: -+ */ -+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); -+ -+ err = "IO error"; -+ if (submit_bio_wait(sb->bio)) -+ goto err; -+ -+ memcpy(&layout, sb->sb, sizeof(layout)); -+ err = validate_sb_layout(&layout); -+ if (err) -+ goto err; -+ -+ for (i = layout.sb_offset; -+ i < layout.sb_offset + layout.nr_superblocks; i++) { -+ offset = le64_to_cpu(*i); -+ -+ if (offset == opt_get(*opts, sb)) -+ continue; -+ -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ } -+ -+ ret = -EINVAL; -+ goto err; -+ -+got_super: -+ err = "Superblock block size smaller than device block size"; -+ ret = -EINVAL; -+ if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev)) -+ goto err; -+ -+ if (sb->mode & FMODE_WRITE) -+ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities -+ |= BDI_CAP_STABLE_WRITES; -+ ret = 0; -+ sb->have_layout = true; -+out: -+ pr_verbose_init(*opts, "ret %i", ret); -+ return ret; -+err: -+ bch2_free_super(sb); -+ pr_err("error reading superblock: %s", err); -+ goto out; -+} -+ -+/* write superblock: */ -+ -+static void write_super_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ -+ /* XXX: return errors directly */ -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", -+ bch2_blk_status_to_str(bio->bi_status))) -+ ca->sb_write_error = 1; -+ -+ closure_put(&ca->fs->sb_write); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ sb->offset = sb->layout.sb_offset[idx]; -+ -+ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); -+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), -+ null_nonce(), sb); -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, sb, -+ roundup((size_t) vstruct_bytes(sb), -+ bdev_logical_block_size(ca->disk_sb.bdev))); -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+int bch2_write_super(struct bch_fs *c) -+{ -+ struct closure *cl = &c->sb_write; -+ struct bch_dev *ca; -+ unsigned i, sb = 0, nr_wrote; -+ const char *err; -+ struct bch_devs_mask sb_written; -+ bool wrote, can_mount_without_written, can_mount_with_written; -+ int ret = 0; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ closure_init_stack(cl); -+ memset(&sb_written, 0, sizeof(sb_written)); -+ -+ le64_add_cpu(&c->disk_sb.sb->seq, 1); -+ -+ if (test_bit(BCH_FS_ERROR, &c->flags)) -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ for_each_online_member(ca, c, i) { -+ err = bch2_sb_validate(&ca->disk_sb); -+ if (err) { -+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); -+ ret = -1; -+ goto out; -+ } -+ } -+ -+ if (c->opts.nochanges) -+ goto out; -+ -+ for_each_online_member(ca, c, i) { -+ __set_bit(ca->dev_idx, sb_written.d); -+ ca->sb_write_error = 0; -+ } -+ -+ for_each_online_member(ca, c, i) -+ read_back_super(c, ca); -+ closure_sync(cl); -+ -+ for_each_online_member(ca, c, i) { -+ if (!ca->sb_write_error && -+ ca->disk_sb.seq != -+ le64_to_cpu(ca->sb_read_scratch->seq)) { -+ bch2_fs_fatal_error(c, -+ "Superblock modified by another process"); -+ percpu_ref_put(&ca->io_ref); -+ ret = -EROFS; -+ goto out; -+ } -+ } -+ -+ do { -+ wrote = false; -+ for_each_online_member(ca, c, i) -+ if (!ca->sb_write_error && -+ sb < ca->disk_sb.sb->layout.nr_superblocks) { -+ write_one_super(c, ca, sb); -+ wrote = true; -+ } -+ closure_sync(cl); -+ sb++; -+ } while (wrote); -+ -+ for_each_online_member(ca, c, i) { -+ if (ca->sb_write_error) -+ __clear_bit(ca->dev_idx, sb_written.d); -+ else -+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); -+ } -+ -+ nr_wrote = dev_mask_nr(&sb_written); -+ -+ can_mount_with_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) -+ sb_written.d[i] = ~sb_written.d[i]; -+ -+ can_mount_without_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ /* -+ * If we would be able to mount _without_ the devices we successfully -+ * wrote superblocks to, we weren't able to write to enough devices: -+ * -+ * Exception: if we can mount without the successes because we haven't -+ * written anything (new filesystem), we continue if we'd be able to -+ * mount with the devices we did successfully write to: -+ */ -+ if (bch2_fs_fatal_err_on(!nr_wrote || -+ (can_mount_without_written && -+ !can_mount_with_written), c, -+ "Unable to write superblock to sufficient devices")) -+ ret = -1; -+out: -+ /* Make new options visible after they're persistent: */ -+ bch2_sb_update(c); -+ return ret; -+} -+ -+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ mutex_lock(&c->sb_lock); -+ if (!(c->sb.features & (1ULL << feat))) { -+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); -+ -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static int u64_cmp(const void *_l, const void *_r) -+{ -+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); -+ -+ return l < r ? -1 : l > r ? 1 : 0; -+} -+ -+static const char *bch2_sb_validate_journal(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ const char *err; -+ unsigned nr; -+ unsigned i; -+ u64 *b; -+ -+ journal = bch2_sb_get_journal(sb); -+ if (!journal) -+ return NULL; -+ -+ nr = bch2_nr_journal_buckets(journal); -+ if (!nr) -+ return NULL; -+ -+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); -+ if (!b) -+ return "cannot allocate memory"; -+ -+ for (i = 0; i < nr; i++) -+ b[i] = le64_to_cpu(journal->buckets[i]); -+ -+ sort(b, nr, sizeof(u64), u64_cmp, NULL); -+ -+ err = "journal bucket at sector 0"; -+ if (!b[0]) -+ goto err; -+ -+ err = "journal bucket before first bucket"; -+ if (m && b[0] < le16_to_cpu(m->first_bucket)) -+ goto err; -+ -+ err = "journal bucket past end of device"; -+ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) -+ goto err; -+ -+ err = "duplicate journal buckets"; -+ for (i = 0; i + 1 < nr; i++) -+ if (b[i] == b[i + 1]) -+ goto err; -+ -+ err = NULL; -+err: -+ kfree(b); -+ return err; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_journal = { -+ .validate = bch2_sb_validate_journal, -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+static const char *bch2_sb_validate_members(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_member *m; -+ -+ if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) -+ return "Invalid superblock: bad member info"; -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) -+ return "Too many buckets"; -+ -+ if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) -+ return "Not enough buckets"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) -+ return "bucket size smaller than block size"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "bucket size smaller than btree node size"; -+ } -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_validate_members, -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+static const char *bch2_sb_validate_crypt(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) -+ return "invalid field crypt: wrong size"; -+ -+ if (BCH_CRYPT_KDF_TYPE(crypt)) -+ return "invalid field crypt: bad kdf type"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_validate_crypt, -+}; -+ -+/* BCH_SB_FIELD_clean: */ -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = clean->start; -+ entry < (struct jset_entry *) vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) -+ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); -+} -+ -+int bch2_fs_mark_dirty(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Unconditionally write superblock, to verify it hasn't changed before -+ * we go rw: -+ */ -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; -+ ret = bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static void -+entry_init_u64s(struct jset_entry *entry, unsigned u64s) -+{ -+ memset(entry, 0, u64s * sizeof(u64)); -+ -+ /* -+ * The u64s field counts from the start of data, ignoring the shared -+ * fields. -+ */ -+ entry->u64s = u64s - 1; -+} -+ -+static void -+entry_init_size(struct jset_entry *entry, size_t size) -+{ -+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); -+ entry_init_u64s(entry, u64s); -+} -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *c, -+ struct jset_entry *entry, -+ u64 journal_seq) -+{ -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ -+ if (!journal_seq) { -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ } else { -+ bch2_fs_usage_acc_to_base(c, journal_seq & 1); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_INODES; -+ u->v = cpu_to_le64(c->usage_base->nr_inodes); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_KEY_VERSION; -+ u->v = cpu_to_le64(atomic64_read(&c->key_version)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_RESERVED; -+ u->entry.level = i; -+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u) + e->nr_devs); -+ u->entry.type = BCH_JSET_ENTRY_data_usage; -+ u->v = cpu_to_le64(c->usage_base->replicas[i]); -+ memcpy(&u->r, e, replicas_entry_bytes(e)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return entry; -+} -+ -+void bch2_fs_mark_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *sb_clean; -+ struct jset_entry *entry; -+ unsigned u64s; -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_CLEAN(c->disk_sb.sb)) -+ goto out; -+ -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); -+ -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); -+ -+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; -+ -+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); -+ if (!sb_clean) { -+ bch_err(c, "error resizing superblock while setting filesystem clean"); -+ goto out; -+ } -+ -+ sb_clean->flags = 0; -+ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); -+ -+ /* Trying to catch outstanding bug: */ -+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); -+ -+ entry = sb_clean->start; -+ entry = bch2_journal_super_entries_add_common(c, entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); -+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); -+ -+ memset(entry, 0, -+ vstruct_end(&sb_clean->field) - (void *) entry); -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(sb_clean, WRITE); -+ -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+} -+ -+static const char *bch2_sb_validate_clean(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) -+ return "invalid field crypt: wrong size"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_validate_clean, -+}; -+ -+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -+#define x(f, nr) \ -+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, -+ BCH_SB_FIELDS() -+#undef x -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ -+ return type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type]->validate(sb, f) -+ : NULL; -+} -+ -+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type] : NULL; -+ -+ if (ops) -+ pr_buf(out, "%s", bch2_sb_fields[type]); -+ else -+ pr_buf(out, "(unknown field %u)", type); -+ -+ pr_buf(out, " (size %llu):", vstruct_bytes(f)); -+ -+ if (ops && ops->to_text) -+ bch2_sb_field_ops[type]->to_text(out, sb, f); -+} -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -new file mode 100644 -index 000000000000..7a068158efca ---- /dev/null -+++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,137 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_IO_H -+#define _BCACHEFS_SUPER_IO_H -+ -+#include "extents.h" -+#include "eytzinger.h" -+#include "super_types.h" -+#include "super.h" -+ -+#include -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, -+ enum bch_sb_field_type, unsigned); -+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); -+ -+#define field_to_type(_f, _name) \ -+ container_of_or_null(_f, struct bch_sb_field_##_name, field) -+ -+#define x(_name, _nr) \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_get_##_name(struct bch_sb *sb) \ -+{ \ -+ return field_to_type(bch2_sb_field_get(sb, \ -+ BCH_SB_FIELD_##_name), _name); \ -+} \ -+ \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ -+{ \ -+ return field_to_type(bch2_sb_field_resize(sb, \ -+ BCH_SB_FIELD_##_name, u64s), _name); \ -+} -+ -+BCH_SB_FIELDS() -+#undef x -+ -+extern const char * const bch2_sb_fields[]; -+ -+struct bch_sb_field_ops { -+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); -+ void (*to_text)(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+}; -+ -+static inline __le64 bch2_sb_magic(struct bch_fs *c) -+{ -+ __le64 ret; -+ memcpy(&ret, &c->sb.uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 jset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -+} -+ -+static inline __u64 bset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -+} -+ -+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); -+ -+void bch2_free_super(struct bch_sb_handle *); -+int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -+ -+const char *bch2_sb_validate(struct bch_sb_handle *); -+ -+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -+int bch2_write_super(struct bch_fs *); -+void __bch2_check_set_feature(struct bch_fs *, unsigned); -+ -+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ if (!(c->sb.features & (1ULL << feat))) -+ __bch2_check_set_feature(c, feat); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -+{ -+ return j -+ ? (__le64 *) vstruct_end(&j->field) - j->buckets -+ : 0; -+} -+ -+/* BCH_SB_FIELD_members: */ -+ -+static inline bool bch2_member_exists(struct bch_member *m) -+{ -+ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); -+} -+ -+static inline bool bch2_dev_exists(struct bch_sb *sb, -+ struct bch_sb_field_members *mi, -+ unsigned dev) -+{ -+ return dev < sb->nr_devices && -+ bch2_member_exists(&mi->members[dev]); -+} -+ -+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -+{ -+ return (struct bch_member_cpu) { -+ .nbuckets = le64_to_cpu(mi->nbuckets), -+ .first_bucket = le16_to_cpu(mi->first_bucket), -+ .bucket_size = le16_to_cpu(mi->bucket_size), -+ .group = BCH_MEMBER_GROUP(mi), -+ .state = BCH_MEMBER_STATE(mi), -+ .replacement = BCH_MEMBER_REPLACEMENT(mi), -+ .discard = BCH_MEMBER_DISCARD(mi), -+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), -+ .durability = BCH_MEMBER_DURABILITY(mi) -+ ? BCH_MEMBER_DURABILITY(mi) - 1 -+ : 1, -+ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), -+ }; -+} -+ -+/* BCH_SB_FIELD_clean: */ -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *, -+ struct jset_entry *, u64); -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); -+ -+int bch2_fs_mark_dirty(struct bch_fs *); -+void bch2_fs_mark_clean(struct bch_fs *); -+ -+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_SUPER_IO_H */ -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -new file mode 100644 -index 000000000000..30be083b09bf ---- /dev/null -+++ b/fs/bcachefs/super.c -@@ -0,0 +1,2062 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs setup/teardown code, and some metadata io - read a superblock and -+ * figure out what to do with it. -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_key_cache.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "chardev.h" -+#include "checksum.h" -+#include "clock.h" -+#include "compress.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "move.h" -+#include "migrate.h" -+#include "movinggc.h" -+#include "quota.h" -+#include "rebalance.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "sysfs.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Kent Overstreet "); -+ -+#define KTYPE(type) \ -+struct kobj_type type ## _ktype = { \ -+ .release = type ## _release, \ -+ .sysfs_ops = &type ## _sysfs_ops, \ -+ .default_attrs = type ## _files \ -+} -+ -+static void bch2_fs_release(struct kobject *); -+static void bch2_dev_release(struct kobject *); -+ -+static void bch2_fs_internal_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_opts_dir_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_time_stats_release(struct kobject *k) -+{ -+} -+ -+static KTYPE(bch2_fs); -+static KTYPE(bch2_fs_internal); -+static KTYPE(bch2_fs_opts_dir); -+static KTYPE(bch2_fs_time_stats); -+static KTYPE(bch2_dev); -+ -+static struct kset *bcachefs_kset; -+static LIST_HEAD(bch_fs_list); -+static DEFINE_MUTEX(bch_fs_list_lock); -+ -+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); -+ -+static void bch2_dev_free(struct bch_dev *); -+static int bch2_dev_alloc(struct bch_fs *, unsigned); -+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&bch_fs_list_lock); -+ rcu_read_lock(); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ for_each_member_device_rcu(ca, c, i, NULL) -+ if (ca->disk_sb.bdev == bdev) { -+ closure_get(&c->cl); -+ goto found; -+ } -+ c = NULL; -+found: -+ rcu_read_unlock(); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) -+ return c; -+ -+ return NULL; -+} -+ -+struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(uuid); -+ if (c) -+ closure_get(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+int bch2_congested(void *data, int bdi_bits) -+{ -+ struct bch_fs *c = data; -+ struct backing_dev_info *bdi; -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ rcu_read_lock(); -+ if (bdi_bits & (1 << WB_sync_congested)) { -+ /* Reads - check all devices: */ -+ for_each_readable_member(ca, c, i) { -+ bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ if (bdi_congested(bdi, bdi_bits)) { -+ ret = 1; -+ break; -+ } -+ } -+ } else { -+ const struct bch_devs_mask *devs = -+ bch2_target_to_mask(c, c->opts.foreground_target) ?: -+ &c->rw_devs[BCH_DATA_user]; -+ -+ for_each_member_device_rcu(ca, c, i, devs) { -+ bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ if (bdi_congested(bdi, bdi_bits)) { -+ ret = 1; -+ break; -+ } -+ } -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+/* Filesystem RO/RW: */ -+ -+/* -+ * For startup/shutdown of RW stuff, the dependencies are: -+ * -+ * - foreground writes depend on copygc and rebalance (to free up space) -+ * -+ * - copygc and rebalance depend on mark and sweep gc (they actually probably -+ * don't because they either reserve ahead of time or don't block if -+ * allocations fail, but allocations can require mark and sweep gc to run -+ * because of generation number wraparound) -+ * -+ * - all of the above depends on the allocator threads -+ * -+ * - allocator depends on the journal (when it rewrites prios and gens) -+ */ -+ -+static void __bch2_fs_read_only(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ bool wrote = false; -+ unsigned i, clean_passes = 0; -+ int ret; -+ -+ bch2_rebalance_stop(c); -+ bch2_copygc_stop(c); -+ bch2_gc_thread_stop(c); -+ -+ /* -+ * Flush journal before stopping allocators, because flushing journal -+ * blacklist entries involves allocating new btree nodes: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ /* -+ * If the allocator threads didn't all start up, the btree updates to -+ * write out alloc info aren't going to work: -+ */ -+ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) -+ goto nowrote_alloc; -+ -+ bch_verbose(c, "writing alloc info"); -+ /* -+ * This should normally just be writing the bucket read/write clocks: -+ */ -+ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: -+ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); -+ bch_verbose(c, "writing alloc info complete"); -+ -+ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); -+ -+ if (ret) -+ goto nowrote_alloc; -+ -+ bch_verbose(c, "flushing journal and stopping allocators"); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ do { -+ clean_passes++; -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ -+ /* -+ * In flight interior btree updates will generate more journal -+ * updates and btree updates (alloc btree): -+ */ -+ if (bch2_btree_interior_updates_nr_pending(c)) { -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ clean_passes = 0; -+ } -+ flush_work(&c->btree_interior_update_work); -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ } while (clean_passes < 2); -+ bch_verbose(c, "flushing journal and stopping allocators complete"); -+ -+ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+nowrote_alloc: -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ flush_work(&c->btree_interior_update_work); -+ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_stop(ca); -+ -+ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ bch2_fs_journal_stop(&c->journal); -+ -+ /* -+ * the journal kicks off btree writes via reclaim - wait for in flight -+ * writes after stopping journal: -+ */ -+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ bch2_btree_flush_all_writes(c); -+ else -+ bch2_btree_verify_flushed(c); -+ -+ /* -+ * After stopping journal: -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_remove(c, ca); -+} -+ -+static void bch2_writes_disabled(struct percpu_ref *writes) -+{ -+ struct bch_fs *c = container_of(writes, struct bch_fs, writes); -+ -+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ wake_up(&bch_read_only_wait); -+} -+ -+void bch2_fs_read_only(struct bch_fs *c) -+{ -+ if (!test_bit(BCH_FS_RW, &c->flags)) { -+ cancel_delayed_work_sync(&c->journal.reclaim_work); -+ return; -+ } -+ -+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ /* -+ * Block new foreground-end write operations from starting - any new -+ * writes will return -EROFS: -+ * -+ * (This is really blocking new _allocations_, writes to previously -+ * allocated space can still happen until stopping the allocator in -+ * bch2_dev_allocator_stop()). -+ */ -+ percpu_ref_kill(&c->writes); -+ -+ cancel_work_sync(&c->ec_stripe_delete_work); -+ cancel_delayed_work(&c->pd_controllers_update); -+ -+ /* -+ * If we're not doing an emergency shutdown, we want to wait on -+ * outstanding writes to complete so they don't see spurious errors due -+ * to shutting down the allocator: -+ * -+ * If we are doing an emergency shutdown outstanding writes may -+ * hang until we shutdown the allocator so we don't want to wait -+ * on outstanding writes before shutting everything down - but -+ * we do need to wait on them before returning and signalling -+ * that going RO is complete: -+ */ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || -+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); -+ -+ __bch2_fs_read_only(c); -+ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ -+ if (!bch2_journal_error(&c->journal) && -+ !test_bit(BCH_FS_ERROR, &c->flags) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && -+ test_bit(BCH_FS_STARTED, &c->flags) && -+ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && -+ !c->opts.norecovery) { -+ bch_verbose(c, "marking filesystem clean"); -+ bch2_fs_mark_clean(c); -+ } -+ -+ clear_bit(BCH_FS_RW, &c->flags); -+} -+ -+static void bch2_fs_read_only_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, read_only_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+} -+ -+static void bch2_fs_read_only_async(struct bch_fs *c) -+{ -+ queue_work(system_long_wq, &c->read_only_work); -+} -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); -+ -+ bch2_journal_halt(&c->journal); -+ bch2_fs_read_only_async(c); -+ -+ wake_up(&bch_read_only_wait); -+ return ret; -+} -+ -+static int bch2_fs_read_write_late(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_gc_thread_start(c); -+ if (ret) { -+ bch_err(c, "error starting gc thread"); -+ return ret; -+ } -+ -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err(c, "error starting copygc thread"); -+ return ret; -+ } -+ -+ ret = bch2_rebalance_start(c); -+ if (ret) { -+ bch_err(c, "error starting rebalance thread"); -+ return ret; -+ } -+ -+ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); -+ -+ schedule_work(&c->ec_stripe_delete_work); -+ -+ return 0; -+} -+ -+static int __bch2_fs_read_write(struct bch_fs *c, bool early) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ if (test_bit(BCH_FS_RW, &c->flags)) -+ return 0; -+ -+ /* -+ * nochanges is used for fsck -n mode - we have to allow going rw -+ * during recovery for that to work: -+ */ -+ if (c->opts.norecovery || -+ (c->opts.nochanges && -+ (!early || c->opts.read_only))) -+ return -EROFS; -+ -+ ret = bch2_fs_mark_dirty(c); -+ if (ret) -+ goto err; -+ -+ /* -+ * We need to write out a journal entry before we start doing btree -+ * updates, to ensure that on unclean shutdown new journal blacklist -+ * entries are created: -+ */ -+ bch2_journal_meta(&c->journal); -+ -+ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ for_each_rw_member(ca, c, i) { -+ ret = bch2_dev_allocator_start(ca); -+ if (ret) { -+ bch_err(c, "error starting allocator threads"); -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ -+ if (!early) { -+ ret = bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ percpu_ref_reinit(&c->writes); -+ set_bit(BCH_FS_RW, &c->flags); -+ -+ queue_delayed_work(c->journal_reclaim_wq, -+ &c->journal.reclaim_work, 0); -+ return 0; -+err: -+ __bch2_fs_read_only(c); -+ return ret; -+} -+ -+int bch2_fs_read_write(struct bch_fs *c) -+{ -+ return __bch2_fs_read_write(c, false); -+} -+ -+int bch2_fs_read_write_early(struct bch_fs *c) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ return __bch2_fs_read_write(c, true); -+} -+ -+/* Filesystem startup/shutdown: */ -+ -+static void bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_exit(&c->times[i]); -+ -+ bch2_fs_quota_exit(c); -+ bch2_fs_fsio_exit(c); -+ bch2_fs_ec_exit(c); -+ bch2_fs_encryption_exit(c); -+ bch2_fs_io_exit(c); -+ bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_iter_exit(c); -+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); -+ bch2_fs_btree_cache_exit(c); -+ bch2_fs_journal_exit(&c->journal); -+ bch2_io_clock_exit(&c->io_clock[WRITE]); -+ bch2_io_clock_exit(&c->io_clock[READ]); -+ bch2_fs_compress_exit(c); -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ percpu_free_rwsem(&c->mark_lock); -+ kfree(c->usage_scratch); -+ free_percpu(c->usage[1]); -+ free_percpu(c->usage[0]); -+ kfree(c->usage_base); -+ free_percpu(c->pcpu); -+ mempool_exit(&c->large_bkey_pool); -+ mempool_exit(&c->btree_bounce_pool); -+ bioset_exit(&c->btree_bio); -+ mempool_exit(&c->fill_iter); -+ percpu_ref_exit(&c->writes); -+ kfree(c->replicas.entries); -+ kfree(c->replicas_gc.entries); -+ kfree(rcu_dereference_protected(c->disk_groups, 1)); -+ kfree(c->journal_seq_blacklist_table); -+ free_heap(&c->copygc_heap); -+ -+ if (c->journal_reclaim_wq) -+ destroy_workqueue(c->journal_reclaim_wq); -+ if (c->copygc_wq) -+ destroy_workqueue(c->copygc_wq); -+ if (c->wq) -+ destroy_workqueue(c->wq); -+ -+ free_pages((unsigned long) c->disk_sb.sb, -+ c->disk_sb.page_order); -+ kvpfree(c, sizeof(*c)); -+ module_put(THIS_MODULE); -+} -+ -+static void bch2_fs_release(struct kobject *kobj) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ bch2_fs_free(c); -+} -+ -+void bch2_fs_stop(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ bch_verbose(c, "shutting down"); -+ -+ set_bit(BCH_FS_STOPPING, &c->flags); -+ -+ cancel_work_sync(&c->journal_seq_blacklist_gc_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (c->kobj.state_in_sysfs) -+ kobject_del(&c->kobj); -+ -+ bch2_fs_debug_exit(c); -+ bch2_fs_chardev_exit(c); -+ -+ kobject_put(&c->time_stats); -+ kobject_put(&c->opts_dir); -+ kobject_put(&c->internal); -+ -+ mutex_lock(&bch_fs_list_lock); -+ list_del(&c->list); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ closure_sync(&c->cl); -+ closure_debug_destroy(&c->cl); -+ -+ /* btree prefetch might have kicked off reads in the background: */ -+ bch2_btree_flush_all_reads(c); -+ -+ for_each_member_device(ca, c, i) -+ cancel_work_sync(&ca->io_error_work); -+ -+ cancel_work_sync(&c->btree_write_error_work); -+ cancel_delayed_work_sync(&c->pd_controllers_update); -+ cancel_work_sync(&c->read_only_work); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); -+ -+ bch_verbose(c, "shutdown complete"); -+ -+ kobject_put(&c->kobj); -+} -+ -+static const char *bch2_fs_online(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ const char *err = NULL; -+ unsigned i; -+ int ret; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ if (!list_empty(&c->list)) -+ return NULL; -+ -+ if (__bch2_uuid_to_fs(c->sb.uuid)) -+ return "filesystem UUID already open"; -+ -+ ret = bch2_fs_chardev_init(c); -+ if (ret) -+ return "error creating character device"; -+ -+ bch2_fs_debug_init(c); -+ -+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || -+ kobject_add(&c->internal, &c->kobj, "internal") || -+ kobject_add(&c->opts_dir, &c->kobj, "options") || -+ kobject_add(&c->time_stats, &c->kobj, "time_stats") || -+ bch2_opts_create_sysfs_files(&c->opts_dir)) -+ return "error creating sysfs objects"; -+ -+ down_write(&c->state_lock); -+ -+ err = "error creating sysfs objects"; -+ __for_each_member_device(ca, c, i, NULL) -+ if (bch2_dev_sysfs_online(c, ca)) -+ goto err; -+ -+ list_add(&c->list, &bch_fs_list); -+ err = NULL; -+err: -+ up_write(&c->state_lock); -+ return err; -+} -+ -+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_fs *c; -+ unsigned i, iter_size; -+ const char *err; -+ -+ pr_verbose_init(opts, ""); -+ -+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); -+ if (!c) -+ goto out; -+ -+ __module_get(THIS_MODULE); -+ -+ c->minor = -1; -+ c->disk_sb.fs_sb = true; -+ -+ init_rwsem(&c->state_lock); -+ mutex_init(&c->sb_lock); -+ mutex_init(&c->replicas_gc_lock); -+ mutex_init(&c->btree_root_lock); -+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); -+ -+ init_rwsem(&c->gc_lock); -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_init(&c->times[i]); -+ -+ bch2_fs_copygc_init(c); -+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -+ bch2_fs_allocator_background_init(c); -+ bch2_fs_allocator_foreground_init(c); -+ bch2_fs_rebalance_init(c); -+ bch2_fs_quota_init(c); -+ -+ INIT_LIST_HEAD(&c->list); -+ -+ mutex_init(&c->usage_scratch_lock); -+ -+ mutex_init(&c->bio_bounce_pages_lock); -+ -+ bio_list_init(&c->btree_write_error_list); -+ spin_lock_init(&c->btree_write_error_lock); -+ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); -+ -+ INIT_WORK(&c->journal_seq_blacklist_gc_work, -+ bch2_blacklist_entries_gc); -+ -+ INIT_LIST_HEAD(&c->journal_entries); -+ -+ INIT_LIST_HEAD(&c->fsck_errors); -+ mutex_init(&c->fsck_error_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_head_list); -+ mutex_init(&c->ec_stripe_head_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_new_list); -+ mutex_init(&c->ec_stripe_new_lock); -+ -+ spin_lock_init(&c->ec_stripes_heap_lock); -+ -+ seqcount_init(&c->gc_pos_lock); -+ -+ seqcount_init(&c->usage_lock); -+ -+ sema_init(&c->io_in_flight, 64); -+ -+ c->copy_gc_enabled = 1; -+ c->rebalance.enabled = 1; -+ c->promote_whole_extents = true; -+ -+ c->journal.write_time = &c->times[BCH_TIME_journal_write]; -+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; -+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; -+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; -+ -+ bch2_fs_btree_cache_init_early(&c->btree_cache); -+ -+ if (percpu_init_rwsem(&c->mark_lock)) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (bch2_sb_to_fs(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ -+ mutex_unlock(&c->sb_lock); -+ -+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); -+ -+ c->opts = bch2_opts_default; -+ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); -+ bch2_opts_apply(&c->opts, opts); -+ -+ c->block_bits = ilog2(c->opts.block_size); -+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); -+ -+ if (bch2_fs_init_fault("fs_alloc")) -+ goto err; -+ -+ iter_size = sizeof(struct sort_iter) + -+ (btree_blocks(c) + 1) * 2 * -+ sizeof(struct sort_iter_set); -+ -+ if (!(c->wq = alloc_workqueue("bcachefs", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcache_copygc", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || -+ percpu_ref_init(&c->writes, bch2_writes_disabled, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || -+ bioset_init(&c->btree_bio, 1, -+ max(offsetof(struct btree_read_bio, bio), -+ offsetof(struct btree_write_bio, wbio.bio)), -+ BIOSET_NEED_BVECS) || -+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || -+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, -+ btree_bytes(c)) || -+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || -+ bch2_io_clock_init(&c->io_clock[READ]) || -+ bch2_io_clock_init(&c->io_clock[WRITE]) || -+ bch2_fs_journal_init(&c->journal) || -+ bch2_fs_replicas_init(c) || -+ bch2_fs_btree_cache_init(c) || -+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || -+ bch2_fs_btree_iter_init(c) || -+ bch2_fs_btree_interior_update_init(c) || -+ bch2_fs_io_init(c) || -+ bch2_fs_encryption_init(c) || -+ bch2_fs_compress_init(c) || -+ bch2_fs_ec_init(c) || -+ bch2_fs_fsio_init(c)) -+ goto err; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && -+ bch2_dev_alloc(c, i)) -+ goto err; -+ -+ /* -+ * Now that all allocations have succeeded, init various refcounty -+ * things that let us shutdown: -+ */ -+ closure_init(&c->cl, NULL); -+ -+ c->kobj.kset = bcachefs_kset; -+ kobject_init(&c->kobj, &bch2_fs_ktype); -+ kobject_init(&c->internal, &bch2_fs_internal_ktype); -+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); -+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); -+ -+ mutex_lock(&bch_fs_list_lock); -+ err = bch2_fs_online(c); -+ mutex_unlock(&bch_fs_list_lock); -+ if (err) { -+ bch_err(c, "bch2_fs_online() error: %s", err); -+ goto err; -+ } -+out: -+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); -+ return c; -+err: -+ bch2_fs_free(c); -+ c = NULL; -+ goto out; -+} -+ -+noinline_for_stack -+static void print_mount_opts(struct bch_fs *c) -+{ -+ enum bch_opt_id i; -+ char buf[512]; -+ struct printbuf p = PBUF(buf); -+ bool first = true; -+ -+ strcpy(buf, "(null)"); -+ -+ if (c->opts.read_only) { -+ pr_buf(&p, "ro"); -+ first = false; -+ } -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ if (!first) -+ pr_buf(&p, ","); -+ first = false; -+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); -+ } -+ -+ bch_info(c, "mounted with opts: %s", buf); -+} -+ -+int bch2_fs_start(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ time64_t now = ktime_get_real_seconds(); -+ unsigned i; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for_each_online_member(ca, c, i) -+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) -+ ? bch2_fs_recovery(c) -+ : bch2_fs_initialize(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_opts_check_may_set(c); -+ if (ret) -+ goto err; -+ -+ err = "dynamic fault"; -+ ret = -EINVAL; -+ if (bch2_fs_init_fault("fs_start")) -+ goto err; -+ -+ set_bit(BCH_FS_STARTED, &c->flags); -+ -+ if (c->opts.read_only || c->opts.nochanges) { -+ bch2_fs_read_only(c); -+ } else { -+ err = "error going read write"; -+ ret = !test_bit(BCH_FS_RW, &c->flags) -+ ? bch2_fs_read_write(c) -+ : bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ print_mount_opts(c); -+ ret = 0; -+out: -+ up_write(&c->state_lock); -+ return ret; -+err: -+ switch (ret) { -+ case BCH_FSCK_ERRORS_NOT_FIXED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("mount with -o fix_errors to repair\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_UNIMPLEMENTED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_IMPOSSIBLE: -+ bch_err(c, "filesystem contains errors, but repair impossible"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_UNKNOWN_VERSION: -+ err = "unknown metadata version";; -+ break; -+ case -ENOMEM: -+ err = "cannot allocate memory"; -+ break; -+ case -EIO: -+ err = "IO error"; -+ break; -+ } -+ -+ if (ret >= 0) -+ ret = -EIO; -+ goto out; -+} -+ -+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -+{ -+ struct bch_sb_field_members *sb_mi; -+ -+ sb_mi = bch2_sb_get_members(sb); -+ if (!sb_mi) -+ return "Invalid superblock: member info area missing"; -+ -+ if (le16_to_cpu(sb->block_size) != c->opts.block_size) -+ return "mismatched block size"; -+ -+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) -+ return "new cache bucket size is too small"; -+ -+ return NULL; -+} -+ -+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) -+{ -+ struct bch_sb *newest = -+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); -+ -+ if (uuid_le_cmp(fs->uuid, sb->uuid)) -+ return "device not a member of filesystem"; -+ -+ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) -+ return "device has been removed"; -+ -+ if (fs->block_size != sb->block_size) -+ return "mismatched block size"; -+ -+ return NULL; -+} -+ -+/* Device startup/shutdown: */ -+ -+static void bch2_dev_release(struct kobject *kobj) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ -+ kfree(ca); -+} -+ -+static void bch2_dev_free(struct bch_dev *ca) -+{ -+ cancel_work_sync(&ca->io_error_work); -+ -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (ca->kobj.state_in_sysfs) -+ kobject_del(&ca->kobj); -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+ -+ free_percpu(ca->io_done); -+ bioset_exit(&ca->replica_set); -+ bch2_dev_buckets_free(ca); -+ free_page((unsigned long) ca->sb_read_scratch); -+ -+ bch2_time_stats_exit(&ca->io_latency[WRITE]); -+ bch2_time_stats_exit(&ca->io_latency[READ]); -+ -+ percpu_ref_exit(&ca->io_ref); -+ percpu_ref_exit(&ca->ref); -+ kobject_put(&ca->kobj); -+} -+ -+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -+{ -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (percpu_ref_is_zero(&ca->io_ref)) -+ return; -+ -+ __bch2_dev_read_only(c, ca); -+ -+ reinit_completion(&ca->io_ref_completion); -+ percpu_ref_kill(&ca->io_ref); -+ wait_for_completion(&ca->io_ref_completion); -+ -+ if (ca->kobj.state_in_sysfs) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ sysfs_remove_link(block, "bcachefs"); -+ sysfs_remove_link(&ca->kobj, "block"); -+ } -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+} -+ -+static void bch2_dev_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); -+ -+ complete(&ca->ref_completion); -+} -+ -+static void bch2_dev_io_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); -+ -+ complete(&ca->io_ref_completion); -+} -+ -+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -+{ -+ int ret; -+ -+ if (!c->kobj.state_in_sysfs) -+ return 0; -+ -+ if (!ca->kobj.state_in_sysfs) { -+ ret = kobject_add(&ca->kobj, &c->kobj, -+ "dev-%u", ca->dev_idx); -+ if (ret) -+ return ret; -+ } -+ -+ if (ca->disk_sb.bdev) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); -+ if (ret) -+ return ret; -+ ret = sysfs_create_link(&ca->kobj, block, "block"); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, -+ struct bch_member *member) -+{ -+ struct bch_dev *ca; -+ -+ ca = kzalloc(sizeof(*ca), GFP_KERNEL); -+ if (!ca) -+ return NULL; -+ -+ kobject_init(&ca->kobj, &bch2_dev_ktype); -+ init_completion(&ca->ref_completion); -+ init_completion(&ca->io_ref_completion); -+ -+ init_rwsem(&ca->bucket_lock); -+ -+ INIT_WORK(&ca->io_error_work, bch2_io_error_work); -+ -+ bch2_time_stats_init(&ca->io_latency[READ]); -+ bch2_time_stats_init(&ca->io_latency[WRITE]); -+ -+ ca->mi = bch2_mi_to_cpu(member); -+ ca->uuid = member->uuid; -+ -+ if (opt_defined(c->opts, discard)) -+ ca->mi.discard = opt_get(c->opts, discard); -+ -+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, -+ 0, GFP_KERNEL) || -+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || -+ bch2_dev_buckets_alloc(c, ca) || -+ bioset_init(&ca->replica_set, 4, -+ offsetof(struct bch_write_bio, bio), 0) || -+ !(ca->io_done = alloc_percpu(*ca->io_done))) -+ goto err; -+ -+ return ca; -+err: -+ bch2_dev_free(ca); -+ return NULL; -+} -+ -+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, -+ unsigned dev_idx) -+{ -+ ca->dev_idx = dev_idx; -+ __set_bit(ca->dev_idx, ca->self.d); -+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); -+ -+ ca->fs = c; -+ rcu_assign_pointer(c->devs[ca->dev_idx], ca); -+ -+ if (bch2_dev_sysfs_online(c, ca)) -+ pr_warn("error creating sysfs objects"); -+} -+ -+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -+{ -+ struct bch_member *member = -+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; -+ struct bch_dev *ca = NULL; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bch2_fs_init_fault("dev_alloc")) -+ goto err; -+ -+ ca = __bch2_dev_alloc(c, member); -+ if (!ca) -+ goto err; -+ -+ bch2_dev_attach(c, ca, dev_idx); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -+{ -+ unsigned ret; -+ -+ if (bch2_dev_is_online(ca)) { -+ bch_err(ca, "already have device online in slot %u", -+ sb->sb->dev_idx); -+ return -EINVAL; -+ } -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "cannot online: device too small"); -+ return -EINVAL; -+ } -+ -+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "device too small"); -+ return -EINVAL; -+ } -+ -+ ret = bch2_dev_journal_init(ca, sb->sb); -+ if (ret) -+ return ret; -+ -+ /* Commit: */ -+ ca->disk_sb = *sb; -+ if (sb->mode & FMODE_EXCL) -+ ca->disk_sb.bdev->bd_holder = ca; -+ memset(sb, 0, sizeof(*sb)); -+ -+ percpu_ref_reinit(&ca->io_ref); -+ -+ return 0; -+} -+ -+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (le64_to_cpu(sb->sb->seq) > -+ le64_to_cpu(c->disk_sb.sb->seq)) -+ bch2_sb_to_fs(c, sb->sb); -+ -+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || -+ !c->devs[sb->sb->dev_idx]); -+ -+ ca = bch_dev_locked(c, sb->sb->dev_idx); -+ -+ ret = __bch2_dev_attach_bdev(ca, sb); -+ if (ret) -+ return ret; -+ -+ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && -+ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { -+ mutex_lock(&c->sb_lock); -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_dev_sysfs_online(c, ca); -+ -+ if (c->sb.nr_devices == 1) -+ bdevname(ca->disk_sb.bdev, c->name); -+ bdevname(ca->disk_sb.bdev, ca->name); -+ -+ rebalance_wakeup(c); -+ return 0; -+} -+ -+/* Device management: */ -+ -+/* -+ * Note: this function is also used by the error paths - when a particular -+ * device sees an error, we call it to determine whether we can just set the -+ * device RO, or - if this function returns false - we'll set the whole -+ * filesystem RO: -+ * -+ * XXX: maybe we should be more explicit about whether we're changing state -+ * because we got an error or what have you? -+ */ -+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_devs_mask new_online_devs; -+ struct replicas_status s; -+ struct bch_dev *ca2; -+ int i, nr_rw = 0, required; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ switch (new_state) { -+ case BCH_MEMBER_STATE_RW: -+ return true; -+ case BCH_MEMBER_STATE_RO: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ return true; -+ -+ /* do we have enough devices to write to? */ -+ for_each_member_device(ca2, c, i) -+ if (ca2 != ca) -+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; -+ -+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) -+ ? c->opts.metadata_replicas -+ : c->opts.metadata_replicas_required, -+ !(flags & BCH_FORCE_IF_DATA_DEGRADED) -+ ? c->opts.data_replicas -+ : c->opts.data_replicas_required); -+ -+ return nr_rw >= required; -+ case BCH_MEMBER_STATE_FAILED: -+ case BCH_MEMBER_STATE_SPARE: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW && -+ ca->mi.state != BCH_MEMBER_STATE_RO) -+ return true; -+ -+ /* do we have enough devices to read from? */ -+ new_online_devs = bch2_online_devs(c); -+ __clear_bit(ca->dev_idx, new_online_devs.d); -+ -+ s = __bch2_replicas_status(c, new_online_devs); -+ -+ return bch2_have_enough_devs(s, flags); -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_fs_may_start(struct bch_fs *c) -+{ -+ struct replicas_status s; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned i, flags = c->opts.degraded -+ ? BCH_FORCE_IF_DEGRADED -+ : 0; -+ -+ if (!c->opts.degraded) { -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) -+ continue; -+ -+ ca = bch_dev_locked(c, i); -+ -+ if (!bch2_dev_is_online(ca) && -+ (ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO)) { -+ mutex_unlock(&c->sb_lock); -+ return false; -+ } -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ s = bch2_replicas_status(c); -+ -+ return bch2_have_enough_devs(s, flags); -+} -+ -+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -+{ -+ /* -+ * Device going read only means the copygc reserve get smaller, so we -+ * don't want that happening while copygc is in progress: -+ */ -+ bch2_copygc_stop(c); -+ -+ /* -+ * The allocator thread itself allocates btree nodes, so stop it first: -+ */ -+ bch2_dev_allocator_stop(ca); -+ bch2_dev_allocator_remove(c, ca); -+ bch2_dev_journal_stop(&c->journal, ca); -+ -+ bch2_copygc_start(c); -+} -+ -+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); -+ -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ if (bch2_dev_allocator_start(ca)) -+ return "error starting allocator thread"; -+ -+ return NULL; -+} -+ -+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ int ret = 0; -+ -+ if (ca->mi.state == new_state) -+ return 0; -+ -+ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) -+ return -EINVAL; -+ -+ if (new_state != BCH_MEMBER_STATE_RW) -+ __bch2_dev_read_only(c, ca); -+ -+ bch_notice(ca, "%s", bch2_dev_state[new_state]); -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (new_state == BCH_MEMBER_STATE_RW && -+ __bch2_dev_read_write(c, ca)) -+ ret = -ENOMEM; -+ -+ rebalance_wakeup(c); -+ -+ return ret; -+} -+ -+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ int ret; -+ -+ down_write(&c->state_lock); -+ ret = __bch2_dev_set_state(c, ca, new_state, flags); -+ up_write(&c->state_lock); -+ -+ return ret; -+} -+ -+/* Device add/removal: */ -+ -+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ size_t i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < ca->mi.nbuckets; i++) { -+ ret = bch2_btree_key_cache_flush(&trans, -+ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); -+ if (ret) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ return ret; -+ -+ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ POS(ca->dev_idx + 1, 0), -+ NULL); -+} -+ -+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ unsigned dev_idx = ca->dev_idx, data; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ /* -+ * We consume a reference to ca->ref, regardless of whether we succeed -+ * or fail: -+ */ -+ percpu_ref_put(&ca->ref); -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot remove without losing data"); -+ goto err; -+ } -+ -+ __bch2_dev_read_only(c, ca); -+ -+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i dropping data", ret); -+ goto err; -+ } -+ -+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i flushing journal", ret); -+ goto err; -+ } -+ -+ ret = bch2_dev_remove_alloc(c, ca); -+ if (ret) { -+ bch_err(ca, "Remove failed, error deleting alloc info"); -+ goto err; -+ } -+ -+ /* -+ * must flush all existing journal entries, they might have -+ * (overwritten) keys that point to the device we're removing: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ /* -+ * hack to ensure bch2_replicas_gc2() clears out entries to this device -+ */ -+ bch2_journal_meta(&c->journal); -+ ret = bch2_journal_error(&c->journal); -+ if (ret) { -+ bch_err(ca, "Remove failed, journal error"); -+ goto err; -+ } -+ -+ ret = bch2_replicas_gc2(c); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i from replicas gc", ret); -+ goto err; -+ } -+ -+ data = bch2_dev_has_data(c, ca); -+ if (data) { -+ char data_has_str[100]; -+ -+ bch2_flags_to_text(&PBUF(data_has_str), -+ bch2_data_types, data); -+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); -+ ret = -EBUSY; -+ goto err; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ mutex_lock(&c->sb_lock); -+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); -+ mutex_unlock(&c->sb_lock); -+ -+ percpu_ref_kill(&ca->ref); -+ wait_for_completion(&ca->ref_completion); -+ -+ bch2_dev_free(ca); -+ -+ /* -+ * Free this device's slot in the bch_member array - all pointers to -+ * this device must be gone: -+ */ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); -+ -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+ return 0; -+err: -+ if (ca->mi.state == BCH_MEMBER_STATE_RW && -+ !percpu_ref_is_zero(&ca->io_ref)) -+ __bch2_dev_read_write(c, ca); -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+static void dev_usage_clear(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ -+ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); -+ up_read(&ca->bucket_lock); -+} -+ -+/* Add new device to running filesystem: */ -+int bch2_dev_add(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb; -+ const char *err; -+ struct bch_dev *ca = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member dev_mi; -+ unsigned dev_idx, nr_devices, u64s; -+ int ret; -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) -+ return ret; -+ -+ err = bch2_sb_validate(&sb); -+ if (err) -+ return -EINVAL; -+ -+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; -+ -+ err = bch2_dev_may_add(sb.sb, c); -+ if (err) -+ return -EINVAL; -+ -+ ca = __bch2_dev_alloc(c, &dev_mi); -+ if (!ca) { -+ bch2_free_super(&sb); -+ return -ENOMEM; -+ } -+ -+ ret = __bch2_dev_attach_bdev(ca, &sb); -+ if (ret) { -+ bch2_dev_free(ca); -+ return ret; -+ } -+ -+ /* -+ * We want to allocate journal on the new device before adding the new -+ * device to the filesystem because allocating after we attach requires -+ * spinning up the allocator thread, and the allocator thread requires -+ * doing btree writes, which if the existing devices are RO isn't going -+ * to work -+ * -+ * So we have to mark where the superblocks are, but marking allocated -+ * data normally updates the filesystem usage too, so we have to mark, -+ * allocate the journal, reset all the marks, then remark after we -+ * attach... -+ */ -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ -+ err = "journal alloc failed"; -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) -+ goto err; -+ -+ dev_usage_clear(ca); -+ -+ down_write(&c->state_lock); -+ mutex_lock(&c->sb_lock); -+ -+ err = "insufficient space in new superblock"; -+ ret = bch2_sb_from_fs(c, ca); -+ if (ret) -+ goto err_unlock; -+ -+ mi = bch2_sb_get_members(ca->disk_sb.sb); -+ -+ if (!bch2_sb_resize_members(&ca->disk_sb, -+ le32_to_cpu(mi->field.u64s) + -+ sizeof(dev_mi) / sizeof(u64))) { -+ ret = -ENOSPC; -+ goto err_unlock; -+ } -+ -+ if (dynamic_fault("bcachefs:add:no_slot")) -+ goto no_slot; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) -+ goto have_slot; -+no_slot: -+ err = "no slots available in superblock"; -+ ret = -ENOSPC; -+ goto err_unlock; -+ -+have_slot: -+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); -+ u64s = (sizeof(struct bch_sb_field_members) + -+ sizeof(struct bch_member) * nr_devices) / sizeof(u64); -+ -+ err = "no space in superblock for member info"; -+ ret = -ENOSPC; -+ -+ mi = bch2_sb_resize_members(&c->disk_sb, u64s); -+ if (!mi) -+ goto err_unlock; -+ -+ /* success: */ -+ -+ mi->members[dev_idx] = dev_mi; -+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); -+ c->disk_sb.sb->nr_devices = nr_devices; -+ -+ ca->disk_sb.sb->dev_idx = dev_idx; -+ bch2_dev_attach(c, ca, dev_idx); -+ -+ bch2_mark_dev_superblock(c, ca, 0); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err_late; -+ } -+ -+ up_write(&c->state_lock); -+ return 0; -+ -+err_unlock: -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ bch2_free_super(&sb); -+ bch_err(c, "Unable to add device: %s", err); -+ return ret; -+err_late: -+ bch_err(c, "Error going rw after adding device: %s", err); -+ return -EINVAL; -+} -+ -+/* Hot add existing device to running filesystem: */ -+int bch2_dev_online(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb = { NULL }; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ const char *err; -+ int ret; -+ -+ down_write(&c->state_lock); -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) { -+ up_write(&c->state_lock); -+ return ret; -+ } -+ -+ dev_idx = sb.sb->dev_idx; -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); -+ if (err) -+ goto err; -+ -+ if (bch2_dev_attach_bdev(c, &sb)) { -+ err = "bch2_dev_attach_bdev() error"; -+ goto err; -+ } -+ -+ ca = bch_dev_locked(c, dev_idx); -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ mi->members[ca->dev_idx].last_mount = -+ cpu_to_le64(ktime_get_real_seconds()); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ up_write(&c->state_lock); -+ return 0; -+err: -+ up_write(&c->state_lock); -+ bch2_free_super(&sb); -+ bch_err(c, "error bringing %s online: %s", path, err); -+ return -EINVAL; -+} -+ -+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ down_write(&c->state_lock); -+ -+ if (!bch2_dev_is_online(ca)) { -+ bch_err(ca, "Already offline"); -+ up_write(&c->state_lock); -+ return 0; -+ } -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot offline required disk"); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ up_write(&c->state_lock); -+ return 0; -+} -+ -+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bch_member *mi; -+ int ret = 0; -+ -+ down_write(&c->state_lock); -+ -+ if (nbuckets < ca->mi.nbuckets) { -+ bch_err(ca, "Cannot shrink yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (bch2_dev_is_online(ca) && -+ get_capacity(ca->disk_sb.bdev->bd_disk) < -+ ca->mi.bucket_size * nbuckets) { -+ bch_err(ca, "New size larger than device"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = bch2_dev_buckets_resize(c, ca, nbuckets); -+ if (ret) { -+ bch_err(ca, "Resize error: %i", ret); -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ mi->nbuckets = cpu_to_le64(nbuckets); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_recalc_capacity(c); -+err: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+/* return with ref on ca->ref: */ -+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) -+{ -+ struct block_device *bdev = lookup_bdev(path); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->disk_sb.bdev == bdev) -+ goto found; -+ -+ ca = ERR_PTR(-ENOENT); -+found: -+ bdput(bdev); -+ return ca; -+} -+ -+/* Filesystem open: */ -+ -+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, -+ struct bch_opts opts) -+{ -+ struct bch_sb_handle *sb = NULL; -+ struct bch_fs *c = NULL; -+ struct bch_sb_field_members *mi; -+ unsigned i, best_sb = 0; -+ const char *err; -+ int ret = -ENOMEM; -+ -+ pr_verbose_init(opts, ""); -+ -+ if (!nr_devices) { -+ c = ERR_PTR(-EINVAL); -+ goto out2; -+ } -+ -+ if (!try_module_get(THIS_MODULE)) { -+ c = ERR_PTR(-ENODEV); -+ goto out2; -+ } -+ -+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); -+ if (!sb) -+ goto err; -+ -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_read_super(devices[i], &opts, &sb[i]); -+ if (ret) -+ goto err; -+ -+ err = bch2_sb_validate(&sb[i]); -+ if (err) -+ goto err_print; -+ } -+ -+ for (i = 1; i < nr_devices; i++) -+ if (le64_to_cpu(sb[i].sb->seq) > -+ le64_to_cpu(sb[best_sb].sb->seq)) -+ best_sb = i; -+ -+ mi = bch2_sb_get_members(sb[best_sb].sb); -+ -+ i = 0; -+ while (i < nr_devices) { -+ if (i != best_sb && -+ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { -+ char buf[BDEVNAME_SIZE]; -+ pr_info("%s has been removed, skipping", -+ bdevname(sb[i].bdev, buf)); -+ bch2_free_super(&sb[i]); -+ array_remove_item(sb, nr_devices, i); -+ continue; -+ } -+ -+ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); -+ if (err) -+ goto err_print; -+ i++; -+ } -+ -+ ret = -ENOMEM; -+ c = bch2_fs_alloc(sb[best_sb].sb, opts); -+ if (!c) -+ goto err; -+ -+ err = "bch2_dev_online() error"; -+ down_write(&c->state_lock); -+ for (i = 0; i < nr_devices; i++) -+ if (bch2_dev_attach_bdev(c, &sb[i])) { -+ up_write(&c->state_lock); -+ goto err_print; -+ } -+ up_write(&c->state_lock); -+ -+ err = "insufficient devices"; -+ if (!bch2_fs_may_start(c)) -+ goto err_print; -+ -+ if (!c->opts.nostart) { -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+out: -+ kfree(sb); -+ module_put(THIS_MODULE); -+out2: -+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); -+ return c; -+err_print: -+ pr_err("bch_fs_open err opening %s: %s", -+ devices[0], err); -+ ret = -EINVAL; -+err: -+ if (c) -+ bch2_fs_stop(c); -+ for (i = 0; i < nr_devices; i++) -+ bch2_free_super(&sb[i]); -+ c = ERR_PTR(ret); -+ goto out; -+} -+ -+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, -+ struct bch_opts opts) -+{ -+ const char *err; -+ struct bch_fs *c; -+ bool allocated_fs = false; -+ int ret; -+ -+ err = bch2_sb_validate(sb); -+ if (err) -+ return err; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(sb->sb->uuid); -+ if (c) { -+ closure_get(&c->cl); -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); -+ if (err) -+ goto err; -+ } else { -+ c = bch2_fs_alloc(sb->sb, opts); -+ err = "cannot allocate memory"; -+ if (!c) -+ goto err; -+ -+ allocated_fs = true; -+ } -+ -+ err = "bch2_dev_online() error"; -+ -+ mutex_lock(&c->sb_lock); -+ if (bch2_dev_attach_bdev(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ mutex_unlock(&c->sb_lock); -+ -+ if (!c->opts.nostart && bch2_fs_may_start(c)) { -+ err = "error starting filesystem"; -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+ -+ closure_put(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return NULL; -+err: -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (allocated_fs) -+ bch2_fs_stop(c); -+ else if (c) -+ closure_put(&c->cl); -+ -+ return err; -+} -+ -+const char *bch2_fs_open_incremental(const char *path) -+{ -+ struct bch_sb_handle sb; -+ struct bch_opts opts = bch2_opts_empty(); -+ const char *err; -+ -+ if (bch2_read_super(path, &opts, &sb)) -+ return "error reading superblock"; -+ -+ err = __bch2_fs_open_incremental(&sb, opts); -+ bch2_free_super(&sb); -+ -+ return err; -+} -+ -+/* Global interfaces/init */ -+ -+static void bcachefs_exit(void) -+{ -+ bch2_debug_exit(); -+ bch2_vfs_exit(); -+ bch2_chardev_exit(); -+ if (bcachefs_kset) -+ kset_unregister(bcachefs_kset); -+} -+ -+static int __init bcachefs_init(void) -+{ -+ bch2_bkey_pack_test(); -+ bch2_inode_pack_test(); -+ -+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || -+ bch2_chardev_init() || -+ bch2_vfs_init() || -+ bch2_debug_init()) -+ goto err; -+ -+ return 0; -+err: -+ bcachefs_exit(); -+ return -ENOMEM; -+} -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ bool bch2_##name; \ -+ module_param_named(name, bch2_##name, bool, 0644); \ -+ MODULE_PARM_DESC(name, description); -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+module_exit(bcachefs_exit); -+module_init(bcachefs_init); -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -new file mode 100644 -index 000000000000..fffee96726ce ---- /dev/null -+++ b/fs/bcachefs/super.h -@@ -0,0 +1,240 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_H -+#define _BCACHEFS_SUPER_H -+ -+#include "extents.h" -+ -+#include "bcachefs_ioctl.h" -+ -+#include -+ -+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -+{ -+ return div_u64(s, ca->mi.bucket_size); -+} -+ -+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -+{ -+ return ((sector_t) b) * ca->mi.bucket_size; -+} -+ -+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -+{ -+ u32 remainder; -+ -+ div_u64_rem(s, ca->mi.bucket_size, &remainder); -+ return remainder; -+} -+ -+static inline bool bch2_dev_is_online(struct bch_dev *ca) -+{ -+ return !percpu_ref_is_zero(&ca->io_ref); -+} -+ -+static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+{ -+ return bch2_dev_is_online(ca) && -+ ca->mi.state != BCH_MEMBER_STATE_FAILED; -+} -+ -+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -+{ -+ if (!percpu_ref_tryget(&ca->io_ref)) -+ return false; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW || -+ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) -+ return true; -+ -+ percpu_ref_put(&ca->io_ref); -+ return false; -+} -+ -+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -+{ -+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -+} -+ -+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs.nr; i++) -+ if (devs.devs[i] == dev) -+ return true; -+ -+ return false; -+} -+ -+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs->nr; i++) -+ if (devs->devs[i] == dev) { -+ array_remove_item(devs->devs, devs->nr, i); -+ return; -+ } -+} -+ -+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); -+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); -+ devs->devs[devs->nr++] = dev; -+} -+ -+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -+{ -+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -+} -+ -+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, -+ const struct bch_devs_mask *mask) -+{ -+ struct bch_dev *ca = NULL; -+ -+ while ((*iter = mask -+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) -+ : *iter) < c->sb.nr_devices && -+ !(ca = rcu_dereference_check(c->devs[*iter], -+ lockdep_is_held(&c->state_lock)))) -+ (*iter)++; -+ -+ return ca; -+} -+ -+#define __for_each_member_device(ca, c, iter, mask) \ -+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) -+ -+#define for_each_member_device_rcu(ca, c, iter, mask) \ -+ __for_each_member_device(ca, c, iter, mask) -+ -+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ if ((ca = __bch2_next_dev(c, iter, NULL))) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* -+ * If you break early, you must drop your ref on the current device -+ */ -+#define for_each_member_device(ca, c, iter) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_dev(c, &(iter))); \ -+ percpu_ref_put(&ca->ref), (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, -+ unsigned *iter, -+ int state_mask) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ while ((ca = __bch2_next_dev(c, iter, NULL)) && -+ (!((1 << ca->mi.state) & state_mask) || -+ !percpu_ref_tryget(&ca->io_ref))) -+ (*iter)++; -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+#define __for_each_online_member(ca, c, iter, state_mask) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ -+ percpu_ref_put(&ca->io_ref), (iter)++) -+ -+#define for_each_online_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, ~0) -+ -+#define for_each_rw_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) -+ -+#define for_each_readable_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, \ -+ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) -+ -+/* -+ * If a key exists that references a device, the device won't be going away and -+ * we can omit rcu_read_lock(): -+ */ -+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_check(c->devs[idx], 1); -+} -+ -+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_protected(c->devs[idx], -+ lockdep_is_held(&c->sb_lock) || -+ lockdep_is_held(&c->state_lock)); -+} -+ -+/* XXX kill, move to struct bch_fs */ -+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -+{ -+ struct bch_devs_mask devs; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ memset(&devs, 0, sizeof(devs)); -+ for_each_online_member(ca, c, i) -+ __set_bit(ca->dev_idx, devs.d); -+ return devs; -+} -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *); -+struct bch_fs *bch2_uuid_to_fs(uuid_le); -+int bch2_congested(void *, int); -+ -+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+ -+int bch2_dev_fail(struct bch_dev *, int); -+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_add(struct bch_fs *, const char *); -+int bch2_dev_online(struct bch_fs *, const char *); -+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *); -+void bch2_fs_read_only(struct bch_fs *); -+ -+int bch2_fs_read_write(struct bch_fs *); -+int bch2_fs_read_write_early(struct bch_fs *); -+ -+/* -+ * Only for use in the recovery/fsck path: -+ */ -+static inline void bch2_fs_lazy_rw(struct bch_fs *c) -+{ -+ if (percpu_ref_is_zero(&c->writes)) -+ bch2_fs_read_write_early(c); -+} -+ -+void bch2_fs_stop(struct bch_fs *); -+ -+int bch2_fs_start(struct bch_fs *); -+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+const char *bch2_fs_open_incremental(const char *path); -+ -+#endif /* _BCACHEFS_SUPER_H */ -diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h -new file mode 100644 -index 000000000000..20406ebd6f5b ---- /dev/null -+++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,51 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_TYPES_H -+#define _BCACHEFS_SUPER_TYPES_H -+ -+struct bch_sb_handle { -+ struct bch_sb *sb; -+ struct block_device *bdev; -+ struct bio *bio; -+ unsigned page_order; -+ fmode_t mode; -+ unsigned have_layout:1; -+ unsigned have_bio:1; -+ unsigned fs_sb:1; -+ u64 seq; -+}; -+ -+struct bch_devs_mask { -+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -+}; -+ -+struct bch_devs_list { -+ u8 nr; -+ u8 devs[BCH_REPLICAS_MAX + 1]; -+}; -+ -+struct bch_member_cpu { -+ u64 nbuckets; /* device size */ -+ u16 first_bucket; /* index of first bucket used */ -+ u16 bucket_size; /* sectors */ -+ u16 group; -+ u8 state; -+ u8 replacement; -+ u8 discard; -+ u8 data_allowed; -+ u8 durability; -+ u8 valid; -+}; -+ -+struct bch_disk_group_cpu { -+ bool deleted; -+ u16 parent; -+ struct bch_devs_mask devs; -+}; -+ -+struct bch_disk_groups_cpu { -+ struct rcu_head rcu; -+ unsigned nr; -+ struct bch_disk_group_cpu entries[]; -+}; -+ -+#endif /* _BCACHEFS_SUPER_TYPES_H */ -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -new file mode 100644 -index 000000000000..0cb29f43d99d ---- /dev/null -+++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1074 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcache sysfs interfaces -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "sysfs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "inode.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "opts.h" -+#include "rebalance.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "tests.h" -+ -+#include -+#include -+#include -+ -+#include "util.h" -+ -+#define SYSFS_OPS(type) \ -+struct sysfs_ops type ## _sysfs_ops = { \ -+ .show = type ## _show, \ -+ .store = type ## _store \ -+} -+ -+#define SHOW(fn) \ -+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ -+ char *buf) \ -+ -+#define STORE(fn) \ -+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ -+ const char *buf, size_t size) \ -+ -+#define __sysfs_attribute(_name, _mode) \ -+ static struct attribute sysfs_##_name = \ -+ { .name = #_name, .mode = _mode } -+ -+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) -+ -+#define sysfs_printf(file, fmt, ...) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ -+} while (0) -+ -+#define sysfs_print(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return snprint(buf, PAGE_SIZE, var); \ -+} while (0) -+ -+#define sysfs_hprint(file, val) \ -+do { \ -+ if (attr == &sysfs_ ## file) { \ -+ bch2_hprint(&out, val); \ -+ pr_buf(&out, "\n"); \ -+ return out.pos - buf; \ -+ } \ -+} while (0) -+ -+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -+#define var_print(_var) sysfs_print(_var, var(_var)) -+#define var_hprint(_var) sysfs_hprint(_var, var(_var)) -+ -+#define sysfs_strtoul(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe(buf, var) ?: (ssize_t) size; \ -+} while (0) -+ -+#define sysfs_strtoul_clamp(file, var, min, max) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe_clamp(buf, var, min, max) \ -+ ?: (ssize_t) size; \ -+} while (0) -+ -+#define strtoul_or_return(cp) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define strtoul_restrict_or_return(cp, min, max) \ -+({ \ -+ unsigned long __v = 0; \ -+ int _r = strtoul_safe_restrict(cp, __v, min, max); \ -+ if (_r) \ -+ return _r; \ -+ __v; \ -+}) -+ -+#define strtoi_h_or_return(cp) \ -+({ \ -+ u64 _v; \ -+ int _r = strtoi_h(cp, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define sysfs_hatoi(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoi_h(buf, &var) ?: (ssize_t) size; \ -+} while (0) -+ -+write_attribute(trigger_journal_flush); -+write_attribute(trigger_btree_coalesce); -+write_attribute(trigger_gc); -+write_attribute(prune_cache); -+rw_attribute(btree_gc_periodic); -+ -+read_attribute(uuid); -+read_attribute(minor); -+read_attribute(bucket_size); -+read_attribute(block_size); -+read_attribute(btree_node_size); -+read_attribute(first_bucket); -+read_attribute(nbuckets); -+read_attribute(durability); -+read_attribute(iodone); -+ -+read_attribute(io_latency_read); -+read_attribute(io_latency_write); -+read_attribute(io_latency_stats_read); -+read_attribute(io_latency_stats_write); -+read_attribute(congested); -+ -+read_attribute(bucket_quantiles_last_read); -+read_attribute(bucket_quantiles_last_write); -+read_attribute(bucket_quantiles_fragmentation); -+read_attribute(bucket_quantiles_oldest_gen); -+ -+read_attribute(reserve_stats); -+read_attribute(btree_cache_size); -+read_attribute(compression_stats); -+read_attribute(journal_debug); -+read_attribute(journal_pins); -+read_attribute(btree_updates); -+read_attribute(dirty_btree_nodes); -+read_attribute(btree_key_cache); -+read_attribute(btree_transactions); -+read_attribute(stripes_heap); -+ -+read_attribute(internal_uuid); -+ -+read_attribute(has_data); -+read_attribute(alloc_debug); -+write_attribute(wake_allocator); -+ -+read_attribute(read_realloc_races); -+read_attribute(extent_migrate_done); -+read_attribute(extent_migrate_raced); -+ -+rw_attribute(journal_write_delay_ms); -+rw_attribute(journal_reclaim_delay_ms); -+ -+rw_attribute(discard); -+rw_attribute(cache_replacement_policy); -+rw_attribute(label); -+ -+rw_attribute(copy_gc_enabled); -+sysfs_pd_controller_attribute(copy_gc); -+ -+rw_attribute(rebalance_enabled); -+sysfs_pd_controller_attribute(rebalance); -+read_attribute(rebalance_work); -+rw_attribute(promote_whole_extents); -+ -+read_attribute(new_stripes); -+ -+rw_attribute(pd_controllers_update_seconds); -+ -+read_attribute(meta_replicas_have); -+read_attribute(data_replicas_have); -+ -+read_attribute(io_timers_read); -+read_attribute(io_timers_write); -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+write_attribute(perf_test); -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ rw_attribute(name); -+ -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define x(_name) \ -+ static struct attribute sysfs_time_stat_##_name = \ -+ { .name = #_name, .mode = S_IRUGO }; -+ BCH_TIME_STATS() -+#undef x -+ -+static struct attribute sysfs_state_rw = { -+ .name = "state", -+ .mode = S_IRUGO -+}; -+ -+static size_t bch2_btree_cache_size(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct btree *b; -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_for_each_entry(b, &c->btree_cache.live, list) -+ ret += btree_bytes(c); -+ -+ mutex_unlock(&c->btree_cache.lock); -+ return ret; -+} -+ -+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); -+ -+ if (!fs_usage) -+ return -ENOMEM; -+ -+ bch2_fs_usage_to_text(out, c, fs_usage); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ kfree(fs_usage); -+ return 0; -+} -+ -+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, -+ nr_compressed_extents = 0, -+ compressed_sectors_compressed = 0, -+ compressed_sectors_uncompressed = 0; -+ int ret; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) -+ if (k.k->type == KEY_TYPE_extent) { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ if (!crc_is_compressed(p.crc)) { -+ nr_uncompressed_extents++; -+ uncompressed_sectors += e.k->size; -+ } else { -+ nr_compressed_extents++; -+ compressed_sectors_compressed += -+ p.crc.compressed_size; -+ compressed_sectors_uncompressed += -+ p.crc.uncompressed_size; -+ } -+ -+ /* only looking at the first ptr */ -+ break; -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ pr_buf(out, -+ "uncompressed data:\n" -+ " nr extents: %llu\n" -+ " size (bytes): %llu\n" -+ "compressed data:\n" -+ " nr extents: %llu\n" -+ " compressed size (bytes): %llu\n" -+ " uncompressed size (bytes): %llu\n", -+ nr_uncompressed_extents, -+ uncompressed_sectors << 9, -+ nr_compressed_extents, -+ compressed_sectors_compressed << 9, -+ compressed_sectors_uncompressed << 9); -+ return 0; -+} -+ -+SHOW(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ sysfs_print(minor, c->minor); -+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); -+ -+ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(btree_node_size, btree_bytes(c)); -+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); -+ -+ sysfs_print(read_realloc_races, -+ atomic_long_read(&c->read_realloc_races)); -+ sysfs_print(extent_migrate_done, -+ atomic_long_read(&c->extent_migrate_done)); -+ sysfs_print(extent_migrate_raced, -+ atomic_long_read(&c->extent_migrate_raced)); -+ -+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); -+ -+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ -+ sysfs_print(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ -+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); -+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ -+ sysfs_pd_controller_show(copy_gc, &c->copygc_pd); -+ -+ if (attr == &sysfs_rebalance_work) { -+ bch2_rebalance_work_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ sysfs_print(promote_whole_extents, c->promote_whole_extents); -+ -+ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); -+ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_alloc_debug) -+ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; -+ -+ if (attr == &sysfs_journal_debug) { -+ bch2_journal_debug_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_journal_pins) { -+ bch2_journal_pins_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_updates) { -+ bch2_btree_updates_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_dirty_btree_nodes) { -+ bch2_dirty_btree_nodes_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_key_cache) { -+ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_transactions) { -+ bch2_btree_trans_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_stripes_heap) { -+ bch2_stripes_heap_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_compression_stats) { -+ bch2_compression_stats_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_new_stripes) { -+ bch2_new_stripes_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_io_timers_read) { -+ bch2_io_timers_to_text(&out, &c->io_clock[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_timers_write) { -+ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); -+ return out.pos - buf; -+ } -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ return 0; -+} -+ -+STORE(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ if (attr == &sysfs_btree_gc_periodic) { -+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) -+ ?: (ssize_t) size; -+ -+ wake_up_process(c->gc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_copy_gc_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) -+ ?: (ssize_t) size; -+ -+ if (c->copygc_thread) -+ wake_up_process(c->copygc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_rebalance_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) -+ ?: (ssize_t) size; -+ -+ rebalance_wakeup(c); -+ return ret; -+ } -+ -+ sysfs_strtoul(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); -+ sysfs_pd_controller_store(copy_gc, &c->copygc_pd); -+ -+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); -+ -+ /* Debugging: */ -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_trigger_journal_flush) -+ bch2_journal_meta_async(&c->journal, NULL); -+ -+ if (attr == &sysfs_trigger_btree_coalesce) -+ bch2_coalesce(c); -+ -+ if (attr == &sysfs_trigger_gc) { -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ down_read(&c->state_lock); -+ bch2_gc(c, NULL, false, false); -+ up_read(&c->state_lock); -+#else -+ bch2_gc_gens(c); -+#endif -+ } -+ -+ if (attr == &sysfs_prune_cache) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = strtoul_or_return(buf); -+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); -+ } -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ if (attr == &sysfs_perf_test) { -+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; -+ char *test = strsep(&p, " \t\n"); -+ char *nr_str = strsep(&p, " \t\n"); -+ char *threads_str = strsep(&p, " \t\n"); -+ unsigned threads; -+ u64 nr; -+ int ret = -EINVAL; -+ -+ if (threads_str && -+ !(ret = kstrtouint(threads_str, 10, &threads)) && -+ !(ret = bch2_strtoull_h(nr_str, &nr))) -+ bch2_btree_perf_test(c, test, nr, threads); -+ else -+ size = ret; -+ kfree(tmp); -+ } -+#endif -+ return size; -+} -+SYSFS_OPS(bch2_fs); -+ -+struct attribute *bch2_fs_files[] = { -+ &sysfs_minor, -+ &sysfs_block_size, -+ &sysfs_btree_node_size, -+ &sysfs_btree_cache_size, -+ -+ &sysfs_meta_replicas_have, -+ &sysfs_data_replicas_have, -+ -+ &sysfs_journal_write_delay_ms, -+ &sysfs_journal_reclaim_delay_ms, -+ -+ &sysfs_promote_whole_extents, -+ -+ &sysfs_compression_stats, -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ &sysfs_perf_test, -+#endif -+ NULL -+}; -+ -+/* internal dir - just a wrapper */ -+ -+SHOW(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_show(&c->kobj, attr, buf); -+} -+ -+STORE(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_store(&c->kobj, attr, buf, size); -+} -+SYSFS_OPS(bch2_fs_internal); -+ -+struct attribute *bch2_fs_internal_files[] = { -+ &sysfs_alloc_debug, -+ &sysfs_journal_debug, -+ &sysfs_journal_pins, -+ &sysfs_btree_updates, -+ &sysfs_dirty_btree_nodes, -+ &sysfs_btree_key_cache, -+ &sysfs_btree_transactions, -+ &sysfs_stripes_heap, -+ -+ &sysfs_read_realloc_races, -+ &sysfs_extent_migrate_done, -+ &sysfs_extent_migrate_raced, -+ -+ &sysfs_trigger_journal_flush, -+ &sysfs_trigger_btree_coalesce, -+ &sysfs_trigger_gc, -+ &sysfs_prune_cache, -+ -+ &sysfs_copy_gc_enabled, -+ -+ &sysfs_rebalance_enabled, -+ &sysfs_rebalance_work, -+ sysfs_pd_controller_files(rebalance), -+ sysfs_pd_controller_files(copy_gc), -+ -+ &sysfs_new_stripes, -+ -+ &sysfs_io_timers_read, -+ &sysfs_io_timers_write, -+ -+ &sysfs_internal_uuid, -+ -+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ NULL -+}; -+ -+/* options */ -+ -+SHOW(bch2_fs_opts_dir) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int id = opt - bch2_opt_table; -+ u64 v = bch2_opt_get_by_id(&c->opts, id); -+ -+ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); -+ pr_buf(&out, "\n"); -+ -+ return out.pos - buf; -+} -+ -+STORE(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int ret, id = opt - bch2_opt_table; -+ char *tmp; -+ u64 v; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v); -+ kfree(tmp); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, id, v); -+ if (ret < 0) -+ return ret; -+ -+ if (opt->set_sb != SET_NO_SB_OPT) { -+ mutex_lock(&c->sb_lock); -+ opt->set_sb(c->disk_sb.sb, v); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_opt_set_by_id(&c->opts, id, v); -+ -+ if ((id == Opt_background_target || -+ id == Opt_background_compression) && v) { -+ bch2_rebalance_add_work(c, S64_MAX); -+ rebalance_wakeup(c); -+ } -+ -+ return size; -+} -+SYSFS_OPS(bch2_fs_opts_dir); -+ -+struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -+ -+int bch2_opts_create_sysfs_files(struct kobject *kobj) -+{ -+ const struct bch_option *i; -+ int ret; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + bch2_opts_nr; -+ i++) { -+ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) -+ continue; -+ -+ ret = sysfs_create_file(kobj, &i->attr); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* time stats */ -+ -+SHOW(bch2_fs_time_stats) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+#define x(name) \ -+ if (attr == &sysfs_time_stat_##name) { \ -+ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ -+ return out.pos - buf; \ -+ } -+ BCH_TIME_STATS() -+#undef x -+ -+ return 0; -+} -+ -+STORE(bch2_fs_time_stats) -+{ -+ return size; -+} -+SYSFS_OPS(bch2_fs_time_stats); -+ -+struct attribute *bch2_fs_time_stats_files[] = { -+#define x(name) \ -+ &sysfs_time_stat_##name, -+ BCH_TIME_STATS() -+#undef x -+ NULL -+}; -+ -+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, -+ size_t, void *); -+ -+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ int rw = (private ? 1 : 0); -+ -+ return bucket_last_io(c, bucket(ca, b), rw); -+} -+ -+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ struct bucket *g = bucket(ca, b); -+ return bucket_sectors_used(g->mark); -+} -+ -+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ return bucket_gc_gen(ca, b); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ const unsigned *l = _l; -+ const unsigned *r = _r; -+ -+ return cmp_int(*l, *r); -+} -+ -+static int quantiles_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bch_dev *ca, -+ bucket_map_fn *fn, void *private) -+{ -+ size_t i, n; -+ /* Compute 31 quantiles */ -+ unsigned q[31], *p; -+ -+ down_read(&ca->bucket_lock); -+ n = ca->mi.nbuckets; -+ -+ p = vzalloc(n * sizeof(unsigned)); -+ if (!p) { -+ up_read(&ca->bucket_lock); -+ return -ENOMEM; -+ } -+ -+ for (i = ca->mi.first_bucket; i < n; i++) -+ p[i] = fn(c, ca, i, private); -+ -+ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); -+ up_read(&ca->bucket_lock); -+ -+ while (n && -+ !p[n - 1]) -+ --n; -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; -+ -+ vfree(p); -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ pr_buf(out, "%u ", q[i]); -+ pr_buf(out, "\n"); -+ return 0; -+} -+ -+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ enum alloc_reserve i; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ -+ pr_buf(out, "free_inc:\t%zu\t%zu\n", -+ fifo_used(&ca->free_inc), -+ ca->free_inc.size); -+ -+ for (i = 0; i < RESERVE_NR; i++) -+ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, -+ fifo_used(&ca->free[i]), -+ ca->free[i].size); -+ -+ spin_unlock(&ca->fs->freelist_lock); -+} -+ -+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ unsigned i, nr[BCH_DATA_NR]; -+ -+ memset(nr, 0, sizeof(nr)); -+ -+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) -+ nr[c->open_buckets[i].type]++; -+ -+ pr_buf(out, -+ "free_inc: %zu/%zu\n" -+ "free[RESERVE_BTREE]: %zu/%zu\n" -+ "free[RESERVE_MOVINGGC]: %zu/%zu\n" -+ "free[RESERVE_NONE]: %zu/%zu\n" -+ "buckets:\n" -+ " capacity: %llu\n" -+ " alloc: %llu\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " available: %lli\n" -+ "sectors:\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " fragmented: %llu\n" -+ " copygc threshold: %llu\n" -+ "freelist_wait: %s\n" -+ "open buckets: %u/%u (reserved %u)\n" -+ "open_buckets_wait: %s\n" -+ "open_buckets_btree: %u\n" -+ "open_buckets_user: %u\n" -+ "btree reserve cache: %u\n", -+ fifo_used(&ca->free_inc), ca->free_inc.size, -+ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, -+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, -+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, -+ ca->mi.nbuckets - ca->mi.first_bucket, -+ stats.buckets_alloc, -+ stats.buckets[BCH_DATA_sb], -+ stats.buckets[BCH_DATA_journal], -+ stats.buckets[BCH_DATA_btree], -+ stats.buckets[BCH_DATA_user], -+ stats.buckets[BCH_DATA_cached], -+ stats.buckets_ec, -+ __dev_buckets_available(ca, stats), -+ stats.sectors[BCH_DATA_sb], -+ stats.sectors[BCH_DATA_journal], -+ stats.sectors[BCH_DATA_btree], -+ stats.sectors[BCH_DATA_user], -+ stats.sectors[BCH_DATA_cached], -+ stats.sectors_ec, -+ stats.sectors_fragmented, -+ c->copygc_threshold, -+ c->freelist_wait.list.first ? "waiting" : "empty", -+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, -+ BTREE_NODE_OPEN_BUCKET_RESERVE, -+ c->open_buckets_wait.list.first ? "waiting" : "empty", -+ nr[BCH_DATA_btree], -+ nr[BCH_DATA_user], -+ c->btree_reserve_cache_nr); -+} -+ -+static const char * const bch2_rw[] = { -+ "read", -+ "write", -+ NULL -+}; -+ -+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ int rw, i; -+ -+ for (rw = 0; rw < 2; rw++) { -+ pr_buf(out, "%s:\n", bch2_rw[rw]); -+ -+ for (i = 1; i < BCH_DATA_NR; i++) -+ pr_buf(out, "%-12s:%12llu\n", -+ bch2_data_types[i], -+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); -+ } -+} -+ -+SHOW(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ sysfs_printf(uuid, "%pU\n", ca->uuid.b); -+ -+ sysfs_print(bucket_size, bucket_bytes(ca)); -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(first_bucket, ca->mi.first_bucket); -+ sysfs_print(nbuckets, ca->mi.nbuckets); -+ sysfs_print(durability, ca->mi.durability); -+ sysfs_print(discard, ca->mi.discard); -+ -+ if (attr == &sysfs_label) { -+ if (ca->mi.group) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(&out, &c->disk_sb, -+ ca->mi.group - 1); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_has_data) { -+ bch2_flags_to_text(&out, bch2_data_types, -+ bch2_dev_has_data(c, ca)); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ bch2_string_opt_to_text(&out, -+ bch2_cache_replacement_policies, -+ ca->mi.replacement); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_state_rw) { -+ bch2_string_opt_to_text(&out, bch2_dev_state, -+ ca->mi.state); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_iodone) { -+ dev_iodone_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ -+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); -+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); -+ -+ if (attr == &sysfs_io_latency_stats_read) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_latency_stats_write) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); -+ return out.pos - buf; -+ } -+ -+ sysfs_printf(congested, "%u%%", -+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) -+ * 100 / CONGESTED_MAX); -+ -+ if (attr == &sysfs_bucket_quantiles_last_read) -+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_last_write) -+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_fragmentation) -+ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_oldest_gen) -+ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; -+ -+ if (attr == &sysfs_reserve_stats) { -+ reserve_stats_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_alloc_debug) { -+ dev_alloc_debug_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ -+ return 0; -+} -+ -+STORE(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct bch_member *mi; -+ -+ if (attr == &sysfs_discard) { -+ bool v = strtoul_or_return(buf); -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if (v != BCH_MEMBER_DISCARD(mi)) { -+ SET_BCH_MEMBER_DISCARD(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); -+ -+ if (v < 0) -+ return v; -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { -+ SET_BCH_MEMBER_REPLACEMENT(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_label) { -+ char *tmp; -+ int ret; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_dev_group_set(c, ca, strim(tmp)); -+ kfree(tmp); -+ if (ret) -+ return ret; -+ } -+ -+ if (attr == &sysfs_wake_allocator) -+ bch2_wake_allocator(ca); -+ -+ return size; -+} -+SYSFS_OPS(bch2_dev); -+ -+struct attribute *bch2_dev_files[] = { -+ &sysfs_uuid, -+ &sysfs_bucket_size, -+ &sysfs_block_size, -+ &sysfs_first_bucket, -+ &sysfs_nbuckets, -+ &sysfs_durability, -+ -+ /* settings: */ -+ &sysfs_discard, -+ &sysfs_cache_replacement_policy, -+ &sysfs_state_rw, -+ &sysfs_label, -+ -+ &sysfs_has_data, -+ &sysfs_iodone, -+ -+ &sysfs_io_latency_read, -+ &sysfs_io_latency_write, -+ &sysfs_io_latency_stats_read, -+ &sysfs_io_latency_stats_write, -+ &sysfs_congested, -+ -+ /* alloc info - other stats: */ -+ &sysfs_bucket_quantiles_last_read, -+ &sysfs_bucket_quantiles_last_write, -+ &sysfs_bucket_quantiles_fragmentation, -+ &sysfs_bucket_quantiles_oldest_gen, -+ -+ &sysfs_reserve_stats, -+ -+ /* debug: */ -+ &sysfs_alloc_debug, -+ &sysfs_wake_allocator, -+ NULL -+}; -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h -new file mode 100644 -index 000000000000..525fd05d91f7 ---- /dev/null -+++ b/fs/bcachefs/sysfs.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SYSFS_H_ -+#define _BCACHEFS_SYSFS_H_ -+ -+#include -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+struct attribute; -+struct sysfs_ops; -+ -+extern struct attribute *bch2_fs_files[]; -+extern struct attribute *bch2_fs_internal_files[]; -+extern struct attribute *bch2_fs_opts_dir_files[]; -+extern struct attribute *bch2_fs_time_stats_files[]; -+extern struct attribute *bch2_dev_files[]; -+ -+extern struct sysfs_ops bch2_fs_sysfs_ops; -+extern struct sysfs_ops bch2_fs_internal_sysfs_ops; -+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+extern struct sysfs_ops bch2_dev_sysfs_ops; -+ -+int bch2_opts_create_sysfs_files(struct kobject *); -+ -+#else -+ -+static struct attribute *bch2_fs_files[] = {}; -+static struct attribute *bch2_fs_internal_files[] = {}; -+static struct attribute *bch2_fs_opts_dir_files[] = {}; -+static struct attribute *bch2_fs_time_stats_files[] = {}; -+static struct attribute *bch2_dev_files[] = {}; -+ -+static const struct sysfs_ops bch2_fs_sysfs_ops; -+static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+static const struct sysfs_ops bch2_dev_sysfs_ops; -+ -+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } -+ -+#endif /* NO_BCACHEFS_SYSFS */ -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -new file mode 100644 -index 000000000000..4dcace650416 ---- /dev/null -+++ b/fs/bcachefs/tests.c -@@ -0,0 +1,725 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "journal_reclaim.h" -+#include "tests.h" -+ -+#include "linux/kthread.h" -+#include "linux/random.h" -+ -+static void delete_test_keys(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+/* unit tests */ -+ -+static void test_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ pr_info("deleting once"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ pr_info("deleting twice"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_delete_written(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS_MIN, 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i++); -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) -+ BUG_ON(k.k->p.offset != --i); -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test extents"); -+ -+ for (i = 0; i < nr; i += 8) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 8; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS_MIN, 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i); -+ i = k.k->p.offset; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { -+ BUG_ON(k.k->p.offset != i); -+ i = bkey_start_offset(k.k); -+ } -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i * 2; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i); -+ i += 2; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr * 2); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(k.k->p.offset != i); -+ BUG_ON(bkey_deleted(k.k) != (i & 1)); -+ -+ i++; -+ if (i == nr * 2) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i += 16) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 16; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i + 8); -+ BUG_ON(k.k->size != 8); -+ i += 16; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(bkey_deleted(k.k) != !(i % 16)); -+ -+ BUG_ON(bkey_start_offset(k.k) != i); -+ BUG_ON(k.k->size != 8); -+ i = k.k->p.offset; -+ -+ if (i == nr) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * XXX: we really want to make sure we've got a btree with depth > 0 for these -+ * tests -+ */ -+static void test_peek_end(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_peek_end_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* extent unit tests */ -+ -+u64 test_version; -+ -+static void insert_test_extent(struct bch_fs *c, -+ u64 start, u64 end) -+{ -+ struct bkey_i_cookie k; -+ int ret; -+ -+ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); -+ -+ bkey_cookie_init(&k.k_i); -+ k.k_i.k.p.offset = end; -+ k.k_i.k.size = end - start; -+ k.k_i.k.version.lo = test_version++; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+} -+ -+static void __test_extent_overwrite(struct bch_fs *c, -+ u64 e1_start, u64 e1_end, -+ u64 e2_start, u64 e2_end) -+{ -+ insert_test_extent(c, e1_start, e1_end); -+ insert_test_extent(c, e2_start, e2_end); -+ -+ delete_test_keys(c); -+} -+ -+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 0, 32); -+ __test_extent_overwrite(c, 8, 64, 0, 32); -+} -+ -+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 64); -+ __test_extent_overwrite(c, 0, 64, 32, 72); -+} -+ -+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 40); -+} -+ -+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 32, 64, 0, 64); -+ __test_extent_overwrite(c, 32, 64, 0, 128); -+ __test_extent_overwrite(c, 32, 64, 32, 64); -+ __test_extent_overwrite(c, 32, 64, 32, 128); -+} -+ -+/* perf tests */ -+ -+static u64 test_rand(void) -+{ -+ u64 v; -+#if 0 -+ v = prandom_u32(); -+#else -+ prandom_bytes(&v, sizeof(v)); -+#endif -+ return v; -+} -+ -+static void rand_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct bkey_i_cookie k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = test_rand(); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_mixed(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ -+ if (!(i & 3) && k.k) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static int __do_delete(struct btree_trans *trans, struct bpos pos) -+{ -+ struct btree_iter *iter; -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = k.k->p; -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static void rand_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ struct bpos pos = POS(0, test_rand()); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __do_delete(&trans, pos)); -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_cookie insert; -+ int ret; -+ u64 i = 0; -+ -+ bkey_cookie_init(&insert.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ insert.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &insert.k_i, 0)); -+ -+ BUG_ON(ret); -+ -+ if (++i == nr) -+ break; -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) -+ ; -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_overwrite(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_INTENT, k, ret) { -+ struct bkey_i_cookie u; -+ -+ bkey_reassemble(&u.k_i, k); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &u.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_delete(struct bch_fs *c, u64 nr) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+typedef void (*perf_test_fn)(struct bch_fs *, u64); -+ -+struct test_job { -+ struct bch_fs *c; -+ u64 nr; -+ unsigned nr_threads; -+ perf_test_fn fn; -+ -+ atomic_t ready; -+ wait_queue_head_t ready_wait; -+ -+ atomic_t done; -+ struct completion done_completion; -+ -+ u64 start; -+ u64 finish; -+}; -+ -+static int btree_perf_test_thread(void *data) -+{ -+ struct test_job *j = data; -+ -+ if (atomic_dec_and_test(&j->ready)) { -+ wake_up(&j->ready_wait); -+ j->start = sched_clock(); -+ } else { -+ wait_event(j->ready_wait, !atomic_read(&j->ready)); -+ } -+ -+ j->fn(j->c, j->nr / j->nr_threads); -+ -+ if (atomic_dec_and_test(&j->done)) { -+ j->finish = sched_clock(); -+ complete(&j->done_completion); -+ } -+ -+ return 0; -+} -+ -+void bch2_btree_perf_test(struct bch_fs *c, const char *testname, -+ u64 nr, unsigned nr_threads) -+{ -+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; -+ char name_buf[20], nr_buf[20], per_sec_buf[20]; -+ unsigned i; -+ u64 time; -+ -+ atomic_set(&j.ready, nr_threads); -+ init_waitqueue_head(&j.ready_wait); -+ -+ atomic_set(&j.done, nr_threads); -+ init_completion(&j.done_completion); -+ -+#define perf_test(_test) \ -+ if (!strcmp(testname, #_test)) j.fn = _test -+ -+ perf_test(rand_insert); -+ perf_test(rand_lookup); -+ perf_test(rand_mixed); -+ perf_test(rand_delete); -+ -+ perf_test(seq_insert); -+ perf_test(seq_lookup); -+ perf_test(seq_overwrite); -+ perf_test(seq_delete); -+ -+ /* a unit test, not a perf test: */ -+ perf_test(test_delete); -+ perf_test(test_delete_written); -+ perf_test(test_iterate); -+ perf_test(test_iterate_extents); -+ perf_test(test_iterate_slots); -+ perf_test(test_iterate_slots_extents); -+ perf_test(test_peek_end); -+ perf_test(test_peek_end_extents); -+ -+ perf_test(test_extent_overwrite_front); -+ perf_test(test_extent_overwrite_back); -+ perf_test(test_extent_overwrite_middle); -+ perf_test(test_extent_overwrite_all); -+ -+ if (!j.fn) { -+ pr_err("unknown test %s", testname); -+ return; -+ } -+ -+ //pr_info("running test %s:", testname); -+ -+ if (nr_threads == 1) -+ btree_perf_test_thread(&j); -+ else -+ for (i = 0; i < nr_threads; i++) -+ kthread_run(btree_perf_test_thread, &j, -+ "bcachefs perf test[%u]", i); -+ -+ while (wait_for_completion_interruptible(&j.done_completion)) -+ ; -+ -+ time = j.finish - j.start; -+ -+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); -+ bch2_hprint(&PBUF(nr_buf), nr); -+ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); -+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", -+ name_buf, nr_buf, nr_threads, -+ time / NSEC_PER_SEC, -+ time * nr_threads / nr, -+ per_sec_buf); -+} -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h -new file mode 100644 -index 000000000000..551d0764225e ---- /dev/null -+++ b/fs/bcachefs/tests.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_TEST_H -+#define _BCACHEFS_TEST_H -+ -+struct bch_fs; -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); -+ -+#else -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#endif /* _BCACHEFS_TEST_H */ -diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c -new file mode 100644 -index 000000000000..59e8dfa3d245 ---- /dev/null -+++ b/fs/bcachefs/trace.c -@@ -0,0 +1,12 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "buckets.h" -+#include "btree_types.h" -+#include "keylist.h" -+ -+#include -+#include "keylist.h" -+ -+#define CREATE_TRACE_POINTS -+#include -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -new file mode 100644 -index 000000000000..fd4044a6a08f ---- /dev/null -+++ b/fs/bcachefs/util.c -@@ -0,0 +1,907 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * random utiility code, for bcache but in theory not specific to bcache -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "eytzinger.h" -+#include "util.h" -+ -+static const char si_units[] = "?kMGTPEZY"; -+ -+static int __bch2_strtoh(const char *cp, u64 *res, -+ u64 t_max, bool t_signed) -+{ -+ bool positive = *cp != '-'; -+ unsigned u; -+ u64 v = 0; -+ -+ if (*cp == '+' || *cp == '-') -+ cp++; -+ -+ if (!isdigit(*cp)) -+ return -EINVAL; -+ -+ do { -+ if (v > U64_MAX / 10) -+ return -ERANGE; -+ v *= 10; -+ if (v > U64_MAX - (*cp - '0')) -+ return -ERANGE; -+ v += *cp - '0'; -+ cp++; -+ } while (isdigit(*cp)); -+ -+ for (u = 1; u < strlen(si_units); u++) -+ if (*cp == si_units[u]) { -+ cp++; -+ goto got_unit; -+ } -+ u = 0; -+got_unit: -+ if (*cp == '\n') -+ cp++; -+ if (*cp) -+ return -EINVAL; -+ -+ if (fls64(v) + u * 10 > 64) -+ return -ERANGE; -+ -+ v <<= u * 10; -+ -+ if (positive) { -+ if (v > t_max) -+ return -ERANGE; -+ } else { -+ if (v && !t_signed) -+ return -ERANGE; -+ -+ if (v > t_max + 1) -+ return -ERANGE; -+ v = -v; -+ } -+ -+ *res = v; -+ return 0; -+} -+ -+#define STRTO_H(name, type) \ -+int bch2_ ## name ## _h(const char *cp, type *res) \ -+{ \ -+ u64 v; \ -+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ -+ ANYSINT_MAX(type) != ((type) ~0ULL)); \ -+ *res = v; \ -+ return ret; \ -+} -+ -+STRTO_H(strtoint, int) -+STRTO_H(strtouint, unsigned int) -+STRTO_H(strtoll, long long) -+STRTO_H(strtoull, unsigned long long) -+STRTO_H(strtou64, u64) -+ -+void bch2_hprint(struct printbuf *buf, s64 v) -+{ -+ int u, t = 0; -+ -+ for (u = 0; v >= 1024 || v <= -1024; u++) { -+ t = v & ~(~0U << 10); -+ v >>= 10; -+ } -+ -+ pr_buf(buf, "%lli", v); -+ -+ /* -+ * 103 is magic: t is in the range [-1023, 1023] and we want -+ * to turn it into [-9, 9] -+ */ -+ if (u && v < 100 && v > -100) -+ pr_buf(buf, ".%i", t / 103); -+ if (u) -+ pr_buf(buf, "%c", si_units[u]); -+} -+ -+void bch2_string_opt_to_text(struct printbuf *out, -+ const char * const list[], -+ size_t selected) -+{ -+ size_t i; -+ -+ for (i = 0; list[i]; i++) -+ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); -+} -+ -+void bch2_flags_to_text(struct printbuf *out, -+ const char * const list[], u64 flags) -+{ -+ unsigned bit, nr = 0; -+ bool first = true; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ while (list[nr]) -+ nr++; -+ -+ while (flags && (bit = __ffs(flags)) < nr) { -+ if (!first) -+ pr_buf(out, ","); -+ first = false; -+ pr_buf(out, "%s", list[bit]); -+ flags ^= 1 << bit; -+ } -+} -+ -+u64 bch2_read_flag_list(char *opt, const char * const list[]) -+{ -+ u64 ret = 0; -+ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); -+ -+ if (!d) -+ return -ENOMEM; -+ -+ s = strim(d); -+ -+ while ((p = strsep(&s, ","))) { -+ int flag = match_string(list, -1, p); -+ if (flag < 0) { -+ ret = -1; -+ break; -+ } -+ -+ ret |= 1 << flag; -+ } -+ -+ kfree(d); -+ -+ return ret; -+} -+ -+bool bch2_is_zero(const void *_p, size_t n) -+{ -+ const char *p = _p; -+ size_t i; -+ -+ for (i = 0; i < n; i++) -+ if (p[i]) -+ return false; -+ return true; -+} -+ -+static void bch2_quantiles_update(struct quantiles *q, u64 v) -+{ -+ unsigned i = 0; -+ -+ while (i < ARRAY_SIZE(q->entries)) { -+ struct quantile_entry *e = q->entries + i; -+ -+ if (unlikely(!e->step)) { -+ e->m = v; -+ e->step = max_t(unsigned, v / 2, 1024); -+ } else if (e->m > v) { -+ e->m = e->m >= e->step -+ ? e->m - e->step -+ : 0; -+ } else if (e->m < v) { -+ e->m = e->m + e->step > e->m -+ ? e->m + e->step -+ : U32_MAX; -+ } -+ -+ if ((e->m > v ? e->m - v : v - e->m) < e->step) -+ e->step = max_t(unsigned, e->step / 2, 1); -+ -+ if (v >= e->m) -+ break; -+ -+ i = eytzinger0_child(i, v > e->m); -+ } -+} -+ -+/* time stats: */ -+ -+static void bch2_time_stats_update_one(struct time_stats *stats, -+ u64 start, u64 end) -+{ -+ u64 duration, freq; -+ -+ duration = time_after64(end, start) -+ ? end - start : 0; -+ freq = time_after64(end, stats->last_event) -+ ? end - stats->last_event : 0; -+ -+ stats->count++; -+ -+ stats->average_duration = stats->average_duration -+ ? ewma_add(stats->average_duration, duration, 6) -+ : duration; -+ -+ stats->average_frequency = stats->average_frequency -+ ? ewma_add(stats->average_frequency, freq, 6) -+ : freq; -+ -+ stats->max_duration = max(stats->max_duration, duration); -+ -+ stats->last_event = end; -+ -+ bch2_quantiles_update(&stats->quantiles, duration); -+} -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) -+{ -+ unsigned long flags; -+ -+ if (!stats->buffer) { -+ spin_lock_irqsave(&stats->lock, flags); -+ bch2_time_stats_update_one(stats, start, end); -+ -+ if (stats->average_frequency < 32 && -+ stats->count > 1024) -+ stats->buffer = -+ alloc_percpu_gfp(struct time_stat_buffer, -+ GFP_ATOMIC); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ } else { -+ struct time_stat_buffer_entry *i; -+ struct time_stat_buffer *b; -+ -+ preempt_disable(); -+ b = this_cpu_ptr(stats->buffer); -+ -+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); -+ b->entries[b->nr++] = (struct time_stat_buffer_entry) { -+ .start = start, -+ .end = end -+ }; -+ -+ if (b->nr == ARRAY_SIZE(b->entries)) { -+ spin_lock_irqsave(&stats->lock, flags); -+ for (i = b->entries; -+ i < b->entries + ARRAY_SIZE(b->entries); -+ i++) -+ bch2_time_stats_update_one(stats, i->start, i->end); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ -+ b->nr = 0; -+ } -+ -+ preempt_enable(); -+ } -+} -+ -+static const struct time_unit { -+ const char *name; -+ u32 nsecs; -+} time_units[] = { -+ { "ns", 1 }, -+ { "us", NSEC_PER_USEC }, -+ { "ms", NSEC_PER_MSEC }, -+ { "sec", NSEC_PER_SEC }, -+}; -+ -+static const struct time_unit *pick_time_units(u64 ns) -+{ -+ const struct time_unit *u; -+ -+ for (u = time_units; -+ u + 1 < time_units + ARRAY_SIZE(time_units) && -+ ns >= u[1].nsecs << 1; -+ u++) -+ ; -+ -+ return u; -+} -+ -+static void pr_time_units(struct printbuf *out, u64 ns) -+{ -+ const struct time_unit *u = pick_time_units(ns); -+ -+ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) -+{ -+ const struct time_unit *u; -+ u64 freq = READ_ONCE(stats->average_frequency); -+ u64 q, last_q = 0; -+ int i; -+ -+ pr_buf(out, "count:\t\t%llu\n", -+ stats->count); -+ pr_buf(out, "rate:\t\t%llu/sec\n", -+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); -+ -+ pr_buf(out, "frequency:\t"); -+ pr_time_units(out, freq); -+ -+ pr_buf(out, "\navg duration:\t"); -+ pr_time_units(out, stats->average_duration); -+ -+ pr_buf(out, "\nmax duration:\t"); -+ pr_time_units(out, stats->max_duration); -+ -+ i = eytzinger0_first(NR_QUANTILES); -+ u = pick_time_units(stats->quantiles.entries[i].m); -+ -+ pr_buf(out, "\nquantiles (%s):\t", u->name); -+ eytzinger0_for_each(i, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ -+ q = max(stats->quantiles.entries[i].m, last_q); -+ pr_buf(out, "%llu%s", -+ div_u64(q, u->nsecs), -+ is_last ? "\n" : " "); -+ last_q = q; -+ } -+} -+ -+void bch2_time_stats_exit(struct time_stats *stats) -+{ -+ free_percpu(stats->buffer); -+} -+ -+void bch2_time_stats_init(struct time_stats *stats) -+{ -+ memset(stats, 0, sizeof(*stats)); -+ spin_lock_init(&stats->lock); -+} -+ -+/* ratelimit: */ -+ -+/** -+ * bch2_ratelimit_delay() - return how long to delay until the next time to do -+ * some work -+ * -+ * @d - the struct bch_ratelimit to update -+ * -+ * Returns the amount of time to delay by, in jiffies -+ */ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -+{ -+ u64 now = local_clock(); -+ -+ return time_after64(d->next, now) -+ ? nsecs_to_jiffies(d->next - now) -+ : 0; -+} -+ -+/** -+ * bch2_ratelimit_increment() - increment @d by the amount of work done -+ * -+ * @d - the struct bch_ratelimit to update -+ * @done - the amount of work done, in arbitrary units -+ */ -+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -+{ -+ u64 now = local_clock(); -+ -+ d->next += div_u64(done * NSEC_PER_SEC, d->rate); -+ -+ if (time_before64(now + NSEC_PER_SEC, d->next)) -+ d->next = now + NSEC_PER_SEC; -+ -+ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) -+ d->next = now - NSEC_PER_SEC * 2; -+} -+ -+/* pd controller: */ -+ -+/* -+ * Updates pd_controller. Attempts to scale inputed values to units per second. -+ * @target: desired value -+ * @actual: current value -+ * -+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing -+ * it makes actual go down. -+ */ -+void bch2_pd_controller_update(struct bch_pd_controller *pd, -+ s64 target, s64 actual, int sign) -+{ -+ s64 proportional, derivative, change; -+ -+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; -+ -+ if (seconds_since_update == 0) -+ return; -+ -+ pd->last_update = jiffies; -+ -+ proportional = actual - target; -+ proportional *= seconds_since_update; -+ proportional = div_s64(proportional, pd->p_term_inverse); -+ -+ derivative = actual - pd->last_actual; -+ derivative = div_s64(derivative, seconds_since_update); -+ derivative = ewma_add(pd->smoothed_derivative, derivative, -+ (pd->d_term / seconds_since_update) ?: 1); -+ derivative = derivative * pd->d_term; -+ derivative = div_s64(derivative, pd->p_term_inverse); -+ -+ change = proportional + derivative; -+ -+ /* Don't increase rate if not keeping up */ -+ if (change > 0 && -+ pd->backpressure && -+ time_after64(local_clock(), -+ pd->rate.next + NSEC_PER_MSEC)) -+ change = 0; -+ -+ change *= (sign * -1); -+ -+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, -+ 1, UINT_MAX); -+ -+ pd->last_actual = actual; -+ pd->last_derivative = derivative; -+ pd->last_proportional = proportional; -+ pd->last_change = change; -+ pd->last_target = target; -+} -+ -+void bch2_pd_controller_init(struct bch_pd_controller *pd) -+{ -+ pd->rate.rate = 1024; -+ pd->last_update = jiffies; -+ pd->p_term_inverse = 6000; -+ pd->d_term = 30; -+ pd->d_smooth = pd->d_term; -+ pd->backpressure = 1; -+} -+ -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) -+{ -+ /* 2^64 - 1 is 20 digits, plus null byte */ -+ char rate[21]; -+ char actual[21]; -+ char target[21]; -+ char proportional[21]; -+ char derivative[21]; -+ char change[21]; -+ s64 next_io; -+ -+ bch2_hprint(&PBUF(rate), pd->rate.rate); -+ bch2_hprint(&PBUF(actual), pd->last_actual); -+ bch2_hprint(&PBUF(target), pd->last_target); -+ bch2_hprint(&PBUF(proportional), pd->last_proportional); -+ bch2_hprint(&PBUF(derivative), pd->last_derivative); -+ bch2_hprint(&PBUF(change), pd->last_change); -+ -+ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); -+ -+ return sprintf(buf, -+ "rate:\t\t%s/sec\n" -+ "target:\t\t%s\n" -+ "actual:\t\t%s\n" -+ "proportional:\t%s\n" -+ "derivative:\t%s\n" -+ "change:\t\t%s/sec\n" -+ "next io:\t%llims\n", -+ rate, target, actual, proportional, -+ derivative, change, next_io); -+} -+ -+/* misc: */ -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t size) -+{ -+ while (size) { -+ struct page *page = is_vmalloc_addr(base) -+ ? vmalloc_to_page(base) -+ : virt_to_page(base); -+ unsigned offset = offset_in_page(base); -+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, offset)); -+ size -= len; -+ base += len; -+ } -+} -+ -+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -+{ -+ while (size) { -+ struct page *page = alloc_page(gfp_mask); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ if (!page) -+ return -ENOMEM; -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ return 0; -+} -+ -+size_t bch2_rand_range(size_t max) -+{ -+ size_t rand; -+ -+ if (!max) -+ return 0; -+ -+ do { -+ rand = get_random_long(); -+ rand &= roundup_pow_of_two(max) - 1; -+ } while (rand >= max); -+ -+ return rand; -+} -+ -+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, dst, iter, dst_iter) { -+ void *dstp = kmap_atomic(bv.bv_page); -+ memcpy(dstp + bv.bv_offset, src, bv.bv_len); -+ kunmap_atomic(dstp); -+ -+ src += bv.bv_len; -+ } -+} -+ -+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, src, iter, src_iter) { -+ void *srcp = kmap_atomic(bv.bv_page); -+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); -+ kunmap_atomic(srcp); -+ -+ dst += bv.bv_len; -+ } -+} -+ -+void bch_scnmemcpy(struct printbuf *out, -+ const char *src, size_t len) -+{ -+ size_t n = printbuf_remaining(out); -+ -+ if (n) { -+ n = min(n - 1, len); -+ memcpy(out->pos, src, n); -+ out->pos += n; -+ *out->pos = '\0'; -+ } -+} -+ -+#include "eytzinger.h" -+ -+static int alignment_ok(const void *base, size_t align) -+{ -+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || -+ ((unsigned long)base & (align - 1)) == 0; -+} -+ -+static void u32_swap(void *a, void *b, size_t size) -+{ -+ u32 t = *(u32 *)a; -+ *(u32 *)a = *(u32 *)b; -+ *(u32 *)b = t; -+} -+ -+static void u64_swap(void *a, void *b, size_t size) -+{ -+ u64 t = *(u64 *)a; -+ *(u64 *)a = *(u64 *)b; -+ *(u64 *)b = t; -+} -+ -+static void generic_swap(void *a, void *b, size_t size) -+{ -+ char t; -+ -+ do { -+ t = *(char *)a; -+ *(char *)a++ = *(char *)b; -+ *(char *)b++ = t; -+ } while (--size > 0); -+} -+ -+static inline int do_cmp(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ size_t l, size_t r) -+{ -+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+static inline void do_swap(void *base, size_t n, size_t size, -+ void (*swap_func)(void *, void *, size_t), -+ size_t l, size_t r) -+{ -+ swap_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+void eytzinger0_sort(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)) -+{ -+ int i, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for (i = n / 2 - 1; i >= 0; --i) { -+ for (r = i; r * 2 + 1 < n; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < n && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - 1; i > 0; --i) { -+ do_swap(base, n, size, swap_func, 0, i); -+ -+ for (r = 0; r * 2 + 1 < i; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < i && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t size)) -+{ -+ /* pre-scale counters for performance */ -+ int i = (num/2 - 1) * size, n = num * size, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for ( ; i >= 0; i -= size) { -+ for (r = i; r * 2 + size < n; r = c) { -+ c = r * 2 + size; -+ if (c < n - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - size; i > 0; i -= size) { -+ swap_func(base, base + i, size); -+ for (r = 0; r * 2 + size < i; r = c) { -+ c = r * 2 + size; -+ if (c < i - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+} -+ -+static void mempool_free_vp(void *element, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ vpfree(element, size); -+} -+ -+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ return vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -+{ -+ return size < PAGE_SIZE -+ ? mempool_init_kmalloc_pool(pool, min_nr, size) -+ : mempool_init(pool, min_nr, mempool_alloc_vp, -+ mempool_free_vp, (void *) size); -+} -+ -+#if 0 -+void eytzinger1_test(void) -+{ -+ unsigned inorder, eytz, size; -+ -+ pr_info("1 based eytzinger test:"); -+ -+ for (size = 2; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger1_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -+ -+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ -+ inorder = 1; -+ eytzinger1_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger1_last(size) && -+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+void eytzinger0_test(void) -+{ -+ -+ unsigned inorder, eytz, size; -+ -+ pr_info("0 based eytzinger test:"); -+ -+ for (size = 1; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger0_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -+ -+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ -+ inorder = 0; -+ eytzinger0_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger0_last(size) && -+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+{ -+ const u16 *l = _l, *r = _r; -+ -+ return (*l > *r) - (*r - *l); -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ int i, c1 = -1, c2 = -1; -+ ssize_t r; -+ -+ r = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) -+ c1 = test_array[r]; -+ -+ for (i = 0; i < nr; i++) -+ if (test_array[i] <= search && test_array[i] > c2) -+ c2 = test_array[i]; -+ -+ if (c1 != c2) { -+ eytzinger0_for_each(i, nr) -+ pr_info("[%3u] = %12u", i, test_array[i]); -+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -+ i, r, c1, c2); -+ } -+} -+ -+void eytzinger0_find_test(void) -+{ -+ unsigned i, nr, allocated = 1 << 12; -+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); -+ -+ for (nr = 1; nr < allocated; nr++) { -+ pr_info("testing %u elems", nr); -+ -+ get_random_bytes(test_array, nr * sizeof(test_array[0])); -+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); -+ -+ /* verify array is sorted correctly: */ -+ eytzinger0_for_each(i, nr) -+ BUG_ON(i != eytzinger0_last(nr) && -+ test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ -+ for (i = 0; i < U16_MAX; i += 1 << 12) -+ eytzinger0_find_test_val(test_array, nr, i); -+ -+ for (i = 0; i < nr; i++) { -+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); -+ eytzinger0_find_test_val(test_array, nr, test_array[i]); -+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); -+ } -+ } -+ -+ kfree(test_array); -+} -+#endif -+ -+/* -+ * Accumulate percpu counters onto one cpu's copy - only valid when access -+ * against any percpu counter is guarded against -+ */ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -+{ -+ u64 *ret; -+ int cpu; -+ -+ preempt_disable(); -+ ret = this_cpu_ptr(p); -+ preempt_enable(); -+ -+ for_each_possible_cpu(cpu) { -+ u64 *i = per_cpu_ptr(p, cpu); -+ -+ if (i != ret) { -+ acc_u64s(ret, i, nr); -+ memset(i, 0, nr * sizeof(u64)); -+ } -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -new file mode 100644 -index 000000000000..f48c6380684f ---- /dev/null -+++ b/fs/bcachefs/util.h -@@ -0,0 +1,761 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_UTIL_H -+#define _BCACHEFS_UTIL_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -+#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) -+ -+struct closure; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define EBUG_ON(cond) BUG_ON(cond) -+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) -+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -+#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) -+#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) -+#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) -+#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) -+#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) -+#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) -+#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) -+#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -+ -+#define memcpy(dst, src, len) \ -+({ \ -+ void *_dst = (dst); \ -+ const void *_src = (src); \ -+ size_t _len = (len); \ -+ \ -+ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ -+ (void *) (_dst) + (_len) <= (void *) (_src))); \ -+ memcpy(_dst, _src, _len); \ -+}) -+ -+#else /* DEBUG */ -+ -+#define EBUG_ON(cond) -+#define atomic_dec_bug(v) atomic_dec(v) -+#define atomic_inc_bug(v, i) atomic_inc(v) -+#define atomic_sub_bug(i, v) atomic_sub(i, v) -+#define atomic_add_bug(i, v) atomic_add(i, v) -+#define atomic_long_dec_bug(v) atomic_long_dec(v) -+#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) -+#define atomic64_dec_bug(v) atomic64_dec(v) -+#define atomic64_inc_bug(v, i) atomic64_inc(v) -+#define atomic64_sub_bug(i, v) atomic64_sub(i, v) -+#define atomic64_add_bug(i, v) atomic64_add(i, v) -+ -+#endif -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+#define CPU_BIG_ENDIAN 0 -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+#define CPU_BIG_ENDIAN 1 -+#endif -+ -+/* type hackery */ -+ -+#define type_is_exact(_val, _type) \ -+ __builtin_types_compatible_p(typeof(_val), _type) -+ -+#define type_is(_val, _type) \ -+ (__builtin_types_compatible_p(typeof(_val), _type) || \ -+ __builtin_types_compatible_p(typeof(_val), const _type)) -+ -+/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -+static inline size_t buf_pages(void *p, size_t len) -+{ -+ return DIV_ROUND_UP(len + -+ ((unsigned long) p & (PAGE_SIZE - 1)), -+ PAGE_SIZE); -+} -+ -+static inline void vpfree(void *p, size_t size) -+{ -+ if (is_vmalloc_addr(p)) -+ vfree(p); -+ else -+ free_pages((unsigned long) p, get_order(size)); -+} -+ -+static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc(size, gfp_mask); -+} -+ -+static inline void kvpfree(void *p, size_t size) -+{ -+ if (size < PAGE_SIZE) -+ kfree(p); -+ else -+ vpfree(p, size); -+} -+ -+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return size < PAGE_SIZE -+ ? kmalloc(size, gfp_mask) -+ : vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); -+ -+#define HEAP(type) \ -+struct { \ -+ size_t size, used; \ -+ type *data; \ -+} -+ -+#define DECLARE_HEAP(type, name) HEAP(type) name -+ -+#define init_heap(heap, _size, gfp) \ -+({ \ -+ (heap)->used = 0; \ -+ (heap)->size = (_size); \ -+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ -+ (gfp)); \ -+}) -+ -+#define free_heap(heap) \ -+do { \ -+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ -+ (heap)->data = NULL; \ -+} while (0) -+ -+#define heap_set_backpointer(h, i, _fn) \ -+do { \ -+ void (*fn)(typeof(h), size_t) = _fn; \ -+ if (fn) \ -+ fn(h, i); \ -+} while (0) -+ -+#define heap_swap(h, i, j, set_backpointer) \ -+do { \ -+ swap((h)->data[i], (h)->data[j]); \ -+ heap_set_backpointer(h, i, set_backpointer); \ -+ heap_set_backpointer(h, j, set_backpointer); \ -+} while (0) -+ -+#define heap_peek(h) \ -+({ \ -+ EBUG_ON(!(h)->used); \ -+ (h)->data[0]; \ -+}) -+ -+#define heap_full(h) ((h)->used == (h)->size) -+ -+#define heap_sift_down(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _c, _j = i; \ -+ \ -+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ -+ _c = _j * 2 + 1; \ -+ if (_c + 1 < (h)->used && \ -+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ -+ _c++; \ -+ \ -+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ -+ break; \ -+ heap_swap(h, _c, _j, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_sift_up(h, i, cmp, set_backpointer) \ -+do { \ -+ while (i) { \ -+ size_t p = (i - 1) / 2; \ -+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ -+ break; \ -+ heap_swap(h, i, p, set_backpointer); \ -+ i = p; \ -+ } \ -+} while (0) -+ -+#define __heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ size_t _i = (h)->used++; \ -+ (h)->data[_i] = d; \ -+ heap_set_backpointer(h, _i, set_backpointer); \ -+ \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ _i; \ -+}) -+ -+#define heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = !heap_full(h); \ -+ if (_r) \ -+ __heap_add(h, d, cmp, set_backpointer); \ -+ _r; \ -+}) -+ -+#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -+do { \ -+ if (!heap_add(h, new, cmp, set_backpointer) && \ -+ cmp(h, new, heap_peek(h)) >= 0) { \ -+ (h)->data[0] = new; \ -+ heap_set_backpointer(h, 0, set_backpointer); \ -+ heap_sift_down(h, 0, cmp, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_del(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _i = (i); \ -+ \ -+ BUG_ON(_i >= (h)->used); \ -+ (h)->used--; \ -+ heap_swap(h, _i, (h)->used, set_backpointer); \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ heap_sift_down(h, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define heap_pop(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = (h)->used; \ -+ if (_r) { \ -+ (d) = (h)->data[0]; \ -+ heap_del(h, 0, cmp, set_backpointer); \ -+ } \ -+ _r; \ -+}) -+ -+#define heap_resort(heap, cmp, set_backpointer) \ -+do { \ -+ ssize_t _i; \ -+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ -+ heap_sift_down(heap, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define ANYSINT_MAX(t) \ -+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -+ -+struct printbuf { -+ char *pos; -+ char *end; -+}; -+ -+static inline size_t printbuf_remaining(struct printbuf *buf) -+{ -+ return buf->end - buf->pos; -+} -+ -+#define _PBUF(_buf, _len) \ -+ ((struct printbuf) { \ -+ .pos = _buf, \ -+ .end = _buf + _len, \ -+ }) -+ -+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) -+ -+#define pr_buf(_out, ...) \ -+do { \ -+ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ -+ __VA_ARGS__); \ -+} while (0) -+ -+void bch_scnmemcpy(struct printbuf *, const char *, size_t); -+ -+int bch2_strtoint_h(const char *, int *); -+int bch2_strtouint_h(const char *, unsigned int *); -+int bch2_strtoll_h(const char *, long long *); -+int bch2_strtoull_h(const char *, unsigned long long *); -+int bch2_strtou64_h(const char *, u64 *); -+ -+static inline int bch2_strtol_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtoint_h(cp, (int *) res); -+#else -+ return bch2_strtoll_h(cp, (long long *) res); -+#endif -+} -+ -+static inline int bch2_strtoul_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtouint_h(cp, (unsigned int *) res); -+#else -+ return bch2_strtoull_h(cp, (unsigned long long *) res); -+#endif -+} -+ -+#define strtoi_h(cp, res) \ -+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ -+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ -+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ -+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ -+ : -EINVAL) -+ -+#define strtoul_safe(cp, var) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = _v; \ -+ _r; \ -+}) -+ -+#define strtoul_safe_clamp(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = clamp_t(typeof(var), _v, min, max); \ -+ _r; \ -+}) -+ -+#define strtoul_safe_restrict(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r && _v >= min && _v <= max) \ -+ var = _v; \ -+ else \ -+ _r = -EINVAL; \ -+ _r; \ -+}) -+ -+#define snprint(buf, size, var) \ -+ snprintf(buf, size, \ -+ type_is(var, int) ? "%i\n" \ -+ : type_is(var, unsigned) ? "%u\n" \ -+ : type_is(var, long) ? "%li\n" \ -+ : type_is(var, unsigned long) ? "%lu\n" \ -+ : type_is(var, s64) ? "%lli\n" \ -+ : type_is(var, u64) ? "%llu\n" \ -+ : type_is(var, char *) ? "%s\n" \ -+ : "%i\n", var) -+ -+void bch2_hprint(struct printbuf *, s64); -+ -+bool bch2_is_zero(const void *, size_t); -+ -+void bch2_string_opt_to_text(struct printbuf *, -+ const char * const [], size_t); -+ -+void bch2_flags_to_text(struct printbuf *, const char * const[], u64); -+u64 bch2_read_flag_list(char *, const char * const[]); -+ -+#define NR_QUANTILES 15 -+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -+ -+struct quantiles { -+ struct quantile_entry { -+ u64 m; -+ u64 step; -+ } entries[NR_QUANTILES]; -+}; -+ -+struct time_stat_buffer { -+ unsigned nr; -+ struct time_stat_buffer_entry { -+ u64 start; -+ u64 end; -+ } entries[32]; -+}; -+ -+struct time_stats { -+ spinlock_t lock; -+ u64 count; -+ /* all fields are in nanoseconds */ -+ u64 average_duration; -+ u64 average_frequency; -+ u64 max_duration; -+ u64 last_event; -+ struct quantiles quantiles; -+ -+ struct time_stat_buffer __percpu *buffer; -+}; -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64, u64); -+ -+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) -+{ -+ __bch2_time_stats_update(stats, start, local_clock()); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); -+ -+void bch2_time_stats_exit(struct time_stats *); -+void bch2_time_stats_init(struct time_stats *); -+ -+#define ewma_add(ewma, val, weight) \ -+({ \ -+ typeof(ewma) _ewma = (ewma); \ -+ typeof(weight) _weight = (weight); \ -+ \ -+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -+}) -+ -+struct bch_ratelimit { -+ /* Next time we want to do some work, in nanoseconds */ -+ u64 next; -+ -+ /* -+ * Rate at which we want to do work, in units per nanosecond -+ * The units here correspond to the units passed to -+ * bch2_ratelimit_increment() -+ */ -+ unsigned rate; -+}; -+ -+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -+{ -+ d->next = local_clock(); -+} -+ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *); -+void bch2_ratelimit_increment(struct bch_ratelimit *, u64); -+ -+struct bch_pd_controller { -+ struct bch_ratelimit rate; -+ unsigned long last_update; -+ -+ s64 last_actual; -+ s64 smoothed_derivative; -+ -+ unsigned p_term_inverse; -+ unsigned d_smooth; -+ unsigned d_term; -+ -+ /* for exporting to sysfs (no effect on behavior) */ -+ s64 last_derivative; -+ s64 last_proportional; -+ s64 last_change; -+ s64 last_target; -+ -+ /* If true, the rate will not increase if bch2_ratelimit_delay() -+ * is not being called often enough. */ -+ bool backpressure; -+}; -+ -+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -+void bch2_pd_controller_init(struct bch_pd_controller *); -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); -+ -+#define sysfs_pd_controller_attribute(name) \ -+ rw_attribute(name##_rate); \ -+ rw_attribute(name##_rate_bytes); \ -+ rw_attribute(name##_rate_d_term); \ -+ rw_attribute(name##_rate_p_term_inverse); \ -+ read_attribute(name##_rate_debug) -+ -+#define sysfs_pd_controller_files(name) \ -+ &sysfs_##name##_rate, \ -+ &sysfs_##name##_rate_bytes, \ -+ &sysfs_##name##_rate_d_term, \ -+ &sysfs_##name##_rate_p_term_inverse, \ -+ &sysfs_##name##_rate_debug -+ -+#define sysfs_pd_controller_show(name, var) \ -+do { \ -+ sysfs_hprint(name##_rate, (var)->rate.rate); \ -+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ -+ sysfs_print(name##_rate_d_term, (var)->d_term); \ -+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ -+ \ -+ if (attr == &sysfs_##name##_rate_debug) \ -+ return bch2_pd_controller_print_debug(var, buf); \ -+} while (0) -+ -+#define sysfs_pd_controller_store(name, var) \ -+do { \ -+ sysfs_strtoul_clamp(name##_rate, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul_clamp(name##_rate_bytes, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ -+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ -+ (var)->p_term_inverse, 1, INT_MAX); \ -+} while (0) -+ -+#define container_of_or_null(ptr, type, member) \ -+({ \ -+ typeof(ptr) _ptr = ptr; \ -+ _ptr ? container_of(_ptr, type, member) : NULL; \ -+}) -+ -+/* Does linear interpolation between powers of two */ -+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -+{ -+ unsigned fract = x & ~(~0 << fract_bits); -+ -+ x >>= fract_bits; -+ x = 1 << x; -+ x += (x * fract) >> fract_bits; -+ -+ return x; -+} -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -+ -+static inline sector_t bdev_sectors(struct block_device *bdev) -+{ -+ return bdev->bd_inode->i_size >> 9; -+} -+ -+#define closure_bio_submit(bio, cl) \ -+do { \ -+ closure_get(cl); \ -+ submit_bio(bio); \ -+} while (0) -+ -+#define kthread_wait_freezable(cond) \ -+({ \ -+ int _ret = 0; \ -+ while (1) { \ -+ set_current_state(TASK_INTERRUPTIBLE); \ -+ if (kthread_should_stop()) { \ -+ _ret = -1; \ -+ break; \ -+ } \ -+ \ -+ if (cond) \ -+ break; \ -+ \ -+ schedule(); \ -+ try_to_freeze(); \ -+ } \ -+ set_current_state(TASK_RUNNING); \ -+ _ret; \ -+}) -+ -+size_t bch2_rand_range(size_t); -+ -+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -+void memcpy_from_bio(void *, struct bio *, struct bvec_iter); -+ -+static inline void memcpy_u64s_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+} -+ -+static inline void __memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("rep ; movsq" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+#endif -+} -+ -+static inline void memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || -+ dst + u64s * sizeof(u64) <= src)); -+ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst > src); -+ -+ __memmove_u64s_down(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up_small(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s; -+ u64 *src = (u64 *) _src + u64s; -+ -+ while (u64s--) -+ *--dst = *--src; -+} -+ -+static inline void memmove_u64s_up_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up_small(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s - 1; -+ u64 *src = (u64 *) _src + u64s - 1; -+ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("std ;\n" -+ "rep ; movsq\n" -+ "cld ;\n" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ while (u64s--) -+ *dst-- = *src--; -+#endif -+} -+ -+static inline void memmove_u64s_up(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+static inline void memmove_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ if (dst < src) -+ __memmove_u64s_down(dst, src, u64s); -+ else -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -+static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -+{ -+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; -+ -+ memset(s + bytes, c, rem); -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+/* just the memmove, doesn't update @_nr */ -+#define __array_insert_item(_array, _nr, _pos) \ -+ memmove(&(_array)[(_pos) + 1], \ -+ &(_array)[(_pos)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))) -+ -+#define array_insert_item(_array, _nr, _pos, _new_item) \ -+do { \ -+ __array_insert_item(_array, _nr, _pos); \ -+ (_nr)++; \ -+ (_array)[(_pos)] = (_new_item); \ -+} while (0) -+ -+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -+do { \ -+ (_nr) -= (_nr_to_remove); \ -+ memmove(&(_array)[(_pos)], \ -+ &(_array)[(_pos) + (_nr_to_remove)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+} while (0) -+ -+#define array_remove_item(_array, _nr, _pos) \ -+ array_remove_items(_array, _nr, _pos, 1) -+ -+#define bubble_sort(_base, _nr, _cmp) \ -+do { \ -+ ssize_t _i, _end; \ -+ bool _swapped = true; \ -+ \ -+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ -+ _swapped = false; \ -+ for (_i = 0; _i < _end; _i++) \ -+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ -+ swap((_base)[_i], (_base)[_i + 1]); \ -+ _swapped = true; \ -+ } \ -+ } \ -+} while (0) -+ -+static inline u64 percpu_u64_get(u64 __percpu *src) -+{ -+ u64 ret = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ ret += *per_cpu_ptr(src, cpu); -+ return ret; -+} -+ -+static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ *per_cpu_ptr(dst, cpu) = 0; -+ -+ preempt_disable(); -+ *this_cpu_ptr(dst) = src; -+ preempt_enable(); -+} -+ -+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr; i++) -+ acc[i] += src[i]; -+} -+ -+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, -+ unsigned nr) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -+} -+ -+static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(p, cpu), c, bytes); -+} -+ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -+ -+#define cmp_int(l, r) ((l > r) - (l < r)) -+ -+#endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h -new file mode 100644 -index 000000000000..c099cdc0605f ---- /dev/null -+++ b/fs/bcachefs/vstructs.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _VSTRUCTS_H -+#define _VSTRUCTS_H -+ -+#include "util.h" -+ -+/* -+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this -+ * assumes u64 is little endian: -+ */ -+#define __vstruct_u64s(_s) \ -+({ \ -+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ -+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ -+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ -+ : ((__force u8) ((_s)->u64s))); \ -+}) -+ -+#define __vstruct_bytes(_type, _u64s) \ -+({ \ -+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ -+ \ -+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -+}) -+ -+#define vstruct_bytes(_s) \ -+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) -+ -+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ -+ (round_up(__vstruct_bytes(_type, _u64s), \ -+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) -+ -+#define vstruct_blocks(_s, _sector_block_bits) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) -+ -+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ -+ __vstruct_u64s(_s) + (_u64s)) -+ -+#define vstruct_sectors(_s, _sector_block_bits) \ -+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) -+ -+#define vstruct_next(_s) \ -+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_last(_s) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_end(_s) \ -+ ((void *) ((_s)->_data + __vstruct_u64s(_s))) -+ -+#define vstruct_for_each(_s, _i) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s); \ -+ _i = vstruct_next(_i)) -+ -+#define vstruct_for_each_safe(_s, _i, _t) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ -+ _i = _t) -+ -+#define vstruct_idx(_s, _idx) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) -+ -+#endif /* _VSTRUCTS_H */ -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -new file mode 100644 -index 000000000000..21f64cb7e402 ---- /dev/null -+++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,586 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "fs.h" -+#include "rebalance.h" -+#include "str_hash.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -+ -+static u64 bch2_xattr_hash(const struct bch_hash_info *info, -+ const struct xattr_search_key *key) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); -+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); -+ -+ return bch2_str_hash_end(&ctx, info); -+} -+ -+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_xattr_hash(info, key); -+} -+ -+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); -+ -+ return bch2_xattr_hash(info, -+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -+} -+ -+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ const struct xattr_search_key *r = _r; -+ -+ return l.v->x_type != r->type || -+ l.v->x_name_len != r->name.len || -+ memcmp(l.v->x_name, r->name.name, r->name.len); -+} -+ -+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); -+ -+ return l.v->x_type != r.v->x_type || -+ l.v->x_name_len != r.v->x_name_len || -+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -+} -+ -+const struct bch_hash_desc bch2_xattr_hash_desc = { -+ .btree_id = BTREE_ID_XATTRS, -+ .key_type = KEY_TYPE_xattr, -+ .hash_key = xattr_hash_key, -+ .hash_bkey = xattr_hash_bkey, -+ .cmp_key = xattr_cmp_key, -+ .cmp_bkey = xattr_cmp_bkey, -+}; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) < -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) > -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)) -+ return "value too big"; -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (!handler) -+ return "invalid type"; -+ -+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) -+ return "xattr name has invalid characters"; -+ -+ return NULL; -+} -+ -+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (handler && handler->prefix) -+ pr_buf(out, "%s", handler->prefix); -+ else if (handler) -+ pr_buf(out, "(type %u)", xattr.v->x_type); -+ else -+ pr_buf(out, "(unknown type %u)", xattr.v->x_type); -+ -+ bch_scnmemcpy(out, xattr.v->x_name, -+ xattr.v->x_name_len); -+ pr_buf(out, ":"); -+ bch_scnmemcpy(out, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+} -+ -+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, -+ const char *name, void *buffer, size_t size, int type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(type, name, strlen(name)), -+ 0); -+ if (IS_ERR(iter)) { -+ bch2_trans_exit(&trans); -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ -+ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ ret = le16_to_cpu(xattr.v->x_val_len); -+ if (buffer) { -+ if (ret > size) -+ ret = -ERANGE; -+ else -+ memcpy(buffer, xattr_val(xattr.v), ret); -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_xattr_set(struct btree_trans *trans, u64 inum, -+ const struct bch_hash_info *hash_info, -+ const char *name, const void *value, size_t size, -+ int type, int flags) -+{ -+ int ret; -+ -+ if (value) { -+ struct bkey_i_xattr *xattr; -+ unsigned namelen = strlen(name); -+ unsigned u64s = BKEY_U64s + -+ xattr_val_u64s(namelen, size); -+ -+ if (u64s > U8_MAX) -+ return -ERANGE; -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = type; -+ xattr->v.x_name_len = namelen; -+ xattr->v.x_val_len = cpu_to_le16(size); -+ memcpy(xattr->v.x_name, name, namelen); -+ memcpy(xattr_val(&xattr->v), value, size); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inum, &xattr->k_i, -+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| -+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(type, name, strlen(name)); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, -+ hash_info, inum, &search); -+ } -+ -+ if (ret == -ENOENT) -+ ret = flags & XATTR_REPLACE ? -ENODATA : 0; -+ -+ return ret; -+} -+ -+struct xattr_buf { -+ char *buf; -+ size_t len; -+ size_t used; -+}; -+ -+static int __bch2_xattr_emit(const char *prefix, -+ const char *name, size_t name_len, -+ struct xattr_buf *buf) -+{ -+ const size_t prefix_len = strlen(prefix); -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (buf->buf) { -+ if (buf->used + total_len > buf->len) -+ return -ERANGE; -+ -+ memcpy(buf->buf + buf->used, prefix, prefix_len); -+ memcpy(buf->buf + buf->used + prefix_len, -+ name, name_len); -+ buf->buf[buf->used + prefix_len + name_len] = '\0'; -+ } -+ -+ buf->used += total_len; -+ return 0; -+} -+ -+static int bch2_xattr_emit(struct dentry *dentry, -+ const struct bch_xattr *xattr, -+ struct xattr_buf *buf) -+{ -+ const struct xattr_handler *handler = -+ bch2_xattr_type_to_handler(xattr->x_type); -+ -+ return handler && (!handler->list || handler->list(dentry)) -+ ? __bch2_xattr_emit(handler->prefix ?: handler->name, -+ xattr->x_name, xattr->x_name_len, buf) -+ : 0; -+} -+ -+static int bch2_xattr_list_bcachefs(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct xattr_buf *buf, -+ bool all) -+{ -+ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; -+ unsigned id; -+ int ret = 0; -+ u64 v; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ v = bch2_inode_opt_get(&inode->ei_inode, id); -+ if (!v) -+ continue; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << id))) -+ continue; -+ -+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], -+ strlen(bch2_inode_opts[id]), buf); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -+{ -+ struct bch_fs *c = dentry->d_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; -+ u64 inum = dentry->d_inode->i_ino; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS(inum, 0), 0, k, ret) { -+ BUG_ON(k.k->p.inode < inum); -+ -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_xattr) -+ continue; -+ -+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); -+ if (ret) -+ break; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); -+ if (ret) -+ return ret; -+ -+ return buf.used; -+} -+ -+static int bch2_xattr_get_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); -+} -+ -+static int bch2_xattr_set_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, -+ bch2_xattr_set(&trans, inode->v.i_ino, -+ &inode->ei_str_hash, -+ name, value, size, -+ handler->flags, flags)); -+} -+ -+static const struct xattr_handler bch_xattr_user_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_USER, -+}; -+ -+static bool bch2_xattr_trusted_list(struct dentry *dentry) -+{ -+ return capable(CAP_SYS_ADMIN); -+} -+ -+static const struct xattr_handler bch_xattr_trusted_handler = { -+ .prefix = XATTR_TRUSTED_PREFIX, -+ .list = bch2_xattr_trusted_list, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -+}; -+ -+static const struct xattr_handler bch_xattr_security_handler = { -+ .prefix = XATTR_SECURITY_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -+}; -+ -+#ifndef NO_BCACHEFS_FS -+ -+static int opt_to_inode_opt(int id) -+{ -+ switch (id) { -+#define x(name, ...) \ -+ case Opt_##name: return Inode_opt_##name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ return -1; -+ } -+} -+ -+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size, -+ bool all) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_opts opts = -+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); -+ const struct bch_option *opt; -+ int id, inode_opt_id; -+ char buf[512]; -+ struct printbuf out = PBUF(buf); -+ unsigned val_len; -+ u64 v; -+ -+ id = bch2_opt_lookup(name); -+ if (id < 0 || !bch2_opt_is_inode_opt(id)) -+ return -EINVAL; -+ -+ inode_opt_id = opt_to_inode_opt(id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + id; -+ -+ if (!bch2_opt_defined_by_id(&opts, id)) -+ return -ENODATA; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) -+ return -ENODATA; -+ -+ v = bch2_opt_get_by_id(&opts, id); -+ bch2_opt_to_text(&out, c, opt, v, 0); -+ -+ val_len = out.pos - buf; -+ -+ if (buffer && val_len > size) -+ return -ERANGE; -+ -+ if (buffer) -+ memcpy(buffer, buf, val_len); -+ return val_len; -+} -+ -+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, false); -+} -+ -+struct inode_opt_set { -+ int id; -+ u64 v; -+ bool defined; -+}; -+ -+static int inode_opt_set_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_opt_set *s = p; -+ -+ if (s->defined) -+ bi->bi_fields_set |= 1U << s->id; -+ else -+ bi->bi_fields_set &= ~(1U << s->id); -+ -+ bch2_inode_opt_set(bi, s->id, s->v); -+ -+ return 0; -+} -+ -+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ const struct bch_option *opt; -+ char *buf; -+ struct inode_opt_set s; -+ int opt_id, inode_opt_id, ret; -+ -+ opt_id = bch2_opt_lookup(name); -+ if (opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + opt_id; -+ -+ inode_opt_id = opt_to_inode_opt(opt_id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ s.id = inode_opt_id; -+ -+ if (value) { -+ u64 v = 0; -+ -+ buf = kmalloc(size + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ memcpy(buf, value, size); -+ buf[size] = '\0'; -+ -+ ret = bch2_opt_parse(c, opt, buf, &v); -+ kfree(buf); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, opt_id, v); -+ if (ret < 0) -+ return ret; -+ -+ s.v = v + 1; -+ s.defined = true; -+ } else { -+ if (!IS_ROOT(dentry)) { -+ struct bch_inode_info *dir = -+ to_bch_ei(d_inode(dentry->d_parent)); -+ -+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); -+ } else { -+ s.v = 0; -+ } -+ -+ s.defined = false; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (inode_opt_id == Inode_opt_project) { -+ /* -+ * inode fields accessible via the xattr interface are stored -+ * with a +1 bias, so that 0 means unset: -+ */ -+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (value && -+ (opt_id == Opt_background_compression || -+ opt_id == Opt_background_target)) -+ bch2_rebalance_add_work(c, inode->v.i_blocks); -+ -+ return ret; -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_handler = { -+ .prefix = "bcachefs.", -+ .get = bch2_xattr_bcachefs_get, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+static int bch2_xattr_bcachefs_get_effective( -+ const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, true); -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { -+ .prefix = "bcachefs_effective.", -+ .get = bch2_xattr_bcachefs_get_effective, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+const struct xattr_handler *bch2_xattr_handlers[] = { -+ &bch_xattr_user_handler, -+ &posix_acl_access_xattr_handler, -+ &posix_acl_default_xattr_handler, -+ &bch_xattr_trusted_handler, -+ &bch_xattr_security_handler, -+#ifndef NO_BCACHEFS_FS -+ &bch_xattr_bcachefs_handler, -+ &bch_xattr_bcachefs_effective_handler, -+#endif -+ NULL -+}; -+ -+static const struct xattr_handler *bch_xattr_handler_map[] = { -+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = -+ &posix_acl_access_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = -+ &posix_acl_default_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, -+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -+}; -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -+{ -+ return type < ARRAY_SIZE(bch_xattr_handler_map) -+ ? bch_xattr_handler_map[type] -+ : NULL; -+} -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -new file mode 100644 -index 000000000000..4151065ab853 ---- /dev/null -+++ b/fs/bcachefs/xattr.h -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_XATTR_H -+#define _BCACHEFS_XATTR_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_xattr_hash_desc; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_xattr (struct bkey_ops) { \ -+ .key_invalid = bch2_xattr_invalid, \ -+ .val_to_text = bch2_xattr_to_text, \ -+} -+ -+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + -+ name_len + val_len, sizeof(u64)); -+} -+ -+#define xattr_val(_xattr) \ -+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) -+ -+struct xattr_search_key { -+ u8 type; -+ struct qstr name; -+}; -+ -+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ -+ { .type = _type, .name = QSTR_INIT(_name, _len) }) -+ -+struct dentry; -+struct xattr_handler; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, -+ const char *, void *, size_t, int); -+ -+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, -+ const char *, const void *, size_t, int, int); -+ -+ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -+ -+extern const struct xattr_handler *bch2_xattr_handlers[]; -+ -+#endif /* _BCACHEFS_XATTR_H */ -diff --git a/fs/cifs/file.c b/fs/cifs/file.c -index be46fab4c96d..a17a21181e18 100644 ---- a/fs/cifs/file.c -+++ b/fs/cifs/file.c -@@ -4296,20 +4296,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - - page = lru_to_page(page_list); - -- /* -- * Lock the page and put it in the cache. Since no one else -- * should have access to this page, we're safe to simply set -- * PG_locked without checking it first. -- */ -- __SetPageLocked(page); -- rc = add_to_page_cache_locked(page, mapping, -- page->index, gfp); -+ rc = add_to_page_cache(page, mapping, -+ page->index, gfp); - - /* give up if we can't stick it in the cache */ -- if (rc) { -- __ClearPageLocked(page); -+ if (rc) - return rc; -- } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; -@@ -4328,12 +4320,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - if (*bytes + PAGE_SIZE > rsize) - break; - -- __SetPageLocked(page); -- rc = add_to_page_cache_locked(page, mapping, page->index, gfp); -- if (rc) { -- __ClearPageLocked(page); -+ rc = add_to_page_cache(page, mapping, page->index, gfp); -+ if (rc) - break; -- } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; -diff --git a/fs/dcache.c b/fs/dcache.c -index 361ea7ab30ea..6fbf68e60326 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -3132,9 +3132,8 @@ void d_genocide(struct dentry *parent) - - EXPORT_SYMBOL(d_genocide); - --void d_tmpfile(struct dentry *dentry, struct inode *inode) -+void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) - { -- inode_dec_link_count(inode); - BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_u.d_alias) || - !d_unlinked(dentry)); -@@ -3144,6 +3143,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) - (unsigned long long)inode->i_ino); - spin_unlock(&dentry->d_lock); - spin_unlock(&dentry->d_parent->d_lock); -+} -+EXPORT_SYMBOL(d_mark_tmpfile); -+ -+void d_tmpfile(struct dentry *dentry, struct inode *inode) -+{ -+ inode_dec_link_count(inode); -+ d_mark_tmpfile(dentry, inode); - d_instantiate(dentry, inode); - } - EXPORT_SYMBOL(d_tmpfile); -diff --git a/fs/inode.c b/fs/inode.c -index 72c4c347afb7..e70ad3d2d01c 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -1578,6 +1578,46 @@ int insert_inode_locked(struct inode *inode) - } - EXPORT_SYMBOL(insert_inode_locked); - -+struct inode *insert_inode_locked2(struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ ino_t ino = inode->i_ino; -+ struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ -+ while (1) { -+ struct inode *old = NULL; -+ spin_lock(&inode_hash_lock); -+ hlist_for_each_entry(old, head, i_hash) { -+ if (old->i_ino != ino) -+ continue; -+ if (old->i_sb != sb) -+ continue; -+ spin_lock(&old->i_lock); -+ if (old->i_state & (I_FREEING|I_WILL_FREE)) { -+ spin_unlock(&old->i_lock); -+ continue; -+ } -+ break; -+ } -+ if (likely(!old)) { -+ spin_lock(&inode->i_lock); -+ inode->i_state |= I_NEW | I_CREATING; -+ hlist_add_head(&inode->i_hash, head); -+ spin_unlock(&inode->i_lock); -+ spin_unlock(&inode_hash_lock); -+ return NULL; -+ } -+ __iget(old); -+ spin_unlock(&old->i_lock); -+ spin_unlock(&inode_hash_lock); -+ wait_on_inode(old); -+ if (unlikely(!inode_unhashed(old))) -+ return old; -+ iput(old); -+ } -+} -+EXPORT_SYMBOL(insert_inode_locked2); -+ - int insert_inode_locked4(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 57241417ff2f..e080ccb4fdf1 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -908,6 +908,7 @@ extern const char *blk_op_str(unsigned int op); - - int blk_status_to_errno(blk_status_t status); - blk_status_t errno_to_blk_status(int errno); -+const char *blk_status_to_str(blk_status_t status); - - int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); - -diff --git a/include/linux/closure.h b/include/linux/closure.h -new file mode 100644 -index 000000000000..36b4a83f9b77 ---- /dev/null -+++ b/include/linux/closure.h -@@ -0,0 +1,399 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _LINUX_CLOSURE_H -+#define _LINUX_CLOSURE_H -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Closure is perhaps the most overused and abused term in computer science, but -+ * since I've been unable to come up with anything better you're stuck with it -+ * again. -+ * -+ * What are closures? -+ * -+ * They embed a refcount. The basic idea is they count "things that are in -+ * progress" - in flight bios, some other thread that's doing something else - -+ * anything you might want to wait on. -+ * -+ * The refcount may be manipulated with closure_get() and closure_put(). -+ * closure_put() is where many of the interesting things happen, when it causes -+ * the refcount to go to 0. -+ * -+ * Closures can be used to wait on things both synchronously and asynchronously, -+ * and synchronous and asynchronous use can be mixed without restriction. To -+ * wait synchronously, use closure_sync() - you will sleep until your closure's -+ * refcount hits 1. -+ * -+ * To wait asynchronously, use -+ * continue_at(cl, next_function, workqueue); -+ * -+ * passing it, as you might expect, the function to run when nothing is pending -+ * and the workqueue to run that function out of. -+ * -+ * continue_at() also, critically, requires a 'return' immediately following the -+ * location where this macro is referenced, to return to the calling function. -+ * There's good reason for this. -+ * -+ * To use safely closures asynchronously, they must always have a refcount while -+ * they are running owned by the thread that is running them. Otherwise, suppose -+ * you submit some bios and wish to have a function run when they all complete: -+ * -+ * foo_endio(struct bio *bio) -+ * { -+ * closure_put(cl); -+ * } -+ * -+ * closure_init(cl); -+ * -+ * do_stuff(); -+ * closure_get(cl); -+ * bio1->bi_endio = foo_endio; -+ * bio_submit(bio1); -+ * -+ * do_more_stuff(); -+ * closure_get(cl); -+ * bio2->bi_endio = foo_endio; -+ * bio_submit(bio2); -+ * -+ * continue_at(cl, complete_some_read, system_wq); -+ * -+ * If closure's refcount started at 0, complete_some_read() could run before the -+ * second bio was submitted - which is almost always not what you want! More -+ * importantly, it wouldn't be possible to say whether the original thread or -+ * complete_some_read()'s thread owned the closure - and whatever state it was -+ * associated with! -+ * -+ * So, closure_init() initializes a closure's refcount to 1 - and when a -+ * closure_fn is run, the refcount will be reset to 1 first. -+ * -+ * Then, the rule is - if you got the refcount with closure_get(), release it -+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -+ * on a closure because you called closure_init() or you were run out of a -+ * closure - _always_ use continue_at(). Doing so consistently will help -+ * eliminate an entire class of particularly pernicious races. -+ * -+ * Lastly, you might have a wait list dedicated to a specific event, and have no -+ * need for specifying the condition - you just want to wait until someone runs -+ * closure_wake_up() on the appropriate wait list. In that case, just use -+ * closure_wait(). It will return either true or false, depending on whether the -+ * closure was already on a wait list or not - a closure can only be on one wait -+ * list at a time. -+ * -+ * Parents: -+ * -+ * closure_init() takes two arguments - it takes the closure to initialize, and -+ * a (possibly null) parent. -+ * -+ * If parent is non null, the new closure will have a refcount for its lifetime; -+ * a closure is considered to be "finished" when its refcount hits 0 and the -+ * function to run is null. Hence -+ * -+ * continue_at(cl, NULL, NULL); -+ * -+ * returns up the (spaghetti) stack of closures, precisely like normal return -+ * returns up the C stack. continue_at() with non null fn is better thought of -+ * as doing a tail call. -+ * -+ * All this implies that a closure should typically be embedded in a particular -+ * struct (which its refcount will normally control the lifetime of), and that -+ * struct can very much be thought of as a stack frame. -+ */ -+ -+struct closure; -+struct closure_syncer; -+typedef void (closure_fn) (struct closure *); -+extern struct dentry *bcache_debug; -+ -+struct closure_waitlist { -+ struct llist_head list; -+}; -+ -+enum closure_state { -+ /* -+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -+ * the thread that owns the closure, and cleared by the thread that's -+ * waking up the closure. -+ * -+ * The rest are for debugging and don't affect behaviour: -+ * -+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by -+ * closure_init() and when closure_put() runs then next function), and -+ * must be cleared before remaining hits 0. Primarily to help guard -+ * against incorrect usage and accidentally transferring references. -+ * continue_at() and closure_return() clear it for you, if you're doing -+ * something unusual you can use closure_set_dead() which also helps -+ * annotate where references are being transferred. -+ */ -+ -+ CLOSURE_BITS_START = (1U << 26), -+ CLOSURE_DESTRUCTOR = (1U << 26), -+ CLOSURE_WAITING = (1U << 28), -+ CLOSURE_RUNNING = (1U << 30), -+}; -+ -+#define CLOSURE_GUARD_MASK \ -+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -+ -+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -+ -+struct closure { -+ union { -+ struct { -+ struct workqueue_struct *wq; -+ struct closure_syncer *s; -+ struct llist_node list; -+ closure_fn *fn; -+ }; -+ struct work_struct work; -+ }; -+ -+ struct closure *parent; -+ -+ atomic_t remaining; -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+#define CLOSURE_MAGIC_DEAD 0xc054dead -+#define CLOSURE_MAGIC_ALIVE 0xc054a11e -+ -+ unsigned int magic; -+ struct list_head all; -+ unsigned long ip; -+ unsigned long waiting_on; -+#endif -+}; -+ -+void closure_sub(struct closure *cl, int v); -+void closure_put(struct closure *cl); -+void __closure_wake_up(struct closure_waitlist *list); -+bool closure_wait(struct closure_waitlist *list, struct closure *cl); -+void __closure_sync(struct closure *cl); -+ -+/** -+ * closure_sync - sleep until a closure a closure has nothing left to wait on -+ * -+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns -+ * the last refcount. -+ */ -+static inline void closure_sync(struct closure *cl) -+{ -+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -+ __closure_sync(cl); -+} -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+void closure_debug_create(struct closure *cl); -+void closure_debug_destroy(struct closure *cl); -+ -+#else -+ -+static inline void closure_debug_create(struct closure *cl) {} -+static inline void closure_debug_destroy(struct closure *cl) {} -+ -+#endif -+ -+static inline void closure_set_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _THIS_IP_; -+#endif -+} -+ -+static inline void closure_set_ret_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _RET_IP_; -+#endif -+} -+ -+static inline void closure_set_waiting(struct closure *cl, unsigned long f) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->waiting_on = f; -+#endif -+} -+ -+static inline void closure_set_stopped(struct closure *cl) -+{ -+ atomic_sub(CLOSURE_RUNNING, &cl->remaining); -+} -+ -+static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -+ struct workqueue_struct *wq) -+{ -+ closure_set_ip(cl); -+ cl->fn = fn; -+ cl->wq = wq; -+ /* between atomic_dec() in closure_put() */ -+ smp_mb__before_atomic(); -+} -+ -+static inline void closure_queue(struct closure *cl) -+{ -+ struct workqueue_struct *wq = cl->wq; -+ /** -+ * Changes made to closure, work_struct, or a couple of other structs -+ * may cause work.func not pointing to the right location. -+ */ -+ BUILD_BUG_ON(offsetof(struct closure, fn) -+ != offsetof(struct work_struct, func)); -+ -+ if (wq) { -+ INIT_WORK(&cl->work, cl->work.func); -+ BUG_ON(!queue_work(wq, &cl->work)); -+ } else -+ cl->fn(cl); -+} -+ -+/** -+ * closure_get - increment a closure's refcount -+ */ -+static inline void closure_get(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ BUG_ON((atomic_inc_return(&cl->remaining) & -+ CLOSURE_REMAINING_MASK) <= 1); -+#else -+ atomic_inc(&cl->remaining); -+#endif -+} -+ -+/** -+ * closure_init - Initialize a closure, setting the refcount to 1 -+ * @cl: closure to initialize -+ * @parent: parent of the new closure. cl will take a refcount on it for its -+ * lifetime; may be NULL. -+ */ -+static inline void closure_init(struct closure *cl, struct closure *parent) -+{ -+ cl->fn = NULL; -+ cl->parent = parent; -+ if (parent) -+ closure_get(parent); -+ -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+ -+ closure_debug_create(cl); -+ closure_set_ip(cl); -+} -+ -+static inline void closure_init_stack(struct closure *cl) -+{ -+ memset(cl, 0, sizeof(struct closure)); -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+} -+ -+/** -+ * closure_wake_up - wake up all closures on a wait list, -+ * with memory barrier -+ */ -+static inline void closure_wake_up(struct closure_waitlist *list) -+{ -+ /* Memory barrier for the wait list */ -+ smp_mb(); -+ __closure_wake_up(list); -+} -+ -+/** -+ * continue_at - jump to another function with barrier -+ * -+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have -+ * been dropped with closure_put()), it will resume execution at @fn running out -+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -+ * -+ * This is because after calling continue_at() you no longer have a ref on @cl, -+ * and whatever @cl owns may be freed out from under you - a running closure fn -+ * has a ref on its own closure which continue_at() drops. -+ * -+ * Note you are expected to immediately return after using this macro. -+ */ -+#define continue_at(_cl, _fn, _wq) \ -+do { \ -+ set_closure_fn(_cl, _fn, _wq); \ -+ closure_sub(_cl, CLOSURE_RUNNING + 1); \ -+} while (0) -+ -+/** -+ * closure_return - finish execution of a closure -+ * -+ * This is used to indicate that @cl is finished: when all outstanding refs on -+ * @cl have been dropped @cl's ref on its parent closure (as passed to -+ * closure_init()) will be dropped, if one was specified - thus this can be -+ * thought of as returning to the parent closure. -+ */ -+#define closure_return(_cl) continue_at((_cl), NULL, NULL) -+ -+/** -+ * continue_at_nobarrier - jump to another function without barrier -+ * -+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if -+ * @wq is NULL). -+ * -+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -+ * thus it's not safe to touch anything protected by @cl after a -+ * continue_at_nobarrier(). -+ */ -+#define continue_at_nobarrier(_cl, _fn, _wq) \ -+do { \ -+ set_closure_fn(_cl, _fn, _wq); \ -+ closure_queue(_cl); \ -+} while (0) -+ -+/** -+ * closure_return_with_destructor - finish execution of a closure, -+ * with destructor -+ * -+ * Works like closure_return(), except @destructor will be called when all -+ * outstanding refs on @cl have been dropped; @destructor may be used to safely -+ * free the memory occupied by @cl, and it is called with the ref on the parent -+ * closure still held - so @destructor could safely return an item to a -+ * freelist protected by @cl's parent. -+ */ -+#define closure_return_with_destructor(_cl, _destructor) \ -+do { \ -+ set_closure_fn(_cl, _destructor, NULL); \ -+ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -+} while (0) -+ -+/** -+ * closure_call - execute @fn out of a new, uninitialized closure -+ * -+ * Typically used when running out of one closure, and we want to run @fn -+ * asynchronously out of a new closure - @parent will then wait for @cl to -+ * finish. -+ */ -+static inline void closure_call(struct closure *cl, closure_fn fn, -+ struct workqueue_struct *wq, -+ struct closure *parent) -+{ -+ closure_init(cl, parent); -+ continue_at_nobarrier(cl, fn, wq); -+} -+ -+#define __closure_wait_event(waitlist, _cond) \ -+do { \ -+ struct closure cl; \ -+ \ -+ closure_init_stack(&cl); \ -+ \ -+ while (1) { \ -+ closure_wait(waitlist, &cl); \ -+ if (_cond) \ -+ break; \ -+ closure_sync(&cl); \ -+ } \ -+ closure_wake_up(waitlist); \ -+ closure_sync(&cl); \ -+} while (0) -+ -+#define closure_wait_event(waitlist, _cond) \ -+do { \ -+ if (!(_cond)) \ -+ __closure_wait_event(waitlist, _cond); \ -+} while (0) -+ -+#endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index c8f03d2969df..6165f4f769b6 100644 ---- a/include/linux/compiler_attributes.h -+++ b/include/linux/compiler_attributes.h -@@ -271,4 +271,9 @@ - */ - #define __weak __attribute__((__weak__)) - -+/* -+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute -+ */ -+#define __flatten __attribute__((flatten)) -+ - #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ -diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index a81f0c3cf352..053e33f5afd9 100644 ---- a/include/linux/dcache.h -+++ b/include/linux/dcache.h -@@ -256,6 +256,7 @@ extern struct dentry * d_make_root(struct inode *); - /* - the ramfs-type tree */ - extern void d_genocide(struct dentry *); - -+extern void d_mark_tmpfile(struct dentry *, struct inode *); - extern void d_tmpfile(struct dentry *, struct inode *); - - extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/fs.h b/include/linux/fs.h -index f5abba86107d..a0793e83b266 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -3088,6 +3088,7 @@ extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); - extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); - extern int insert_inode_locked(struct inode *); -+extern struct inode *insert_inode_locked2(struct inode *); - #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern void lockdep_annotate_inode_mutex_key(struct inode *inode); - #else -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index cf2468da68e9..25cadac5e90d 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -645,10 +645,15 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) - return 0; - } - --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask); - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask); -+ - extern void delete_from_page_cache(struct page *page); - extern void __delete_from_page_cache(struct page *page, void *shadow); - int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); -@@ -666,22 +671,6 @@ void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); - --/* -- * Like add_to_page_cache_locked, but used to add newly allocated pages: -- * the page is new, so we can just run __SetPageLocked() against it. -- */ --static inline int add_to_page_cache(struct page *page, -- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) --{ -- int error; -- -- __SetPageLocked(page); -- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); -- if (unlikely(error)) -- __ClearPageLocked(page); -- return error; --} -- - /** - * struct readahead_control - Describes a readahead request. - * -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 683372943093..6340de2990ff 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -739,6 +739,7 @@ struct task_struct { - - struct mm_struct *mm; - struct mm_struct *active_mm; -+ struct address_space *faults_disabled_mapping; - - /* Per-thread vma caching: */ - struct vmacache vmacache; -diff --git a/include/linux/six.h b/include/linux/six.h -new file mode 100644 -index 000000000000..a16e94f482e9 ---- /dev/null -+++ b/include/linux/six.h -@@ -0,0 +1,197 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_SIX_H -+#define _LINUX_SIX_H -+ -+/* -+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw -+ * semaphores, except with a third intermediate state, intent. Basic operations -+ * are: -+ * -+ * six_lock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * six_lock_intent(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * -+ * Intent locks block other intent locks, but do not block read locks, and you -+ * must have an intent lock held before taking a write lock, like so: -+ * -+ * six_lock_intent(&foo->lock); -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * Other operations: -+ * -+ * six_trylock_read() -+ * six_trylock_intent() -+ * six_trylock_write() -+ * -+ * six_lock_downgrade(): convert from intent to read -+ * six_lock_tryupgrade(): attempt to convert from read to intent -+ * -+ * Locks also embed a sequence number, which is incremented when the lock is -+ * locked or unlocked for write. The current sequence number can be grabbed -+ * while a lock is held from lock->state.seq; then, if you drop the lock you can -+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock -+ * iff it hasn't been locked for write in the meantime. -+ * -+ * There are also operations that take the lock type as a parameter, where the -+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: -+ * -+ * six_lock_type(lock, type) -+ * six_unlock_type(lock, type) -+ * six_relock(lock, type, seq) -+ * six_trylock_type(lock, type) -+ * six_trylock_convert(lock, from, to) -+ * -+ * A lock may be held multiple types by the same thread (for read or intent, -+ * not write). However, the six locks code does _not_ implement the actual -+ * recursive checks itself though - rather, if your code (e.g. btree iterator -+ * code) knows that the current thread already has a lock held, and for the -+ * correct type, six_lock_increment() may be used to bump up the counter for -+ * that type - the only effect is that one more call to unlock will be required -+ * before the lock is unlocked. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define SIX_LOCK_SEPARATE_LOCKFNS -+ -+union six_lock_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ /* for waitlist_bitnr() */ -+ unsigned long l; -+ }; -+ -+ struct { -+ unsigned read_lock:28; -+ unsigned intent_lock:1; -+ unsigned waiters:3; -+ /* -+ * seq works much like in seqlocks: it's incremented every time -+ * we lock and unlock for write. -+ * -+ * If it's odd write lock is held, even unlocked. -+ * -+ * Thus readers can unlock, and then lock again later iff it -+ * hasn't been modified in the meantime. -+ */ -+ u32 seq; -+ }; -+}; -+ -+enum six_lock_type { -+ SIX_LOCK_read, -+ SIX_LOCK_intent, -+ SIX_LOCK_write, -+}; -+ -+struct six_lock { -+ union six_lock_state state; -+ unsigned intent_lock_recurse; -+ struct task_struct *owner; -+ struct optimistic_spin_queue osq; -+ -+ raw_spinlock_t wait_lock; -+ struct list_head wait_list[2]; -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map dep_map; -+#endif -+}; -+ -+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -+ -+static __always_inline void __six_lock_init(struct six_lock *lock, -+ const char *name, -+ struct lock_class_key *key) -+{ -+ atomic64_set(&lock->state.counter, 0); -+ raw_spin_lock_init(&lock->wait_lock); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); -+ lockdep_init_map(&lock->dep_map, name, key, 0); -+#endif -+} -+ -+#define six_lock_init(lock) \ -+do { \ -+ static struct lock_class_key __key; \ -+ \ -+ __six_lock_init((lock), #lock, &__key); \ -+} while (0) -+ -+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *); \ -+bool six_relock_##type(struct six_lock *, u32); \ -+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ -+void six_unlock_##type(struct six_lock *); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+#undef __SIX_LOCK -+ -+#define SIX_LOCK_DISPATCH(type, fn, ...) \ -+ switch (type) { \ -+ case SIX_LOCK_read: \ -+ return fn##_read(__VA_ARGS__); \ -+ case SIX_LOCK_intent: \ -+ return fn##_intent(__VA_ARGS__); \ -+ case SIX_LOCK_write: \ -+ return fn##_write(__VA_ARGS__); \ -+ default: \ -+ BUG(); \ -+ } -+ -+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_trylock, lock); -+} -+ -+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); -+} -+ -+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); -+} -+ -+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_unlock, lock); -+} -+ -+void six_lock_downgrade(struct six_lock *); -+bool six_lock_tryupgrade(struct six_lock *); -+bool six_trylock_convert(struct six_lock *, enum six_lock_type, -+ enum six_lock_type); -+ -+void six_lock_increment(struct six_lock *, enum six_lock_type); -+ -+void six_lock_wakeup_all(struct six_lock *); -+ -+#endif /* _LINUX_SIX_H */ -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index 0221f852a7e1..f81f60d891ac 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -106,6 +106,7 @@ extern void *vzalloc(unsigned long size); - extern void *vmalloc_user(unsigned long size); - extern void *vmalloc_node(unsigned long size, int node); - extern void *vzalloc_node(unsigned long size, int node); -+extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask); - extern void *vmalloc_32(unsigned long size); - extern void *vmalloc_32_user(unsigned long size); - extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); -diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h -new file mode 100644 -index 000000000000..9b4e8295ed75 ---- /dev/null -+++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,664 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM bcachefs -+ -+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_BCACHE_H -+ -+#include -+ -+DECLARE_EVENT_CLASS(bpos, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = p->inode; -+ __entry->offset = p->offset; -+ ), -+ -+ TP_printk("%llu:%llu", __entry->inode, __entry->offset) -+); -+ -+DECLARE_EVENT_CLASS(bkey, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = k->p.inode; -+ __entry->offset = k->p.offset; -+ __entry->size = k->size; -+ ), -+ -+ TP_printk("%llu:%llu len %u", __entry->inode, -+ __entry->offset, __entry->size) -+); -+ -+DECLARE_EVENT_CLASS(bch_fs, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DECLARE_EVENT_CLASS(bio, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(sector_t, sector ) -+ __field(unsigned int, nr_sector ) -+ __array(char, rwbs, 6 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; -+ __entry->sector = bio->bi_iter.bi_sector; -+ __entry->nr_sector = bio->bi_iter.bi_size >> 9; -+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); -+ ), -+ -+ TP_printk("%d,%d %s %llu + %u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, -+ (unsigned long long)__entry->sector, __entry->nr_sector) -+); -+ -+/* io.c: */ -+ -+DEFINE_EVENT(bio, read_split, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_bounce, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_retry, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, promote, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* Journal */ -+ -+DEFINE_EVENT(bch_fs, journal_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, journal_entry_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bio, journal_write, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* bset.c: */ -+ -+DEFINE_EVENT(bpos, bkey_pack_pos_fail, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p) -+); -+ -+/* Btree */ -+ -+DECLARE_EVENT_CLASS(btree_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u8, level ) -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->level = b->c.level; -+ __entry->id = b->c.btree_id; -+ __entry->inode = b->key.k.p.inode; -+ __entry->offset = b->key.k.p.offset; -+ ), -+ -+ TP_printk("%pU %u id %u %llu:%llu", -+ __entry->uuid, __entry->level, __entry->id, -+ __entry->inode, __entry->offset) -+); -+ -+DEFINE_EVENT(btree_node, btree_read, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_write, -+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), -+ TP_ARGS(b, bytes, sectors), -+ -+ TP_STRUCT__entry( -+ __field(enum btree_node_type, type) -+ __field(unsigned, bytes ) -+ __field(unsigned, sectors ) -+ ), -+ -+ TP_fast_assign( -+ __entry->type = btree_node_type(b); -+ __entry->bytes = bytes; -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("bkey type %u bytes %u sectors %u", -+ __entry->type , __entry->bytes, __entry->sectors) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_alloc, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_free, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_reap, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+TRACE_EVENT(btree_reserve_get_fail, -+ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), -+ TP_ARGS(c, required, cl), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, required ) -+ __field(struct closure *, cl ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->required = required; -+ __entry->cl = cl; -+ ), -+ -+ TP_printk("%pU required %zu by %p", __entry->uuid, -+ __entry->required, __entry->cl) -+); -+ -+TRACE_EVENT(btree_insert_key, -+ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), -+ TP_ARGS(c, b, k), -+ -+ TP_STRUCT__entry( -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->id = b->c.btree_id; -+ __entry->inode = k->k.p.inode; -+ __entry->offset = k->k.p.offset; -+ __entry->size = k->k.size; -+ ), -+ -+ TP_printk("btree %u: %llu:%llu len %u", __entry->id, -+ __entry->inode, __entry->offset, __entry->size) -+); -+ -+DEFINE_EVENT(btree_node, btree_split, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_compact, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_merge, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_set_root, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+/* Garbage collection */ -+ -+DEFINE_EVENT(btree_node, btree_gc_coalesce, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_gc_coalesce_fail, -+ TP_PROTO(struct bch_fs *c, int reason), -+ TP_ARGS(c, reason), -+ -+ TP_STRUCT__entry( -+ __field(u8, reason ) -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->reason = reason; -+ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU: %u", __entry->uuid, __entry->reason) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(bch_fs, gc_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+/* Allocator */ -+ -+TRACE_EVENT(alloc_batch, -+ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), -+ TP_ARGS(ca, free, total), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, free ) -+ __field(size_t, total ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->free = free; -+ __entry->total = total; -+ ), -+ -+ TP_printk("%pU free %zu total %zu", -+ __entry->uuid, __entry->free, __entry->total) -+); -+ -+TRACE_EVENT(invalidate, -+ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), -+ TP_ARGS(ca, offset, sectors), -+ -+ TP_STRUCT__entry( -+ __field(unsigned, sectors ) -+ __field(dev_t, dev ) -+ __field(__u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; -+ __entry->offset = offset, -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("invalidated %u sectors at %d,%d sector=%llu", -+ __entry->sectors, MAJOR(__entry->dev), -+ MINOR(__entry->dev), __entry->offset) -+); -+ -+DEFINE_EVENT(bch_fs, rescale_prios, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DECLARE_EVENT_CLASS(bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16) -+ __field(enum alloc_reserve, reserve ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->reserve = reserve; -+ ), -+ -+ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+/* Moving IO */ -+ -+DEFINE_EVENT(bkey, move_extent, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_alloc_fail, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_race, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+TRACE_EVENT(move_data, -+ TP_PROTO(struct bch_fs *c, u64 sectors_moved, -+ u64 keys_moved), -+ TP_ARGS(c, sectors_moved, keys_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, keys_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->keys_moved = keys_moved; -+ ), -+ -+ TP_printk("%pU sectors_moved %llu keys_moved %llu", -+ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) -+); -+ -+TRACE_EVENT(copygc, -+ TP_PROTO(struct bch_fs *c, -+ u64 sectors_moved, u64 sectors_not_moved, -+ u64 buckets_moved, u64 buckets_not_moved), -+ TP_ARGS(c, -+ sectors_moved, sectors_not_moved, -+ buckets_moved, buckets_not_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, sectors_not_moved ) -+ __field(u64, buckets_moved ) -+ __field(u64, buckets_not_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->sectors_not_moved = sectors_not_moved; -+ __entry->buckets_moved = buckets_moved; -+ __entry->buckets_not_moved = buckets_moved; -+ ), -+ -+ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", -+ __entry->uuid, -+ __entry->sectors_moved, __entry->sectors_not_moved, -+ __entry->buckets_moved, __entry->buckets_not_moved) -+); -+ -+TRACE_EVENT(transaction_restart_ip, -+ TP_PROTO(unsigned long caller, unsigned long ip), -+ TP_ARGS(caller, ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, caller ) -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->caller = caller; -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) -+); -+ -+DECLARE_EVENT_CLASS(transaction_restart, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pf", (void *) __entry->ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+TRACE_EVENT(trans_restart_iters_realloced, -+ TP_PROTO(unsigned long ip, unsigned nr), -+ TP_ARGS(ip, nr), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned, nr ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->nr = nr; -+ ), -+ -+ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) -+); -+ -+TRACE_EVENT(trans_restart_mem_realloced, -+ TP_PROTO(unsigned long ip, unsigned long bytes), -+ TP_ARGS(ip, bytes), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned long, bytes ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->bytes = bytes; -+ ), -+ -+ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_traverse, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_atomic, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DECLARE_EVENT_CLASS(node_lock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq), -+ -+ TP_STRUCT__entry( -+ __field(u32, level) -+ __field(u32, iter_seq) -+ __field(u32, node) -+ __field(u32, node_seq) -+ ), -+ -+ TP_fast_assign( -+ __entry->level = level; -+ __entry->iter_seq = iter_seq; -+ __entry->node = node; -+ __entry->node_seq = node_seq; -+ ), -+ -+ TP_printk("level %u iter seq %u node %u node seq %u", -+ __entry->level, __entry->iter_seq, -+ __entry->node, __entry->node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_upgrade_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_relock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+#endif /* _TRACE_BCACHE_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/init/init_task.c b/init/init_task.c -index 15089d15010a..61d969e94569 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -83,6 +83,7 @@ struct task_struct init_task - .nr_cpus_allowed= NR_CPUS, - .mm = NULL, - .active_mm = &init_mm, -+ .faults_disabled_mapping = NULL, - .restart_block = { - .fn = do_no_restart_syscall, - }, -diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks -index 3de8fd11873b..ab8aa082ce56 100644 ---- a/kernel/Kconfig.locks -+++ b/kernel/Kconfig.locks -@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB - config MMIOWB - def_bool y if ARCH_HAS_MMIOWB - depends on SMP -+ -+config SIXLOCKS -+ bool -diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile -index 6d11cfb9b41f..4c13937e8f37 100644 ---- a/kernel/locking/Makefile -+++ b/kernel/locking/Makefile -@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o - obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o - obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o - obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o -+obj-$(CONFIG_SIXLOCKS) += six.o -diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h -index baca699b94e9..4abb462d914d 100644 ---- a/kernel/locking/lockdep_internals.h -+++ b/kernel/locking/lockdep_internals.h -@@ -96,7 +96,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - #else - #define MAX_LOCKDEP_ENTRIES 32768UL - --#define MAX_LOCKDEP_CHAINS_BITS 16 -+#define MAX_LOCKDEP_CHAINS_BITS 18 - - /* - * Stack-trace: tightly packed array of stack backtrace -@@ -114,7 +114,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - - #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - --#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) - - extern struct list_head all_lock_classes; - extern struct lock_chain lock_chains[]; -diff --git a/kernel/locking/six.c b/kernel/locking/six.c -new file mode 100644 -index 000000000000..49d46ed2e18e ---- /dev/null -+++ b/kernel/locking/six.c -@@ -0,0 +1,553 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) do {} while (0) -+#endif -+ -+#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -+#define six_release(l) lock_release(l, _RET_IP_) -+ -+struct six_lock_vals { -+ /* Value we add to the lock in order to take the lock: */ -+ u64 lock_val; -+ -+ /* If the lock has this value (used as a mask), taking the lock fails: */ -+ u64 lock_fail; -+ -+ /* Value we add to the lock in order to release the lock: */ -+ u64 unlock_val; -+ -+ /* Mask that indicates lock is held for this type: */ -+ u64 held_mask; -+ -+ /* Waitlist we wakeup when releasing the lock: */ -+ enum six_lock_type unlock_wakeup; -+}; -+ -+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) -+ -+#define LOCK_VALS { \ -+ [SIX_LOCK_read] = { \ -+ .lock_val = __SIX_VAL(read_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_write, \ -+ .unlock_val = -__SIX_VAL(read_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_read, \ -+ .unlock_wakeup = SIX_LOCK_write, \ -+ }, \ -+ [SIX_LOCK_intent] = { \ -+ .lock_val = __SIX_VAL(intent_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_intent, \ -+ .unlock_val = -__SIX_VAL(intent_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_intent, \ -+ .unlock_wakeup = SIX_LOCK_intent, \ -+ }, \ -+ [SIX_LOCK_write] = { \ -+ .lock_val = __SIX_VAL(seq, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_read, \ -+ .unlock_val = __SIX_VAL(seq, 1), \ -+ .held_mask = __SIX_LOCK_HELD_write, \ -+ .unlock_wakeup = SIX_LOCK_read, \ -+ }, \ -+} -+ -+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, -+ union six_lock_state old) -+{ -+ if (type != SIX_LOCK_intent) -+ return; -+ -+ if (!old.intent_lock) { -+ EBUG_ON(lock->owner); -+ lock->owner = current; -+ } else { -+ EBUG_ON(lock->owner != current); -+ } -+} -+ -+static __always_inline bool do_six_trylock_type(struct six_lock *lock, -+ enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); -+ -+ do { -+ old.v = v; -+ -+ EBUG_ON(type == SIX_LOCK_write && -+ ((old.v & __SIX_LOCK_HELD_write) || -+ !(old.v & __SIX_LOCK_HELD_intent))); -+ -+ if (old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ if (!do_six_trylock_type(lock, type)) -+ return false; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ old.v = v; -+ -+ if (old.seq != seq || old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+struct six_lock_waiter { -+ struct list_head list; -+ struct task_struct *task; -+}; -+ -+/* This is probably up there with the more evil things I've done */ -+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) -+ -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER -+ -+static inline int six_can_spin_on_owner(struct six_lock *lock) -+{ -+ struct task_struct *owner; -+ int retval = 1; -+ -+ if (need_resched()) -+ return 0; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(lock->owner); -+ if (owner) -+ retval = owner->on_cpu; -+ rcu_read_unlock(); -+ /* -+ * if lock->owner is not set, the mutex owner may have just acquired -+ * it and not set the owner yet or the mutex has been released. -+ */ -+ return retval; -+} -+ -+static inline bool six_spin_on_owner(struct six_lock *lock, -+ struct task_struct *owner) -+{ -+ bool ret = true; -+ -+ rcu_read_lock(); -+ while (lock->owner == owner) { -+ /* -+ * Ensure we emit the owner->on_cpu, dereference _after_ -+ * checking lock->owner still matches owner. If that fails, -+ * owner might point to freed memory. If it still matches, -+ * the rcu_read_lock() ensures the memory stays valid. -+ */ -+ barrier(); -+ -+ if (!owner->on_cpu || need_resched()) { -+ ret = false; -+ break; -+ } -+ -+ cpu_relax(); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ struct task_struct *task = current; -+ -+ if (type == SIX_LOCK_write) -+ return false; -+ -+ preempt_disable(); -+ if (!six_can_spin_on_owner(lock)) -+ goto fail; -+ -+ if (!osq_lock(&lock->osq)) -+ goto fail; -+ -+ while (1) { -+ struct task_struct *owner; -+ -+ /* -+ * If there's an owner, wait for it to either -+ * release the lock or go to sleep. -+ */ -+ owner = READ_ONCE(lock->owner); -+ if (owner && !six_spin_on_owner(lock, owner)) -+ break; -+ -+ if (do_six_trylock_type(lock, type)) { -+ osq_unlock(&lock->osq); -+ preempt_enable(); -+ return true; -+ } -+ -+ /* -+ * When there's no owner, we might have preempted between the -+ * owner acquiring the lock and setting the owner field. If -+ * we're an RT task that will live-lock because we won't let -+ * the owner complete. -+ */ -+ if (!owner && (need_resched() || rt_task(task))) -+ break; -+ -+ /* -+ * The cpu_relax() call is a compiler barrier which forces -+ * everything in this loop to be re-loaded. We don't need -+ * memory barriers as we'll eventually observe the right -+ * values at the cost of a few extra spins. -+ */ -+ cpu_relax(); -+ } -+ -+ osq_unlock(&lock->osq); -+fail: -+ preempt_enable(); -+ -+ /* -+ * If we fell out of the spin path because of need_resched(), -+ * reschedule now, before we try-lock again. This avoids getting -+ * scheduled out right after we obtained the lock. -+ */ -+ if (need_resched()) -+ schedule(); -+ -+ return false; -+} -+ -+#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ return false; -+} -+ -+#endif -+ -+noinline -+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ struct six_lock_waiter wait; -+ int ret = 0; -+ u64 v; -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ return ret; -+ -+ if (six_optimistic_spin(lock, type)) -+ return 0; -+ -+ lock_contended(&lock->dep_map, _RET_IP_); -+ -+ INIT_LIST_HEAD(&wait.list); -+ wait.task = current; -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (type == SIX_LOCK_write) -+ EBUG_ON(lock->owner != current); -+ else if (list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_add_tail(&wait.list, &lock->wait_list[type]); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ break; -+ -+ v = READ_ONCE(lock->state.v); -+ do { -+ new.v = old.v = v; -+ -+ if (!(old.v & l[type].lock_fail)) -+ new.v += l[type].lock_val; -+ else if (!(new.waiters & (1 << type))) -+ new.waiters |= 1 << type; -+ else -+ break; /* waiting bit already set */ -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ if (!(old.v & l[type].lock_fail)) -+ break; -+ -+ schedule(); -+ } -+ -+ if (!ret) -+ six_set_owner(lock, type, old); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (!list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_del_init(&wait.list); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ return ret; -+} -+ -+__always_inline -+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ int ret; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 0); -+ -+ ret = do_six_trylock_type(lock, type) ? 0 -+ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); -+ -+ if (ret && type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ if (!ret) -+ lock_acquired(&lock->dep_map, _RET_IP_); -+ -+ return ret; -+} -+ -+static inline void six_lock_wakeup(struct six_lock *lock, -+ union six_lock_state state, -+ unsigned waitlist_id) -+{ -+ struct list_head *wait_list = &lock->wait_list[waitlist_id]; -+ struct six_lock_waiter *w, *next; -+ -+ if (waitlist_id == SIX_LOCK_write && state.read_lock) -+ return; -+ -+ if (!(state.waiters & (1 << waitlist_id))) -+ return; -+ -+ clear_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ -+ if (waitlist_id == SIX_LOCK_write) { -+ struct task_struct *p = READ_ONCE(lock->owner); -+ -+ if (p) -+ wake_up_process(p); -+ return; -+ } -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry_safe(w, next, wait_list, list) { -+ list_del_init(&w->list); -+ -+ if (wake_up_process(w->task) && -+ waitlist_id != SIX_LOCK_read) { -+ if (!list_empty(wait_list)) -+ set_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ break; -+ } -+ } -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+ -+__always_inline __flatten -+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state state; -+ -+ EBUG_ON(!(lock->state.v & l[type].held_mask)); -+ EBUG_ON(type == SIX_LOCK_write && -+ !(lock->state.v & __SIX_LOCK_HELD_intent)); -+ -+ if (type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ -+ if (type == SIX_LOCK_intent) { -+ EBUG_ON(lock->owner != current); -+ -+ if (lock->intent_lock_recurse) { -+ --lock->intent_lock_recurse; -+ return; -+ } -+ -+ lock->owner = NULL; -+ } -+ -+ state.v = atomic64_add_return_release(l[type].unlock_val, -+ &lock->state.counter); -+ six_lock_wakeup(lock, state, l[type].unlock_wakeup); -+} -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *lock) \ -+{ \ -+ return __six_trylock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_trylock_##type); \ -+ \ -+bool six_relock_##type(struct six_lock *lock, u32 seq) \ -+{ \ -+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ -+} \ -+EXPORT_SYMBOL_GPL(six_relock_##type); \ -+ \ -+int six_lock_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p) \ -+{ \ -+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ -+} \ -+EXPORT_SYMBOL_GPL(six_lock_##type); \ -+ \ -+void six_unlock_##type(struct six_lock *lock) \ -+{ \ -+ __six_unlock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_unlock_##type); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+ -+#undef __SIX_LOCK -+ -+/* Convert from intent to read: */ -+void six_lock_downgrade(struct six_lock *lock) -+{ -+ six_lock_increment(lock, SIX_LOCK_read); -+ six_unlock_intent(lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_downgrade); -+ -+bool six_lock_tryupgrade(struct six_lock *lock) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ new.v = old.v = v; -+ -+ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); -+ -+ new.v += l[SIX_LOCK_read].unlock_val; -+ -+ if (new.v & l[SIX_LOCK_intent].lock_fail) -+ return false; -+ -+ new.v += l[SIX_LOCK_intent].lock_val; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ six_set_owner(lock, SIX_LOCK_intent, old); -+ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_lock_tryupgrade); -+ -+bool six_trylock_convert(struct six_lock *lock, -+ enum six_lock_type from, -+ enum six_lock_type to) -+{ -+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); -+ -+ if (to == from) -+ return true; -+ -+ if (to == SIX_LOCK_read) { -+ six_lock_downgrade(lock); -+ return true; -+ } else { -+ return six_lock_tryupgrade(lock); -+ } -+} -+EXPORT_SYMBOL_GPL(six_trylock_convert); -+ -+/* -+ * Increment read/intent lock count, assuming we already have it read or intent -+ * locked: -+ */ -+void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ -+ EBUG_ON(type == SIX_LOCK_write); -+ six_acquire(&lock->dep_map, 0); -+ -+ /* XXX: assert already locked, and that we don't overflow: */ -+ -+ switch (type) { -+ case SIX_LOCK_read: -+ atomic64_add(l[type].lock_val, &lock->state.counter); -+ break; -+ case SIX_LOCK_intent: -+ lock->intent_lock_recurse++; -+ break; -+ case SIX_LOCK_write: -+ BUG(); -+ break; -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_increment); -+ -+void six_lock_wakeup_all(struct six_lock *lock) -+{ -+ struct six_lock_waiter *w; -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry(w, &lock->wait_list[0], list) -+ wake_up_process(w->task); -+ list_for_each_entry(w, &lock->wait_list[1], list) -+ wake_up_process(w->task); -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -diff --git a/kernel/module.c b/kernel/module.c -index aa183c9ac0a2..fdfe519a0393 100644 ---- a/kernel/module.c -+++ b/kernel/module.c -@@ -2786,9 +2786,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) - - void * __weak module_alloc(unsigned long size) - { -- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, -- NUMA_NO_NODE, __builtin_return_address(0)); -+ return vmalloc_exec(size, GFP_KERNEL); - } - - bool __weak module_init_section(const char *name) -diff --git a/lib/Kconfig b/lib/Kconfig -index df3f3da95990..086d332ab5c8 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -457,6 +457,9 @@ config ASSOCIATIVE_ARRAY - - for more information. - -+config CLOSURES -+ bool -+ - config HAS_IOMEM - bool - depends on !NO_IOMEM -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 9ad9210d70a1..51558639ee91 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -1466,6 +1466,15 @@ config DEBUG_CREDENTIALS - - source "kernel/rcu/Kconfig.debug" - -+config DEBUG_CLOSURES -+ bool "Debug closures (bcache async widgits)" -+ depends on CLOSURES -+ select DEBUG_FS -+ help -+ Keeps all active closures in a linked list and provides a debugfs -+ interface to list them, which makes it possible to see asynchronous -+ operations that get stuck. -+ - config DEBUG_WQ_FORCE_RR_CPU - bool "Force round-robin CPU selection for unbound work items" - depends on DEBUG_KERNEL -diff --git a/lib/Makefile b/lib/Makefile -index b1c42c10073b..7d6921a5c823 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -208,6 +208,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o - - obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o - -+obj-$(CONFIG_CLOSURES) += closure.o -+ - obj-$(CONFIG_DQL) += dynamic_queue_limits.o - - obj-$(CONFIG_GLOB) += glob.o -diff --git a/lib/closure.c b/lib/closure.c -new file mode 100644 -index 000000000000..3e6366c26209 ---- /dev/null -+++ b/lib/closure.c -@@ -0,0 +1,214 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Asynchronous refcounty things -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+static inline void closure_put_after_sub(struct closure *cl, int flags) -+{ -+ int r = flags & CLOSURE_REMAINING_MASK; -+ -+ BUG_ON(flags & CLOSURE_GUARD_MASK); -+ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -+ -+ if (!r) { -+ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -+ atomic_set(&cl->remaining, -+ CLOSURE_REMAINING_INITIALIZER); -+ closure_queue(cl); -+ } else { -+ struct closure *parent = cl->parent; -+ closure_fn *destructor = cl->fn; -+ -+ closure_debug_destroy(cl); -+ -+ if (destructor) -+ destructor(cl); -+ -+ if (parent) -+ closure_put(parent); -+ } -+ } -+} -+ -+/* For clearing flags with the same atomic op as a put */ -+void closure_sub(struct closure *cl, int v) -+{ -+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -+} -+EXPORT_SYMBOL(closure_sub); -+ -+/* -+ * closure_put - decrement a closure's refcount -+ */ -+void closure_put(struct closure *cl) -+{ -+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -+} -+EXPORT_SYMBOL(closure_put); -+ -+/* -+ * closure_wake_up - wake up all closures on a wait list, without memory barrier -+ */ -+void __closure_wake_up(struct closure_waitlist *wait_list) -+{ -+ struct llist_node *list; -+ struct closure *cl, *t; -+ struct llist_node *reverse = NULL; -+ -+ list = llist_del_all(&wait_list->list); -+ -+ /* We first reverse the list to preserve FIFO ordering and fairness */ -+ reverse = llist_reverse_order(list); -+ -+ /* Then do the wakeups */ -+ llist_for_each_entry_safe(cl, t, reverse, list) { -+ closure_set_waiting(cl, 0); -+ closure_sub(cl, CLOSURE_WAITING + 1); -+ } -+} -+EXPORT_SYMBOL(__closure_wake_up); -+ -+/** -+ * closure_wait - add a closure to a waitlist -+ * @waitlist: will own a ref on @cl, which will be released when -+ * closure_wake_up() is called on @waitlist. -+ * @cl: closure pointer. -+ * -+ */ -+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -+{ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ return false; -+ -+ closure_set_waiting(cl, _RET_IP_); -+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -+ llist_add(&cl->list, &waitlist->list); -+ -+ return true; -+} -+EXPORT_SYMBOL(closure_wait); -+ -+struct closure_syncer { -+ struct task_struct *task; -+ int done; -+}; -+ -+static void closure_sync_fn(struct closure *cl) -+{ -+ struct closure_syncer *s = cl->s; -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = READ_ONCE(s->task); -+ s->done = 1; -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void __sched __closure_sync(struct closure *cl) -+{ -+ struct closure_syncer s = { .task = current }; -+ -+ cl->s = &s; -+ continue_at(cl, closure_sync_fn, NULL); -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (s.done) -+ break; -+ schedule(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+} -+EXPORT_SYMBOL(__closure_sync); -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+static LIST_HEAD(closure_list); -+static DEFINE_SPINLOCK(closure_list_lock); -+ -+void closure_debug_create(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_ALIVE; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_add(&cl->all, &closure_list); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_create); -+ -+void closure_debug_destroy(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_DEAD; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_del(&cl->all); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_destroy); -+ -+static int debug_seq_show(struct seq_file *f, void *data) -+{ -+ struct closure *cl; -+ -+ spin_lock_irq(&closure_list_lock); -+ -+ list_for_each_entry(cl, &closure_list, all) { -+ int r = atomic_read(&cl->remaining); -+ -+ seq_printf(f, "%p: %pS -> %pS p %p r %i ", -+ cl, (void *) cl->ip, cl->fn, cl->parent, -+ r & CLOSURE_REMAINING_MASK); -+ -+ seq_printf(f, "%s%s\n", -+ test_bit(WORK_STRUCT_PENDING_BIT, -+ work_data_bits(&cl->work)) ? "Q" : "", -+ r & CLOSURE_RUNNING ? "R" : ""); -+ -+ if (r & CLOSURE_WAITING) -+ seq_printf(f, " W %pS\n", -+ (void *) cl->waiting_on); -+ -+ seq_puts(f, "\n"); -+ } -+ -+ spin_unlock_irq(&closure_list_lock); -+ return 0; -+} -+ -+static int debug_seq_open(struct inode *inode, struct file *file) -+{ -+ return single_open(file, debug_seq_show, NULL); -+} -+ -+static const struct file_operations debug_ops = { -+ .owner = THIS_MODULE, -+ .open = debug_seq_open, -+ .read = seq_read, -+ .release = single_release -+}; -+ -+static int __init closure_debug_init(void) -+{ -+ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); -+ return 0; -+} -+late_initcall(closure_debug_init) -+ -+#endif -diff --git a/mm/filemap.c b/mm/filemap.c -index 385759c4ce4b..5ca0ff7b9357 100644 ---- a/mm/filemap.c -+++ b/mm/filemap.c -@@ -116,6 +116,69 @@ - * ->tasklist_lock (memory_failure, collect_procs_ao) - */ - -+static int page_cache_tree_insert_vec(struct page *pages[], -+ unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, -+ gfp_t gfp_mask, -+ void *shadow[]) -+{ -+ XA_STATE(xas, &mapping->i_pages, index); -+ void *old; -+ int i = 0, error = 0; -+ -+ mapping_set_update(&xas, mapping); -+ -+ if (!nr_pages) -+ return 0; -+ -+ xa_lock_irq(&mapping->i_pages); -+ -+ while (1) { -+ old = xas_load(&xas); -+ if (old && !xa_is_value(old)) { -+ error = -EEXIST; -+ break; -+ } -+ -+ xas_store(&xas, pages[i]); -+ error = xas_error(&xas); -+ -+ if (error == -ENOMEM) { -+ xa_unlock_irq(&mapping->i_pages); -+ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) -+ error = 0; -+ xa_lock_irq(&mapping->i_pages); -+ -+ if (!error) -+ continue; -+ break; -+ } -+ -+ if (error) -+ break; -+ -+ if (shadow) -+ shadow[i] = old; -+ if (xa_is_value(old)) -+ mapping->nrexceptional--; -+ mapping->nrpages++; -+ -+ /* hugetlb pages do not participate in page cache accounting. */ -+ if (!PageHuge(pages[i])) -+ __inc_lruvec_page_state(pages[i], NR_FILE_PAGES); -+ -+ if (++i == nr_pages) -+ break; -+ -+ xas_next(&xas); -+ } -+ -+ xa_unlock_irq(&mapping->i_pages); -+ -+ return i ?: error; -+} -+ - static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) - { -@@ -826,114 +889,147 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) - } - EXPORT_SYMBOL_GPL(replace_page_cache_page); - --static int __add_to_page_cache_locked(struct page *page, -- struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask, -- void **shadowp) -+static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask, -+ void *shadow[]) - { -- XA_STATE(xas, &mapping->i_pages, offset); -- int huge = PageHuge(page); -- int error; -- void *old; -+ int i, nr_added = 0, error = 0; - -- VM_BUG_ON_PAGE(!PageLocked(page), page); -- VM_BUG_ON_PAGE(PageSwapBacked(page), page); -- mapping_set_update(&xas, mapping); -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- get_page(page); -- page->mapping = mapping; -- page->index = offset; -+ VM_BUG_ON_PAGE(PageSwapBacked(page), page); -+ VM_BUG_ON_PAGE(PageSwapCache(page), page); - -- if (!huge) { -- error = mem_cgroup_charge(page, current->mm, gfp_mask); -- if (error) -- goto error; -+ __SetPageLocked(page); -+ get_page(page); -+ page->mapping = mapping; -+ page->index = index + i; -+ -+ if (!PageHuge(page)) { -+ error = mem_cgroup_charge(page, current->mm, gfp_mask); -+ if (error) { -+ page->mapping = NULL; -+ /* Leave page->index set: truncation relies upon it */ -+ put_page(page); -+ __ClearPageLocked(page); -+ if (!i) -+ return error; -+ nr_pages = i; -+ break; -+ } -+ } - } - -- do { -- xas_lock_irq(&xas); -- old = xas_load(&xas); -- if (old && !xa_is_value(old)) -- xas_set_err(&xas, -EEXIST); -- xas_store(&xas, page); -- if (xas_error(&xas)) -- goto unlock; -+ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, -+ index, gfp_mask, shadow); -+ if (error > 0) { -+ nr_added = error; -+ error = 0; -+ } - -- if (xa_is_value(old)) { -- mapping->nrexceptional--; -- if (shadowp) -- *shadowp = old; -- } -- mapping->nrpages++; -+ for (i = 0; i < nr_added; i++) -+ trace_mm_filemap_add_to_page_cache(pages[i]); - -- /* hugetlb pages do not participate in page cache accounting */ -- if (!huge) -- __inc_lruvec_page_state(page, NR_FILE_PAGES); --unlock: -- xas_unlock_irq(&xas); -- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); -+ for (i = nr_added; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- if (xas_error(&xas)) { -- error = xas_error(&xas); -- goto error; -+ /* Leave page->index set: truncation relies upon it */ -+ page->mapping = NULL; -+ put_page(page); -+ __ClearPageLocked(page); - } - -- trace_mm_filemap_add_to_page_cache(page); -- return 0; --error: -- page->mapping = NULL; -- /* Leave page->index set: truncation relies upon it */ -- put_page(page); -- return error; -+ return nr_added ?: error; - } --ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); - - /** -- * add_to_page_cache_locked - add a locked page to the pagecache -+ * add_to_page_cache - add a newly allocated page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * -- * This function is used to add a page to the pagecache. It must be locked. -- * This function does not add the page to the LRU. The caller must do that. -+ * This function is used to add a page to the pagecache. It must be newly -+ * allocated. This function does not add the page to the LRU. The caller must -+ * do that. - * - * Return: %0 on success, negative error code otherwise. - */ --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) - { -- return __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, NULL); -+ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, -+ gfp_mask, NULL); -+ if (ret < 0) -+ return ret; -+ return 0; - } --EXPORT_SYMBOL(add_to_page_cache_locked); -+EXPORT_SYMBOL(add_to_page_cache); -+ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); - --int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask) - { -- void *shadow = NULL; -- int ret; -+ void *shadow_stack[8], **shadow = shadow_stack; -+ int i, ret = 0, err = 0, nr_added; -+ -+ if (nr_pages > ARRAY_SIZE(shadow_stack)) { -+ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); -+ if (!shadow) -+ goto slowpath; -+ } -+ -+ for (i = 0; i < nr_pages; i++) -+ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); -+ -+ ret = add_to_page_cache_vec(pages, nr_pages, mapping, -+ offset, gfp_mask, shadow); -+ nr_added = ret > 0 ? ret : 0; -+ -+ /* -+ * The page might have been evicted from cache only recently, in which -+ * case it should be activated like any other repeatedly accessed page. -+ * The exception is pages getting rewritten; evicting other data from -+ * the working set, only to cache data that will get overwritten with -+ * something else, is a waste of memory. -+ */ -+ for (i = 0; i < nr_added; i++) { -+ struct page *page = pages[i]; -+ void *s = shadow[i]; - -- __SetPageLocked(page); -- ret = __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, &shadow); -- if (unlikely(ret)) -- __ClearPageLocked(page); -- else { -- /* -- * The page might have been evicted from cache only -- * recently, in which case it should be activated like -- * any other repeatedly accessed page. -- * The exception is pages getting rewritten; evicting other -- * data from the working set, only to cache data that will -- * get overwritten with something else, is a waste of memory. -- */ - WARN_ON_ONCE(PageActive(page)); -- if (!(gfp_mask & __GFP_WRITE) && shadow) -- workingset_refault(page, shadow); -+ if (!(gfp_mask & __GFP_WRITE) && s) -+ workingset_refault(page, s); - lru_cache_add(page); - } -+ -+ if (shadow != shadow_stack) -+ kfree(shadow); -+ - return ret; -+slowpath: -+ for (i = 0; i < nr_pages; i++) { -+ err = add_to_page_cache_lru(pages[i], mapping, -+ offset + i, gfp_mask); -+ if (err) -+ break; -+ } -+ -+ return i ?: err; -+} -+EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); -+ -+int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) -+{ -+ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); -+ if (ret < 0) -+ return ret; -+ return 0; - } - EXPORT_SYMBOL_GPL(add_to_page_cache_lru); - -@@ -1824,6 +1920,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, - - return ret; - } -+EXPORT_SYMBOL(find_get_pages_range); - - /** - * find_get_pages_contig - gang contiguous pagecache lookup -@@ -1972,6 +2069,244 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) - ra->ra_pages /= 4; - } - -+static struct page * -+generic_file_buffered_read_readpage(struct file *filp, -+ struct address_space *mapping, -+ struct page *page) -+{ -+ struct file_ra_state *ra = &filp->f_ra; -+ int error; -+ -+ /* -+ * A previous I/O error may have been due to temporary -+ * failures, eg. multipath errors. -+ * PG_error will be set again if readpage fails. -+ */ -+ ClearPageError(page); -+ /* Start the actual read. The read will unlock the page. */ -+ error = mapping->a_ops->readpage(filp, page); -+ -+ if (unlikely(error)) { -+ put_page(page); -+ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; -+ } -+ -+ if (!PageUptodate(page)) { -+ error = lock_page_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ if (!PageUptodate(page)) { -+ if (page->mapping == NULL) { -+ /* -+ * invalidate_mapping_pages got it -+ */ -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ unlock_page(page); -+ shrink_readahead_size_eio(ra); -+ put_page(page); -+ return ERR_PTR(-EIO); -+ } -+ unlock_page(page); -+ } -+ -+ return page; -+} -+ -+static struct page * -+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, -+ struct file *filp, -+ struct iov_iter *iter, -+ struct page *page, -+ loff_t pos, loff_t count) -+{ -+ struct address_space *mapping = filp->f_mapping; -+ struct inode *inode = mapping->host; -+ int error; -+ -+ /* -+ * See comment in do_read_cache_page on why -+ * wait_on_page_locked is used to avoid unnecessarily -+ * serialisations and why it's safe. -+ */ -+ error = wait_on_page_locked_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ -+ if (PageUptodate(page)) -+ return page; -+ -+ if (inode->i_blkbits == PAGE_SHIFT || -+ !mapping->a_ops->is_partially_uptodate) -+ goto page_not_up_to_date; -+ /* pipes can't handle partially uptodate pages */ -+ if (unlikely(iov_iter_is_pipe(iter))) -+ goto page_not_up_to_date; -+ if (!trylock_page(page)) -+ goto page_not_up_to_date; -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) -+ goto page_not_up_to_date_locked; -+ -+ if (!mapping->a_ops->is_partially_uptodate(page, -+ pos & ~PAGE_MASK, count)) -+ goto page_not_up_to_date_locked; -+ unlock_page(page); -+ return page; -+ -+page_not_up_to_date: -+ /* Get exclusive access to the page ... */ -+ error = lock_page_killable(page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ -+page_not_up_to_date_locked: -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) { -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ -+ /* Did somebody else fill it already? */ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return page; -+ } -+ -+ if (iocb->ki_flags & IOCB_NOIO) { -+ unlock_page(page); -+ put_page(page); -+ return ERR_PTR(-EAGAIN); -+ } -+ -+ return generic_file_buffered_read_readpage(filp, mapping, page); -+} -+ -+static struct page * -+generic_file_buffered_read_no_cached_page(struct kiocb *iocb, -+ struct iov_iter *iter) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ struct page *page; -+ int error; -+ -+ if (iocb->ki_flags & IOCB_NOIO) -+ return ERR_PTR(-EAGAIN); -+ -+ /* -+ * Ok, it wasn't cached, so we need to create a new -+ * page.. -+ */ -+ page = page_cache_alloc(mapping); -+ if (!page) -+ return ERR_PTR(-ENOMEM); -+ -+ error = add_to_page_cache_lru(page, mapping, index, -+ mapping_gfp_constraint(mapping, GFP_KERNEL)); -+ if (error) { -+ put_page(page); -+ return error != -EEXIST ? ERR_PTR(error) : NULL; -+ } -+ -+ return generic_file_buffered_read_readpage(filp, mapping, page); -+} -+ -+static int generic_file_buffered_read_get_pages(struct kiocb *iocb, -+ struct iov_iter *iter, -+ struct page **pages, -+ unsigned int nr) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ struct file_ra_state *ra = &filp->f_ra; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -+ int i, j, nr_got, err = 0; -+ -+ nr = min_t(unsigned long, last_index - index, nr); -+find_page: -+ if (fatal_signal_pending(current)) -+ return -EINTR; -+ -+ nr_got = find_get_pages_contig(mapping, index, nr, pages); -+ if (nr_got) -+ goto got_pages; -+ -+ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) -+ return -EAGAIN; -+ -+ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); -+ -+ nr_got = find_get_pages_contig(mapping, index, nr, pages); -+ if (nr_got) -+ goto got_pages; -+ -+ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); -+ err = PTR_ERR_OR_ZERO(pages[0]); -+ if (!IS_ERR_OR_NULL(pages[0])) -+ nr_got = 1; -+got_pages: -+ for (i = 0; i < nr_got; i++) { -+ struct page *page = pages[i]; -+ pgoff_t pg_index = index + i; -+ loff_t pg_pos = max(iocb->ki_pos, -+ (loff_t) pg_index << PAGE_SHIFT); -+ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; -+ -+ if (PageReadahead(page)) { -+ if (iocb->ki_flags & IOCB_NOIO) { -+ for (j = i; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = -EAGAIN; -+ break; -+ } -+ page_cache_async_readahead(mapping, ra, filp, page, -+ pg_index, last_index - pg_index); -+ } -+ -+ if (!PageUptodate(page)) { -+ if (iocb->ki_flags & IOCB_NOWAIT) { -+ for (j = i; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = -EAGAIN; -+ break; -+ } -+ -+ page = generic_file_buffered_read_pagenotuptodate(iocb, -+ filp, iter, page, pg_pos, pg_count); -+ if (IS_ERR_OR_NULL(page)) { -+ for (j = i + 1; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = PTR_ERR_OR_ZERO(page); -+ break; -+ } -+ } -+ } -+ -+ if (likely(nr_got)) -+ return nr_got; -+ if (err) -+ return err; -+ /* -+ * No pages and no error means we raced and should retry: -+ */ -+ goto find_page; -+} -+ - /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read -@@ -1992,261 +2327,110 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) - { - struct file *filp = iocb->ki_filp; -+ struct file_ra_state *ra = &filp->f_ra; - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; -- struct file_ra_state *ra = &filp->f_ra; -- loff_t *ppos = &iocb->ki_pos; -- pgoff_t index; -- pgoff_t last_index; -- pgoff_t prev_index; -- unsigned long offset; /* offset into pagecache page */ -- unsigned int prev_offset; -- int error = 0; -- -- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) -+ size_t orig_count = iov_iter_count(iter); -+ struct page *pages_onstack[8], **pages = NULL; -+ unsigned int nr_pages = min_t(unsigned int, 512, -+ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - -+ (iocb->ki_pos >> PAGE_SHIFT)); -+ int i, pg_nr, error = 0; -+ bool writably_mapped; -+ loff_t isize, end_offset; -+ -+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) - return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - -- index = *ppos >> PAGE_SHIFT; -- prev_index = ra->prev_pos >> PAGE_SHIFT; -- prev_offset = ra->prev_pos & (PAGE_SIZE-1); -- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -- offset = *ppos & ~PAGE_MASK; -+ if (nr_pages > ARRAY_SIZE(pages_onstack)) -+ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - -- for (;;) { -- struct page *page; -- pgoff_t end_index; -- loff_t isize; -- unsigned long nr, ret; -+ if (!pages) { -+ pages = pages_onstack; -+ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); -+ } - -+ do { - cond_resched(); --find_page: -- if (fatal_signal_pending(current)) { -- error = -EINTR; -- goto out; -- } - -- page = find_get_page(mapping, index); -- if (!page) { -- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) -- goto would_block; -- page_cache_sync_readahead(mapping, -- ra, filp, -- index, last_index - index); -- page = find_get_page(mapping, index); -- if (unlikely(page == NULL)) -- goto no_cached_page; -- } -- if (PageReadahead(page)) { -- if (iocb->ki_flags & IOCB_NOIO) { -- put_page(page); -- goto out; -- } -- page_cache_async_readahead(mapping, -- ra, filp, page, -- index, last_index - index); -+ i = 0; -+ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, -+ pages, nr_pages); -+ if (pg_nr < 0) { -+ error = pg_nr; -+ break; - } -- if (!PageUptodate(page)) { -- if (iocb->ki_flags & IOCB_NOWAIT) { -- put_page(page); -- goto would_block; -- } - -- /* -- * See comment in do_read_cache_page on why -- * wait_on_page_locked is used to avoid unnecessarily -- * serialisations and why it's safe. -- */ -- error = wait_on_page_locked_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- if (PageUptodate(page)) -- goto page_ok; -- -- if (inode->i_blkbits == PAGE_SHIFT || -- !mapping->a_ops->is_partially_uptodate) -- goto page_not_up_to_date; -- /* pipes can't handle partially uptodate pages */ -- if (unlikely(iov_iter_is_pipe(iter))) -- goto page_not_up_to_date; -- if (!trylock_page(page)) -- goto page_not_up_to_date; -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) -- goto page_not_up_to_date_locked; -- if (!mapping->a_ops->is_partially_uptodate(page, -- offset, iter->count)) -- goto page_not_up_to_date_locked; -- unlock_page(page); -- } --page_ok: - /* -- * i_size must be checked after we know the page is Uptodate. -+ * i_size must be checked after we know the pages are Uptodate. - * - * Checking i_size after the check allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ -- - isize = i_size_read(inode); -- end_index = (isize - 1) >> PAGE_SHIFT; -- if (unlikely(!isize || index > end_index)) { -- put_page(page); -- goto out; -- } -+ if (unlikely(iocb->ki_pos >= isize)) -+ goto put_pages; - -- /* nr is the maximum number of bytes to copy from this page */ -- nr = PAGE_SIZE; -- if (index == end_index) { -- nr = ((isize - 1) & ~PAGE_MASK) + 1; -- if (nr <= offset) { -- put_page(page); -- goto out; -- } -- } -- nr = nr - offset; -+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - -- /* If users can be writing to this page using arbitrary -- * virtual addresses, take care about potential aliasing -- * before reading the page on the kernel side. -- */ -- if (mapping_writably_mapped(mapping)) -- flush_dcache_page(page); -+ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > -+ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) -+ put_page(pages[--pg_nr]); - - /* -- * When a sequential read accesses a page several times, -- * only mark it as accessed the first time. -+ * Once we start copying data, we don't want to be touching any -+ * cachelines that might be contended: - */ -- if (prev_index != index || offset != prev_offset) -- mark_page_accessed(page); -- prev_index = index; -+ writably_mapped = mapping_writably_mapped(mapping); - - /* -- * Ok, we have the page, and it's up-to-date, so -- * now we can copy it to user space... -+ * When a sequential read accesses a page several times, only -+ * mark it as accessed the first time. - */ -+ if (iocb->ki_pos >> PAGE_SHIFT != -+ ra->prev_pos >> PAGE_SHIFT) -+ mark_page_accessed(pages[0]); -+ for (i = 1; i < pg_nr; i++) -+ mark_page_accessed(pages[i]); -+ -+ for (i = 0; i < pg_nr; i++) { -+ unsigned int offset = iocb->ki_pos & ~PAGE_MASK; -+ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, -+ PAGE_SIZE - offset); -+ unsigned int copied; - -- ret = copy_page_to_iter(page, offset, nr, iter); -- offset += ret; -- index += offset >> PAGE_SHIFT; -- offset &= ~PAGE_MASK; -- prev_offset = offset; -- -- put_page(page); -- written += ret; -- if (!iov_iter_count(iter)) -- goto out; -- if (ret < nr) { -- error = -EFAULT; -- goto out; -- } -- continue; -- --page_not_up_to_date: -- /* Get exclusive access to the page ... */ -- error = lock_page_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- --page_not_up_to_date_locked: -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) { -- unlock_page(page); -- put_page(page); -- continue; -- } -- -- /* Did somebody else fill it already? */ -- if (PageUptodate(page)) { -- unlock_page(page); -- goto page_ok; -- } -+ /* -+ * If users can be writing to this page using arbitrary -+ * virtual addresses, take care about potential aliasing -+ * before reading the page on the kernel side. -+ */ -+ if (writably_mapped) -+ flush_dcache_page(pages[i]); - --readpage: -- if (iocb->ki_flags & IOCB_NOIO) { -- unlock_page(page); -- put_page(page); -- goto would_block; -- } -- /* -- * A previous I/O error may have been due to temporary -- * failures, eg. multipath errors. -- * PG_error will be set again if readpage fails. -- */ -- ClearPageError(page); -- /* Start the actual read. The read will unlock the page. */ -- error = mapping->a_ops->readpage(filp, page); -+ copied = copy_page_to_iter(pages[i], offset, bytes, iter); - -- if (unlikely(error)) { -- if (error == AOP_TRUNCATED_PAGE) { -- put_page(page); -- error = 0; -- goto find_page; -- } -- goto readpage_error; -- } -+ iocb->ki_pos += copied; -+ ra->prev_pos = iocb->ki_pos; - -- if (!PageUptodate(page)) { -- error = lock_page_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- if (!PageUptodate(page)) { -- if (page->mapping == NULL) { -- /* -- * invalidate_mapping_pages got it -- */ -- unlock_page(page); -- put_page(page); -- goto find_page; -- } -- unlock_page(page); -- shrink_readahead_size_eio(ra); -- error = -EIO; -- goto readpage_error; -+ if (copied < bytes) { -+ error = -EFAULT; -+ break; - } -- unlock_page(page); - } -+put_pages: -+ for (i = 0; i < pg_nr; i++) -+ put_page(pages[i]); -+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - -- goto page_ok; -- --readpage_error: -- /* UHHUH! A synchronous read error occurred. Report it */ -- put_page(page); -- goto out; -- --no_cached_page: -- /* -- * Ok, it wasn't cached, so we need to create a new -- * page.. -- */ -- page = page_cache_alloc(mapping); -- if (!page) { -- error = -ENOMEM; -- goto out; -- } -- error = add_to_page_cache_lru(page, mapping, index, -- mapping_gfp_constraint(mapping, GFP_KERNEL)); -- if (error) { -- put_page(page); -- if (error == -EEXIST) { -- error = 0; -- goto find_page; -- } -- goto out; -- } -- goto readpage; -- } -+ file_accessed(filp); -+ written += orig_count - iov_iter_count(iter); - --would_block: -- error = -EAGAIN; --out: -- ra->prev_pos = prev_index; -- ra->prev_pos <<= PAGE_SHIFT; -- ra->prev_pos |= prev_offset; -+ if (pages != pages_onstack) -+ kfree(pages); - -- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; -- file_accessed(filp); - return written ? written : error; - } - EXPORT_SYMBOL_GPL(generic_file_buffered_read); -diff --git a/mm/gup.c b/mm/gup.c -index 6f47697f8fb0..ccceb6d3e367 100644 ---- a/mm/gup.c -+++ b/mm/gup.c -@@ -1108,6 +1108,13 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - } - cond_resched(); - -+ if (current->faults_disabled_mapping && -+ vma->vm_file && -+ vma->vm_file->f_mapping == current->faults_disabled_mapping) { -+ ret = -EFAULT; -+ goto out; -+ } -+ - page = follow_page_mask(vma, start, foll_flags, &ctx); - if (!page) { - ret = faultin_page(tsk, vma, start, &foll_flags, -diff --git a/mm/nommu.c b/mm/nommu.c -index f32a69095d50..f714f339e19b 100644 ---- a/mm/nommu.c -+++ b/mm/nommu.c -@@ -290,6 +290,24 @@ void *vzalloc_node(unsigned long size, int node) - } - EXPORT_SYMBOL(vzalloc_node); - -+/** -+ * vmalloc_exec - allocate virtually contiguous, executable memory -+ * @size: allocation size -+ * -+ * Kernel-internal function to allocate enough pages to cover @size -+ * the page level allocator and map them into contiguous and -+ * executable kernel virtual space. -+ * -+ * For tight control over page level allocator and protection flags -+ * use __vmalloc() instead. -+ */ -+ -+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) -+{ -+ return __vmalloc(size, gfp_mask); -+} -+EXPORT_SYMBOL_GPL(vmalloc_exec); -+ - /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 28b3e7a67565..2aa1e1e4c20b 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -2477,20 +2477,19 @@ int __set_page_dirty_nobuffers(struct page *page) - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); -- unsigned long flags; - - if (!mapping) { - unlock_page_memcg(page); - return 1; - } - -- xa_lock_irqsave(&mapping->i_pages, flags); -+ xa_lock_irq(&mapping->i_pages); - BUG_ON(page_mapping(page) != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); -- xa_unlock_irqrestore(&mapping->i_pages, flags); -+ xa_unlock_irq(&mapping->i_pages); - unlock_page_memcg(page); - - if (mapping->host) { -diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index 5a2b55c8dd9a..f296b41e67f0 100644 ---- a/mm/vmalloc.c -+++ b/mm/vmalloc.c -@@ -2695,6 +2695,27 @@ void *vzalloc_node(unsigned long size, int node) - } - EXPORT_SYMBOL(vzalloc_node); - -+/** -+ * vmalloc_exec - allocate virtually contiguous, executable memory -+ * @size: allocation size -+ * -+ * Kernel-internal function to allocate enough pages to cover @size -+ * the page level allocator and map them into contiguous and -+ * executable kernel virtual space. -+ * -+ * For tight control over page level allocator and protection flags -+ * use __vmalloc() instead. -+ * -+ * Return: pointer to the allocated memory or %NULL on error -+ */ -+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) -+{ -+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -+ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, -+ NUMA_NO_NODE, __builtin_return_address(0)); -+} -+EXPORT_SYMBOL_GPL(vmalloc_exec); -+ - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) - #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) - #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/linux58-tkg/linux58-tkg-patches/0009-glitched-bmq.patch b/linux58-tkg/linux58-tkg-patches/0009-glitched-bmq.patch deleted file mode 100644 index 38666e4..0000000 --- a/linux58-tkg/linux58-tkg-patches/0009-glitched-bmq.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - BMQ - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -159,7 +159,7 @@ struct scan_control { - /* - * From 0 .. 100. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - /* - * The total number of pages which are beyond the high watermark within all - * zones. diff --git a/linux58-tkg/linux58-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux58-tkg/linux58-tkg-patches/0009-glitched-ondemand-bmq.patch deleted file mode 100644 index a926040..0000000 --- a/linux58-tkg/linux58-tkg-patches/0009-glitched-ondemand-bmq.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux58-tkg/linux58-tkg-patches/0009-prjc_v5.8-r3.patch b/linux58-tkg/linux58-tkg-patches/0009-prjc_v5.8-r3.patch deleted file mode 100644 index 01bffcd..0000000 --- a/linux58-tkg/linux58-tkg-patches/0009-prjc_v5.8-r3.patch +++ /dev/null @@ -1,8582 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index fb95fad81c79..6e3f8233600e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4525,6 +4525,12 @@ - - sbni= [NET] Granch SBNI12 leased line adapter - -+ sched_timeslice= -+ [KNL] Time slice in us for BMQ/PDS scheduler. -+ Format: (must be >= 1000) -+ Default: 4000 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_debug [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index 83acf5025488..313d2124e709 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1428,3 +1428,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index d86c0afc8a85..7f394a6fb9b6 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 683372943093..d25f2501daf3 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -32,6 +32,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -650,12 +651,18 @@ struct task_struct { - unsigned int ptrace; - - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; -+#endif -+ -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -669,6 +676,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -677,13 +685,33 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+ int bmq_idx; -+ struct list_head bmq_node; -+#endif /* CONFIG_SCHED_BMQ */ -+#ifdef CONFIG_SCHED_PDS -+ u64 deadline; -+ u64 priodl; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+#endif /* CONFIG_SCHED_PDS */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* Clamp values requested for a scheduling entity */ -@@ -1326,6 +1354,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..179d77c8360e 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((p)->priodl) -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..42730d27ceb5 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,11 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ 7 -+#endif -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ 0 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..47ca955a451d ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ * Copyright (C) 2016 Alfred Chen. -+ * -+ * Code based on Con Kolivas's skip list implementation for BFS, and -+ * which is based on example originally by William Pugh. -+ * -+ * Skip Lists are a probabilistic alternative to balanced trees, as -+ * described in the June 1990 issue of CACM and were invented by -+ * William Pugh in 1987. -+ * -+ * A couple of comments about this implementation: -+ * -+ * This file only provides a infrastructure of skip list. -+ * -+ * skiplist_node is embedded into container data structure, to get rid -+ * the dependency of kmalloc/kfree operation in scheduler code. -+ * -+ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+ * macro and be used for skip list insert operation. -+ * -+ * Random Level is also not defined in this file, instead, it should be -+ * customized implemented and set to node->level then pass to the customized -+ * skiplist_insert function. -+ * -+ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ * -+ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+ * considering that there will be 256 entries to enable the top level when using -+ * random level p=0.5, and that number is more than enough for a run queue usage -+ * in a scheduler usage. And it also help to reduce the memory usage of the -+ * embedded skip list node in task_struct to about 50%. -+ * -+ * The insertion routine has been implemented so as to use the -+ * dirty hack described in the CACM paper: if a random level is -+ * generated that is more than the current maximum level, the -+ * current maximum level plus one is used instead. -+ * -+ * BFS Notes: In this implementation of skiplists, there are bidirectional -+ * next/prev pointers and the insert function returns a pointer to the actual -+ * node the value is stored. The key here is chosen by the scheduler so as to -+ * sort tasks according to the priority list requirements and is no longer used -+ * by the scheduler after insertion. The scheduler lookup, however, occurs in -+ * O(1) time because it is always the first item in the level 0 linked list. -+ * Since the task struct stores a copy of the node pointer upon skiplist_insert, -+ * it can also remove it much faster than the original implementation with the -+ * aid of prev<->next pointer manipulation and no searching. -+ */ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty() */ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/init/Kconfig b/init/Kconfig -index 0498af567f70..aaa7c434eedf 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -742,9 +742,39 @@ config GENERIC_SCHED_CLOCK - - menu "Scheduler features" - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+config SCHED_PDS -+ bool "PDS CPU scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. -+ -+endchoice -+ -+endif -+ - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -830,6 +860,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_ALT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -916,7 +947,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_ALT - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1172,6 +1203,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_ALT - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index 15089d15010a..6bc94553d79a 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -74,9 +74,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -86,6 +92,19 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+#endif -+#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+ .sl_level = 0, -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -93,6 +112,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 642415b8c3c9..7e0e1fe18035 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 727150f28103..23ddd91a3d29 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..4176ad070bc9 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_ALT -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index cfdd5b93264d..84c284eb544a 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static inline int - rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static void -@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..eb6d7d87779f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o alt_debug.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..b469c9488d18 ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,6184 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include "sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+#include "smp.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#define ALT_SCHED_VERSION "v5.8-r3" -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_us; -+ -+ get_option(&str, ×lice_us); -+ if (timeslice_us >= 1000) -+ sched_timeslice_ns = timeslice_us * 1000; -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq_imp.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds_imp.h" -+#endif -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = sched_queue_watermark(rq); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", -+ cpu_of(rq), watermark, last_wm);*/ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu; -+ -+ if (!tick_nohz_full_enabled()) -+ return; -+ -+ cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} -+ -+void select_nohz_load_balancer(int stop_tick) {} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ set_tsk_need_resched(cpu_rq(cpu)->idle); -+ smp_send_reschedule(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static void nohz_csd_func(void *info) -+{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; -+ -+ /* -+ * Release the rq::nohz_csd. -+ */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } -+} -+ -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+static inline void -+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -+{ -+ csd->flags = 0; -+ csd->func = func; -+ csd->info = rq; -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_from_idle(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ /* Fall-through */ -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ check_preempt_curr(rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+static int ttwu_remote(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+#if defined(CONFIG_SMP) -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -+ * 2) smp_cond_load_acquire(!X->on_cpu) -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_remote()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with mb() in -+ * set_current_state() the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ sched_task_ttwu(p); -+ -+ cpu = select_task_rq(p, this_rq()); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, &rf); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ __task_rq_unlock(rq, &rf); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); -+ -+ rseq_migrate(p); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p, this_rq())); -+#ifdef CONFIG_SMP -+ rseq_migrate(p); -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->on_cpu is cleared, the task can be moved to a different CPU. -+ * We must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+ struct task_struct *p = current; -+ unsigned long flags; -+ int dest_cpu; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = this_rq(); -+ -+ if (rq != task_rq(p) || rq->nr_running < 2) -+ goto unlock; -+ -+ dest_cpu = select_task_rq(p, task_rq(p)); -+ if (dest_cpu == smp_processor_id()) -+ goto unlock; -+ -+ if (likely(cpu_active(dest_cpu))) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); -+ return; -+ } -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+DEFINE_PER_CPU(unsigned long, thermal_pressure); -+ -+void arch_set_thermal_pressure(struct cpumask *cpus, -+ unsigned long th_pressure) -+{ -+ int cpu; -+ -+ for_each_cpu(cpu, cpus) -+ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(cpu, &tmp, -+ per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance_check(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu; -+ -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) -+ return; -+ -+ cpu = cpu_of(rq); -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk)) { -+ if (sg_balance_trigger(i)) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[IDLE_WM].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0, ); -+ set_task_cpu(p, dest_cpu); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+#ifdef CONFIG_SMP -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+#endif -+ rq->nr_running += nr_migrated; -+#ifdef CONFIG_SMP -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+#endif -+ update_sched_rq_watermark(rq); -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state && prev_state == prev->state) { -+ if (signal_pending_state(prev_state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ -+ if (likely(prev != next)) { -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { -+ requeue_task(p, rq); -+ check_preempt_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+EXPORT_SYMBOL_GPL(sched_setscheduler); -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+EXPORT_SYMBOL_GPL(sched_setattr); -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ alt_sched_debug(); -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, -+ task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+#if BITS_PER_LONG == 32 -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#else -+ printk(KERN_INFO -+ " task PC stack pid father\n"); -+#endif -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ sched_queue_init_idle(rq, idle); -+ -+ scs_task_reset(idle); -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ p = sched_rq_first_task(rq); -+ while (p != rq->idle) { -+ int dest_cpu; -+ -+ /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ /* -+ * Rules for changing task_struct::cpus_allowed are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ p = sched_rq_first_task(rq); -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last) \ -+ if (cpumask_and(chk, chk, mask)) \ -+ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ -+ cpu, (chk++)->bits[0]); \ -+ if (!last) \ -+ cpumask_complement(chk, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ -+ cpumask_complement(chk, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = chk; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(rq); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); -+#endif -+#endif /* CONFIG_SMP */ -+ rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..1212a031700e ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..99be2c51c88d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,555 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_rq */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+#ifdef CONFIG_SCHED_BMQ -+ struct bmq queue; -+#endif -+#ifdef CONFIG_SCHED_PDS -+ struct skiplist_node sl_header; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, -+ const cpumask_t *mask) -+{ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu : -+ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); -+} -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_from_idle(void) { } -+#endif -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..aff0bb30a884 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,20 @@ -+#ifndef BMQ_H -+#define BMQ_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, SCHED_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#endif -diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h -new file mode 100644 -index 000000000000..ad9a7c448da7 ---- /dev/null -+++ b/kernel/sched/bmq_imp.h -@@ -0,0 +1,185 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline int task_sched_prio(struct task_struct *p, struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+} -+ -+static inline void update_task_priodl(struct task_struct *p) {} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return find_first_bit(rq->queue.bitmap, SCHED_BITS); -+} -+ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ struct bmq *q = &rq->queue; -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ struct bmq *q = &rq->queue; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); -+ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); -+ set_bit(idle->bmq_idx, q->bitmap); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ list_del(&p->bmq_node); \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap);\ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->bmq_idx = task_sched_prio(p, rq); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ -+ set_bit(p->bmq_idx, rq->queue.bitmap) -+ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{ \ -+ int idx = task_sched_prio(p, rq); \ -+\ -+ list_del(&p->bmq_node); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ -+ if (idx != p->bmq_idx) { \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap); \ -+ p->bmq_idx = idx; \ -+ set_bit(p->bmq_idx, rq->queue.bitmap); \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ return (task_sched_prio(p, rq) != p->bmq_idx); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+ -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index 7fbaee24c824..0d7ad05b84fe 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_ALT */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -916,6 +927,7 @@ static int __init sugov_register(void) - core_initcall(sugov_register); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_ALT - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_ALT */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index ff9435dee1df..0ee9967d2d74 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -658,7 +658,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 1ae95b9150d3..f5c3aa20d172 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -372,6 +372,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -492,3 +493,4 @@ const struct sched_class idle_sched_class = { - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..7fdeace7e8a5 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,14 @@ -+#ifndef PDS_H -+#define PDS_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+#endif -diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h -new file mode 100644 -index 000000000000..6baee5e961b9 ---- /dev/null -+++ b/kernel/sched/pds_imp.h -@@ -0,0 +1,257 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static const u64 user_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, -+/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, -+/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, -+/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, -+/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, -+/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, -+/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, -+/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 -+}; -+ -+static const unsigned char dl_level_map[] = { -+/* 0 4 8 12 */ -+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, -+/* 16 20 24 28 */ -+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, -+/* 32 36 40 44 */ -+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, -+/* 48 52 56 60 */ -+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, -+/* 64 68 72 76 */ -+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, -+/* 80 84 88 92 */ -+ 1, 0 -+}; -+ -+static inline int -+task_sched_prio(const struct task_struct *p, const struct rq *rq) -+{ -+ size_t delta; -+ -+ if (p == rq->idle) -+ return IDLE_TASK_SCHED_PRIO; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return p->prio; -+ -+ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; -+ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); -+ -+ return MAX_RT_PRIO + dl_level_map[delta]; -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+} -+ -+/* -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Init the queue structure in rq -+ */ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ -+ int default_prio = idle->prio; -+ -+ idle->prio = MAX_PRIO; -+ idle->deadline = 0ULL; -+ update_task_priodl(idle); -+ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ -+ idle->sl_node.level = idle->sl_level; -+ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); -+ -+ idle->prio = default_prio; -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ BUG_ON(node == &rq->sl_header); -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *next = p->sl_node.next[0]; -+ -+ BUG_ON(next == &rq->sl_header); -+ return skiplist_entry(next, struct task_struct, sl_node); -+} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return task_sched_prio(sched_rq_first_task(rq), rq); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sl_node.level = p->sl_level; \ -+ pds_skiplist_insert(&rq->sl_header, &p->sl_node) -+ -+/* -+ * Requeue a task @p to @rq -+ */ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{\ -+ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ -+\ -+ p->sl_node.level = p->sl_level; \ -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *node = p->sl_node.prev[0]; -+ -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl > p->priodl) -+ return true; -+ } -+ -+ node = p->sl_node.next[0]; -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl < p->priodl) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->sl_level = pds_skiplist_random_level(p); -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int ret; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ -+ preempt_disable(); -+ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+static void sched_task_ttwu(struct task_struct *p) {} -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index b4b1ff96642f..0ead9625081f 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -274,6 +274,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -391,8 +392,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - /* - * thermal: - * -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index eb034d9f024d..7fefc89b377a 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) -@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) - } - #endif - -+#ifndef CONFIG_SCHED_ALT - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -177,6 +182,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 877fb08eb1b0..da6a01b591a0 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - - #include -@@ -2542,3 +2546,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..108422ebc7bf 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index ba81187bb7af..996b5786b058 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -4,6 +4,7 @@ - */ - #include "sched.h" - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology = tl; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index db1ce7af2563..4437a207d061 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_ALT -+static int __maybe_unused zero = 0; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -184,7 +188,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - int sysctl_legacy_va_layout; - #endif - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -1653,6 +1657,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -1834,6 +1839,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -2410,6 +2416,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index d89da1c7e005..a73adff9f309 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1923,8 +1923,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index 165117996ea0..bd8718a51499 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -789,6 +789,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -796,6 +797,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -823,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -838,7 +842,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1074,8 +1078,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..65f60c77bc50 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - diff --git a/linux58-tkg/linux58-tkg-patches/0011-ZFS-fix.patch b/linux58-tkg/linux58-tkg-patches/0011-ZFS-fix.patch deleted file mode 100644 index af71d04..0000000 --- a/linux58-tkg/linux58-tkg-patches/0011-ZFS-fix.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= -Date: Thu, 2 May 2019 05:28:08 +0100 -Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with - EXPORT_SYMBOL_GPL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We need these symbols in zfs as the fpu implementation breaks userspace: - -https://github.com/zfsonlinux/zfs/issues/9346 -Signed-off-by: Jörg Thalheim ---- - arch/x86/kernel/fpu/core.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index 12c70840980e..352538b3bb5d 100644 ---- a/arch/x86/kernel/fpu/core.c -+++ b/arch/x86/kernel/fpu/core.c -@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) - } - __cpu_invalidate_fpregs_state(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_begin); -+EXPORT_SYMBOL(kernel_fpu_begin); - - void kernel_fpu_end(void) - { -@@ -111,7 +111,7 @@ void kernel_fpu_end(void) - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_end); -+EXPORT_SYMBOL(kernel_fpu_end); - - /* - * Save the FPU state (mark it for reload if necessary): --- -2.23.0 - - diff --git a/linux58-tkg/linux58-tkg-patches/0012-misc-additions.patch b/linux58-tkg/linux58-tkg-patches/0012-misc-additions.patch deleted file mode 100644 index ae06419..0000000 --- a/linux58-tkg/linux58-tkg-patches/0012-misc-additions.patch +++ /dev/null @@ -1,54 +0,0 @@ -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 0840d27381ea..73aba9a31064 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP - def_bool y - depends on VT_CONSOLE && PM_SLEEP - -+config NR_TTY_DEVICES -+ int "Maximum tty device number" -+ depends on VT -+ range 12 63 -+ default 63 -+ ---help--- -+ This option is used to change the number of tty devices in /dev. -+ The default value is 63. The lowest number you can set is 12, -+ 63 is also the upper limit so we don't overrun the serial -+ consoles. -+ -+ If unsure, say 63. -+ - config HW_CONSOLE - bool - depends on VT && !UML -diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h -index e9d39c48520a..3bceead8da40 100644 ---- a/include/uapi/linux/vt.h -+++ b/include/uapi/linux/vt.h -@@ -3,12 +3,25 @@ - #define _UAPI_LINUX_VT_H - - -+/* -+ * We will make this definition solely for the purpose of making packages -+ * such as splashutils build, because they can not understand that -+ * NR_TTY_DEVICES is defined in the kernel configuration. -+ */ -+#ifndef CONFIG_NR_TTY_DEVICES -+#define CONFIG_NR_TTY_DEVICES 63 -+#endif -+ - /* - * These constants are also useful for user-level apps (e.g., VC - * resizing). - */ - #define MIN_NR_CONSOLES 1 /* must be at least 1 */ --#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -+/* -+ * NR_TTY_DEVICES: -+ * Value MUST be at least 12 and must never be higher then 63 -+ */ -+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ - /* Note: the ioctl VT_GETSTATE does not work for - consoles 16 and higher (since it returns a short) */ \ No newline at end of file diff --git a/linux59-tkg/PKGBUILD b/linux59-tkg/PKGBUILD deleted file mode 100644 index d1939b1..0000000 --- a/linux59-tkg/PKGBUILD +++ /dev/null @@ -1,285 +0,0 @@ -# Based on the file created for Arch Linux by: -# Tobias Powalowski -# Thomas Baechler - -# Contributor: Tk-Glitch - -plain ' .---.` `.---.' -plain ' `/syhhhyso- -osyhhhys/`' -plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' -plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' -plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' -plain ' /ssyhhhysssssssssssssssssyhhhyss/' -plain ' .ossssssssssssssssssssssssssssso.' -plain ' :sssssssssssssssssssssssssssssssss:' -plain ' /sssssssssssssssssssssssssssssssssss/' -plain ' :sssssssssssssoosssssssoosssssssssssss:' -plain ' osssssssssssssoosssssssoossssssssssssso' -plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' -plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' -plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' -plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' -plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' -plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' -plain ' `.-:///////:-.`' - -_where="$PWD" # track basedir as different Arch based distros are moving srcdir around - -source "$_where"/customization.cfg # load default configuration from file -source "$_where"/linux*-tkg-config/prepare - -if [[ "$_sub" = rc* ]]; then - _srcpath="linux-${_basekernel}-${_sub}" -else - _srcpath="linux-${_basekernel}" -fi - -_tkg_initscript - -_distro="Arch" - -if [ -n "$_custom_pkgbase" ]; then - pkgbase="${_custom_pkgbase}" -else - pkgbase=linux"${_basever}"-tkg-"${_cpusched}" -fi -pkgname=("${pkgbase}" "${pkgbase}-headers") -pkgver="${_basekernel}"."${_sub}" -pkgrel=8 -pkgdesc='Linux-tkg' -arch=('x86_64') # no i686 in here -url="http://www.kernel.org/" -license=('GPL2') -makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'pahole' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') -optdepends=('schedtool') -options=('!strip' 'docs') -source=("https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" - "https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-${pkgver}.xz" - "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" - 'config.x86_64' # stock Arch config - #'config_hardened.x86_64' # hardened Arch config - 90-cleanup.hook - cleanup - # ARCH Patches - 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - # TkG - 0002-clear-patches.patch - 0003-glitched-base.patch - 0003-glitched-cfs.patch - 0004-glitched-ondemand-muqss.patch - 0004-glitched-muqss.patch - 0004-5.9-ck1.patch - #0005-undead-glitched-ondemand-pds.patch - #0005-undead-glitched-pds.patch - #0005-v5.8_undead-pds099o.patch - 0005-glitched-pds.patch - 0006-add-acs-overrides_iommu.patch - 0007-v5.9-fsync.patch - 0008-5.9-bcachefs.patch - 0009-glitched-ondemand-bmq.patch - 0009-glitched-bmq.patch - 0009-prjc_v5.9-r0.patch - 0011-ZFS-fix.patch - #0012-linux-hardened.patch - 0012-misc-additions.patch -) -sha256sums=('3239a4ee1250bf2048be988cc8cb46c487b2c8a0de5b1b032d38394d5c6b1a06' - '7edb7b9d06b02f9b88d868c74ab618baf899c94edb19a73291f640dbea55c312' - '5ab29eb64e57df83b395a29a6a4f89030d142feffbfbf73b3afc6d97a2a7fd12' - 'ca84d1966bf13570768a65015ddcbde198d866d2a5a44df21a581ed57860b887' - '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' - '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' - 'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6' - '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' - 'b9ebe0ae69bc2b2091d6bfcf6c7875a87ea7969fcfa4e306c48d47a60f9ef4d6' - '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' - 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' - '2bbbac963b6ca44ef3f8a71ec7c5cad7d66df860869a73059087ee236775970a' - '45a9ab99215ab3313be6e66e073d29154aac55bc58975a4df2dad116c918d27c' - 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' - '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' - 'b302ba6c5bbe8ed19b20207505d513208fae1e678cf4d8e7ac0b154e5fe3f456' - '3956c324798f25bcf8e6c5f6d160551245304c5cfa3a2cba73e5b1e350c364ce' - '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' - 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' - '88c7e308e474c845e0cc09e09bd223fc39876eca757abf6d6c3b8321f49ce1f1' - '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' - '433b919e6a0be26784fb4304c43b1811a28f12ad3de9e26c0af827f64c0c316e') - -export KBUILD_BUILD_HOST=archlinux -export KBUILD_BUILD_USER=$pkgbase -export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" - -prepare() { - rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build - - ln -s "${_where}/customization.cfg" "${srcdir}" # workaround - - cd "${srcdir}/${_srcpath}" - - _tkg_srcprep -} - -build() { - cd "${srcdir}/${_srcpath}" - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _force_all_threads="-j$((`nproc`*2))" - else - _force_all_threads="${MAKEFLAGS}" - fi - - # ccache - if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then - export PATH="/usr/lib/ccache/bin/:$PATH" - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - fi - - # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". - declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" - - # build! - _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) -} - -hackbase() { - pkgdesc="The $pkgdesc kernel and modules" - depends=('coreutils' 'kmod' 'initramfs') - optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' - 'crda: to set the correct wireless channels of your country.' - 'linux-firmware: Firmware files for Linux' - 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' - 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' - 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' - 'update-grub: Simple wrapper around grub-mkconfig.') - provides=("linux=${pkgver}" "${pkgbase}" VIRTUALBOX-GUEST-MODULES WIREGUARD-MODULE) - replaces=(virtualbox-guest-modules-arch wireguard-arch) - - cd "${srcdir}/${_srcpath}" - - # get kernel version - local _kernver="$(\033[1;0m \033[1;1m$1\033[1;0m" >&2 -} - -error() { - echo -e " \033[1;31m==> ERROR: $1\033[1;0m" >&2 -} - -warning() { - echo -e " \033[1;33m==> WARNING: $1\033[1;0m" >&2 -} - -plain() { - echo "$1" >&2 -} - -# Stop the script at any ecountered error -set -e - -_where=`pwd` -srcdir="$_where" - -source linux*-tkg-config/prepare - -_cpu_opt_patch_link="https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" - -source customization.cfg - -if [ "$1" != "install" ] && [ "$1" != "config" ] && [ "$1" != "uninstall-help" ]; then - msg2 "Argument not recognised, options are: - - config : shallow clones the linux ${_basekernel}.x git tree into the folder linux-${_basekernel}, then applies on it the extra patches and prepares the .config file - by copying the one from the current linux system in /boot/config-`uname -r` and updates it. - - install : [RPM and DEB based distros only], does the config step, proceeds to compile, then prompts to install - - uninstall-help : [RPM and DEB based distros only], lists the installed kernels in this system, then gives a hint on how to uninstall them manually." - exit 0 -fi - -# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. -if [ -e "$_EXT_CONFIG_PATH" ]; then - msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values." - source "$_EXT_CONFIG_PATH" -fi - -_misc_adds="false" # We currently don't want this enabled on non-Arch - -if [ "$1" = "install" ] || [ "$1" = "config" ]; then - - if [ -z $_distro ] && [ "$1" = "install" ]; then - while true; do - echo "Which linux distribution are you running ?" - echo "if it's not on the list, chose the closest one to it: Fedora/Suse for RPM, Ubuntu/Debian for DEB" - echo " 1) Debian" - echo " 2) Fedora" - echo " 3) Suse" - echo " 4) Ubuntu" - read -p "[1-4]: " _distro_index - - if [ "$_distro_index" = "1" ]; then - _distro="Debian" - break - elif [ "$_distro_index" = "2" ]; then - _distro="Fedora" - break - elif [ "$_distro_index" = "3" ]; then - _distro="Suse" - break - elif [ "$_distro_index" = "4" ]; then - _distro="Ubuntu" - break - else - echo "Wrong index." - fi - done - fi - - if [[ $1 = "install" && "$_distro" != "Ubuntu" && "$_distro" != "Debian" && "$_distro" != "Fedora" && "$_distro" != "Suse" ]]; then - msg2 "Variable \"_distro\" in \"customization.cfg\" hasn't been set to \"Ubuntu\", \"Debian\", \"Fedora\" or \"Suse\"" - msg2 "This script can only install custom kernels for RPM and DEB based distros, though only those keywords are permitted. Exiting..." - exit 0 - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - msg2 "Installing dependencies" - sudo apt install git build-essential kernel-package fakeroot libncurses5-dev libssl-dev ccache bison flex qtbase5-dev -y - elif [ "$_distro" = "Fedora" ]; then - msg2 "Installing dependencies" - sudo dnf install fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby qt5-devel libXi-devel gcc-c++ git ccache flex bison elfutils-libelf-devel openssl-devel dwarves rpm-build -y - elif [ "$_distro" = "Suse" ]; then - msg2 "Installing dependencies" - sudo zypper install -y rpmdevtools ncurses-devel pesign libXi-devel gcc-c++ git ccache flex bison elfutils libelf-devel openssl-devel dwarves make patch bc rpm-build libqt5-qtbase-common-devel libqt5-qtbase-devel lz4 - fi - - # Force prepare script to avoid Arch specific commands if the user is using `config` - if [ "$1" = "config" ]; then - _distro="" - fi - - if [ -d linux-${_basekernel}.orig ]; then - rm -rf linux-${_basekernel}.orig - fi - - if [ -d linux-${_basekernel} ]; then - msg2 "Reseting files in linux-$_basekernel to their original state and getting latest updates" - cd "$_where"/linux-${_basekernel} - git checkout --force linux-$_basekernel.y - git clean -f -d -x - git pull - msg2 "Done" - cd "$_where" - else - msg2 "Shallow git cloning linux $_basekernel" - git clone --branch linux-$_basekernel.y --single-branch --depth=1 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git linux-${_basekernel} - msg2 "Done" - fi - - # Define current kernel subversion - if [ -z $_kernel_subver ]; then - cd "$_where"/linux-${_basekernel} - _kernelverstr=`git describe` - _kernel_subver=${_kernelverstr:5} - cd "$_where" - fi - - - # Run init script that is also run in PKGBUILD, it will define some env vars that we will use - _tkg_initscript - - cd "$_where" - msg2 "Downloading Graysky2's CPU optimisations patch" - wget "$_cpu_opt_patch_link" - - # Follow Ubuntu install isntructions in https://wiki.ubuntu.com/KernelTeam/GitKernelBuild - - # cd in linux folder, copy Ubuntu's current config file, update with new params - cd "$_where"/linux-${_basekernel} - - msg2 "Copying current kernel's config and running make oldconfig..." - cp /boot/config-`uname -r` .config - if [ "$_distro" = "Debian" ]; then #Help Debian cert problem. - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/test-signing-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/debian-uefi-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config - fi - yes '' | make oldconfig - msg2 "Done" - - # apply linux-tkg patching script - _tkg_srcprep - - msg2 "Configuration done." -fi - -if [ "$1" = "install" ]; then - - # Use custom compiler paths if defined - if [ -n "${CUSTOM_GCC_PATH}" ]; then - PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} - fi - - if [ "$_force_all_threads" = "true" ]; then - _thread_num=`nproc` - else - _thread_num=`expr \`nproc\` / 4` - if [ "$_thread_num" = "0" ]; then - _thread_num=1 - fi - fi - - # ccache - if [ "$_noccache" != "true" ]; then - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - export PATH="/usr/lib/ccache/bin/:$PATH" - elif [ "$_distro" = "Fedora" ] || [ "$_distro" = "Suse" ]; then - export PATH="/usr/lib64/ccache/:$PATH" - fi - - export CCACHE_SLOPPINESS="file_macro,locale,time_macros" - export CCACHE_NOHASHDIR="true" - msg2 'ccache was found and will be used' - - fi - - if [ -z $_kernel_localversion ]; then - _kernel_flavor="tkg-${_cpusched}" - else - _kernel_flavor="tkg-${_kernel_localversion}" - fi - - if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then - - if make -j ${_thread_num} deb-pkg LOCALVERSION=-${_kernel_flavor}; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create DEBS folder if it doesn't exist - mkdir -p DEBS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv "$_where"/*.deb "$_where"/DEBS/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [[ $_install =~ [yY] ]] || [ $_install = "yes" ] || [ $_install = "Yes" ]; then - cd "$_where" - _kernelname=$_basekernel.$_kernel_subver-$_kernel_flavor - _headers_deb="linux-headers-${_kernelname}*.deb" - _image_deb="linux-image-${_kernelname}_*.deb" - _kernel_devel_deb="linux-libc-dev_${_kernelname}*.deb" - - cd DEBS - sudo dpkg -i $_headers_deb $_image_deb $_kernel_devel_deb - fi - fi - - elif [[ "$_distro" = "Fedora" || "$_distro" = "Suse" ]]; then - - # Replace dashes with underscores, it seems that it's being done by binrpm-pkg - # Se we can actually refer properly to the rpm files. - _kernel_flavor=${_kernel_flavor//-/_} - - if make -j ${_thread_num} rpm-pkg EXTRAVERSION="_${_kernel_flavor}"; then - msg2 "Building successfully finished!" - - cd "$_where" - - # Create RPMS folder if it doesn't exist - mkdir -p RPMS - - # Move rpm files to RPMS folder inside the linux-tkg folder - mv ~/rpmbuild/RPMS/x86_64/* "$_where"/RPMS/ - - #Clean up the original folder, unneeded and takes a lot of space - rm -rf ~/rpmbuild/ - - read -p "Do you want to install the new Kernel ? y/[n]: " _install - if [ "$_install" = "y" ] || [ "$_install" = "Y" ] || [ "$_install" = "yes" ] || [ "$_install" = "Yes" ]; then - - _kernelname=$_basekernel.${_kernel_subver}_$_kernel_flavor - _headers_rpm="kernel-headers-${_kernelname}*.rpm" - _kernel_rpm="kernel-${_kernelname}*.rpm" - _kernel_devel_rpm="kernel-devel-${_kernelname}*.rpm" - - cd RPMS - if [ "$_distro" = "Fedora" ]; then - sudo dnf install $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - elif [ "$_distro" = "Suse" ]; then - msg2 "Some files from 'linux-glibc-devel' will be replaced by files from the custom kernel-hearders package" - msg2 "To revert back to the original kernel headers do 'sudo zypper install -f linux-glibc-devel'" - sudo zypper install --replacefiles --allow-unsigned-rpm $_headers_rpm $_kernel_rpm $_kernel_devel_rpm - fi - - msg2 "Install successful" - fi - fi - fi -fi - -if [ "$1" = "uninstall-help" ]; then - - cd "$_where" - msg2 "List of installed custom tkg kernels: " - - if [ "$_distro" = "Ubuntu" ]; then - dpkg -l "*tkg*" | grep "linux.*tkg" - dpkg -l "*linux-libc-dev*" | grep "linux.*tkg" - msg2 "To uninstall a version, you should remove the linux-image, linux-headers and linux-libc-dev associated to it (if installed), with: " - msg2 " sudo apt remove linux-image-VERSION linux-headers-VERSION linux-libc-dev-VERSION" - msg2 " where VERSION is displayed in the lists above, uninstall only versions that have \"tkg\" in its name" - elif [ "$_distro" = "Fedora" ]; then - dnf list --installed kernel* - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo dnf remove --noautoremove kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second column" - elif [ "$_distro" = "Suse" ]; then - zypper packages --installed-only | grep "kernel.*tkg" - msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " - msg2 " sudo zypper remove --no-clean-deps kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" - msg2 " where VERSION is displayed in the second to last column" - fi - -fi diff --git a/linux59-tkg/linux59-tkg-config/90-cleanup.hook b/linux59-tkg/linux59-tkg-config/90-cleanup.hook deleted file mode 100644 index 99f5221..0000000 --- a/linux59-tkg/linux59-tkg-config/90-cleanup.hook +++ /dev/null @@ -1,14 +0,0 @@ -[Trigger] -Type = File -Operation = Install -Operation = Upgrade -Operation = Remove -Target = usr/lib/modules/*/ -Target = !usr/lib/modules/*/?* - -[Action] -Description = Cleaning up... -When = PostTransaction -Exec = /usr/share/libalpm/scripts/cleanup -NeedsTargets - diff --git a/linux59-tkg/linux59-tkg-config/cleanup b/linux59-tkg/linux59-tkg-config/cleanup deleted file mode 100755 index c00c08d..0000000 --- a/linux59-tkg/linux59-tkg-config/cleanup +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -for _f in /usr/lib/modules/*tkg*; do - if [[ ! -e ${_f}/vmlinuz ]]; then - rm -rf "$_f" - fi -done - -# vim:set ft=sh sw=2 et: - diff --git a/linux59-tkg/linux59-tkg-config/config.x86_64 b/linux59-tkg/linux59-tkg-config/config.x86_64 deleted file mode 100644 index 9524eeb..0000000 --- a/linux59-tkg/linux59-tkg-config/config.x86_64 +++ /dev/null @@ -1,11076 +0,0 @@ -# -# Automatically generated file; DO NOT EDIT. -# Linux/x86 5.9.0 Kernel Configuration -# -CONFIG_CC_VERSION_TEXT="gcc (GCC) 10.2.0" -CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=100200 -CONFIG_LD_VERSION=235000000 -CONFIG_CLANG_VERSION=0 -CONFIG_CC_CAN_LINK=y -CONFIG_CC_CAN_LINK_STATIC=y -CONFIG_CC_HAS_ASM_GOTO=y -CONFIG_CC_HAS_ASM_INLINE=y -CONFIG_IRQ_WORK=y -CONFIG_BUILDTIME_TABLE_SORT=y -CONFIG_THREAD_INFO_IN_TASK=y - -# -# General setup -# -CONFIG_INIT_ENV_ARG_LIMIT=32 -# CONFIG_COMPILE_TEST is not set -CONFIG_LOCALVERSION="" -CONFIG_LOCALVERSION_AUTO=y -CONFIG_BUILD_SALT="" -CONFIG_HAVE_KERNEL_GZIP=y -CONFIG_HAVE_KERNEL_BZIP2=y -CONFIG_HAVE_KERNEL_LZMA=y -CONFIG_HAVE_KERNEL_XZ=y -CONFIG_HAVE_KERNEL_LZO=y -CONFIG_HAVE_KERNEL_LZ4=y -CONFIG_HAVE_KERNEL_ZSTD=y -# CONFIG_KERNEL_GZIP is not set -# CONFIG_KERNEL_BZIP2 is not set -# CONFIG_KERNEL_LZMA is not set -CONFIG_KERNEL_XZ=y -# CONFIG_KERNEL_LZO is not set -# CONFIG_KERNEL_LZ4 is not set -# CONFIG_KERNEL_ZSTD is not set -CONFIG_DEFAULT_INIT="" -CONFIG_DEFAULT_HOSTNAME="archlinux" -CONFIG_SWAP=y -CONFIG_SYSVIPC=y -CONFIG_SYSVIPC_SYSCTL=y -CONFIG_POSIX_MQUEUE=y -CONFIG_POSIX_MQUEUE_SYSCTL=y -CONFIG_WATCH_QUEUE=y -CONFIG_CROSS_MEMORY_ATTACH=y -# CONFIG_USELIB is not set -CONFIG_AUDIT=y -CONFIG_HAVE_ARCH_AUDITSYSCALL=y -CONFIG_AUDITSYSCALL=y - -# -# IRQ subsystem -# -CONFIG_GENERIC_IRQ_PROBE=y -CONFIG_GENERIC_IRQ_SHOW=y -CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y -CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_GENERIC_IRQ_MIGRATION=y -CONFIG_HARDIRQS_SW_RESEND=y -CONFIG_GENERIC_IRQ_CHIP=y -CONFIG_IRQ_DOMAIN=y -CONFIG_IRQ_SIM=y -CONFIG_IRQ_DOMAIN_HIERARCHY=y -CONFIG_GENERIC_MSI_IRQ=y -CONFIG_GENERIC_MSI_IRQ_DOMAIN=y -CONFIG_IRQ_MSI_IOMMU=y -CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y -CONFIG_GENERIC_IRQ_RESERVATION_MODE=y -CONFIG_IRQ_FORCED_THREADING=y -CONFIG_SPARSE_IRQ=y -# CONFIG_GENERIC_IRQ_DEBUGFS is not set -# end of IRQ subsystem - -CONFIG_CLOCKSOURCE_WATCHDOG=y -CONFIG_ARCH_CLOCKSOURCE_INIT=y -CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y -CONFIG_GENERIC_TIME_VSYSCALL=y -CONFIG_GENERIC_CLOCKEVENTS=y -CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y -CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y -CONFIG_GENERIC_CMOS_UPDATE=y -CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y -CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y - -# -# Timers subsystem -# -CONFIG_TICK_ONESHOT=y -CONFIG_NO_HZ_COMMON=y -# CONFIG_HZ_PERIODIC is not set -CONFIG_NO_HZ_IDLE=y -# CONFIG_NO_HZ_FULL is not set -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y -# end of Timers subsystem - -# CONFIG_PREEMPT_NONE is not set -# CONFIG_PREEMPT_VOLUNTARY is not set -CONFIG_PREEMPT=y -CONFIG_PREEMPT_COUNT=y -CONFIG_PREEMPTION=y - -# -# CPU/Task time and stats accounting -# -CONFIG_TICK_CPU_ACCOUNTING=y -# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set -CONFIG_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_SCHED_AVG_IRQ=y -# CONFIG_SCHED_THERMAL_PRESSURE is not set -CONFIG_BSD_PROCESS_ACCT=y -CONFIG_BSD_PROCESS_ACCT_V3=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_XACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_PSI=y -# CONFIG_PSI_DEFAULT_DISABLED is not set -# end of CPU/Task time and stats accounting - -CONFIG_CPU_ISOLATION=y - -# -# RCU Subsystem -# -CONFIG_TREE_RCU=y -CONFIG_PREEMPT_RCU=y -CONFIG_RCU_EXPERT=y -CONFIG_SRCU=y -CONFIG_TREE_SRCU=y -CONFIG_TASKS_RCU_GENERIC=y -CONFIG_TASKS_RCU=y -CONFIG_TASKS_RUDE_RCU=y -CONFIG_RCU_STALL_COMMON=y -CONFIG_RCU_NEED_SEGCBLIST=y -CONFIG_RCU_FANOUT=64 -CONFIG_RCU_FANOUT_LEAF=16 -CONFIG_RCU_FAST_NO_HZ=y -CONFIG_RCU_BOOST=y -CONFIG_RCU_BOOST_DELAY=500 -# CONFIG_RCU_NOCB_CPU is not set -# CONFIG_TASKS_TRACE_RCU_READ_MB is not set -# end of RCU Subsystem - -CONFIG_BUILD_BIN2C=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -# CONFIG_IKHEADERS is not set -CONFIG_LOG_BUF_SHIFT=17 -CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 -CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y - -# -# Scheduler features -# -CONFIG_UCLAMP_TASK=y -CONFIG_UCLAMP_BUCKETS_COUNT=5 -# end of Scheduler features - -CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y -CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y -CONFIG_CC_HAS_INT128=y -CONFIG_ARCH_SUPPORTS_INT128=y -CONFIG_NUMA_BALANCING=y -CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y -CONFIG_CGROUPS=y -CONFIG_PAGE_COUNTER=y -CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y -CONFIG_MEMCG_KMEM=y -CONFIG_BLK_CGROUP=y -CONFIG_CGROUP_WRITEBACK=y -CONFIG_CGROUP_SCHED=y -CONFIG_FAIR_GROUP_SCHED=y -CONFIG_CFS_BANDWIDTH=y -# CONFIG_RT_GROUP_SCHED is not set -CONFIG_UCLAMP_TASK_GROUP=y -CONFIG_CGROUP_PIDS=y -CONFIG_CGROUP_RDMA=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_HUGETLB=y -CONFIG_CPUSETS=y -CONFIG_PROC_PID_CPUSET=y -CONFIG_CGROUP_DEVICE=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_PERF=y -CONFIG_CGROUP_BPF=y -# CONFIG_CGROUP_DEBUG is not set -CONFIG_SOCK_CGROUP_DATA=y -CONFIG_NAMESPACES=y -CONFIG_UTS_NS=y -CONFIG_TIME_NS=y -CONFIG_IPC_NS=y -CONFIG_USER_NS=y -CONFIG_USER_NS_UNPRIVILEGED=y -CONFIG_PID_NS=y -CONFIG_NET_NS=y -CONFIG_CHECKPOINT_RESTORE=y -CONFIG_SCHED_AUTOGROUP=y -# CONFIG_SYSFS_DEPRECATED is not set -CONFIG_RELAY=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_INITRAMFS_SOURCE="" -CONFIG_RD_GZIP=y -CONFIG_RD_BZIP2=y -CONFIG_RD_LZMA=y -CONFIG_RD_XZ=y -CONFIG_RD_LZO=y -CONFIG_RD_LZ4=y -CONFIG_RD_ZSTD=y -CONFIG_BOOT_CONFIG=y -CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y -# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set -CONFIG_SYSCTL=y -CONFIG_HAVE_UID16=y -CONFIG_SYSCTL_EXCEPTION_TRACE=y -CONFIG_HAVE_PCSPKR_PLATFORM=y -CONFIG_BPF=y -CONFIG_EXPERT=y -# CONFIG_UID16 is not set -CONFIG_MULTIUSER=y -CONFIG_SGETMASK_SYSCALL=y -# CONFIG_SYSFS_SYSCALL is not set -CONFIG_FHANDLE=y -CONFIG_POSIX_TIMERS=y -CONFIG_PRINTK=y -CONFIG_PRINTK_NMI=y -CONFIG_BUG=y -CONFIG_ELF_CORE=y -CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y -CONFIG_FUTEX=y -CONFIG_FUTEX_PI=y -CONFIG_EPOLL=y -CONFIG_SIGNALFD=y -CONFIG_TIMERFD=y -CONFIG_EVENTFD=y -CONFIG_SHMEM=y -CONFIG_AIO=y -CONFIG_IO_URING=y -CONFIG_ADVISE_SYSCALLS=y -CONFIG_HAVE_ARCH_USERFAULTFD_WP=y -CONFIG_MEMBARRIER=y -CONFIG_KALLSYMS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y -CONFIG_KALLSYMS_BASE_RELATIVE=y -CONFIG_BPF_LSM=y -CONFIG_BPF_SYSCALL=y -CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y -CONFIG_BPF_JIT_ALWAYS_ON=y -CONFIG_BPF_JIT_DEFAULT_ON=y -CONFIG_USERFAULTFD=y -CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y -CONFIG_RSEQ=y -# CONFIG_DEBUG_RSEQ is not set -# CONFIG_EMBEDDED is not set -CONFIG_HAVE_PERF_EVENTS=y -# CONFIG_PC104 is not set - -# -# Kernel Performance Events And Counters -# -CONFIG_PERF_EVENTS=y -# CONFIG_DEBUG_PERF_USE_VMALLOC is not set -# end of Kernel Performance Events And Counters - -CONFIG_VM_EVENT_COUNTERS=y -CONFIG_SLUB_DEBUG=y -# CONFIG_SLUB_MEMCG_SYSFS_ON is not set -# CONFIG_COMPAT_BRK is not set -# CONFIG_SLAB is not set -CONFIG_SLUB=y -# CONFIG_SLOB is not set -CONFIG_SLAB_MERGE_DEFAULT=y -CONFIG_SLAB_FREELIST_RANDOM=y -CONFIG_SLAB_FREELIST_HARDENED=y -CONFIG_SHUFFLE_PAGE_ALLOCATOR=y -CONFIG_SLUB_CPU_PARTIAL=y -CONFIG_SYSTEM_DATA_VERIFICATION=y -CONFIG_PROFILING=y -CONFIG_TRACEPOINTS=y -# end of General setup - -CONFIG_64BIT=y -CONFIG_X86_64=y -CONFIG_X86=y -CONFIG_INSTRUCTION_DECODER=y -CONFIG_OUTPUT_FORMAT="elf64-x86-64" -CONFIG_LOCKDEP_SUPPORT=y -CONFIG_STACKTRACE_SUPPORT=y -CONFIG_MMU=y -CONFIG_ARCH_MMAP_RND_BITS_MIN=28 -CONFIG_ARCH_MMAP_RND_BITS_MAX=32 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 -CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 -CONFIG_GENERIC_ISA_DMA=y -CONFIG_GENERIC_BUG=y -CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y -CONFIG_ARCH_MAY_HAVE_PC_FDC=y -CONFIG_GENERIC_CALIBRATE_DELAY=y -CONFIG_ARCH_HAS_CPU_RELAX=y -CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y -CONFIG_ARCH_HAS_FILTER_PGPROT=y -CONFIG_HAVE_SETUP_PER_CPU_AREA=y -CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y -CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y -CONFIG_ARCH_HIBERNATION_POSSIBLE=y -CONFIG_ARCH_SUSPEND_POSSIBLE=y -CONFIG_ARCH_WANT_GENERAL_HUGETLB=y -CONFIG_ZONE_DMA32=y -CONFIG_AUDIT_ARCH=y -CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y -CONFIG_HAVE_INTEL_TXT=y -CONFIG_X86_64_SMP=y -CONFIG_ARCH_SUPPORTS_UPROBES=y -CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DYNAMIC_PHYSICAL_MASK=y -CONFIG_PGTABLE_LEVELS=5 -CONFIG_CC_HAS_SANE_STACKPROTECTOR=y - -# -# Processor type and features -# -CONFIG_ZONE_DMA=y -CONFIG_SMP=y -CONFIG_X86_FEATURE_NAMES=y -CONFIG_X86_X2APIC=y -CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set -CONFIG_RETPOLINE=y -CONFIG_X86_CPU_RESCTRL=y -# CONFIG_X86_EXTENDED_PLATFORM is not set -CONFIG_X86_INTEL_LPSS=y -CONFIG_X86_AMD_PLATFORM_DEVICE=y -CONFIG_IOSF_MBI=y -# CONFIG_IOSF_MBI_DEBUG is not set -CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y -CONFIG_SCHED_OMIT_FRAME_POINTER=y -CONFIG_HYPERVISOR_GUEST=y -CONFIG_PARAVIRT=y -CONFIG_PARAVIRT_XXL=y -# CONFIG_PARAVIRT_DEBUG is not set -CONFIG_PARAVIRT_SPINLOCKS=y -CONFIG_X86_HV_CALLBACK_VECTOR=y -CONFIG_XEN=y -CONFIG_XEN_PV=y -CONFIG_XEN_PV_SMP=y -CONFIG_XEN_DOM0=y -CONFIG_XEN_PVHVM=y -CONFIG_XEN_PVHVM_SMP=y -CONFIG_XEN_512GB=y -CONFIG_XEN_SAVE_RESTORE=y -# CONFIG_XEN_DEBUG_FS is not set -CONFIG_XEN_PVH=y -CONFIG_KVM_GUEST=y -CONFIG_ARCH_CPUIDLE_HALTPOLL=y -CONFIG_PVH=y -CONFIG_PARAVIRT_TIME_ACCOUNTING=y -CONFIG_PARAVIRT_CLOCK=y -CONFIG_JAILHOUSE_GUEST=y -CONFIG_ACRN_GUEST=y -# CONFIG_MK8 is not set -# CONFIG_MPSC is not set -# CONFIG_MCORE2 is not set -# CONFIG_MATOM is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_INTERNODE_CACHE_SHIFT=6 -CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_TSC=y -CONFIG_X86_CMPXCHG64=y -CONFIG_X86_CMOV=y -CONFIG_X86_MINIMUM_CPU_FAMILY=64 -CONFIG_X86_DEBUGCTLMSR=y -CONFIG_IA32_FEAT_CTL=y -CONFIG_X86_VMX_FEATURE_NAMES=y -CONFIG_PROCESSOR_SELECT=y -CONFIG_CPU_SUP_INTEL=y -CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_HYGON=y -CONFIG_CPU_SUP_CENTAUR=y -CONFIG_CPU_SUP_ZHAOXIN=y -CONFIG_HPET_TIMER=y -CONFIG_HPET_EMULATE_RTC=y -CONFIG_DMI=y -CONFIG_GART_IOMMU=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS_RANGE_BEGIN=2 -CONFIG_NR_CPUS_RANGE_END=512 -CONFIG_NR_CPUS_DEFAULT=64 -CONFIG_NR_CPUS=320 -CONFIG_SCHED_SMT=y -CONFIG_SCHED_MC=y -CONFIG_SCHED_MC_PRIO=y -CONFIG_X86_LOCAL_APIC=y -CONFIG_X86_IO_APIC=y -CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y -CONFIG_X86_MCE=y -# CONFIG_X86_MCELOG_LEGACY is not set -CONFIG_X86_MCE_INTEL=y -CONFIG_X86_MCE_AMD=y -CONFIG_X86_MCE_THRESHOLD=y -CONFIG_X86_MCE_INJECT=m -CONFIG_X86_THERMAL_VECTOR=y - -# -# Performance monitoring -# -CONFIG_PERF_EVENTS_INTEL_UNCORE=m -CONFIG_PERF_EVENTS_INTEL_RAPL=m -CONFIG_PERF_EVENTS_INTEL_CSTATE=m -CONFIG_PERF_EVENTS_AMD_POWER=m -# end of Performance monitoring - -CONFIG_X86_16BIT=y -CONFIG_X86_ESPFIX64=y -CONFIG_X86_VSYSCALL_EMULATION=y -CONFIG_X86_IOPL_IOPERM=y -CONFIG_I8K=m -CONFIG_MICROCODE=y -CONFIG_MICROCODE_INTEL=y -CONFIG_MICROCODE_AMD=y -CONFIG_MICROCODE_OLD_INTERFACE=y -CONFIG_X86_MSR=m -CONFIG_X86_CPUID=m -CONFIG_X86_5LEVEL=y -CONFIG_X86_DIRECT_GBPAGES=y -# CONFIG_X86_CPA_STATISTICS is not set -CONFIG_AMD_MEM_ENCRYPT=y -# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set -CONFIG_NUMA=y -CONFIG_AMD_NUMA=y -CONFIG_X86_64_ACPI_NUMA=y -# CONFIG_NUMA_EMU is not set -CONFIG_NODES_SHIFT=5 -CONFIG_ARCH_SPARSEMEM_ENABLE=y -CONFIG_ARCH_SPARSEMEM_DEFAULT=y -CONFIG_ARCH_SELECT_MEMORY_MODEL=y -CONFIG_ARCH_MEMORY_PROBE=y -CONFIG_ARCH_PROC_KCORE_TEXT=y -CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 -CONFIG_X86_PMEM_LEGACY_DEVICE=y -CONFIG_X86_PMEM_LEGACY=m -CONFIG_X86_CHECK_BIOS_CORRUPTION=y -CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y -CONFIG_X86_RESERVE_LOW=64 -CONFIG_MTRR=y -CONFIG_MTRR_SANITIZER=y -CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 -CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 -CONFIG_X86_PAT=y -CONFIG_ARCH_USES_PG_UNCACHED=y -CONFIG_ARCH_RANDOM=y -CONFIG_X86_SMAP=y -CONFIG_X86_UMIP=y -CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y -# CONFIG_X86_INTEL_TSX_MODE_OFF is not set -# CONFIG_X86_INTEL_TSX_MODE_ON is not set -CONFIG_X86_INTEL_TSX_MODE_AUTO=y -CONFIG_EFI=y -CONFIG_EFI_STUB=y -CONFIG_EFI_MIXED=y -CONFIG_SECCOMP=y -# CONFIG_HZ_100 is not set -# CONFIG_HZ_250 is not set -CONFIG_HZ_300=y -# CONFIG_HZ_1000 is not set -CONFIG_HZ=300 -CONFIG_SCHED_HRTICK=y -CONFIG_KEXEC=y -CONFIG_KEXEC_FILE=y -CONFIG_ARCH_HAS_KEXEC_PURGATORY=y -# CONFIG_KEXEC_SIG is not set -CONFIG_CRASH_DUMP=y -CONFIG_KEXEC_JUMP=y -CONFIG_PHYSICAL_START=0x1000000 -CONFIG_RELOCATABLE=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_X86_NEED_RELOCS=y -CONFIG_PHYSICAL_ALIGN=0x200000 -CONFIG_DYNAMIC_MEMORY_LAYOUT=y -CONFIG_RANDOMIZE_MEMORY=y -CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set -# CONFIG_COMPAT_VDSO is not set -# CONFIG_LEGACY_VSYSCALL_EMULATE is not set -CONFIG_LEGACY_VSYSCALL_XONLY=y -# CONFIG_LEGACY_VSYSCALL_NONE is not set -# CONFIG_CMDLINE_BOOL is not set -CONFIG_MODIFY_LDT_SYSCALL=y -CONFIG_HAVE_LIVEPATCH=y -# CONFIG_LIVEPATCH is not set -# end of Processor type and features - -CONFIG_ARCH_HAS_ADD_PAGES=y -CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y -CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y -CONFIG_USE_PERCPU_NUMA_NODE_ID=y -CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y -CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y -CONFIG_ARCH_ENABLE_THP_MIGRATION=y - -# -# Power management and ACPI options -# -CONFIG_ARCH_HIBERNATION_HEADER=y -CONFIG_SUSPEND=y -CONFIG_SUSPEND_FREEZER=y -# CONFIG_SUSPEND_SKIP_SYNC is not set -CONFIG_HIBERNATE_CALLBACKS=y -CONFIG_HIBERNATION=y -CONFIG_HIBERNATION_SNAPSHOT_DEV=y -CONFIG_PM_STD_PARTITION="" -CONFIG_PM_SLEEP=y -CONFIG_PM_SLEEP_SMP=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PM_WAKELOCKS_LIMIT=100 -CONFIG_PM_WAKELOCKS_GC=y -CONFIG_PM=y -CONFIG_PM_DEBUG=y -CONFIG_PM_ADVANCED_DEBUG=y -# CONFIG_PM_TEST_SUSPEND is not set -CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_DPM_WATCHDOG is not set -CONFIG_PM_TRACE=y -CONFIG_PM_TRACE_RTC=y -CONFIG_PM_CLK=y -CONFIG_PM_GENERIC_DOMAINS=y -CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y -CONFIG_PM_GENERIC_DOMAINS_SLEEP=y -CONFIG_PM_GENERIC_DOMAINS_OF=y -CONFIG_ENERGY_MODEL=y -CONFIG_ARCH_SUPPORTS_ACPI=y -CONFIG_ACPI=y -CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y -CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y -CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y -# CONFIG_ACPI_DEBUGGER is not set -CONFIG_ACPI_SPCR_TABLE=y -CONFIG_ACPI_LPIT=y -CONFIG_ACPI_SLEEP=y -CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y -CONFIG_ACPI_EC_DEBUGFS=y -CONFIG_ACPI_AC=m -CONFIG_ACPI_BATTERY=m -CONFIG_ACPI_BUTTON=y -CONFIG_ACPI_VIDEO=y -CONFIG_ACPI_FAN=y -CONFIG_ACPI_TAD=m -CONFIG_ACPI_DOCK=y -CONFIG_ACPI_CPU_FREQ_PSS=y -CONFIG_ACPI_PROCESSOR_CSTATE=y -CONFIG_ACPI_PROCESSOR_IDLE=y -CONFIG_ACPI_CPPC_LIB=y -CONFIG_ACPI_PROCESSOR=y -CONFIG_ACPI_IPMI=m -CONFIG_ACPI_HOTPLUG_CPU=y -CONFIG_ACPI_PROCESSOR_AGGREGATOR=y -CONFIG_ACPI_THERMAL=y -CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_TABLE_UPGRADE=y -CONFIG_ACPI_DEBUG=y -CONFIG_ACPI_PCI_SLOT=y -CONFIG_ACPI_CONTAINER=y -CONFIG_ACPI_HOTPLUG_MEMORY=y -CONFIG_ACPI_HOTPLUG_IOAPIC=y -CONFIG_ACPI_SBS=m -CONFIG_ACPI_HED=y -CONFIG_ACPI_CUSTOM_METHOD=m -CONFIG_ACPI_BGRT=y -# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set -CONFIG_ACPI_NFIT=m -# CONFIG_NFIT_SECURITY_DEBUG is not set -CONFIG_ACPI_NUMA=y -CONFIG_ACPI_HMAT=y -CONFIG_HAVE_ACPI_APEI=y -CONFIG_HAVE_ACPI_APEI_NMI=y -CONFIG_ACPI_APEI=y -CONFIG_ACPI_APEI_GHES=y -CONFIG_ACPI_APEI_PCIEAER=y -CONFIG_ACPI_APEI_MEMORY_FAILURE=y -CONFIG_ACPI_APEI_EINJ=m -CONFIG_ACPI_APEI_ERST_DEBUG=m -CONFIG_DPTF_POWER=m -CONFIG_ACPI_WATCHDOG=y -CONFIG_ACPI_EXTLOG=m -CONFIG_ACPI_ADXL=y -CONFIG_PMIC_OPREGION=y -CONFIG_BYTCRC_PMIC_OPREGION=y -CONFIG_CHTCRC_PMIC_OPREGION=y -CONFIG_XPOWER_PMIC_OPREGION=y -CONFIG_BXT_WC_PMIC_OPREGION=y -CONFIG_CHT_WC_PMIC_OPREGION=y -CONFIG_CHT_DC_TI_PMIC_OPREGION=y -CONFIG_ACPI_CONFIGFS=m -CONFIG_TPS68470_PMIC_OPREGION=y -CONFIG_X86_PM_TIMER=y -CONFIG_SFI=y - -# -# CPU Frequency scaling -# -CONFIG_CPU_FREQ=y -CONFIG_CPU_FREQ_GOV_ATTR_SET=y -CONFIG_CPU_FREQ_GOV_COMMON=y -CONFIG_CPU_FREQ_STAT=y -# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set -# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set -CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y -CONFIG_CPU_FREQ_GOV_PERFORMANCE=y -CONFIG_CPU_FREQ_GOV_POWERSAVE=m -CONFIG_CPU_FREQ_GOV_USERSPACE=m -CONFIG_CPU_FREQ_GOV_ONDEMAND=m -CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y - -# -# CPU frequency scaling drivers -# -CONFIG_CPUFREQ_DT=m -CONFIG_CPUFREQ_DT_PLATDEV=y -CONFIG_X86_INTEL_PSTATE=y -CONFIG_X86_PCC_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ=m -CONFIG_X86_ACPI_CPUFREQ_CPB=y -CONFIG_X86_POWERNOW_K8=m -CONFIG_X86_AMD_FREQ_SENSITIVITY=m -# CONFIG_X86_SPEEDSTEP_CENTRINO is not set -CONFIG_X86_P4_CLOCKMOD=m - -# -# shared options -# -CONFIG_X86_SPEEDSTEP_LIB=m -# end of CPU Frequency scaling - -# -# CPU Idle -# -CONFIG_CPU_IDLE=y -CONFIG_CPU_IDLE_GOV_LADDER=y -CONFIG_CPU_IDLE_GOV_MENU=y -CONFIG_CPU_IDLE_GOV_TEO=y -CONFIG_CPU_IDLE_GOV_HALTPOLL=y -CONFIG_HALTPOLL_CPUIDLE=m -# end of CPU Idle - -CONFIG_INTEL_IDLE=y -# end of Power management and ACPI options - -# -# Bus options (PCI etc.) -# -CONFIG_PCI_DIRECT=y -CONFIG_PCI_MMCONFIG=y -CONFIG_PCI_XEN=y -CONFIG_MMCONF_FAM10H=y -# CONFIG_PCI_CNB20LE_QUIRK is not set -# CONFIG_ISA_BUS is not set -CONFIG_ISA_DMA_API=y -CONFIG_AMD_NB=y -# CONFIG_X86_SYSFB is not set -# end of Bus options (PCI etc.) - -# -# Binary Emulations -# -CONFIG_IA32_EMULATION=y -# CONFIG_X86_X32 is not set -CONFIG_COMPAT_32=y -CONFIG_COMPAT=y -CONFIG_COMPAT_FOR_U64_ALIGNMENT=y -CONFIG_SYSVIPC_COMPAT=y -# end of Binary Emulations - -# -# Firmware Drivers -# -CONFIG_EDD=m -# CONFIG_EDD_OFF is not set -CONFIG_FIRMWARE_MEMMAP=y -CONFIG_DMIID=y -CONFIG_DMI_SYSFS=m -CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y -CONFIG_ISCSI_IBFT_FIND=y -CONFIG_ISCSI_IBFT=m -CONFIG_FW_CFG_SYSFS=m -# CONFIG_FW_CFG_SYSFS_CMDLINE is not set -CONFIG_GOOGLE_FIRMWARE=y -# CONFIG_GOOGLE_SMI is not set -CONFIG_GOOGLE_COREBOOT_TABLE=m -CONFIG_GOOGLE_MEMCONSOLE=m -# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set -CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m -CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m -CONFIG_GOOGLE_VPD=m - -# -# EFI (Extensible Firmware Interface) Support -# -# CONFIG_EFI_VARS is not set -CONFIG_EFI_ESRT=y -CONFIG_EFI_RUNTIME_MAP=y -# CONFIG_EFI_FAKE_MEMMAP is not set -CONFIG_EFI_SOFT_RESERVE=y -CONFIG_EFI_RUNTIME_WRAPPERS=y -CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y -CONFIG_EFI_CAPSULE_LOADER=m -# CONFIG_EFI_TEST is not set -CONFIG_APPLE_PROPERTIES=y -# CONFIG_RESET_ATTACK_MITIGATION is not set -CONFIG_EFI_RCI2_TABLE=y -# CONFIG_EFI_DISABLE_PCI_DMA is not set -# end of EFI (Extensible Firmware Interface) Support - -CONFIG_EFI_EMBEDDED_FIRMWARE=y -CONFIG_UEFI_CPER=y -CONFIG_UEFI_CPER_X86=y -CONFIG_EFI_DEV_PATH_PARSER=y -CONFIG_EFI_EARLYCON=y - -# -# Tegra firmware driver -# -# end of Tegra firmware driver -# end of Firmware Drivers - -CONFIG_HAVE_KVM=y -CONFIG_HAVE_KVM_IRQCHIP=y -CONFIG_HAVE_KVM_IRQFD=y -CONFIG_HAVE_KVM_IRQ_ROUTING=y -CONFIG_HAVE_KVM_EVENTFD=y -CONFIG_KVM_MMIO=y -CONFIG_KVM_ASYNC_PF=y -CONFIG_HAVE_KVM_MSI=y -CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y -CONFIG_KVM_VFIO=y -CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y -CONFIG_KVM_COMPAT=y -CONFIG_HAVE_KVM_IRQ_BYPASS=y -CONFIG_HAVE_KVM_NO_POLL=y -CONFIG_KVM_XFER_TO_GUEST_WORK=y -CONFIG_VIRTUALIZATION=y -CONFIG_KVM=m -CONFIG_KVM_WERROR=y -CONFIG_KVM_INTEL=m -CONFIG_KVM_AMD=m -CONFIG_KVM_AMD_SEV=y -CONFIG_KVM_MMU_AUDIT=y -CONFIG_AS_AVX512=y -CONFIG_AS_SHA1_NI=y -CONFIG_AS_SHA256_NI=y -CONFIG_AS_TPAUSE=y - -# -# General architecture-dependent options -# -CONFIG_CRASH_CORE=y -CONFIG_KEXEC_CORE=y -CONFIG_HOTPLUG_SMT=y -CONFIG_GENERIC_ENTRY=y -CONFIG_OPROFILE=m -# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set -CONFIG_HAVE_OPROFILE=y -CONFIG_OPROFILE_NMI_TIMER=y -CONFIG_KPROBES=y -CONFIG_JUMP_LABEL=y -# CONFIG_STATIC_KEYS_SELFTEST is not set -CONFIG_OPTPROBES=y -CONFIG_KPROBES_ON_FTRACE=y -CONFIG_UPROBES=y -CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y -CONFIG_ARCH_USE_BUILTIN_BSWAP=y -CONFIG_KRETPROBES=y -CONFIG_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_IOREMAP_PROT=y -CONFIG_HAVE_KPROBES=y -CONFIG_HAVE_KRETPROBES=y -CONFIG_HAVE_OPTPROBES=y -CONFIG_HAVE_KPROBES_ON_FTRACE=y -CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y -CONFIG_HAVE_NMI=y -CONFIG_HAVE_ARCH_TRACEHOOK=y -CONFIG_HAVE_DMA_CONTIGUOUS=y -CONFIG_GENERIC_SMP_IDLE_THREAD=y -CONFIG_ARCH_HAS_FORTIFY_SOURCE=y -CONFIG_ARCH_HAS_SET_MEMORY=y -CONFIG_ARCH_HAS_SET_DIRECT_MAP=y -CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y -CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y -CONFIG_HAVE_ASM_MODVERSIONS=y -CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y -CONFIG_HAVE_RSEQ=y -CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y -CONFIG_HAVE_HW_BREAKPOINT=y -CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y -CONFIG_HAVE_USER_RETURN_NOTIFIER=y -CONFIG_HAVE_PERF_EVENTS_NMI=y -CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HAVE_PERF_REGS=y -CONFIG_HAVE_PERF_USER_STACK_DUMP=y -CONFIG_HAVE_ARCH_JUMP_LABEL=y -CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y -CONFIG_MMU_GATHER_TABLE_FREE=y -CONFIG_MMU_GATHER_RCU_TABLE_FREE=y -CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y -CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y -CONFIG_HAVE_CMPXCHG_LOCAL=y -CONFIG_HAVE_CMPXCHG_DOUBLE=y -CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y -CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y -CONFIG_HAVE_ARCH_SECCOMP_FILTER=y -CONFIG_SECCOMP_FILTER=y -CONFIG_HAVE_ARCH_STACKLEAK=y -CONFIG_HAVE_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR=y -CONFIG_STACKPROTECTOR_STRONG=y -CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y -CONFIG_HAVE_CONTEXT_TRACKING=y -CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y -CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y -CONFIG_HAVE_MOVE_PMD=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y -CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y -CONFIG_HAVE_ARCH_HUGE_VMAP=y -CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y -CONFIG_HAVE_ARCH_SOFT_DIRTY=y -CONFIG_HAVE_MOD_ARCH_SPECIFIC=y -CONFIG_MODULES_USE_ELF_RELA=y -CONFIG_ARCH_HAS_ELF_RANDOMIZE=y -CONFIG_HAVE_ARCH_MMAP_RND_BITS=y -CONFIG_HAVE_EXIT_THREAD=y -CONFIG_ARCH_MMAP_RND_BITS=28 -CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y -CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 -CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y -CONFIG_HAVE_STACK_VALIDATION=y -CONFIG_HAVE_RELIABLE_STACKTRACE=y -CONFIG_ISA_BUS_API=y -CONFIG_OLD_SIGSUSPEND3=y -CONFIG_COMPAT_OLD_SIGACTION=y -CONFIG_COMPAT_32BIT_TIME=y -CONFIG_HAVE_ARCH_VMAP_STACK=y -CONFIG_VMAP_STACK=y -CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y -CONFIG_STRICT_KERNEL_RWX=y -CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y -CONFIG_STRICT_MODULE_RWX=y -CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y -CONFIG_ARCH_USE_MEMREMAP_PROT=y -CONFIG_LOCK_EVENT_COUNTS=y -CONFIG_ARCH_HAS_MEM_ENCRYPT=y - -# -# GCOV-based kernel profiling -# -# CONFIG_GCOV_KERNEL is not set -CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y -# end of GCOV-based kernel profiling - -CONFIG_HAVE_GCC_PLUGINS=y -CONFIG_GCC_PLUGINS=y -# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set -# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set -# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set -# end of General architecture-dependent options - -CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 -CONFIG_MODULE_SIG_FORMAT=y -CONFIG_MODULES=y -CONFIG_MODULE_FORCE_LOAD=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODULE_FORCE_UNLOAD=y -# CONFIG_MODVERSIONS is not set -CONFIG_MODULE_SRCVERSION_ALL=y -CONFIG_MODULE_SIG=y -# CONFIG_MODULE_SIG_FORCE is not set -CONFIG_MODULE_SIG_ALL=y -# CONFIG_MODULE_SIG_SHA1 is not set -# CONFIG_MODULE_SIG_SHA224 is not set -# CONFIG_MODULE_SIG_SHA256 is not set -# CONFIG_MODULE_SIG_SHA384 is not set -CONFIG_MODULE_SIG_SHA512=y -CONFIG_MODULE_SIG_HASH="sha512" -CONFIG_MODULE_COMPRESS=y -# CONFIG_MODULE_COMPRESS_GZIP is not set -CONFIG_MODULE_COMPRESS_XZ=y -CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y -CONFIG_UNUSED_SYMBOLS=y -CONFIG_MODULES_TREE_LOOKUP=y -CONFIG_BLOCK=y -CONFIG_BLK_RQ_ALLOC_TIME=y -CONFIG_BLK_SCSI_REQUEST=y -CONFIG_BLK_CGROUP_RWSTAT=y -CONFIG_BLK_DEV_BSG=y -CONFIG_BLK_DEV_BSGLIB=y -CONFIG_BLK_DEV_INTEGRITY=y -CONFIG_BLK_DEV_INTEGRITY_T10=y -CONFIG_BLK_DEV_ZONED=y -CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y -# CONFIG_BLK_CMDLINE_PARSER is not set -CONFIG_BLK_WBT=y -CONFIG_BLK_CGROUP_IOLATENCY=y -CONFIG_BLK_CGROUP_IOCOST=y -CONFIG_BLK_WBT_MQ=y -CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y -CONFIG_BLK_SED_OPAL=y -CONFIG_BLK_INLINE_ENCRYPTION=y -CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y - -# -# Partition Types -# -CONFIG_PARTITION_ADVANCED=y -# CONFIG_ACORN_PARTITION is not set -CONFIG_AIX_PARTITION=y -# CONFIG_OSF_PARTITION is not set -# CONFIG_AMIGA_PARTITION is not set -# CONFIG_ATARI_PARTITION is not set -CONFIG_MAC_PARTITION=y -CONFIG_MSDOS_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -# CONFIG_UNIXWARE_DISKLABEL is not set -CONFIG_LDM_PARTITION=y -# CONFIG_LDM_DEBUG is not set -# CONFIG_SGI_PARTITION is not set -# CONFIG_ULTRIX_PARTITION is not set -# CONFIG_SUN_PARTITION is not set -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y -# CONFIG_SYSV68_PARTITION is not set -# CONFIG_CMDLINE_PARTITION is not set -# end of Partition Types - -CONFIG_BLOCK_COMPAT=y -CONFIG_BLK_MQ_PCI=y -CONFIG_BLK_MQ_VIRTIO=y -CONFIG_BLK_MQ_RDMA=y -CONFIG_BLK_PM=y - -# -# IO Schedulers -# -CONFIG_MQ_IOSCHED_DEADLINE=y -CONFIG_MQ_IOSCHED_KYBER=y -CONFIG_IOSCHED_BFQ=y -CONFIG_BFQ_GROUP_IOSCHED=y -# CONFIG_BFQ_CGROUP_DEBUG is not set -# end of IO Schedulers - -CONFIG_PREEMPT_NOTIFIERS=y -CONFIG_PADATA=y -CONFIG_ASN1=y -CONFIG_UNINLINE_SPIN_UNLOCK=y -CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y -CONFIG_MUTEX_SPIN_ON_OWNER=y -CONFIG_RWSEM_SPIN_ON_OWNER=y -CONFIG_LOCK_SPIN_ON_OWNER=y -CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y -CONFIG_QUEUED_SPINLOCKS=y -CONFIG_ARCH_USE_QUEUED_RWLOCKS=y -CONFIG_QUEUED_RWLOCKS=y -CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y -CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y -CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y -CONFIG_FREEZER=y - -# -# Executable file formats -# -CONFIG_BINFMT_ELF=y -CONFIG_COMPAT_BINFMT_ELF=y -CONFIG_ELFCORE=y -CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y -CONFIG_BINFMT_SCRIPT=y -CONFIG_BINFMT_MISC=y -CONFIG_COREDUMP=y -# end of Executable file formats - -# -# Memory Management options -# -CONFIG_SELECT_MEMORY_MODEL=y -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_NEED_MULTIPLE_NODES=y -CONFIG_SPARSEMEM_EXTREME=y -CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y -CONFIG_SPARSEMEM_VMEMMAP=y -CONFIG_HAVE_FAST_GUP=y -CONFIG_NUMA_KEEP_MEMINFO=y -CONFIG_MEMORY_ISOLATION=y -CONFIG_HAVE_BOOTMEM_INFO_NODE=y -CONFIG_MEMORY_HOTPLUG=y -CONFIG_MEMORY_HOTPLUG_SPARSE=y -CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y -CONFIG_MEMORY_HOTREMOVE=y -CONFIG_SPLIT_PTLOCK_CPUS=4 -CONFIG_MEMORY_BALLOON=y -CONFIG_BALLOON_COMPACTION=y -CONFIG_COMPACTION=y -CONFIG_PAGE_REPORTING=y -CONFIG_MIGRATION=y -CONFIG_CONTIG_ALLOC=y -CONFIG_PHYS_ADDR_T_64BIT=y -CONFIG_BOUNCE=y -CONFIG_VIRT_TO_BUS=y -CONFIG_MMU_NOTIFIER=y -CONFIG_KSM=y -CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 -CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y -CONFIG_MEMORY_FAILURE=y -CONFIG_HWPOISON_INJECT=m -CONFIG_TRANSPARENT_HUGEPAGE=y -# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set -CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y -CONFIG_ARCH_WANTS_THP_SWAP=y -CONFIG_THP_SWAP=y -CONFIG_CLEANCACHE=y -CONFIG_FRONTSWAP=y -# CONFIG_CMA is not set -CONFIG_MEM_SOFT_DIRTY=y -CONFIG_ZSWAP=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set -# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set -CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y -# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set -CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" -CONFIG_ZSWAP_DEFAULT_ON=y -CONFIG_ZPOOL=y -CONFIG_ZBUD=y -CONFIG_Z3FOLD=y -CONFIG_ZSMALLOC=y -# CONFIG_ZSMALLOC_PGTABLE_MAPPING is not set -# CONFIG_ZSMALLOC_STAT is not set -CONFIG_GENERIC_EARLY_IOREMAP=y -# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set -# CONFIG_IDLE_PAGE_TRACKING is not set -CONFIG_ARCH_HAS_PTE_DEVMAP=y -CONFIG_ZONE_DEVICE=y -CONFIG_DEV_PAGEMAP_OPS=y -CONFIG_HMM_MIRROR=y -CONFIG_DEVICE_PRIVATE=y -CONFIG_FRAME_VECTOR=y -CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y -CONFIG_ARCH_HAS_PKEYS=y -# CONFIG_PERCPU_STATS is not set -# CONFIG_GUP_BENCHMARK is not set -CONFIG_READ_ONLY_THP_FOR_FS=y -CONFIG_ARCH_HAS_PTE_SPECIAL=y -CONFIG_MAPPING_DIRTY_HELPERS=y -# end of Memory Management options - -CONFIG_NET=y -CONFIG_COMPAT_NETLINK_MESSAGES=y -CONFIG_NET_INGRESS=y -CONFIG_NET_EGRESS=y -CONFIG_NET_REDIRECT=y -CONFIG_SKB_EXTENSIONS=y - -# -# Networking options -# -CONFIG_PACKET=y -CONFIG_PACKET_DIAG=y -CONFIG_UNIX=y -CONFIG_UNIX_SCM=y -CONFIG_UNIX_DIAG=y -CONFIG_TLS=m -CONFIG_TLS_DEVICE=y -# CONFIG_TLS_TOE is not set -CONFIG_XFRM=y -CONFIG_XFRM_OFFLOAD=y -CONFIG_XFRM_ALGO=m -CONFIG_XFRM_USER=m -CONFIG_XFRM_INTERFACE=m -CONFIG_XFRM_SUB_POLICY=y -CONFIG_XFRM_MIGRATE=y -CONFIG_XFRM_STATISTICS=y -CONFIG_XFRM_AH=m -CONFIG_XFRM_ESP=m -CONFIG_XFRM_IPCOMP=m -CONFIG_NET_KEY=m -CONFIG_NET_KEY_MIGRATE=y -CONFIG_XFRM_ESPINTCP=y -CONFIG_SMC=m -CONFIG_SMC_DIAG=m -CONFIG_XDP_SOCKETS=y -CONFIG_XDP_SOCKETS_DIAG=y -CONFIG_INET=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_ADVANCED_ROUTER=y -# CONFIG_IP_FIB_TRIE_STATS is not set -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_ROUTE_MULTIPATH=y -CONFIG_IP_ROUTE_VERBOSE=y -CONFIG_IP_ROUTE_CLASSID=y -# CONFIG_IP_PNP is not set -CONFIG_NET_IPIP=m -CONFIG_NET_IPGRE_DEMUX=m -CONFIG_NET_IP_TUNNEL=m -CONFIG_NET_IPGRE=m -# CONFIG_NET_IPGRE_BROADCAST is not set -CONFIG_IP_MROUTE_COMMON=y -CONFIG_IP_MROUTE=y -CONFIG_IP_MROUTE_MULTIPLE_TABLES=y -CONFIG_IP_PIMSM_V1=y -CONFIG_IP_PIMSM_V2=y -CONFIG_SYN_COOKIES=y -CONFIG_NET_IPVTI=m -CONFIG_NET_UDP_TUNNEL=m -CONFIG_NET_FOU=m -CONFIG_NET_FOU_IP_TUNNELS=y -CONFIG_INET_AH=m -CONFIG_INET_ESP=m -CONFIG_INET_ESP_OFFLOAD=m -CONFIG_INET_ESPINTCP=y -CONFIG_INET_IPCOMP=m -CONFIG_INET_XFRM_TUNNEL=m -CONFIG_INET_TUNNEL=m -CONFIG_INET_DIAG=m -CONFIG_INET_TCP_DIAG=m -CONFIG_INET_UDP_DIAG=m -CONFIG_INET_RAW_DIAG=m -CONFIG_INET_DIAG_DESTROY=y -CONFIG_TCP_CONG_ADVANCED=y -CONFIG_TCP_CONG_BIC=m -CONFIG_TCP_CONG_CUBIC=y -CONFIG_TCP_CONG_WESTWOOD=m -CONFIG_TCP_CONG_HTCP=m -CONFIG_TCP_CONG_HSTCP=m -CONFIG_TCP_CONG_HYBLA=m -CONFIG_TCP_CONG_VEGAS=m -CONFIG_TCP_CONG_NV=m -CONFIG_TCP_CONG_SCALABLE=m -CONFIG_TCP_CONG_LP=m -CONFIG_TCP_CONG_VENO=m -CONFIG_TCP_CONG_YEAH=m -CONFIG_TCP_CONG_ILLINOIS=m -CONFIG_TCP_CONG_DCTCP=m -CONFIG_TCP_CONG_CDG=m -CONFIG_TCP_CONG_BBR=m -CONFIG_DEFAULT_CUBIC=y -# CONFIG_DEFAULT_RENO is not set -CONFIG_DEFAULT_TCP_CONG="cubic" -CONFIG_TCP_MD5SIG=y -CONFIG_IPV6=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_INET6_AH=m -CONFIG_INET6_ESP=m -CONFIG_INET6_ESP_OFFLOAD=m -CONFIG_INET6_ESPINTCP=y -CONFIG_INET6_IPCOMP=m -CONFIG_IPV6_MIP6=m -CONFIG_IPV6_ILA=m -CONFIG_INET6_XFRM_TUNNEL=m -CONFIG_INET6_TUNNEL=m -CONFIG_IPV6_VTI=m -CONFIG_IPV6_SIT=m -CONFIG_IPV6_SIT_6RD=y -CONFIG_IPV6_NDISC_NODETYPE=y -CONFIG_IPV6_TUNNEL=m -CONFIG_IPV6_GRE=m -CONFIG_IPV6_FOU=m -CONFIG_IPV6_FOU_TUNNEL=m -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_SUBTREES=y -CONFIG_IPV6_MROUTE=y -CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y -CONFIG_IPV6_PIMSM_V2=y -CONFIG_IPV6_SEG6_LWTUNNEL=y -CONFIG_IPV6_SEG6_HMAC=y -CONFIG_IPV6_SEG6_BPF=y -CONFIG_IPV6_RPL_LWTUNNEL=y -CONFIG_NETLABEL=y -CONFIG_MPTCP=y -CONFIG_INET_MPTCP_DIAG=m -CONFIG_MPTCP_IPV6=y -CONFIG_NETWORK_SECMARK=y -CONFIG_NET_PTP_CLASSIFY=y -CONFIG_NETWORK_PHY_TIMESTAMPING=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_ADVANCED=y -CONFIG_BRIDGE_NETFILTER=m - -# -# Core Netfilter Configuration -# -CONFIG_NETFILTER_INGRESS=y -CONFIG_NETFILTER_NETLINK=m -CONFIG_NETFILTER_FAMILY_BRIDGE=y -CONFIG_NETFILTER_FAMILY_ARP=y -CONFIG_NETFILTER_NETLINK_ACCT=m -CONFIG_NETFILTER_NETLINK_QUEUE=m -CONFIG_NETFILTER_NETLINK_LOG=m -CONFIG_NETFILTER_NETLINK_OSF=m -CONFIG_NF_CONNTRACK=m -CONFIG_NF_LOG_COMMON=m -CONFIG_NF_LOG_NETDEV=m -CONFIG_NETFILTER_CONNCOUNT=m -CONFIG_NF_CONNTRACK_MARK=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_ZONES=y -CONFIG_NF_CONNTRACK_PROCFS=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_TIMEOUT=y -CONFIG_NF_CONNTRACK_TIMESTAMP=y -CONFIG_NF_CONNTRACK_LABELS=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_GRE=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_CONNTRACK_AMANDA=m -CONFIG_NF_CONNTRACK_FTP=m -CONFIG_NF_CONNTRACK_H323=m -CONFIG_NF_CONNTRACK_IRC=m -CONFIG_NF_CONNTRACK_BROADCAST=m -CONFIG_NF_CONNTRACK_NETBIOS_NS=m -CONFIG_NF_CONNTRACK_SNMP=m -CONFIG_NF_CONNTRACK_PPTP=m -CONFIG_NF_CONNTRACK_SANE=m -CONFIG_NF_CONNTRACK_SIP=m -CONFIG_NF_CONNTRACK_TFTP=m -CONFIG_NF_CT_NETLINK=m -CONFIG_NF_CT_NETLINK_TIMEOUT=m -CONFIG_NF_CT_NETLINK_HELPER=m -CONFIG_NETFILTER_NETLINK_GLUE_CT=y -CONFIG_NF_NAT=m -CONFIG_NF_NAT_AMANDA=m -CONFIG_NF_NAT_FTP=m -CONFIG_NF_NAT_IRC=m -CONFIG_NF_NAT_SIP=m -CONFIG_NF_NAT_TFTP=m -CONFIG_NF_NAT_REDIRECT=y -CONFIG_NF_NAT_MASQUERADE=y -CONFIG_NETFILTER_SYNPROXY=m -CONFIG_NF_TABLES=m -CONFIG_NF_TABLES_INET=y -CONFIG_NF_TABLES_NETDEV=y -CONFIG_NFT_NUMGEN=m -CONFIG_NFT_CT=m -CONFIG_NFT_FLOW_OFFLOAD=m -CONFIG_NFT_COUNTER=m -CONFIG_NFT_CONNLIMIT=m -CONFIG_NFT_LOG=m -CONFIG_NFT_LIMIT=m -CONFIG_NFT_MASQ=m -CONFIG_NFT_REDIR=m -CONFIG_NFT_NAT=m -CONFIG_NFT_TUNNEL=m -CONFIG_NFT_OBJREF=m -CONFIG_NFT_QUEUE=m -CONFIG_NFT_QUOTA=m -CONFIG_NFT_REJECT=m -CONFIG_NFT_REJECT_INET=m -CONFIG_NFT_COMPAT=m -CONFIG_NFT_HASH=m -CONFIG_NFT_FIB=m -CONFIG_NFT_FIB_INET=m -CONFIG_NFT_XFRM=m -CONFIG_NFT_SOCKET=m -CONFIG_NFT_OSF=m -CONFIG_NFT_TPROXY=m -CONFIG_NFT_SYNPROXY=m -CONFIG_NF_DUP_NETDEV=m -CONFIG_NFT_DUP_NETDEV=m -CONFIG_NFT_FWD_NETDEV=m -CONFIG_NFT_FIB_NETDEV=m -CONFIG_NF_FLOW_TABLE_INET=m -CONFIG_NF_FLOW_TABLE=m -CONFIG_NETFILTER_XTABLES=m - -# -# Xtables combined modules -# -CONFIG_NETFILTER_XT_MARK=m -CONFIG_NETFILTER_XT_CONNMARK=m -CONFIG_NETFILTER_XT_SET=m - -# -# Xtables targets -# -CONFIG_NETFILTER_XT_TARGET_AUDIT=m -CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m -CONFIG_NETFILTER_XT_TARGET_CONNMARK=m -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m -CONFIG_NETFILTER_XT_TARGET_CT=m -CONFIG_NETFILTER_XT_TARGET_DSCP=m -CONFIG_NETFILTER_XT_TARGET_HL=m -CONFIG_NETFILTER_XT_TARGET_HMARK=m -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m -CONFIG_NETFILTER_XT_TARGET_LED=m -CONFIG_NETFILTER_XT_TARGET_LOG=m -CONFIG_NETFILTER_XT_TARGET_MARK=m -CONFIG_NETFILTER_XT_NAT=m -CONFIG_NETFILTER_XT_TARGET_NETMAP=m -CONFIG_NETFILTER_XT_TARGET_NFLOG=m -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m -CONFIG_NETFILTER_XT_TARGET_NOTRACK=m -CONFIG_NETFILTER_XT_TARGET_RATEEST=m -CONFIG_NETFILTER_XT_TARGET_REDIRECT=m -CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m -CONFIG_NETFILTER_XT_TARGET_TEE=m -CONFIG_NETFILTER_XT_TARGET_TPROXY=m -CONFIG_NETFILTER_XT_TARGET_TRACE=m -CONFIG_NETFILTER_XT_TARGET_SECMARK=m -CONFIG_NETFILTER_XT_TARGET_TCPMSS=m -CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m - -# -# Xtables matches -# -CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m -CONFIG_NETFILTER_XT_MATCH_BPF=m -CONFIG_NETFILTER_XT_MATCH_CGROUP=m -CONFIG_NETFILTER_XT_MATCH_CLUSTER=m -CONFIG_NETFILTER_XT_MATCH_COMMENT=m -CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m -CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m -CONFIG_NETFILTER_XT_MATCH_CONNMARK=m -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m -CONFIG_NETFILTER_XT_MATCH_CPU=m -CONFIG_NETFILTER_XT_MATCH_DCCP=m -CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m -CONFIG_NETFILTER_XT_MATCH_DSCP=m -CONFIG_NETFILTER_XT_MATCH_ECN=m -CONFIG_NETFILTER_XT_MATCH_ESP=m -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m -CONFIG_NETFILTER_XT_MATCH_HELPER=m -CONFIG_NETFILTER_XT_MATCH_HL=m -CONFIG_NETFILTER_XT_MATCH_IPCOMP=m -CONFIG_NETFILTER_XT_MATCH_IPRANGE=m -CONFIG_NETFILTER_XT_MATCH_IPVS=m -CONFIG_NETFILTER_XT_MATCH_L2TP=m -CONFIG_NETFILTER_XT_MATCH_LENGTH=m -CONFIG_NETFILTER_XT_MATCH_LIMIT=m -CONFIG_NETFILTER_XT_MATCH_MAC=m -CONFIG_NETFILTER_XT_MATCH_MARK=m -CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m -CONFIG_NETFILTER_XT_MATCH_NFACCT=m -CONFIG_NETFILTER_XT_MATCH_OSF=m -CONFIG_NETFILTER_XT_MATCH_OWNER=m -CONFIG_NETFILTER_XT_MATCH_POLICY=m -CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m -CONFIG_NETFILTER_XT_MATCH_QUOTA=m -CONFIG_NETFILTER_XT_MATCH_RATEEST=m -CONFIG_NETFILTER_XT_MATCH_REALM=m -CONFIG_NETFILTER_XT_MATCH_RECENT=m -CONFIG_NETFILTER_XT_MATCH_SCTP=m -CONFIG_NETFILTER_XT_MATCH_SOCKET=m -CONFIG_NETFILTER_XT_MATCH_STATE=m -CONFIG_NETFILTER_XT_MATCH_STATISTIC=m -CONFIG_NETFILTER_XT_MATCH_STRING=m -CONFIG_NETFILTER_XT_MATCH_TCPMSS=m -CONFIG_NETFILTER_XT_MATCH_TIME=m -CONFIG_NETFILTER_XT_MATCH_U32=m -# end of Core Netfilter Configuration - -CONFIG_IP_SET=m -CONFIG_IP_SET_MAX=256 -CONFIG_IP_SET_BITMAP_IP=m -CONFIG_IP_SET_BITMAP_IPMAC=m -CONFIG_IP_SET_BITMAP_PORT=m -CONFIG_IP_SET_HASH_IP=m -CONFIG_IP_SET_HASH_IPMARK=m -CONFIG_IP_SET_HASH_IPPORT=m -CONFIG_IP_SET_HASH_IPPORTIP=m -CONFIG_IP_SET_HASH_IPPORTNET=m -CONFIG_IP_SET_HASH_IPMAC=m -CONFIG_IP_SET_HASH_MAC=m -CONFIG_IP_SET_HASH_NETPORTNET=m -CONFIG_IP_SET_HASH_NET=m -CONFIG_IP_SET_HASH_NETNET=m -CONFIG_IP_SET_HASH_NETPORT=m -CONFIG_IP_SET_HASH_NETIFACE=m -CONFIG_IP_SET_LIST_SET=m -CONFIG_IP_VS=m -CONFIG_IP_VS_IPV6=y -# CONFIG_IP_VS_DEBUG is not set -CONFIG_IP_VS_TAB_BITS=15 - -# -# IPVS transport protocol load balancing support -# -CONFIG_IP_VS_PROTO_TCP=y -CONFIG_IP_VS_PROTO_UDP=y -CONFIG_IP_VS_PROTO_AH_ESP=y -CONFIG_IP_VS_PROTO_ESP=y -CONFIG_IP_VS_PROTO_AH=y -CONFIG_IP_VS_PROTO_SCTP=y - -# -# IPVS scheduler -# -CONFIG_IP_VS_RR=m -CONFIG_IP_VS_WRR=m -CONFIG_IP_VS_LC=m -CONFIG_IP_VS_WLC=m -CONFIG_IP_VS_FO=m -CONFIG_IP_VS_OVF=m -CONFIG_IP_VS_LBLC=m -CONFIG_IP_VS_LBLCR=m -CONFIG_IP_VS_DH=m -CONFIG_IP_VS_SH=m -CONFIG_IP_VS_MH=m -CONFIG_IP_VS_SED=m -CONFIG_IP_VS_NQ=m - -# -# IPVS SH scheduler -# -CONFIG_IP_VS_SH_TAB_BITS=8 - -# -# IPVS MH scheduler -# -CONFIG_IP_VS_MH_TAB_INDEX=12 - -# -# IPVS application helper -# -CONFIG_IP_VS_FTP=m -CONFIG_IP_VS_NFCT=y -CONFIG_IP_VS_PE_SIP=m - -# -# IP: Netfilter Configuration -# -CONFIG_NF_DEFRAG_IPV4=m -CONFIG_NF_SOCKET_IPV4=m -CONFIG_NF_TPROXY_IPV4=m -CONFIG_NF_TABLES_IPV4=y -CONFIG_NFT_REJECT_IPV4=m -CONFIG_NFT_DUP_IPV4=m -CONFIG_NFT_FIB_IPV4=m -CONFIG_NF_TABLES_ARP=y -CONFIG_NF_FLOW_TABLE_IPV4=m -CONFIG_NF_DUP_IPV4=m -CONFIG_NF_LOG_ARP=m -CONFIG_NF_LOG_IPV4=m -CONFIG_NF_REJECT_IPV4=m -CONFIG_NF_NAT_SNMP_BASIC=m -CONFIG_NF_NAT_PPTP=m -CONFIG_NF_NAT_H323=m -CONFIG_IP_NF_IPTABLES=m -CONFIG_IP_NF_MATCH_AH=m -CONFIG_IP_NF_MATCH_ECN=m -CONFIG_IP_NF_MATCH_RPFILTER=m -CONFIG_IP_NF_MATCH_TTL=m -CONFIG_IP_NF_FILTER=m -CONFIG_IP_NF_TARGET_REJECT=m -CONFIG_IP_NF_TARGET_SYNPROXY=m -CONFIG_IP_NF_NAT=m -CONFIG_IP_NF_TARGET_MASQUERADE=m -CONFIG_IP_NF_TARGET_NETMAP=m -CONFIG_IP_NF_TARGET_REDIRECT=m -CONFIG_IP_NF_MANGLE=m -CONFIG_IP_NF_TARGET_CLUSTERIP=m -CONFIG_IP_NF_TARGET_ECN=m -CONFIG_IP_NF_TARGET_TTL=m -CONFIG_IP_NF_RAW=m -CONFIG_IP_NF_SECURITY=m -CONFIG_IP_NF_ARPTABLES=m -CONFIG_IP_NF_ARPFILTER=m -CONFIG_IP_NF_ARP_MANGLE=m -# end of IP: Netfilter Configuration - -# -# IPv6: Netfilter Configuration -# -CONFIG_NF_SOCKET_IPV6=m -CONFIG_NF_TPROXY_IPV6=m -CONFIG_NF_TABLES_IPV6=y -CONFIG_NFT_REJECT_IPV6=m -CONFIG_NFT_DUP_IPV6=m -CONFIG_NFT_FIB_IPV6=m -CONFIG_NF_FLOW_TABLE_IPV6=m -CONFIG_NF_DUP_IPV6=m -CONFIG_NF_REJECT_IPV6=m -CONFIG_NF_LOG_IPV6=m -CONFIG_IP6_NF_IPTABLES=m -CONFIG_IP6_NF_MATCH_AH=m -CONFIG_IP6_NF_MATCH_EUI64=m -CONFIG_IP6_NF_MATCH_FRAG=m -CONFIG_IP6_NF_MATCH_OPTS=m -CONFIG_IP6_NF_MATCH_HL=m -CONFIG_IP6_NF_MATCH_IPV6HEADER=m -CONFIG_IP6_NF_MATCH_MH=m -CONFIG_IP6_NF_MATCH_RPFILTER=m -CONFIG_IP6_NF_MATCH_RT=m -CONFIG_IP6_NF_MATCH_SRH=m -CONFIG_IP6_NF_TARGET_HL=m -CONFIG_IP6_NF_FILTER=m -CONFIG_IP6_NF_TARGET_REJECT=m -CONFIG_IP6_NF_TARGET_SYNPROXY=m -CONFIG_IP6_NF_MANGLE=m -CONFIG_IP6_NF_RAW=m -CONFIG_IP6_NF_SECURITY=m -CONFIG_IP6_NF_NAT=m -CONFIG_IP6_NF_TARGET_MASQUERADE=m -CONFIG_IP6_NF_TARGET_NPT=m -# end of IPv6: Netfilter Configuration - -CONFIG_NF_DEFRAG_IPV6=m -CONFIG_NF_TABLES_BRIDGE=m -CONFIG_NFT_BRIDGE_META=m -CONFIG_NFT_BRIDGE_REJECT=m -CONFIG_NF_LOG_BRIDGE=m -CONFIG_NF_CONNTRACK_BRIDGE=m -CONFIG_BRIDGE_NF_EBTABLES=m -CONFIG_BRIDGE_EBT_BROUTE=m -CONFIG_BRIDGE_EBT_T_FILTER=m -CONFIG_BRIDGE_EBT_T_NAT=m -CONFIG_BRIDGE_EBT_802_3=m -CONFIG_BRIDGE_EBT_AMONG=m -CONFIG_BRIDGE_EBT_ARP=m -CONFIG_BRIDGE_EBT_IP=m -CONFIG_BRIDGE_EBT_IP6=m -CONFIG_BRIDGE_EBT_LIMIT=m -CONFIG_BRIDGE_EBT_MARK=m -CONFIG_BRIDGE_EBT_PKTTYPE=m -CONFIG_BRIDGE_EBT_STP=m -CONFIG_BRIDGE_EBT_VLAN=m -CONFIG_BRIDGE_EBT_ARPREPLY=m -CONFIG_BRIDGE_EBT_DNAT=m -CONFIG_BRIDGE_EBT_MARK_T=m -CONFIG_BRIDGE_EBT_REDIRECT=m -CONFIG_BRIDGE_EBT_SNAT=m -CONFIG_BRIDGE_EBT_LOG=m -CONFIG_BRIDGE_EBT_NFLOG=m -# CONFIG_BPFILTER is not set -CONFIG_IP_DCCP=m -CONFIG_INET_DCCP_DIAG=m - -# -# DCCP CCIDs Configuration -# -# CONFIG_IP_DCCP_CCID2_DEBUG is not set -CONFIG_IP_DCCP_CCID3=y -# CONFIG_IP_DCCP_CCID3_DEBUG is not set -CONFIG_IP_DCCP_TFRC_LIB=y -# end of DCCP CCIDs Configuration - -# -# DCCP Kernel Hacking -# -# CONFIG_IP_DCCP_DEBUG is not set -# end of DCCP Kernel Hacking - -CONFIG_IP_SCTP=m -# CONFIG_SCTP_DBG_OBJCNT is not set -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set -CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y -# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set -CONFIG_SCTP_COOKIE_HMAC_MD5=y -CONFIG_SCTP_COOKIE_HMAC_SHA1=y -CONFIG_INET_SCTP_DIAG=m -CONFIG_RDS=m -CONFIG_RDS_RDMA=m -CONFIG_RDS_TCP=m -# CONFIG_RDS_DEBUG is not set -CONFIG_TIPC=m -CONFIG_TIPC_MEDIA_IB=y -CONFIG_TIPC_MEDIA_UDP=y -CONFIG_TIPC_CRYPTO=y -CONFIG_TIPC_DIAG=m -CONFIG_ATM=m -CONFIG_ATM_CLIP=m -# CONFIG_ATM_CLIP_NO_ICMP is not set -CONFIG_ATM_LANE=m -CONFIG_ATM_MPOA=m -CONFIG_ATM_BR2684=m -# CONFIG_ATM_BR2684_IPFILTER is not set -CONFIG_L2TP=m -# CONFIG_L2TP_DEBUGFS is not set -CONFIG_L2TP_V3=y -CONFIG_L2TP_IP=m -CONFIG_L2TP_ETH=m -CONFIG_STP=m -CONFIG_GARP=m -CONFIG_MRP=m -CONFIG_BRIDGE=m -CONFIG_BRIDGE_IGMP_SNOOPING=y -CONFIG_BRIDGE_VLAN_FILTERING=y -CONFIG_BRIDGE_MRP=y -CONFIG_HAVE_NET_DSA=y -CONFIG_NET_DSA=m -CONFIG_NET_DSA_TAG_8021Q=m -CONFIG_NET_DSA_TAG_AR9331=m -CONFIG_NET_DSA_TAG_BRCM_COMMON=m -CONFIG_NET_DSA_TAG_BRCM=m -CONFIG_NET_DSA_TAG_BRCM_PREPEND=m -CONFIG_NET_DSA_TAG_GSWIP=m -CONFIG_NET_DSA_TAG_DSA=m -CONFIG_NET_DSA_TAG_EDSA=m -CONFIG_NET_DSA_TAG_MTK=m -CONFIG_NET_DSA_TAG_KSZ=m -CONFIG_NET_DSA_TAG_RTL4_A=m -CONFIG_NET_DSA_TAG_OCELOT=m -CONFIG_NET_DSA_TAG_QCA=m -CONFIG_NET_DSA_TAG_LAN9303=m -CONFIG_NET_DSA_TAG_SJA1105=m -CONFIG_NET_DSA_TAG_TRAILER=m -CONFIG_VLAN_8021Q=m -CONFIG_VLAN_8021Q_GVRP=y -CONFIG_VLAN_8021Q_MVRP=y -# CONFIG_DECNET is not set -CONFIG_LLC=m -CONFIG_LLC2=m -CONFIG_ATALK=m -CONFIG_DEV_APPLETALK=m -CONFIG_IPDDP=m -CONFIG_IPDDP_ENCAP=y -# CONFIG_X25 is not set -# CONFIG_LAPB is not set -CONFIG_PHONET=m -CONFIG_6LOWPAN=m -# CONFIG_6LOWPAN_DEBUGFS is not set -CONFIG_6LOWPAN_NHC=m -CONFIG_6LOWPAN_NHC_DEST=m -CONFIG_6LOWPAN_NHC_FRAGMENT=m -CONFIG_6LOWPAN_NHC_HOP=m -CONFIG_6LOWPAN_NHC_IPV6=m -CONFIG_6LOWPAN_NHC_MOBILITY=m -CONFIG_6LOWPAN_NHC_ROUTING=m -CONFIG_6LOWPAN_NHC_UDP=m -CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m -CONFIG_6LOWPAN_GHC_UDP=m -CONFIG_6LOWPAN_GHC_ICMPV6=m -CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m -CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m -CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m -CONFIG_IEEE802154=m -CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y -CONFIG_IEEE802154_SOCKET=m -CONFIG_IEEE802154_6LOWPAN=m -CONFIG_MAC802154=m -CONFIG_NET_SCHED=y - -# -# Queueing/Scheduling -# -CONFIG_NET_SCH_CBQ=m -CONFIG_NET_SCH_HTB=m -CONFIG_NET_SCH_HFSC=m -CONFIG_NET_SCH_ATM=m -CONFIG_NET_SCH_PRIO=m -CONFIG_NET_SCH_MULTIQ=m -CONFIG_NET_SCH_RED=m -CONFIG_NET_SCH_SFB=m -CONFIG_NET_SCH_SFQ=m -CONFIG_NET_SCH_TEQL=m -CONFIG_NET_SCH_TBF=m -CONFIG_NET_SCH_CBS=m -CONFIG_NET_SCH_ETF=m -CONFIG_NET_SCH_TAPRIO=m -CONFIG_NET_SCH_GRED=m -CONFIG_NET_SCH_DSMARK=m -CONFIG_NET_SCH_NETEM=m -CONFIG_NET_SCH_DRR=m -CONFIG_NET_SCH_MQPRIO=m -CONFIG_NET_SCH_SKBPRIO=m -CONFIG_NET_SCH_CHOKE=m -CONFIG_NET_SCH_QFQ=m -CONFIG_NET_SCH_CODEL=m -CONFIG_NET_SCH_FQ_CODEL=y -CONFIG_NET_SCH_CAKE=m -CONFIG_NET_SCH_FQ=m -CONFIG_NET_SCH_HHF=m -CONFIG_NET_SCH_PIE=m -CONFIG_NET_SCH_FQ_PIE=m -CONFIG_NET_SCH_INGRESS=m -CONFIG_NET_SCH_PLUG=m -CONFIG_NET_SCH_ETS=m -CONFIG_NET_SCH_DEFAULT=y -# CONFIG_DEFAULT_FQ is not set -# CONFIG_DEFAULT_CODEL is not set -CONFIG_DEFAULT_FQ_CODEL=y -# CONFIG_DEFAULT_FQ_PIE is not set -# CONFIG_DEFAULT_SFQ is not set -# CONFIG_DEFAULT_PFIFO_FAST is not set -CONFIG_DEFAULT_NET_SCH="fq_codel" - -# -# Classification -# -CONFIG_NET_CLS=y -CONFIG_NET_CLS_BASIC=m -CONFIG_NET_CLS_TCINDEX=m -CONFIG_NET_CLS_ROUTE4=m -CONFIG_NET_CLS_FW=m -CONFIG_NET_CLS_U32=m -CONFIG_CLS_U32_PERF=y -CONFIG_CLS_U32_MARK=y -CONFIG_NET_CLS_RSVP=m -CONFIG_NET_CLS_RSVP6=m -CONFIG_NET_CLS_FLOW=m -CONFIG_NET_CLS_CGROUP=m -CONFIG_NET_CLS_BPF=m -CONFIG_NET_CLS_FLOWER=m -CONFIG_NET_CLS_MATCHALL=m -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_STACK=32 -CONFIG_NET_EMATCH_CMP=m -CONFIG_NET_EMATCH_NBYTE=m -CONFIG_NET_EMATCH_U32=m -CONFIG_NET_EMATCH_META=m -CONFIG_NET_EMATCH_TEXT=m -CONFIG_NET_EMATCH_CANID=m -CONFIG_NET_EMATCH_IPSET=m -CONFIG_NET_EMATCH_IPT=m -CONFIG_NET_CLS_ACT=y -CONFIG_NET_ACT_POLICE=m -CONFIG_NET_ACT_GACT=m -CONFIG_GACT_PROB=y -CONFIG_NET_ACT_MIRRED=m -CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m -CONFIG_NET_ACT_NAT=m -CONFIG_NET_ACT_PEDIT=m -CONFIG_NET_ACT_SIMP=m -CONFIG_NET_ACT_SKBEDIT=m -CONFIG_NET_ACT_CSUM=m -CONFIG_NET_ACT_MPLS=m -CONFIG_NET_ACT_VLAN=m -CONFIG_NET_ACT_BPF=m -CONFIG_NET_ACT_CONNMARK=m -CONFIG_NET_ACT_CTINFO=m -CONFIG_NET_ACT_SKBMOD=m -CONFIG_NET_ACT_IFE=m -CONFIG_NET_ACT_TUNNEL_KEY=m -CONFIG_NET_ACT_CT=m -CONFIG_NET_ACT_GATE=m -CONFIG_NET_IFE_SKBMARK=m -CONFIG_NET_IFE_SKBPRIO=m -CONFIG_NET_IFE_SKBTCINDEX=m -CONFIG_NET_TC_SKB_EXT=y -CONFIG_NET_SCH_FIFO=y -CONFIG_DCB=y -CONFIG_DNS_RESOLVER=m -CONFIG_BATMAN_ADV=m -CONFIG_BATMAN_ADV_BATMAN_V=y -CONFIG_BATMAN_ADV_BLA=y -CONFIG_BATMAN_ADV_DAT=y -CONFIG_BATMAN_ADV_NC=y -CONFIG_BATMAN_ADV_MCAST=y -CONFIG_BATMAN_ADV_DEBUGFS=y -# CONFIG_BATMAN_ADV_DEBUG is not set -CONFIG_BATMAN_ADV_SYSFS=y -# CONFIG_BATMAN_ADV_TRACING is not set -CONFIG_OPENVSWITCH=m -CONFIG_OPENVSWITCH_GRE=m -CONFIG_OPENVSWITCH_VXLAN=m -CONFIG_OPENVSWITCH_GENEVE=m -CONFIG_VSOCKETS=m -CONFIG_VSOCKETS_DIAG=m -CONFIG_VSOCKETS_LOOPBACK=m -CONFIG_VMWARE_VMCI_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS=m -CONFIG_VIRTIO_VSOCKETS_COMMON=m -CONFIG_HYPERV_VSOCKETS=m -CONFIG_NETLINK_DIAG=m -CONFIG_MPLS=y -CONFIG_NET_MPLS_GSO=m -CONFIG_MPLS_ROUTING=m -CONFIG_MPLS_IPTUNNEL=m -CONFIG_NET_NSH=m -CONFIG_HSR=m -CONFIG_NET_SWITCHDEV=y -CONFIG_NET_L3_MASTER_DEV=y -CONFIG_QRTR=m -CONFIG_QRTR_SMD=m -CONFIG_QRTR_TUN=m -CONFIG_QRTR_MHI=m -CONFIG_NET_NCSI=y -CONFIG_NCSI_OEM_CMD_GET_MAC=y -CONFIG_RPS=y -CONFIG_RFS_ACCEL=y -CONFIG_XPS=y -CONFIG_CGROUP_NET_PRIO=y -CONFIG_CGROUP_NET_CLASSID=y -CONFIG_NET_RX_BUSY_POLL=y -CONFIG_BQL=y -CONFIG_BPF_JIT=y -CONFIG_BPF_STREAM_PARSER=y -CONFIG_NET_FLOW_LIMIT=y - -# -# Network testing -# -CONFIG_NET_PKTGEN=m -CONFIG_NET_DROP_MONITOR=y -# end of Network testing -# end of Networking options - -CONFIG_HAMRADIO=y - -# -# Packet Radio protocols -# -CONFIG_AX25=m -CONFIG_AX25_DAMA_SLAVE=y -CONFIG_NETROM=m -CONFIG_ROSE=m - -# -# AX.25 network device drivers -# -CONFIG_MKISS=m -CONFIG_6PACK=m -CONFIG_BPQETHER=m -CONFIG_BAYCOM_SER_FDX=m -CONFIG_BAYCOM_SER_HDX=m -CONFIG_BAYCOM_PAR=m -CONFIG_YAM=m -# end of AX.25 network device drivers - -CONFIG_CAN=m -CONFIG_CAN_RAW=m -CONFIG_CAN_BCM=m -CONFIG_CAN_GW=m -CONFIG_CAN_J1939=m - -# -# CAN Device Drivers -# -CONFIG_CAN_VCAN=m -CONFIG_CAN_VXCAN=m -CONFIG_CAN_SLCAN=m -CONFIG_CAN_DEV=m -CONFIG_CAN_CALC_BITTIMING=y -CONFIG_CAN_FLEXCAN=m -CONFIG_CAN_GRCAN=m -CONFIG_CAN_JANZ_ICAN3=m -CONFIG_CAN_KVASER_PCIEFD=m -CONFIG_CAN_C_CAN=m -CONFIG_CAN_C_CAN_PLATFORM=m -CONFIG_CAN_C_CAN_PCI=m -CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set -CONFIG_CAN_CC770_PLATFORM=m -CONFIG_CAN_IFI_CANFD=m -CONFIG_CAN_M_CAN=m -CONFIG_CAN_M_CAN_PLATFORM=m -CONFIG_CAN_M_CAN_TCAN4X5X=m -CONFIG_CAN_PEAK_PCIEFD=m -CONFIG_CAN_SJA1000=m -CONFIG_CAN_EMS_PCI=m -# CONFIG_CAN_EMS_PCMCIA is not set -CONFIG_CAN_F81601=m -CONFIG_CAN_KVASER_PCI=m -CONFIG_CAN_PEAK_PCI=m -CONFIG_CAN_PEAK_PCIEC=y -CONFIG_CAN_PEAK_PCMCIA=m -CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set -CONFIG_CAN_SJA1000_PLATFORM=m -CONFIG_CAN_SOFTING=m -CONFIG_CAN_SOFTING_CS=m - -# -# CAN SPI interfaces -# -CONFIG_CAN_HI311X=m -CONFIG_CAN_MCP251X=m -# end of CAN SPI interfaces - -# -# CAN USB interfaces -# -CONFIG_CAN_8DEV_USB=m -CONFIG_CAN_EMS_USB=m -CONFIG_CAN_ESD_USB2=m -CONFIG_CAN_GS_USB=m -CONFIG_CAN_KVASER_USB=m -CONFIG_CAN_MCBA_USB=m -CONFIG_CAN_PEAK_USB=m -CONFIG_CAN_UCAN=m -# end of CAN USB interfaces - -# CONFIG_CAN_DEBUG_DEVICES is not set -# end of CAN Device Drivers - -CONFIG_BT=m -CONFIG_BT_BREDR=y -CONFIG_BT_RFCOMM=m -CONFIG_BT_RFCOMM_TTY=y -CONFIG_BT_BNEP=m -CONFIG_BT_BNEP_MC_FILTER=y -CONFIG_BT_BNEP_PROTO_FILTER=y -CONFIG_BT_CMTP=m -CONFIG_BT_HIDP=m -CONFIG_BT_HS=y -CONFIG_BT_LE=y -CONFIG_BT_6LOWPAN=m -CONFIG_BT_LEDS=y -CONFIG_BT_MSFTEXT=y -CONFIG_BT_DEBUGFS=y -# CONFIG_BT_SELFTEST is not set - -# -# Bluetooth device drivers -# -CONFIG_BT_INTEL=m -CONFIG_BT_BCM=m -CONFIG_BT_RTL=m -CONFIG_BT_QCA=m -CONFIG_BT_HCIBTUSB=m -CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y -CONFIG_BT_HCIBTUSB_BCM=y -CONFIG_BT_HCIBTUSB_MTK=y -CONFIG_BT_HCIBTUSB_RTL=y -CONFIG_BT_HCIBTSDIO=m -CONFIG_BT_HCIUART=m -CONFIG_BT_HCIUART_SERDEV=y -CONFIG_BT_HCIUART_H4=y -CONFIG_BT_HCIUART_NOKIA=m -CONFIG_BT_HCIUART_BCSP=y -CONFIG_BT_HCIUART_ATH3K=y -CONFIG_BT_HCIUART_LL=y -CONFIG_BT_HCIUART_3WIRE=y -CONFIG_BT_HCIUART_INTEL=y -CONFIG_BT_HCIUART_BCM=y -CONFIG_BT_HCIUART_RTL=y -CONFIG_BT_HCIUART_QCA=y -CONFIG_BT_HCIUART_AG6XX=y -CONFIG_BT_HCIUART_MRVL=y -CONFIG_BT_HCIBCM203X=m -CONFIG_BT_HCIBPA10X=m -CONFIG_BT_HCIBFUSB=m -CONFIG_BT_HCIDTL1=m -CONFIG_BT_HCIBT3C=m -CONFIG_BT_HCIBLUECARD=m -CONFIG_BT_HCIVHCI=m -CONFIG_BT_MRVL=m -CONFIG_BT_MRVL_SDIO=m -CONFIG_BT_ATH3K=m -CONFIG_BT_MTKSDIO=m -CONFIG_BT_MTKUART=m -CONFIG_BT_HCIRSI=m -# end of Bluetooth device drivers - -CONFIG_AF_RXRPC=m -CONFIG_AF_RXRPC_IPV6=y -# CONFIG_AF_RXRPC_INJECT_LOSS is not set -CONFIG_AF_RXRPC_DEBUG=y -CONFIG_RXKAD=y -CONFIG_AF_KCM=m -CONFIG_STREAM_PARSER=y -CONFIG_FIB_RULES=y -CONFIG_WIRELESS=y -CONFIG_WIRELESS_EXT=y -CONFIG_WEXT_CORE=y -CONFIG_WEXT_PROC=y -CONFIG_WEXT_SPY=y -CONFIG_WEXT_PRIV=y -CONFIG_CFG80211=m -# CONFIG_NL80211_TESTMODE is not set -# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set -# CONFIG_CFG80211_CERTIFICATION_ONUS is not set -CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y -CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y -CONFIG_CFG80211_DEFAULT_PS=y -CONFIG_CFG80211_DEBUGFS=y -CONFIG_CFG80211_CRDA_SUPPORT=y -CONFIG_CFG80211_WEXT=y -CONFIG_CFG80211_WEXT_EXPORT=y -CONFIG_LIB80211=m -CONFIG_LIB80211_CRYPT_WEP=m -CONFIG_LIB80211_CRYPT_CCMP=m -CONFIG_LIB80211_CRYPT_TKIP=m -# CONFIG_LIB80211_DEBUG is not set -CONFIG_MAC80211=m -CONFIG_MAC80211_HAS_RC=y -CONFIG_MAC80211_RC_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y -CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" -CONFIG_MAC80211_MESH=y -CONFIG_MAC80211_LEDS=y -CONFIG_MAC80211_DEBUGFS=y -# CONFIG_MAC80211_MESSAGE_TRACING is not set -# CONFIG_MAC80211_DEBUG_MENU is not set -CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 -CONFIG_WIMAX=m -CONFIG_WIMAX_DEBUG_LEVEL=8 -CONFIG_RFKILL=m -CONFIG_RFKILL_LEDS=y -CONFIG_RFKILL_INPUT=y -CONFIG_RFKILL_GPIO=m -CONFIG_NET_9P=m -CONFIG_NET_9P_VIRTIO=m -CONFIG_NET_9P_XEN=m -CONFIG_NET_9P_RDMA=m -# CONFIG_NET_9P_DEBUG is not set -CONFIG_CAIF=m -# CONFIG_CAIF_DEBUG is not set -CONFIG_CAIF_NETDEV=m -CONFIG_CAIF_USB=m -CONFIG_CEPH_LIB=m -CONFIG_CEPH_LIB_PRETTYDEBUG=y -CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y -CONFIG_NFC=m -CONFIG_NFC_DIGITAL=m -CONFIG_NFC_NCI=m -CONFIG_NFC_NCI_SPI=m -CONFIG_NFC_NCI_UART=m -CONFIG_NFC_HCI=m -CONFIG_NFC_SHDLC=y - -# -# Near Field Communication (NFC) devices -# -CONFIG_NFC_TRF7970A=m -CONFIG_NFC_MEI_PHY=m -CONFIG_NFC_SIM=m -CONFIG_NFC_PORT100=m -CONFIG_NFC_FDP=m -CONFIG_NFC_FDP_I2C=m -CONFIG_NFC_PN544=m -CONFIG_NFC_PN544_I2C=m -CONFIG_NFC_PN544_MEI=m -CONFIG_NFC_PN533=m -CONFIG_NFC_PN533_USB=m -CONFIG_NFC_PN533_I2C=m -CONFIG_NFC_PN532_UART=m -CONFIG_NFC_MICROREAD=m -CONFIG_NFC_MICROREAD_I2C=m -CONFIG_NFC_MICROREAD_MEI=m -CONFIG_NFC_MRVL=m -CONFIG_NFC_MRVL_USB=m -CONFIG_NFC_MRVL_UART=m -CONFIG_NFC_MRVL_I2C=m -CONFIG_NFC_MRVL_SPI=m -CONFIG_NFC_ST21NFCA=m -CONFIG_NFC_ST21NFCA_I2C=m -CONFIG_NFC_ST_NCI=m -CONFIG_NFC_ST_NCI_I2C=m -CONFIG_NFC_ST_NCI_SPI=m -CONFIG_NFC_NXP_NCI=m -CONFIG_NFC_NXP_NCI_I2C=m -CONFIG_NFC_S3FWRN5=m -CONFIG_NFC_S3FWRN5_I2C=m -CONFIG_NFC_ST95HF=m -# end of Near Field Communication (NFC) devices - -CONFIG_PSAMPLE=m -CONFIG_NET_IFE=m -CONFIG_LWTUNNEL=y -CONFIG_LWTUNNEL_BPF=y -CONFIG_DST_CACHE=y -CONFIG_GRO_CELLS=y -CONFIG_SOCK_VALIDATE_XMIT=y -CONFIG_NET_SOCK_MSG=y -CONFIG_NET_DEVLINK=y -CONFIG_PAGE_POOL=y -CONFIG_FAILOVER=m -CONFIG_ETHTOOL_NETLINK=y -CONFIG_HAVE_EBPF_JIT=y - -# -# Device Drivers -# -CONFIG_HAVE_EISA=y -# CONFIG_EISA is not set -CONFIG_HAVE_PCI=y -CONFIG_PCI=y -CONFIG_PCI_DOMAINS=y -CONFIG_PCIEPORTBUS=y -CONFIG_HOTPLUG_PCI_PCIE=y -CONFIG_PCIEAER=y -# CONFIG_PCIEAER_INJECT is not set -CONFIG_PCIE_ECRC=y -CONFIG_PCIEASPM=y -CONFIG_PCIEASPM_DEFAULT=y -# CONFIG_PCIEASPM_POWERSAVE is not set -# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set -# CONFIG_PCIEASPM_PERFORMANCE is not set -CONFIG_PCIE_PME=y -CONFIG_PCIE_DPC=y -CONFIG_PCIE_PTM=y -# CONFIG_PCIE_BW is not set -CONFIG_PCIE_EDR=y -CONFIG_PCI_MSI=y -CONFIG_PCI_MSI_IRQ_DOMAIN=y -CONFIG_PCI_QUIRKS=y -# CONFIG_PCI_DEBUG is not set -CONFIG_PCI_REALLOC_ENABLE_AUTO=y -CONFIG_PCI_STUB=y -CONFIG_PCI_PF_STUB=m -CONFIG_XEN_PCIDEV_FRONTEND=m -CONFIG_PCI_ATS=y -CONFIG_PCI_ECAM=y -CONFIG_PCI_LOCKLESS_CONFIG=y -CONFIG_PCI_IOV=y -CONFIG_PCI_PRI=y -CONFIG_PCI_PASID=y -CONFIG_PCI_P2PDMA=y -CONFIG_PCI_LABEL=y -CONFIG_PCI_HYPERV=m -CONFIG_HOTPLUG_PCI=y -CONFIG_HOTPLUG_PCI_ACPI=y -CONFIG_HOTPLUG_PCI_ACPI_IBM=m -CONFIG_HOTPLUG_PCI_CPCI=y -CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m -CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m -CONFIG_HOTPLUG_PCI_SHPC=y - -# -# PCI controller drivers -# -CONFIG_PCI_FTPCI100=y -CONFIG_PCI_HOST_COMMON=y -CONFIG_PCI_HOST_GENERIC=y -CONFIG_PCIE_XILINX=y -CONFIG_VMD=m -CONFIG_PCI_HYPERV_INTERFACE=m - -# -# DesignWare PCI Core Support -# -CONFIG_PCIE_DW=y -CONFIG_PCIE_DW_HOST=y -CONFIG_PCIE_DW_EP=y -CONFIG_PCIE_DW_PLAT=y -CONFIG_PCIE_DW_PLAT_HOST=y -CONFIG_PCIE_DW_PLAT_EP=y -CONFIG_PCIE_INTEL_GW=y -CONFIG_PCI_MESON=y -# end of DesignWare PCI Core Support - -# -# Mobiveil PCIe Core Support -# -# end of Mobiveil PCIe Core Support - -# -# Cadence PCIe controllers support -# -CONFIG_PCIE_CADENCE=y -CONFIG_PCIE_CADENCE_HOST=y -CONFIG_PCIE_CADENCE_EP=y -CONFIG_PCIE_CADENCE_PLAT=y -CONFIG_PCIE_CADENCE_PLAT_HOST=y -CONFIG_PCIE_CADENCE_PLAT_EP=y -# CONFIG_PCI_J721E_HOST is not set -# CONFIG_PCI_J721E_EP is not set -# end of Cadence PCIe controllers support -# end of PCI controller drivers - -# -# PCI Endpoint -# -CONFIG_PCI_ENDPOINT=y -CONFIG_PCI_ENDPOINT_CONFIGFS=y -# CONFIG_PCI_EPF_TEST is not set -# end of PCI Endpoint - -# -# PCI switch controller drivers -# -CONFIG_PCI_SW_SWITCHTEC=m -# end of PCI switch controller drivers - -CONFIG_PCCARD=m -CONFIG_PCMCIA=m -CONFIG_PCMCIA_LOAD_CIS=y -CONFIG_CARDBUS=y - -# -# PC-card bridges -# -CONFIG_YENTA=m -CONFIG_YENTA_O2=y -CONFIG_YENTA_RICOH=y -CONFIG_YENTA_TI=y -CONFIG_YENTA_ENE_TUNE=y -CONFIG_YENTA_TOSHIBA=y -CONFIG_PD6729=m -CONFIG_I82092=m -CONFIG_PCCARD_NONSTATIC=y -CONFIG_RAPIDIO=m -CONFIG_RAPIDIO_TSI721=m -CONFIG_RAPIDIO_DISC_TIMEOUT=30 -CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y -CONFIG_RAPIDIO_DMA_ENGINE=y -# CONFIG_RAPIDIO_DEBUG is not set -CONFIG_RAPIDIO_ENUM_BASIC=m -CONFIG_RAPIDIO_CHMAN=m -CONFIG_RAPIDIO_MPORT_CDEV=m - -# -# RapidIO Switch drivers -# -CONFIG_RAPIDIO_TSI57X=m -CONFIG_RAPIDIO_CPS_XX=m -CONFIG_RAPIDIO_TSI568=m -CONFIG_RAPIDIO_CPS_GEN2=m -CONFIG_RAPIDIO_RXS_GEN3=m -# end of RapidIO Switch drivers - -# -# Generic Driver Options -# -# CONFIG_UEVENT_HELPER is not set -CONFIG_DEVTMPFS=y -CONFIG_DEVTMPFS_MOUNT=y -CONFIG_STANDALONE=y -CONFIG_PREVENT_FIRMWARE_BUILD=y - -# -# Firmware loader -# -CONFIG_FW_LOADER=y -CONFIG_FW_LOADER_PAGED_BUF=y -CONFIG_EXTRA_FIRMWARE="" -# CONFIG_FW_LOADER_USER_HELPER is not set -CONFIG_FW_LOADER_COMPRESS=y -CONFIG_FW_CACHE=y -# end of Firmware loader - -CONFIG_WANT_DEV_COREDUMP=y -CONFIG_ALLOW_DEV_COREDUMP=y -CONFIG_DEV_COREDUMP=y -# CONFIG_DEBUG_DRIVER is not set -# CONFIG_DEBUG_DEVRES is not set -# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set -CONFIG_HMEM_REPORTING=y -# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set -CONFIG_SYS_HYPERVISOR=y -CONFIG_GENERIC_CPU_AUTOPROBE=y -CONFIG_GENERIC_CPU_VULNERABILITIES=y -CONFIG_REGMAP=y -CONFIG_REGMAP_I2C=y -CONFIG_REGMAP_SLIMBUS=m -CONFIG_REGMAP_SPI=y -CONFIG_REGMAP_SPMI=m -CONFIG_REGMAP_W1=m -CONFIG_REGMAP_MMIO=y -CONFIG_REGMAP_IRQ=y -CONFIG_REGMAP_SOUNDWIRE=m -CONFIG_REGMAP_SCCB=m -CONFIG_REGMAP_I3C=m -CONFIG_DMA_SHARED_BUFFER=y -# CONFIG_DMA_FENCE_TRACE is not set -# end of Generic Driver Options - -# -# Bus devices -# -CONFIG_MOXTET=m -CONFIG_SIMPLE_PM_BUS=y -CONFIG_MHI_BUS=m -# end of Bus devices - -CONFIG_CONNECTOR=y -CONFIG_PROC_EVENTS=y -CONFIG_GNSS=m -CONFIG_GNSS_SERIAL=m -CONFIG_GNSS_MTK_SERIAL=m -CONFIG_GNSS_SIRF_SERIAL=m -CONFIG_GNSS_UBX_SERIAL=m -CONFIG_MTD=m -CONFIG_MTD_TESTS=m - -# -# Partition parsers -# -CONFIG_MTD_AR7_PARTS=m -CONFIG_MTD_CMDLINE_PARTS=m -CONFIG_MTD_OF_PARTS=m -CONFIG_MTD_REDBOOT_PARTS=m -CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 -# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set -# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set -# end of Partition parsers - -# -# User Modules And Translation Layers -# -CONFIG_MTD_BLKDEVS=m -CONFIG_MTD_BLOCK=m -CONFIG_MTD_BLOCK_RO=m -CONFIG_FTL=m -CONFIG_NFTL=m -CONFIG_NFTL_RW=y -CONFIG_INFTL=m -CONFIG_RFD_FTL=m -CONFIG_SSFDC=m -CONFIG_SM_FTL=m -CONFIG_MTD_OOPS=m -CONFIG_MTD_PSTORE=m -CONFIG_MTD_SWAP=m -CONFIG_MTD_PARTITIONED_MASTER=y - -# -# RAM/ROM/Flash chip drivers -# -CONFIG_MTD_CFI=m -CONFIG_MTD_JEDECPROBE=m -CONFIG_MTD_GEN_PROBE=m -# CONFIG_MTD_CFI_ADV_OPTIONS is not set -CONFIG_MTD_MAP_BANK_WIDTH_1=y -CONFIG_MTD_MAP_BANK_WIDTH_2=y -CONFIG_MTD_MAP_BANK_WIDTH_4=y -CONFIG_MTD_CFI_I1=y -CONFIG_MTD_CFI_I2=y -CONFIG_MTD_CFI_INTELEXT=m -CONFIG_MTD_CFI_AMDSTD=m -CONFIG_MTD_CFI_STAA=m -CONFIG_MTD_CFI_UTIL=m -CONFIG_MTD_RAM=m -CONFIG_MTD_ROM=m -CONFIG_MTD_ABSENT=m -# end of RAM/ROM/Flash chip drivers - -# -# Mapping drivers for chip access -# -CONFIG_MTD_COMPLEX_MAPPINGS=y -CONFIG_MTD_PHYSMAP=m -# CONFIG_MTD_PHYSMAP_COMPAT is not set -CONFIG_MTD_PHYSMAP_OF=y -CONFIG_MTD_PHYSMAP_VERSATILE=y -CONFIG_MTD_PHYSMAP_GEMINI=y -CONFIG_MTD_PHYSMAP_GPIO_ADDR=y -CONFIG_MTD_SBC_GXX=m -CONFIG_MTD_AMD76XROM=m -CONFIG_MTD_ICHXROM=m -CONFIG_MTD_ESB2ROM=m -CONFIG_MTD_CK804XROM=m -CONFIG_MTD_SCB2_FLASH=m -CONFIG_MTD_NETtel=m -CONFIG_MTD_L440GX=m -CONFIG_MTD_PCI=m -CONFIG_MTD_PCMCIA=m -# CONFIG_MTD_PCMCIA_ANONYMOUS is not set -CONFIG_MTD_INTEL_VR_NOR=m -CONFIG_MTD_PLATRAM=m -# end of Mapping drivers for chip access - -# -# Self-contained MTD device drivers -# -CONFIG_MTD_PMC551=m -# CONFIG_MTD_PMC551_BUGFIX is not set -# CONFIG_MTD_PMC551_DEBUG is not set -CONFIG_MTD_DATAFLASH=m -# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set -CONFIG_MTD_DATAFLASH_OTP=y -CONFIG_MTD_MCHP23K256=m -CONFIG_MTD_SST25L=m -CONFIG_MTD_SLRAM=m -CONFIG_MTD_PHRAM=m -CONFIG_MTD_MTDRAM=m -CONFIG_MTDRAM_TOTAL_SIZE=4096 -CONFIG_MTDRAM_ERASE_SIZE=128 -CONFIG_MTD_BLOCK2MTD=m - -# -# Disk-On-Chip Device Drivers -# -CONFIG_MTD_DOCG3=m -CONFIG_BCH_CONST_M=14 -CONFIG_BCH_CONST_T=4 -# end of Self-contained MTD device drivers - -# -# NAND -# -CONFIG_MTD_NAND_CORE=m -CONFIG_MTD_ONENAND=m -# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set -CONFIG_MTD_ONENAND_GENERIC=m -CONFIG_MTD_ONENAND_OTP=y -CONFIG_MTD_ONENAND_2X_PROGRAM=y -CONFIG_MTD_NAND_ECC_SW_HAMMING=m -CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y -CONFIG_MTD_RAW_NAND=m -CONFIG_MTD_NAND_ECC_SW_BCH=y - -# -# Raw/parallel NAND flash controllers -# -CONFIG_MTD_NAND_DENALI=m -CONFIG_MTD_NAND_DENALI_PCI=m -CONFIG_MTD_NAND_DENALI_DT=m -CONFIG_MTD_NAND_CAFE=m -CONFIG_MTD_NAND_MXIC=m -CONFIG_MTD_NAND_GPIO=m -CONFIG_MTD_NAND_PLATFORM=m -CONFIG_MTD_NAND_CADENCE=m -CONFIG_MTD_NAND_ARASAN=m - -# -# Misc -# -CONFIG_MTD_SM_COMMON=m -CONFIG_MTD_NAND_NANDSIM=m -CONFIG_MTD_NAND_RICOH=m -CONFIG_MTD_NAND_DISKONCHIP=m -# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set -CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 -CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y -CONFIG_MTD_SPI_NAND=m -# end of NAND - -# -# LPDDR & LPDDR2 PCM memory drivers -# -CONFIG_MTD_LPDDR=m -CONFIG_MTD_QINFO_PROBE=m -# end of LPDDR & LPDDR2 PCM memory drivers - -CONFIG_MTD_SPI_NOR=m -CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y -CONFIG_SPI_INTEL_SPI=m -CONFIG_SPI_INTEL_SPI_PCI=m -CONFIG_SPI_INTEL_SPI_PLATFORM=m -CONFIG_MTD_UBI=m -CONFIG_MTD_UBI_WL_THRESHOLD=4096 -CONFIG_MTD_UBI_BEB_LIMIT=20 -CONFIG_MTD_UBI_FASTMAP=y -CONFIG_MTD_UBI_GLUEBI=m -CONFIG_MTD_UBI_BLOCK=y -CONFIG_MTD_HYPERBUS=m -CONFIG_DTC=y -CONFIG_OF=y -# CONFIG_OF_UNITTEST is not set -CONFIG_OF_FLATTREE=y -CONFIG_OF_EARLY_FLATTREE=y -CONFIG_OF_KOBJ=y -CONFIG_OF_DYNAMIC=y -CONFIG_OF_ADDRESS=y -CONFIG_OF_IRQ=y -CONFIG_OF_NET=y -CONFIG_OF_MDIO=m -CONFIG_OF_RESERVED_MEM=y -CONFIG_OF_RESOLVE=y -CONFIG_OF_OVERLAY=y -CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y -CONFIG_PARPORT=m -CONFIG_PARPORT_PC=m -CONFIG_PARPORT_SERIAL=m -CONFIG_PARPORT_PC_FIFO=y -CONFIG_PARPORT_PC_SUPERIO=y -CONFIG_PARPORT_PC_PCMCIA=m -CONFIG_PARPORT_AX88796=m -CONFIG_PARPORT_1284=y -CONFIG_PARPORT_NOT_PC=y -CONFIG_PNP=y -CONFIG_PNP_DEBUG_MESSAGES=y - -# -# Protocols -# -CONFIG_PNPACPI=y -CONFIG_BLK_DEV=y -# CONFIG_BLK_DEV_NULL_BLK is not set -CONFIG_BLK_DEV_FD=m -CONFIG_CDROM=m -# CONFIG_PARIDE is not set -CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m -CONFIG_ZRAM=m -CONFIG_ZRAM_WRITEBACK=y -# CONFIG_ZRAM_MEMORY_TRACKING is not set -CONFIG_BLK_DEV_UMEM=m -CONFIG_BLK_DEV_LOOP=m -CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 -CONFIG_BLK_DEV_CRYPTOLOOP=m -CONFIG_BLK_DEV_DRBD=m -# CONFIG_DRBD_FAULT_INJECTION is not set -CONFIG_BLK_DEV_NBD=m -CONFIG_BLK_DEV_SKD=m -CONFIG_BLK_DEV_SX8=m -CONFIG_BLK_DEV_RAM=m -CONFIG_BLK_DEV_RAM_COUNT=16 -CONFIG_BLK_DEV_RAM_SIZE=16384 -CONFIG_CDROM_PKTCDVD=m -CONFIG_CDROM_PKTCDVD_BUFFERS=8 -# CONFIG_CDROM_PKTCDVD_WCACHE is not set -CONFIG_ATA_OVER_ETH=m -CONFIG_XEN_BLKDEV_FRONTEND=m -CONFIG_XEN_BLKDEV_BACKEND=m -CONFIG_VIRTIO_BLK=m -CONFIG_BLK_DEV_RBD=m -CONFIG_BLK_DEV_RSXX=m -CONFIG_BLK_DEV_RNBD=y -CONFIG_BLK_DEV_RNBD_CLIENT=m -CONFIG_BLK_DEV_RNBD_SERVER=m - -# -# NVME Support -# -CONFIG_NVME_CORE=y -CONFIG_BLK_DEV_NVME=y -CONFIG_NVME_MULTIPATH=y -CONFIG_NVME_HWMON=y -CONFIG_NVME_FABRICS=m -CONFIG_NVME_RDMA=m -CONFIG_NVME_FC=m -CONFIG_NVME_TCP=m -CONFIG_NVME_TARGET=m -# CONFIG_NVME_TARGET_PASSTHRU is not set -CONFIG_NVME_TARGET_LOOP=m -CONFIG_NVME_TARGET_RDMA=m -CONFIG_NVME_TARGET_FC=m -CONFIG_NVME_TARGET_FCLOOP=m -CONFIG_NVME_TARGET_TCP=m -# end of NVME Support - -# -# Misc devices -# -CONFIG_SENSORS_LIS3LV02D=m -CONFIG_AD525X_DPOT=m -CONFIG_AD525X_DPOT_I2C=m -CONFIG_AD525X_DPOT_SPI=m -# CONFIG_DUMMY_IRQ is not set -CONFIG_IBM_ASM=m -CONFIG_PHANTOM=m -CONFIG_TIFM_CORE=m -CONFIG_TIFM_7XX1=m -CONFIG_ICS932S401=m -CONFIG_ENCLOSURE_SERVICES=m -CONFIG_HP_ILO=m -CONFIG_APDS9802ALS=m -CONFIG_ISL29003=m -CONFIG_ISL29020=m -CONFIG_SENSORS_TSL2550=m -CONFIG_SENSORS_BH1770=m -CONFIG_SENSORS_APDS990X=m -CONFIG_HMC6352=m -CONFIG_DS1682=m -CONFIG_VMWARE_BALLOON=m -CONFIG_LATTICE_ECP3_CONFIG=m -# CONFIG_SRAM is not set -CONFIG_PCI_ENDPOINT_TEST=m -CONFIG_XILINX_SDFEC=m -CONFIG_MISC_RTSX=m -CONFIG_PVPANIC=m -CONFIG_C2PORT=m -CONFIG_C2PORT_DURAMAR_2150=m - -# -# EEPROM support -# -CONFIG_EEPROM_AT24=m -# CONFIG_EEPROM_AT25 is not set -CONFIG_EEPROM_LEGACY=m -CONFIG_EEPROM_MAX6875=m -CONFIG_EEPROM_93CX6=m -# CONFIG_EEPROM_93XX46 is not set -CONFIG_EEPROM_IDT_89HPESX=m -CONFIG_EEPROM_EE1004=m -# end of EEPROM support - -CONFIG_CB710_CORE=m -# CONFIG_CB710_DEBUG is not set -CONFIG_CB710_DEBUG_ASSUMPTIONS=y - -# -# Texas Instruments shared transport line discipline -# -CONFIG_TI_ST=m -# end of Texas Instruments shared transport line discipline - -CONFIG_SENSORS_LIS3_I2C=m -CONFIG_ALTERA_STAPL=m -CONFIG_INTEL_MEI=m -CONFIG_INTEL_MEI_ME=m -CONFIG_INTEL_MEI_TXE=m -CONFIG_INTEL_MEI_HDCP=m -CONFIG_VMWARE_VMCI=m - -# -# Intel MIC & related support -# -CONFIG_INTEL_MIC_BUS=m -CONFIG_SCIF_BUS=m -CONFIG_VOP_BUS=m -CONFIG_INTEL_MIC_HOST=m -CONFIG_INTEL_MIC_CARD=m -CONFIG_SCIF=m -CONFIG_MIC_COSM=m -CONFIG_VOP=m -# end of Intel MIC & related support - -CONFIG_GENWQE=m -CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 -CONFIG_ECHO=m -CONFIG_MISC_ALCOR_PCI=m -CONFIG_MISC_RTSX_PCI=m -CONFIG_MISC_RTSX_USB=m -CONFIG_HABANA_AI=m -CONFIG_UACCE=m -# end of Misc devices - -CONFIG_HAVE_IDE=y -# CONFIG_IDE is not set - -# -# SCSI device support -# -CONFIG_SCSI_MOD=y -CONFIG_RAID_ATTRS=m -CONFIG_SCSI=y -CONFIG_SCSI_DMA=y -CONFIG_SCSI_NETLINK=y -CONFIG_SCSI_PROC_FS=y - -# -# SCSI support type (disk, tape, CD-ROM) -# -CONFIG_BLK_DEV_SD=y -CONFIG_CHR_DEV_ST=m -CONFIG_BLK_DEV_SR=m -CONFIG_CHR_DEV_SG=m -CONFIG_CHR_DEV_SCH=m -CONFIG_SCSI_ENCLOSURE=m -CONFIG_SCSI_CONSTANTS=y -CONFIG_SCSI_LOGGING=y -CONFIG_SCSI_SCAN_ASYNC=y - -# -# SCSI Transports -# -CONFIG_SCSI_SPI_ATTRS=m -CONFIG_SCSI_FC_ATTRS=m -CONFIG_SCSI_ISCSI_ATTRS=m -CONFIG_SCSI_SAS_ATTRS=m -CONFIG_SCSI_SAS_LIBSAS=m -CONFIG_SCSI_SAS_ATA=y -CONFIG_SCSI_SAS_HOST_SMP=y -CONFIG_SCSI_SRP_ATTRS=m -# end of SCSI Transports - -CONFIG_SCSI_LOWLEVEL=y -CONFIG_ISCSI_TCP=m -CONFIG_ISCSI_BOOT_SYSFS=m -CONFIG_SCSI_CXGB3_ISCSI=m -CONFIG_SCSI_CXGB4_ISCSI=m -CONFIG_SCSI_BNX2_ISCSI=m -CONFIG_SCSI_BNX2X_FCOE=m -CONFIG_BE2ISCSI=m -CONFIG_BLK_DEV_3W_XXXX_RAID=m -CONFIG_SCSI_HPSA=m -CONFIG_SCSI_3W_9XXX=m -CONFIG_SCSI_3W_SAS=m -CONFIG_SCSI_ACARD=m -CONFIG_SCSI_AACRAID=m -CONFIG_SCSI_AIC7XXX=m -CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 -CONFIG_AIC7XXX_RESET_DELAY_MS=15000 -CONFIG_AIC7XXX_DEBUG_ENABLE=y -CONFIG_AIC7XXX_DEBUG_MASK=0 -CONFIG_AIC7XXX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC79XX=m -CONFIG_AIC79XX_CMDS_PER_DEVICE=32 -CONFIG_AIC79XX_RESET_DELAY_MS=15000 -CONFIG_AIC79XX_DEBUG_ENABLE=y -CONFIG_AIC79XX_DEBUG_MASK=0 -CONFIG_AIC79XX_REG_PRETTY_PRINT=y -CONFIG_SCSI_AIC94XX=m -CONFIG_AIC94XX_DEBUG=y -CONFIG_SCSI_MVSAS=m -CONFIG_SCSI_MVSAS_DEBUG=y -CONFIG_SCSI_MVSAS_TASKLET=y -CONFIG_SCSI_MVUMI=m -CONFIG_SCSI_DPT_I2O=m -CONFIG_SCSI_ADVANSYS=m -CONFIG_SCSI_ARCMSR=m -CONFIG_SCSI_ESAS2R=m -CONFIG_MEGARAID_NEWGEN=y -CONFIG_MEGARAID_MM=m -CONFIG_MEGARAID_MAILBOX=m -CONFIG_MEGARAID_LEGACY=m -CONFIG_MEGARAID_SAS=m -CONFIG_SCSI_MPT3SAS=m -CONFIG_SCSI_MPT2SAS_MAX_SGE=128 -CONFIG_SCSI_MPT3SAS_MAX_SGE=128 -CONFIG_SCSI_MPT2SAS=m -CONFIG_SCSI_SMARTPQI=m -CONFIG_SCSI_UFSHCD=m -CONFIG_SCSI_UFSHCD_PCI=m -# CONFIG_SCSI_UFS_DWC_TC_PCI is not set -CONFIG_SCSI_UFSHCD_PLATFORM=m -CONFIG_SCSI_UFS_CDNS_PLATFORM=m -# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set -CONFIG_SCSI_UFS_BSG=y -# CONFIG_SCSI_UFS_CRYPTO is not set -CONFIG_SCSI_HPTIOP=m -CONFIG_SCSI_BUSLOGIC=m -CONFIG_SCSI_FLASHPOINT=y -CONFIG_SCSI_MYRB=m -CONFIG_SCSI_MYRS=m -CONFIG_VMWARE_PVSCSI=m -CONFIG_XEN_SCSI_FRONTEND=m -CONFIG_HYPERV_STORAGE=m -CONFIG_LIBFC=m -CONFIG_LIBFCOE=m -CONFIG_FCOE=m -CONFIG_FCOE_FNIC=m -CONFIG_SCSI_SNIC=m -# CONFIG_SCSI_SNIC_DEBUG_FS is not set -CONFIG_SCSI_DMX3191D=m -CONFIG_SCSI_FDOMAIN=m -CONFIG_SCSI_FDOMAIN_PCI=m -CONFIG_SCSI_GDTH=m -CONFIG_SCSI_ISCI=m -CONFIG_SCSI_IPS=m -CONFIG_SCSI_INITIO=m -CONFIG_SCSI_INIA100=m -CONFIG_SCSI_PPA=m -CONFIG_SCSI_IMM=m -# CONFIG_SCSI_IZIP_EPP16 is not set -# CONFIG_SCSI_IZIP_SLOW_CTR is not set -CONFIG_SCSI_STEX=m -CONFIG_SCSI_SYM53C8XX_2=m -CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 -CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 -CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 -CONFIG_SCSI_SYM53C8XX_MMIO=y -CONFIG_SCSI_IPR=m -CONFIG_SCSI_IPR_TRACE=y -CONFIG_SCSI_IPR_DUMP=y -CONFIG_SCSI_QLOGIC_1280=m -CONFIG_SCSI_QLA_FC=m -CONFIG_TCM_QLA2XXX=m -# CONFIG_TCM_QLA2XXX_DEBUG is not set -CONFIG_SCSI_QLA_ISCSI=m -CONFIG_QEDI=m -CONFIG_QEDF=m -CONFIG_SCSI_LPFC=m -# CONFIG_SCSI_LPFC_DEBUG_FS is not set -CONFIG_SCSI_DC395x=m -CONFIG_SCSI_AM53C974=m -CONFIG_SCSI_WD719X=m -CONFIG_SCSI_DEBUG=m -CONFIG_SCSI_PMCRAID=m -CONFIG_SCSI_PM8001=m -CONFIG_SCSI_BFA_FC=m -CONFIG_SCSI_VIRTIO=m -CONFIG_SCSI_CHELSIO_FCOE=m -CONFIG_SCSI_LOWLEVEL_PCMCIA=y -CONFIG_PCMCIA_AHA152X=m -CONFIG_PCMCIA_FDOMAIN=m -CONFIG_PCMCIA_QLOGIC=m -CONFIG_PCMCIA_SYM53C500=m -CONFIG_SCSI_DH=y -CONFIG_SCSI_DH_RDAC=m -CONFIG_SCSI_DH_HP_SW=m -CONFIG_SCSI_DH_EMC=m -CONFIG_SCSI_DH_ALUA=m -# end of SCSI device support - -CONFIG_ATA=y -CONFIG_SATA_HOST=y -CONFIG_PATA_TIMINGS=y -CONFIG_ATA_VERBOSE_ERROR=y -CONFIG_ATA_FORCE=y -CONFIG_ATA_ACPI=y -CONFIG_SATA_ZPODD=y -CONFIG_SATA_PMP=y - -# -# Controllers with non-SFF native interface -# -CONFIG_SATA_AHCI=y -CONFIG_SATA_MOBILE_LPM_POLICY=3 -CONFIG_SATA_AHCI_PLATFORM=m -CONFIG_AHCI_CEVA=m -CONFIG_AHCI_QORIQ=m -CONFIG_SATA_INIC162X=m -CONFIG_SATA_ACARD_AHCI=m -CONFIG_SATA_SIL24=m -CONFIG_ATA_SFF=y - -# -# SFF controllers with custom DMA interface -# -CONFIG_PDC_ADMA=m -CONFIG_SATA_QSTOR=m -CONFIG_SATA_SX4=m -CONFIG_ATA_BMDMA=y - -# -# SATA SFF controllers with BMDMA -# -CONFIG_ATA_PIIX=m -CONFIG_SATA_DWC=m -# CONFIG_SATA_DWC_OLD_DMA is not set -# CONFIG_SATA_DWC_DEBUG is not set -CONFIG_SATA_MV=m -CONFIG_SATA_NV=m -CONFIG_SATA_PROMISE=m -CONFIG_SATA_SIL=m -CONFIG_SATA_SIS=m -CONFIG_SATA_SVW=m -CONFIG_SATA_ULI=m -CONFIG_SATA_VIA=m -CONFIG_SATA_VITESSE=m - -# -# PATA SFF controllers with BMDMA -# -CONFIG_PATA_ALI=m -CONFIG_PATA_AMD=m -CONFIG_PATA_ARTOP=m -CONFIG_PATA_ATIIXP=m -CONFIG_PATA_ATP867X=m -CONFIG_PATA_CMD64X=m -CONFIG_PATA_CYPRESS=m -CONFIG_PATA_EFAR=m -CONFIG_PATA_HPT366=m -CONFIG_PATA_HPT37X=m -CONFIG_PATA_HPT3X2N=m -CONFIG_PATA_HPT3X3=m -CONFIG_PATA_HPT3X3_DMA=y -CONFIG_PATA_IT8213=m -CONFIG_PATA_IT821X=m -CONFIG_PATA_JMICRON=m -CONFIG_PATA_MARVELL=m -CONFIG_PATA_NETCELL=m -CONFIG_PATA_NINJA32=m -CONFIG_PATA_NS87415=m -CONFIG_PATA_OLDPIIX=m -CONFIG_PATA_OPTIDMA=m -CONFIG_PATA_PDC2027X=m -CONFIG_PATA_PDC_OLD=m -CONFIG_PATA_RADISYS=m -CONFIG_PATA_RDC=m -CONFIG_PATA_SCH=m -CONFIG_PATA_SERVERWORKS=m -CONFIG_PATA_SIL680=m -CONFIG_PATA_SIS=m -CONFIG_PATA_TOSHIBA=m -CONFIG_PATA_TRIFLEX=m -CONFIG_PATA_VIA=m -CONFIG_PATA_WINBOND=m - -# -# PIO-only SFF controllers -# -CONFIG_PATA_CMD640_PCI=m -CONFIG_PATA_MPIIX=m -CONFIG_PATA_NS87410=m -CONFIG_PATA_OPTI=m -CONFIG_PATA_PCMCIA=m -# CONFIG_PATA_PLATFORM is not set -CONFIG_PATA_RZ1000=m - -# -# Generic fallback / legacy drivers -# -CONFIG_PATA_ACPI=m -CONFIG_ATA_GENERIC=m -CONFIG_PATA_LEGACY=m -CONFIG_MD=y -CONFIG_BLK_DEV_MD=m -CONFIG_MD_LINEAR=m -CONFIG_MD_RAID0=m -CONFIG_MD_RAID1=m -CONFIG_MD_RAID10=m -CONFIG_MD_RAID456=m -CONFIG_MD_MULTIPATH=m -CONFIG_MD_FAULTY=m -CONFIG_MD_CLUSTER=m -CONFIG_BCACHE=m -# CONFIG_BCACHE_DEBUG is not set -# CONFIG_BCACHE_CLOSURES_DEBUG is not set -CONFIG_BCACHE_ASYNC_REGISTRATION=y -CONFIG_BLK_DEV_DM_BUILTIN=y -CONFIG_BLK_DEV_DM=m -CONFIG_DM_DEBUG=y -CONFIG_DM_BUFIO=m -# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set -CONFIG_DM_BIO_PRISON=m -CONFIG_DM_PERSISTENT_DATA=m -CONFIG_DM_UNSTRIPED=m -CONFIG_DM_CRYPT=m -CONFIG_DM_SNAPSHOT=m -CONFIG_DM_THIN_PROVISIONING=m -CONFIG_DM_CACHE=m -CONFIG_DM_CACHE_SMQ=m -CONFIG_DM_WRITECACHE=m -CONFIG_DM_EBS=m -CONFIG_DM_ERA=m -CONFIG_DM_CLONE=m -CONFIG_DM_MIRROR=m -CONFIG_DM_LOG_USERSPACE=m -CONFIG_DM_RAID=m -CONFIG_DM_ZERO=m -CONFIG_DM_MULTIPATH=m -CONFIG_DM_MULTIPATH_QL=m -CONFIG_DM_MULTIPATH_ST=m -CONFIG_DM_MULTIPATH_HST=m -CONFIG_DM_DELAY=m -CONFIG_DM_DUST=m -CONFIG_DM_UEVENT=y -CONFIG_DM_FLAKEY=m -CONFIG_DM_VERITY=m -CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DM_SWITCH=m -CONFIG_DM_LOG_WRITES=m -CONFIG_DM_INTEGRITY=m -CONFIG_DM_ZONED=m -CONFIG_TARGET_CORE=m -CONFIG_TCM_IBLOCK=m -CONFIG_TCM_FILEIO=m -CONFIG_TCM_PSCSI=m -CONFIG_TCM_USER2=m -CONFIG_LOOPBACK_TARGET=m -CONFIG_TCM_FC=m -CONFIG_ISCSI_TARGET=m -CONFIG_ISCSI_TARGET_CXGB4=m -CONFIG_SBP_TARGET=m -CONFIG_FUSION=y -CONFIG_FUSION_SPI=m -CONFIG_FUSION_FC=m -CONFIG_FUSION_SAS=m -CONFIG_FUSION_MAX_SGE=128 -CONFIG_FUSION_CTL=m -CONFIG_FUSION_LAN=m -# CONFIG_FUSION_LOGGING is not set - -# -# IEEE 1394 (FireWire) support -# -CONFIG_FIREWIRE=m -CONFIG_FIREWIRE_OHCI=m -CONFIG_FIREWIRE_SBP2=m -CONFIG_FIREWIRE_NET=m -CONFIG_FIREWIRE_NOSY=m -# end of IEEE 1394 (FireWire) support - -CONFIG_MACINTOSH_DRIVERS=y -CONFIG_MAC_EMUMOUSEBTN=m -CONFIG_NETDEVICES=y -CONFIG_MII=m -CONFIG_NET_CORE=y -CONFIG_BONDING=m -CONFIG_DUMMY=m -CONFIG_WIREGUARD=m -# CONFIG_WIREGUARD_DEBUG is not set -CONFIG_EQUALIZER=m -CONFIG_NET_FC=y -CONFIG_IFB=m -CONFIG_NET_TEAM=m -CONFIG_NET_TEAM_MODE_BROADCAST=m -CONFIG_NET_TEAM_MODE_ROUNDROBIN=m -CONFIG_NET_TEAM_MODE_RANDOM=m -CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m -CONFIG_NET_TEAM_MODE_LOADBALANCE=m -CONFIG_MACVLAN=m -CONFIG_MACVTAP=m -CONFIG_IPVLAN_L3S=y -CONFIG_IPVLAN=m -CONFIG_IPVTAP=m -CONFIG_VXLAN=m -CONFIG_GENEVE=m -CONFIG_BAREUDP=m -CONFIG_GTP=m -CONFIG_MACSEC=m -CONFIG_NETCONSOLE=m -CONFIG_NETCONSOLE_DYNAMIC=y -CONFIG_NETPOLL=y -CONFIG_NET_POLL_CONTROLLER=y -CONFIG_NTB_NETDEV=m -CONFIG_RIONET=m -CONFIG_RIONET_TX_SIZE=128 -CONFIG_RIONET_RX_SIZE=128 -CONFIG_TUN=m -CONFIG_TAP=m -# CONFIG_TUN_VNET_CROSS_LE is not set -CONFIG_VETH=m -CONFIG_VIRTIO_NET=m -CONFIG_NLMON=m -CONFIG_NET_VRF=m -CONFIG_VSOCKMON=m -CONFIG_SUNGEM_PHY=m -# CONFIG_ARCNET is not set -CONFIG_ATM_DRIVERS=y -# CONFIG_ATM_DUMMY is not set -CONFIG_ATM_TCP=m -CONFIG_ATM_LANAI=m -CONFIG_ATM_ENI=m -# CONFIG_ATM_ENI_DEBUG is not set -# CONFIG_ATM_ENI_TUNE_BURST is not set -CONFIG_ATM_FIRESTREAM=m -CONFIG_ATM_ZATM=m -# CONFIG_ATM_ZATM_DEBUG is not set -CONFIG_ATM_NICSTAR=m -# CONFIG_ATM_NICSTAR_USE_SUNI is not set -# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set -CONFIG_ATM_IDT77252=m -# CONFIG_ATM_IDT77252_DEBUG is not set -# CONFIG_ATM_IDT77252_RCV_ALL is not set -CONFIG_ATM_IDT77252_USE_SUNI=y -CONFIG_ATM_AMBASSADOR=m -# CONFIG_ATM_AMBASSADOR_DEBUG is not set -CONFIG_ATM_HORIZON=m -# CONFIG_ATM_HORIZON_DEBUG is not set -CONFIG_ATM_IA=m -# CONFIG_ATM_IA_DEBUG is not set -CONFIG_ATM_FORE200E=m -CONFIG_ATM_FORE200E_USE_TASKLET=y -CONFIG_ATM_FORE200E_TX_RETRY=16 -CONFIG_ATM_FORE200E_DEBUG=0 -CONFIG_ATM_HE=m -CONFIG_ATM_HE_USE_SUNI=y -CONFIG_ATM_SOLOS=m -CONFIG_CAIF_DRIVERS=y -CONFIG_CAIF_TTY=m -CONFIG_CAIF_SPI_SLAVE=m -CONFIG_CAIF_SPI_SYNC=y -CONFIG_CAIF_HSI=m -CONFIG_CAIF_VIRTIO=m - -# -# Distributed Switch Architecture drivers -# -CONFIG_B53=m -# CONFIG_B53_SPI_DRIVER is not set -CONFIG_B53_MDIO_DRIVER=m -CONFIG_B53_MMAP_DRIVER=m -CONFIG_B53_SRAB_DRIVER=m -CONFIG_B53_SERDES=m -CONFIG_NET_DSA_BCM_SF2=m -CONFIG_NET_DSA_LOOP=m -CONFIG_NET_DSA_LANTIQ_GSWIP=m -CONFIG_NET_DSA_MT7530=m -CONFIG_NET_DSA_MV88E6060=m -CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m -CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795=m -CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m -CONFIG_NET_DSA_MV88E6XXX=m -CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y -CONFIG_NET_DSA_MV88E6XXX_PTP=y -CONFIG_NET_DSA_AR9331=m -CONFIG_NET_DSA_SJA1105=m -CONFIG_NET_DSA_SJA1105_PTP=y -CONFIG_NET_DSA_SJA1105_TAS=y -CONFIG_NET_DSA_SJA1105_VL=y -CONFIG_NET_DSA_QCA8K=m -CONFIG_NET_DSA_REALTEK_SMI=m -CONFIG_NET_DSA_SMSC_LAN9303=m -CONFIG_NET_DSA_SMSC_LAN9303_I2C=m -CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m -CONFIG_NET_DSA_VITESSE_VSC73XX=m -CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m -CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m -# end of Distributed Switch Architecture drivers - -CONFIG_ETHERNET=y -CONFIG_MDIO=m -CONFIG_NET_VENDOR_3COM=y -CONFIG_PCMCIA_3C574=m -CONFIG_PCMCIA_3C589=m -CONFIG_VORTEX=m -CONFIG_TYPHOON=m -CONFIG_NET_VENDOR_ADAPTEC=y -CONFIG_ADAPTEC_STARFIRE=m -CONFIG_NET_VENDOR_AGERE=y -CONFIG_ET131X=m -CONFIG_NET_VENDOR_ALACRITECH=y -CONFIG_SLICOSS=m -CONFIG_NET_VENDOR_ALTEON=y -CONFIG_ACENIC=m -# CONFIG_ACENIC_OMIT_TIGON_I is not set -CONFIG_ALTERA_TSE=m -CONFIG_NET_VENDOR_AMAZON=y -CONFIG_ENA_ETHERNET=m -CONFIG_NET_VENDOR_AMD=y -CONFIG_AMD8111_ETH=m -CONFIG_PCNET32=m -CONFIG_PCMCIA_NMCLAN=m -CONFIG_AMD_XGBE=m -CONFIG_AMD_XGBE_DCB=y -CONFIG_AMD_XGBE_HAVE_ECC=y -CONFIG_NET_VENDOR_AQUANTIA=y -CONFIG_AQTION=m -CONFIG_NET_VENDOR_ARC=y -CONFIG_NET_VENDOR_ATHEROS=y -CONFIG_ATL2=m -CONFIG_ATL1=m -CONFIG_ATL1E=m -CONFIG_ATL1C=m -CONFIG_ALX=m -CONFIG_NET_VENDOR_AURORA=y -CONFIG_AURORA_NB8800=m -CONFIG_NET_VENDOR_BROADCOM=y -CONFIG_B44=m -CONFIG_B44_PCI_AUTOSELECT=y -CONFIG_B44_PCICORE_AUTOSELECT=y -CONFIG_B44_PCI=y -CONFIG_BCMGENET=m -CONFIG_BNX2=m -CONFIG_CNIC=m -CONFIG_TIGON3=m -CONFIG_TIGON3_HWMON=y -CONFIG_BNX2X=m -CONFIG_BNX2X_SRIOV=y -CONFIG_SYSTEMPORT=m -CONFIG_BNXT=m -CONFIG_BNXT_SRIOV=y -CONFIG_BNXT_FLOWER_OFFLOAD=y -CONFIG_BNXT_DCB=y -CONFIG_BNXT_HWMON=y -CONFIG_NET_VENDOR_BROCADE=y -CONFIG_BNA=m -CONFIG_NET_VENDOR_CADENCE=y -CONFIG_MACB=m -CONFIG_MACB_USE_HWSTAMP=y -CONFIG_MACB_PCI=m -CONFIG_NET_VENDOR_CAVIUM=y -CONFIG_THUNDER_NIC_PF=m -CONFIG_THUNDER_NIC_VF=m -CONFIG_THUNDER_NIC_BGX=m -CONFIG_THUNDER_NIC_RGX=m -CONFIG_CAVIUM_PTP=m -CONFIG_LIQUIDIO=m -CONFIG_LIQUIDIO_VF=m -CONFIG_NET_VENDOR_CHELSIO=y -CONFIG_CHELSIO_T1=m -CONFIG_CHELSIO_T1_1G=y -CONFIG_CHELSIO_T3=m -CONFIG_CHELSIO_T4=m -CONFIG_CHELSIO_T4_DCB=y -CONFIG_CHELSIO_T4_FCOE=y -CONFIG_CHELSIO_T4VF=m -CONFIG_CHELSIO_LIB=m -CONFIG_NET_VENDOR_CISCO=y -CONFIG_ENIC=m -CONFIG_NET_VENDOR_CORTINA=y -CONFIG_GEMINI_ETHERNET=m -CONFIG_CX_ECAT=m -CONFIG_DNET=m -CONFIG_NET_VENDOR_DEC=y -CONFIG_NET_TULIP=y -CONFIG_DE2104X=m -CONFIG_DE2104X_DSL=0 -CONFIG_TULIP=m -CONFIG_TULIP_MWI=y -CONFIG_TULIP_MMIO=y -CONFIG_TULIP_NAPI=y -CONFIG_TULIP_NAPI_HW_MITIGATION=y -CONFIG_DE4X5=m -CONFIG_WINBOND_840=m -CONFIG_DM9102=m -CONFIG_ULI526X=m -CONFIG_PCMCIA_XIRCOM=m -CONFIG_NET_VENDOR_DLINK=y -CONFIG_DL2K=m -CONFIG_SUNDANCE=m -# CONFIG_SUNDANCE_MMIO is not set -CONFIG_NET_VENDOR_EMULEX=y -CONFIG_BE2NET=m -CONFIG_BE2NET_HWMON=y -CONFIG_BE2NET_BE2=y -CONFIG_BE2NET_BE3=y -CONFIG_BE2NET_LANCER=y -CONFIG_BE2NET_SKYHAWK=y -CONFIG_NET_VENDOR_EZCHIP=y -CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_FUJITSU=y -CONFIG_PCMCIA_FMVJ18X=m -CONFIG_NET_VENDOR_GOOGLE=y -CONFIG_GVE=m -CONFIG_NET_VENDOR_HUAWEI=y -CONFIG_HINIC=m -CONFIG_NET_VENDOR_I825XX=y -CONFIG_NET_VENDOR_INTEL=y -CONFIG_E100=m -CONFIG_E1000=m -CONFIG_E1000E=m -CONFIG_E1000E_HWTS=y -CONFIG_IGB=m -CONFIG_IGB_HWMON=y -CONFIG_IGB_DCA=y -CONFIG_IGBVF=m -CONFIG_IXGB=m -CONFIG_IXGBE=m -CONFIG_IXGBE_HWMON=y -CONFIG_IXGBE_DCA=y -CONFIG_IXGBE_DCB=y -# CONFIG_IXGBE_IPSEC is not set -CONFIG_IXGBEVF=m -CONFIG_IXGBEVF_IPSEC=y -CONFIG_I40E=m -CONFIG_I40E_DCB=y -CONFIG_IAVF=m -CONFIG_I40EVF=m -CONFIG_ICE=m -CONFIG_FM10K=m -CONFIG_IGC=m -CONFIG_JME=m -CONFIG_NET_VENDOR_MARVELL=y -CONFIG_MVMDIO=m -CONFIG_SKGE=m -# CONFIG_SKGE_DEBUG is not set -CONFIG_SKGE_GENESIS=y -CONFIG_SKY2=m -# CONFIG_SKY2_DEBUG is not set -CONFIG_NET_VENDOR_MELLANOX=y -CONFIG_MLX4_EN=m -CONFIG_MLX4_EN_DCB=y -CONFIG_MLX4_CORE=m -CONFIG_MLX4_DEBUG=y -CONFIG_MLX4_CORE_GEN2=y -CONFIG_MLX5_CORE=m -CONFIG_MLX5_ACCEL=y -CONFIG_MLX5_FPGA=y -CONFIG_MLX5_CORE_EN=y -CONFIG_MLX5_EN_ARFS=y -CONFIG_MLX5_EN_RXNFC=y -CONFIG_MLX5_MPFS=y -CONFIG_MLX5_ESWITCH=y -CONFIG_MLX5_CLS_ACT=y -CONFIG_MLX5_TC_CT=y -CONFIG_MLX5_CORE_EN_DCB=y -CONFIG_MLX5_CORE_IPOIB=y -CONFIG_MLX5_FPGA_IPSEC=y -# CONFIG_MLX5_IPSEC is not set -CONFIG_MLX5_EN_IPSEC=y -CONFIG_MLX5_FPGA_TLS=y -CONFIG_MLX5_TLS=y -CONFIG_MLX5_EN_TLS=y -CONFIG_MLX5_SW_STEERING=y -CONFIG_MLXSW_CORE=m -CONFIG_MLXSW_CORE_HWMON=y -CONFIG_MLXSW_CORE_THERMAL=y -CONFIG_MLXSW_PCI=m -CONFIG_MLXSW_I2C=m -CONFIG_MLXSW_SWITCHIB=m -CONFIG_MLXSW_SWITCHX2=m -CONFIG_MLXSW_SPECTRUM=m -CONFIG_MLXSW_SPECTRUM_DCB=y -CONFIG_MLXSW_MINIMAL=m -CONFIG_MLXFW=m -CONFIG_NET_VENDOR_MICREL=y -CONFIG_KS8842=m -CONFIG_KS8851=m -CONFIG_KS8851_MLL=m -CONFIG_KSZ884X_PCI=m -CONFIG_NET_VENDOR_MICROCHIP=y -CONFIG_ENC28J60=m -# CONFIG_ENC28J60_WRITEVERIFY is not set -CONFIG_ENCX24J600=m -CONFIG_LAN743X=m -CONFIG_NET_VENDOR_MICROSEMI=y -CONFIG_MSCC_OCELOT_SWITCH_LIB=m -CONFIG_MSCC_OCELOT_SWITCH=m -CONFIG_NET_VENDOR_MYRI=y -CONFIG_MYRI10GE=m -CONFIG_MYRI10GE_DCA=y -CONFIG_FEALNX=m -CONFIG_NET_VENDOR_NATSEMI=y -CONFIG_NATSEMI=m -CONFIG_NS83820=m -CONFIG_NET_VENDOR_NETERION=y -CONFIG_S2IO=m -CONFIG_VXGE=m -# CONFIG_VXGE_DEBUG_TRACE_ALL is not set -CONFIG_NET_VENDOR_NETRONOME=y -CONFIG_NFP=m -CONFIG_NFP_APP_FLOWER=y -CONFIG_NFP_APP_ABM_NIC=y -# CONFIG_NFP_DEBUG is not set -CONFIG_NET_VENDOR_NI=y -CONFIG_NI_XGE_MANAGEMENT_ENET=m -CONFIG_NET_VENDOR_8390=y -CONFIG_PCMCIA_AXNET=m -CONFIG_NE2K_PCI=m -CONFIG_PCMCIA_PCNET=m -CONFIG_NET_VENDOR_NVIDIA=y -CONFIG_FORCEDETH=m -CONFIG_NET_VENDOR_OKI=y -CONFIG_ETHOC=m -CONFIG_NET_VENDOR_PACKET_ENGINES=y -CONFIG_HAMACHI=m -CONFIG_YELLOWFIN=m -CONFIG_NET_VENDOR_PENSANDO=y -CONFIG_IONIC=m -CONFIG_NET_VENDOR_QLOGIC=y -CONFIG_QLA3XXX=m -CONFIG_QLCNIC=m -CONFIG_QLCNIC_SRIOV=y -CONFIG_QLCNIC_DCB=y -CONFIG_QLCNIC_HWMON=y -CONFIG_NETXEN_NIC=m -CONFIG_QED=m -CONFIG_QED_LL2=y -CONFIG_QED_SRIOV=y -CONFIG_QEDE=m -CONFIG_QED_RDMA=y -CONFIG_QED_ISCSI=y -CONFIG_QED_FCOE=y -CONFIG_QED_OOO=y -CONFIG_NET_VENDOR_QUALCOMM=y -CONFIG_QCA7000=m -CONFIG_QCA7000_SPI=m -CONFIG_QCA7000_UART=m -CONFIG_QCOM_EMAC=m -CONFIG_RMNET=m -CONFIG_NET_VENDOR_RDC=y -CONFIG_R6040=m -CONFIG_NET_VENDOR_REALTEK=y -CONFIG_ATP=m -CONFIG_8139CP=m -CONFIG_8139TOO=m -# CONFIG_8139TOO_PIO is not set -CONFIG_8139TOO_TUNE_TWISTER=y -CONFIG_8139TOO_8129=y -# CONFIG_8139_OLD_RX_RESET is not set -CONFIG_R8169=m -CONFIG_NET_VENDOR_RENESAS=y -CONFIG_NET_VENDOR_ROCKER=y -CONFIG_ROCKER=m -CONFIG_NET_VENDOR_SAMSUNG=y -CONFIG_SXGBE_ETH=m -CONFIG_NET_VENDOR_SEEQ=y -CONFIG_NET_VENDOR_SOLARFLARE=y -CONFIG_SFC=m -CONFIG_SFC_MTD=y -CONFIG_SFC_MCDI_MON=y -CONFIG_SFC_SRIOV=y -CONFIG_SFC_MCDI_LOGGING=y -CONFIG_SFC_FALCON=m -CONFIG_SFC_FALCON_MTD=y -CONFIG_NET_VENDOR_SILAN=y -CONFIG_SC92031=m -CONFIG_NET_VENDOR_SIS=y -CONFIG_SIS900=m -CONFIG_SIS190=m -CONFIG_NET_VENDOR_SMSC=y -CONFIG_PCMCIA_SMC91C92=m -CONFIG_EPIC100=m -CONFIG_SMSC911X=m -CONFIG_SMSC9420=m -CONFIG_NET_VENDOR_SOCIONEXT=y -CONFIG_NET_VENDOR_STMICRO=y -CONFIG_STMMAC_ETH=m -# CONFIG_STMMAC_SELFTESTS is not set -CONFIG_STMMAC_PLATFORM=m -CONFIG_DWMAC_DWC_QOS_ETH=m -CONFIG_DWMAC_GENERIC=m -CONFIG_DWMAC_INTEL=m -CONFIG_STMMAC_PCI=m -CONFIG_NET_VENDOR_SUN=y -CONFIG_HAPPYMEAL=m -CONFIG_SUNGEM=m -CONFIG_CASSINI=m -CONFIG_NIU=m -CONFIG_NET_VENDOR_SYNOPSYS=y -CONFIG_DWC_XLGMAC=m -CONFIG_DWC_XLGMAC_PCI=m -CONFIG_NET_VENDOR_TEHUTI=y -CONFIG_TEHUTI=m -CONFIG_NET_VENDOR_TI=y -# CONFIG_TI_CPSW_PHY_SEL is not set -CONFIG_TLAN=m -CONFIG_NET_VENDOR_VIA=y -CONFIG_VIA_RHINE=m -CONFIG_VIA_RHINE_MMIO=y -CONFIG_VIA_VELOCITY=m -CONFIG_NET_VENDOR_WIZNET=y -CONFIG_WIZNET_W5100=m -CONFIG_WIZNET_W5300=m -# CONFIG_WIZNET_BUS_DIRECT is not set -# CONFIG_WIZNET_BUS_INDIRECT is not set -CONFIG_WIZNET_BUS_ANY=y -CONFIG_WIZNET_W5100_SPI=m -CONFIG_NET_VENDOR_XILINX=y -CONFIG_XILINX_AXI_EMAC=m -CONFIG_XILINX_LL_TEMAC=m -CONFIG_NET_VENDOR_XIRCOM=y -CONFIG_PCMCIA_XIRC2PS=m -CONFIG_FDDI=m -CONFIG_DEFXX=m -CONFIG_DEFXX_MMIO=y -CONFIG_SKFP=m -# CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m -CONFIG_MDIO_DEVICE=m -CONFIG_MDIO_BUS=m -CONFIG_MDIO_DEVRES=m -CONFIG_MDIO_BCM_UNIMAC=m -CONFIG_MDIO_BITBANG=m -CONFIG_MDIO_BUS_MUX=m -CONFIG_MDIO_BUS_MUX_GPIO=m -CONFIG_MDIO_BUS_MUX_MMIOREG=m -CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m -CONFIG_MDIO_CAVIUM=m -CONFIG_MDIO_GPIO=m -CONFIG_MDIO_HISI_FEMAC=m -CONFIG_MDIO_I2C=m -CONFIG_MDIO_IPQ4019=m -CONFIG_MDIO_IPQ8064=m -CONFIG_MDIO_MSCC_MIIM=m -CONFIG_MDIO_MVUSB=m -CONFIG_MDIO_OCTEON=m -CONFIG_MDIO_THUNDER=m -CONFIG_MDIO_XPCS=m -CONFIG_PHYLINK=m -CONFIG_PHYLIB=m -CONFIG_SWPHY=y -CONFIG_LED_TRIGGER_PHY=y - -# -# MII PHY device drivers -# -CONFIG_SFP=m -CONFIG_ADIN_PHY=m -CONFIG_AMD_PHY=m -CONFIG_AQUANTIA_PHY=m -CONFIG_AX88796B_PHY=m -CONFIG_BCM7XXX_PHY=m -CONFIG_BCM87XX_PHY=m -CONFIG_BCM_NET_PHYLIB=m -CONFIG_BROADCOM_PHY=m -CONFIG_BCM54140_PHY=m -CONFIG_BCM84881_PHY=m -CONFIG_CICADA_PHY=m -CONFIG_CORTINA_PHY=m -CONFIG_DAVICOM_PHY=m -CONFIG_DP83822_PHY=m -CONFIG_DP83TC811_PHY=m -CONFIG_DP83848_PHY=m -CONFIG_DP83867_PHY=m -CONFIG_DP83869_PHY=m -CONFIG_FIXED_PHY=m -CONFIG_ICPLUS_PHY=m -CONFIG_INTEL_XWAY_PHY=m -CONFIG_LSI_ET1011C_PHY=m -CONFIG_LXT_PHY=m -CONFIG_MARVELL_PHY=m -CONFIG_MARVELL_10G_PHY=m -CONFIG_MICREL_PHY=m -CONFIG_MICROCHIP_PHY=m -CONFIG_MICROCHIP_T1_PHY=m -CONFIG_MICROSEMI_PHY=m -CONFIG_NATIONAL_PHY=m -CONFIG_NXP_TJA11XX_PHY=m -CONFIG_AT803X_PHY=m -CONFIG_QSEMI_PHY=m -CONFIG_REALTEK_PHY=m -CONFIG_RENESAS_PHY=m -CONFIG_ROCKCHIP_PHY=m -CONFIG_SMSC_PHY=m -CONFIG_STE10XP=m -CONFIG_TERANETICS_PHY=m -CONFIG_VITESSE_PHY=m -CONFIG_XILINX_GMII2RGMII=m -CONFIG_MICREL_KS8995MA=m -CONFIG_PLIP=m -CONFIG_PPP=m -CONFIG_PPP_BSDCOMP=m -CONFIG_PPP_DEFLATE=m -CONFIG_PPP_FILTER=y -CONFIG_PPP_MPPE=m -CONFIG_PPP_MULTILINK=y -CONFIG_PPPOATM=m -CONFIG_PPPOE=m -CONFIG_PPTP=m -CONFIG_PPPOL2TP=m -CONFIG_PPP_ASYNC=m -CONFIG_PPP_SYNC_TTY=m -CONFIG_SLIP=m -CONFIG_SLHC=m -CONFIG_SLIP_COMPRESSED=y -CONFIG_SLIP_SMART=y -CONFIG_SLIP_MODE_SLIP6=y -CONFIG_USB_NET_DRIVERS=m -CONFIG_USB_CATC=m -CONFIG_USB_KAWETH=m -CONFIG_USB_PEGASUS=m -CONFIG_USB_RTL8150=m -CONFIG_USB_RTL8152=m -CONFIG_USB_LAN78XX=m -CONFIG_USB_USBNET=m -CONFIG_USB_NET_AX8817X=m -CONFIG_USB_NET_AX88179_178A=m -CONFIG_USB_NET_CDCETHER=m -CONFIG_USB_NET_CDC_EEM=m -CONFIG_USB_NET_CDC_NCM=m -CONFIG_USB_NET_HUAWEI_CDC_NCM=m -CONFIG_USB_NET_CDC_MBIM=m -CONFIG_USB_NET_DM9601=m -CONFIG_USB_NET_SR9700=m -CONFIG_USB_NET_SR9800=m -CONFIG_USB_NET_SMSC75XX=m -CONFIG_USB_NET_SMSC95XX=m -CONFIG_USB_NET_GL620A=m -CONFIG_USB_NET_NET1080=m -CONFIG_USB_NET_PLUSB=m -CONFIG_USB_NET_MCS7830=m -CONFIG_USB_NET_RNDIS_HOST=m -CONFIG_USB_NET_CDC_SUBSET_ENABLE=m -CONFIG_USB_NET_CDC_SUBSET=m -CONFIG_USB_ALI_M5632=y -CONFIG_USB_AN2720=y -CONFIG_USB_BELKIN=y -CONFIG_USB_ARMLINUX=y -CONFIG_USB_EPSON2888=y -CONFIG_USB_KC2190=y -CONFIG_USB_NET_ZAURUS=m -CONFIG_USB_NET_CX82310_ETH=m -CONFIG_USB_NET_KALMIA=m -CONFIG_USB_NET_QMI_WWAN=m -CONFIG_USB_HSO=m -CONFIG_USB_NET_INT51X1=m -CONFIG_USB_CDC_PHONET=m -CONFIG_USB_IPHETH=m -CONFIG_USB_SIERRA_NET=m -CONFIG_USB_VL600=m -CONFIG_USB_NET_CH9200=m -CONFIG_USB_NET_AQC111=m -CONFIG_WLAN=y -# CONFIG_WIRELESS_WDS is not set -CONFIG_WLAN_VENDOR_ADMTEK=y -CONFIG_ADM8211=m -CONFIG_ATH_COMMON=m -CONFIG_WLAN_VENDOR_ATH=y -# CONFIG_ATH_DEBUG is not set -CONFIG_ATH5K=m -CONFIG_ATH5K_DEBUG=y -CONFIG_ATH5K_TRACER=y -CONFIG_ATH5K_PCI=y -CONFIG_ATH9K_HW=m -CONFIG_ATH9K_COMMON=m -CONFIG_ATH9K_COMMON_DEBUG=y -CONFIG_ATH9K_BTCOEX_SUPPORT=y -CONFIG_ATH9K=m -CONFIG_ATH9K_PCI=y -CONFIG_ATH9K_AHB=y -CONFIG_ATH9K_DEBUGFS=y -CONFIG_ATH9K_STATION_STATISTICS=y -CONFIG_ATH9K_DYNACK=y -CONFIG_ATH9K_WOW=y -CONFIG_ATH9K_RFKILL=y -CONFIG_ATH9K_CHANNEL_CONTEXT=y -CONFIG_ATH9K_PCOEM=y -CONFIG_ATH9K_PCI_NO_EEPROM=m -CONFIG_ATH9K_HTC=m -CONFIG_ATH9K_HTC_DEBUGFS=y -CONFIG_ATH9K_HWRNG=y -CONFIG_ATH9K_COMMON_SPECTRAL=y -CONFIG_CARL9170=m -CONFIG_CARL9170_LEDS=y -CONFIG_CARL9170_DEBUGFS=y -CONFIG_CARL9170_WPC=y -# CONFIG_CARL9170_HWRNG is not set -CONFIG_ATH6KL=m -CONFIG_ATH6KL_SDIO=m -CONFIG_ATH6KL_USB=m -CONFIG_ATH6KL_DEBUG=y -CONFIG_ATH6KL_TRACING=y -CONFIG_AR5523=m -CONFIG_WIL6210=m -CONFIG_WIL6210_ISR_COR=y -CONFIG_WIL6210_TRACING=y -CONFIG_WIL6210_DEBUGFS=y -CONFIG_ATH10K=m -CONFIG_ATH10K_CE=y -CONFIG_ATH10K_PCI=m -CONFIG_ATH10K_AHB=y -CONFIG_ATH10K_SDIO=m -CONFIG_ATH10K_USB=m -CONFIG_ATH10K_DEBUG=y -CONFIG_ATH10K_DEBUGFS=y -CONFIG_ATH10K_SPECTRAL=y -CONFIG_ATH10K_TRACING=y -CONFIG_WCN36XX=m -CONFIG_WCN36XX_DEBUGFS=y -CONFIG_WLAN_VENDOR_ATMEL=y -CONFIG_ATMEL=m -CONFIG_PCI_ATMEL=m -CONFIG_PCMCIA_ATMEL=m -CONFIG_AT76C50X_USB=m -CONFIG_WLAN_VENDOR_BROADCOM=y -CONFIG_B43=m -CONFIG_B43_BCMA=y -CONFIG_B43_SSB=y -CONFIG_B43_BUSES_BCMA_AND_SSB=y -# CONFIG_B43_BUSES_BCMA is not set -# CONFIG_B43_BUSES_SSB is not set -CONFIG_B43_PCI_AUTOSELECT=y -CONFIG_B43_PCICORE_AUTOSELECT=y -CONFIG_B43_SDIO=y -CONFIG_B43_BCMA_PIO=y -CONFIG_B43_PIO=y -CONFIG_B43_PHY_G=y -CONFIG_B43_PHY_N=y -CONFIG_B43_PHY_LP=y -CONFIG_B43_PHY_HT=y -CONFIG_B43_LEDS=y -CONFIG_B43_HWRNG=y -# CONFIG_B43_DEBUG is not set -CONFIG_B43LEGACY=m -CONFIG_B43LEGACY_PCI_AUTOSELECT=y -CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y -CONFIG_B43LEGACY_LEDS=y -CONFIG_B43LEGACY_HWRNG=y -CONFIG_B43LEGACY_DEBUG=y -CONFIG_B43LEGACY_DMA=y -CONFIG_B43LEGACY_PIO=y -CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y -# CONFIG_B43LEGACY_DMA_MODE is not set -# CONFIG_B43LEGACY_PIO_MODE is not set -CONFIG_BRCMUTIL=m -CONFIG_BRCMSMAC=m -CONFIG_BRCMFMAC=m -CONFIG_BRCMFMAC_PROTO_BCDC=y -CONFIG_BRCMFMAC_PROTO_MSGBUF=y -CONFIG_BRCMFMAC_SDIO=y -CONFIG_BRCMFMAC_USB=y -CONFIG_BRCMFMAC_PCIE=y -CONFIG_BRCM_TRACING=y -CONFIG_BRCMDBG=y -CONFIG_WLAN_VENDOR_CISCO=y -CONFIG_AIRO=m -CONFIG_AIRO_CS=m -CONFIG_WLAN_VENDOR_INTEL=y -CONFIG_IPW2100=m -CONFIG_IPW2100_MONITOR=y -# CONFIG_IPW2100_DEBUG is not set -CONFIG_IPW2200=m -CONFIG_IPW2200_MONITOR=y -CONFIG_IPW2200_RADIOTAP=y -CONFIG_IPW2200_PROMISCUOUS=y -CONFIG_IPW2200_QOS=y -# CONFIG_IPW2200_DEBUG is not set -CONFIG_LIBIPW=m -# CONFIG_LIBIPW_DEBUG is not set -CONFIG_IWLEGACY=m -CONFIG_IWL4965=m -CONFIG_IWL3945=m - -# -# iwl3945 / iwl4965 Debugging Options -# -CONFIG_IWLEGACY_DEBUG=y -CONFIG_IWLEGACY_DEBUGFS=y -# end of iwl3945 / iwl4965 Debugging Options - -CONFIG_IWLWIFI=m -CONFIG_IWLWIFI_LEDS=y -CONFIG_IWLDVM=m -CONFIG_IWLMVM=m -CONFIG_IWLWIFI_OPMODE_MODULAR=y -# CONFIG_IWLWIFI_BCAST_FILTERING is not set - -# -# Debugging Options -# -CONFIG_IWLWIFI_DEBUG=y -CONFIG_IWLWIFI_DEBUGFS=y -CONFIG_IWLWIFI_DEVICE_TRACING=y -# end of Debugging Options - -CONFIG_WLAN_VENDOR_INTERSIL=y -CONFIG_HOSTAP=m -CONFIG_HOSTAP_FIRMWARE=y -CONFIG_HOSTAP_FIRMWARE_NVRAM=y -CONFIG_HOSTAP_PLX=m -CONFIG_HOSTAP_PCI=m -CONFIG_HOSTAP_CS=m -CONFIG_HERMES=m -CONFIG_HERMES_PRISM=y -CONFIG_HERMES_CACHE_FW_ON_INIT=y -CONFIG_PLX_HERMES=m -CONFIG_TMD_HERMES=m -CONFIG_NORTEL_HERMES=m -CONFIG_PCI_HERMES=m -CONFIG_PCMCIA_HERMES=m -CONFIG_PCMCIA_SPECTRUM=m -CONFIG_ORINOCO_USB=m -CONFIG_P54_COMMON=m -CONFIG_P54_USB=m -CONFIG_P54_PCI=m -CONFIG_P54_SPI=m -# CONFIG_P54_SPI_DEFAULT_EEPROM is not set -CONFIG_P54_LEDS=y -CONFIG_PRISM54=m -CONFIG_WLAN_VENDOR_MARVELL=y -CONFIG_LIBERTAS=m -CONFIG_LIBERTAS_USB=m -CONFIG_LIBERTAS_CS=m -CONFIG_LIBERTAS_SDIO=m -CONFIG_LIBERTAS_SPI=m -# CONFIG_LIBERTAS_DEBUG is not set -CONFIG_LIBERTAS_MESH=y -CONFIG_LIBERTAS_THINFIRM=m -# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set -CONFIG_LIBERTAS_THINFIRM_USB=m -CONFIG_MWIFIEX=m -CONFIG_MWIFIEX_SDIO=m -CONFIG_MWIFIEX_PCIE=m -CONFIG_MWIFIEX_USB=m -CONFIG_MWL8K=m -CONFIG_WLAN_VENDOR_MEDIATEK=y -CONFIG_MT7601U=m -CONFIG_MT76_CORE=m -CONFIG_MT76_LEDS=y -CONFIG_MT76_USB=m -CONFIG_MT76x02_LIB=m -CONFIG_MT76x02_USB=m -CONFIG_MT76x0_COMMON=m -CONFIG_MT76x0U=m -CONFIG_MT76x0E=m -CONFIG_MT76x2_COMMON=m -CONFIG_MT76x2E=m -CONFIG_MT76x2U=m -CONFIG_MT7603E=m -CONFIG_MT7615_COMMON=m -CONFIG_MT7615E=m -CONFIG_MT7663_USB_SDIO_COMMON=m -CONFIG_MT7663U=m -# CONFIG_MT7663S is not set -CONFIG_MT7915E=m -CONFIG_WLAN_VENDOR_MICROCHIP=y -CONFIG_WILC1000=m -CONFIG_WILC1000_SDIO=m -CONFIG_WILC1000_SPI=m -# CONFIG_WILC1000_HW_OOB_INTR is not set -CONFIG_WLAN_VENDOR_RALINK=y -CONFIG_RT2X00=m -CONFIG_RT2400PCI=m -CONFIG_RT2500PCI=m -CONFIG_RT61PCI=m -CONFIG_RT2800PCI=m -CONFIG_RT2800PCI_RT33XX=y -CONFIG_RT2800PCI_RT35XX=y -CONFIG_RT2800PCI_RT53XX=y -CONFIG_RT2800PCI_RT3290=y -CONFIG_RT2500USB=m -CONFIG_RT73USB=m -CONFIG_RT2800USB=m -CONFIG_RT2800USB_RT33XX=y -CONFIG_RT2800USB_RT35XX=y -CONFIG_RT2800USB_RT3573=y -CONFIG_RT2800USB_RT53XX=y -CONFIG_RT2800USB_RT55XX=y -CONFIG_RT2800USB_UNKNOWN=y -CONFIG_RT2800_LIB=m -CONFIG_RT2800_LIB_MMIO=m -CONFIG_RT2X00_LIB_MMIO=m -CONFIG_RT2X00_LIB_PCI=m -CONFIG_RT2X00_LIB_USB=m -CONFIG_RT2X00_LIB=m -CONFIG_RT2X00_LIB_FIRMWARE=y -CONFIG_RT2X00_LIB_CRYPTO=y -CONFIG_RT2X00_LIB_LEDS=y -CONFIG_RT2X00_LIB_DEBUGFS=y -# CONFIG_RT2X00_DEBUG is not set -CONFIG_WLAN_VENDOR_REALTEK=y -CONFIG_RTL8180=m -CONFIG_RTL8187=m -CONFIG_RTL8187_LEDS=y -CONFIG_RTL_CARDS=m -CONFIG_RTL8192CE=m -CONFIG_RTL8192SE=m -CONFIG_RTL8192DE=m -CONFIG_RTL8723AE=m -CONFIG_RTL8723BE=m -CONFIG_RTL8188EE=m -CONFIG_RTL8192EE=m -CONFIG_RTL8821AE=m -CONFIG_RTL8192CU=m -CONFIG_RTLWIFI=m -CONFIG_RTLWIFI_PCI=m -CONFIG_RTLWIFI_USB=m -CONFIG_RTLWIFI_DEBUG=y -CONFIG_RTL8192C_COMMON=m -CONFIG_RTL8723_COMMON=m -CONFIG_RTLBTCOEXIST=m -CONFIG_RTL8XXXU=m -CONFIG_RTL8XXXU_UNTESTED=y -CONFIG_RTW88=m -CONFIG_RTW88_CORE=m -CONFIG_RTW88_PCI=m -CONFIG_RTW88_8822B=m -CONFIG_RTW88_8822C=m -CONFIG_RTW88_8723D=m -CONFIG_RTW88_8821C=m -CONFIG_RTW88_8822BE=m -CONFIG_RTW88_8822CE=m -CONFIG_RTW88_8723DE=m -CONFIG_RTW88_8821CE=m -CONFIG_RTW88_DEBUG=y -CONFIG_RTW88_DEBUGFS=y -CONFIG_WLAN_VENDOR_RSI=y -CONFIG_RSI_91X=m -CONFIG_RSI_DEBUGFS=y -CONFIG_RSI_SDIO=m -CONFIG_RSI_USB=m -CONFIG_RSI_COEX=y -CONFIG_WLAN_VENDOR_ST=y -CONFIG_CW1200=m -CONFIG_CW1200_WLAN_SDIO=m -CONFIG_CW1200_WLAN_SPI=m -CONFIG_WLAN_VENDOR_TI=y -CONFIG_WL1251=m -CONFIG_WL1251_SPI=m -CONFIG_WL1251_SDIO=m -CONFIG_WL12XX=m -CONFIG_WL18XX=m -CONFIG_WLCORE=m -CONFIG_WLCORE_SPI=m -CONFIG_WLCORE_SDIO=m -CONFIG_WILINK_PLATFORM_DATA=y -CONFIG_WLAN_VENDOR_ZYDAS=y -CONFIG_USB_ZD1201=m -CONFIG_ZD1211RW=m -# CONFIG_ZD1211RW_DEBUG is not set -CONFIG_WLAN_VENDOR_QUANTENNA=y -CONFIG_QTNFMAC=m -CONFIG_QTNFMAC_PCIE=m -CONFIG_PCMCIA_RAYCS=m -CONFIG_PCMCIA_WL3501=m -CONFIG_MAC80211_HWSIM=m -CONFIG_USB_NET_RNDIS_WLAN=m -CONFIG_VIRT_WIFI=m - -# -# WiMAX Wireless Broadband devices -# -CONFIG_WIMAX_I2400M=m -CONFIG_WIMAX_I2400M_USB=m -CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 -# end of WiMAX Wireless Broadband devices - -# CONFIG_WAN is not set -CONFIG_IEEE802154_DRIVERS=m -CONFIG_IEEE802154_FAKELB=m -CONFIG_IEEE802154_AT86RF230=m -# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set -CONFIG_IEEE802154_MRF24J40=m -CONFIG_IEEE802154_CC2520=m -CONFIG_IEEE802154_ATUSB=m -CONFIG_IEEE802154_ADF7242=m -CONFIG_IEEE802154_CA8210=m -# CONFIG_IEEE802154_CA8210_DEBUGFS is not set -CONFIG_IEEE802154_MCR20A=m -CONFIG_IEEE802154_HWSIM=m -CONFIG_XEN_NETDEV_FRONTEND=m -CONFIG_XEN_NETDEV_BACKEND=m -CONFIG_VMXNET3=m -CONFIG_FUJITSU_ES=m -CONFIG_USB4_NET=m -CONFIG_HYPERV_NET=m -CONFIG_NETDEVSIM=m -CONFIG_NET_FAILOVER=m -CONFIG_ISDN=y -CONFIG_ISDN_CAPI=y -CONFIG_CAPI_TRACE=y -CONFIG_ISDN_CAPI_MIDDLEWARE=y -CONFIG_MISDN=m -CONFIG_MISDN_DSP=m -CONFIG_MISDN_L1OIP=m - -# -# mISDN hardware drivers -# -CONFIG_MISDN_HFCPCI=m -CONFIG_MISDN_HFCMULTI=m -CONFIG_MISDN_HFCUSB=m -CONFIG_MISDN_AVMFRITZ=m -CONFIG_MISDN_SPEEDFAX=m -CONFIG_MISDN_INFINEON=m -CONFIG_MISDN_W6692=m -CONFIG_MISDN_NETJET=m -CONFIG_MISDN_HDLC=m -CONFIG_MISDN_IPAC=m -CONFIG_MISDN_ISAR=m -CONFIG_NVM=y -CONFIG_NVM_PBLK=m -# CONFIG_NVM_PBLK_DEBUG is not set - -# -# Input device support -# -CONFIG_INPUT=y -CONFIG_INPUT_LEDS=m -CONFIG_INPUT_FF_MEMLESS=m -CONFIG_INPUT_POLLDEV=m -CONFIG_INPUT_SPARSEKMAP=m -CONFIG_INPUT_MATRIXKMAP=m - -# -# Userland interfaces -# -CONFIG_INPUT_MOUSEDEV=m -CONFIG_INPUT_MOUSEDEV_PSAUX=y -CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 -CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 -CONFIG_INPUT_JOYDEV=m -CONFIG_INPUT_EVDEV=m -# CONFIG_INPUT_EVBUG is not set - -# -# Input Device Drivers -# -CONFIG_INPUT_KEYBOARD=y -CONFIG_KEYBOARD_ADC=m -CONFIG_KEYBOARD_ADP5520=m -CONFIG_KEYBOARD_ADP5588=m -CONFIG_KEYBOARD_ADP5589=m -CONFIG_KEYBOARD_APPLESPI=m -CONFIG_KEYBOARD_ATKBD=m -CONFIG_KEYBOARD_QT1050=m -CONFIG_KEYBOARD_QT1070=m -CONFIG_KEYBOARD_QT2160=m -CONFIG_KEYBOARD_DLINK_DIR685=m -CONFIG_KEYBOARD_LKKBD=m -CONFIG_KEYBOARD_GPIO=m -CONFIG_KEYBOARD_GPIO_POLLED=m -CONFIG_KEYBOARD_TCA6416=m -CONFIG_KEYBOARD_TCA8418=m -CONFIG_KEYBOARD_MATRIX=m -CONFIG_KEYBOARD_LM8323=m -CONFIG_KEYBOARD_LM8333=m -CONFIG_KEYBOARD_MAX7359=m -CONFIG_KEYBOARD_MCS=m -CONFIG_KEYBOARD_MPR121=m -CONFIG_KEYBOARD_NEWTON=m -CONFIG_KEYBOARD_OPENCORES=m -CONFIG_KEYBOARD_SAMSUNG=m -CONFIG_KEYBOARD_STOWAWAY=m -CONFIG_KEYBOARD_SUNKBD=m -CONFIG_KEYBOARD_STMPE=m -CONFIG_KEYBOARD_IQS62X=m -CONFIG_KEYBOARD_OMAP4=m -CONFIG_KEYBOARD_TC3589X=m -CONFIG_KEYBOARD_TM2_TOUCHKEY=m -CONFIG_KEYBOARD_TWL4030=m -CONFIG_KEYBOARD_XTKBD=m -CONFIG_KEYBOARD_CROS_EC=m -CONFIG_KEYBOARD_CAP11XX=m -CONFIG_KEYBOARD_BCM=m -CONFIG_KEYBOARD_MTK_PMIC=m -CONFIG_INPUT_MOUSE=y -CONFIG_MOUSE_PS2=m -CONFIG_MOUSE_PS2_ALPS=y -CONFIG_MOUSE_PS2_BYD=y -CONFIG_MOUSE_PS2_LOGIPS2PP=y -CONFIG_MOUSE_PS2_SYNAPTICS=y -CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y -CONFIG_MOUSE_PS2_CYPRESS=y -CONFIG_MOUSE_PS2_LIFEBOOK=y -CONFIG_MOUSE_PS2_TRACKPOINT=y -CONFIG_MOUSE_PS2_ELANTECH=y -CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y -CONFIG_MOUSE_PS2_SENTELIC=y -CONFIG_MOUSE_PS2_TOUCHKIT=y -CONFIG_MOUSE_PS2_FOCALTECH=y -CONFIG_MOUSE_PS2_VMMOUSE=y -CONFIG_MOUSE_PS2_SMBUS=y -CONFIG_MOUSE_SERIAL=m -CONFIG_MOUSE_APPLETOUCH=m -CONFIG_MOUSE_BCM5974=m -CONFIG_MOUSE_CYAPA=m -CONFIG_MOUSE_ELAN_I2C=m -CONFIG_MOUSE_ELAN_I2C_I2C=y -CONFIG_MOUSE_ELAN_I2C_SMBUS=y -CONFIG_MOUSE_VSXXXAA=m -CONFIG_MOUSE_GPIO=m -CONFIG_MOUSE_SYNAPTICS_I2C=m -CONFIG_MOUSE_SYNAPTICS_USB=m -CONFIG_INPUT_JOYSTICK=y -CONFIG_JOYSTICK_ANALOG=m -CONFIG_JOYSTICK_A3D=m -CONFIG_JOYSTICK_ADI=m -CONFIG_JOYSTICK_COBRA=m -CONFIG_JOYSTICK_GF2K=m -CONFIG_JOYSTICK_GRIP=m -CONFIG_JOYSTICK_GRIP_MP=m -CONFIG_JOYSTICK_GUILLEMOT=m -CONFIG_JOYSTICK_INTERACT=m -CONFIG_JOYSTICK_SIDEWINDER=m -CONFIG_JOYSTICK_TMDC=m -CONFIG_JOYSTICK_IFORCE=m -CONFIG_JOYSTICK_IFORCE_USB=m -CONFIG_JOYSTICK_IFORCE_232=m -CONFIG_JOYSTICK_WARRIOR=m -CONFIG_JOYSTICK_MAGELLAN=m -CONFIG_JOYSTICK_SPACEORB=m -CONFIG_JOYSTICK_SPACEBALL=m -CONFIG_JOYSTICK_STINGER=m -CONFIG_JOYSTICK_TWIDJOY=m -CONFIG_JOYSTICK_ZHENHUA=m -CONFIG_JOYSTICK_DB9=m -CONFIG_JOYSTICK_GAMECON=m -CONFIG_JOYSTICK_TURBOGRAFX=m -CONFIG_JOYSTICK_AS5011=m -CONFIG_JOYSTICK_JOYDUMP=m -CONFIG_JOYSTICK_XPAD=m -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_JOYSTICK_WALKERA0701=m -CONFIG_JOYSTICK_PSXPAD_SPI=m -CONFIG_JOYSTICK_PSXPAD_SPI_FF=y -CONFIG_JOYSTICK_PXRC=m -CONFIG_JOYSTICK_FSIA6B=m -CONFIG_INPUT_TABLET=y -CONFIG_TABLET_USB_ACECAD=m -CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m -CONFIG_TABLET_USB_HANWANG=m -CONFIG_TABLET_USB_KBTAB=m -CONFIG_TABLET_USB_PEGASUS=m -CONFIG_TABLET_SERIAL_WACOM4=m -CONFIG_INPUT_TOUCHSCREEN=y -CONFIG_TOUCHSCREEN_PROPERTIES=y -CONFIG_TOUCHSCREEN_88PM860X=m -CONFIG_TOUCHSCREEN_ADS7846=m -CONFIG_TOUCHSCREEN_AD7877=m -CONFIG_TOUCHSCREEN_AD7879=m -CONFIG_TOUCHSCREEN_AD7879_I2C=m -CONFIG_TOUCHSCREEN_AD7879_SPI=m -CONFIG_TOUCHSCREEN_ADC=m -CONFIG_TOUCHSCREEN_AR1021_I2C=m -CONFIG_TOUCHSCREEN_ATMEL_MXT=m -CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y -CONFIG_TOUCHSCREEN_AUO_PIXCIR=m -CONFIG_TOUCHSCREEN_BU21013=m -CONFIG_TOUCHSCREEN_BU21029=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m -CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m -CONFIG_TOUCHSCREEN_CY8CTMA140=m -CONFIG_TOUCHSCREEN_CY8CTMG110=m -CONFIG_TOUCHSCREEN_CYTTSP_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP_SPI=m -CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m -CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m -CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m -CONFIG_TOUCHSCREEN_DA9034=m -CONFIG_TOUCHSCREEN_DA9052=m -CONFIG_TOUCHSCREEN_DYNAPRO=m -CONFIG_TOUCHSCREEN_HAMPSHIRE=m -CONFIG_TOUCHSCREEN_EETI=m -CONFIG_TOUCHSCREEN_EGALAX=m -CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m -CONFIG_TOUCHSCREEN_EXC3000=m -CONFIG_TOUCHSCREEN_FUJITSU=m -CONFIG_TOUCHSCREEN_GOODIX=m -CONFIG_TOUCHSCREEN_HIDEEP=m -CONFIG_TOUCHSCREEN_ILI210X=m -CONFIG_TOUCHSCREEN_S6SY761=m -CONFIG_TOUCHSCREEN_GUNZE=m -CONFIG_TOUCHSCREEN_EKTF2127=m -CONFIG_TOUCHSCREEN_ELAN=m -CONFIG_TOUCHSCREEN_ELO=m -CONFIG_TOUCHSCREEN_WACOM_W8001=m -CONFIG_TOUCHSCREEN_WACOM_I2C=m -CONFIG_TOUCHSCREEN_MAX11801=m -CONFIG_TOUCHSCREEN_MCS5000=m -CONFIG_TOUCHSCREEN_MMS114=m -CONFIG_TOUCHSCREEN_MELFAS_MIP4=m -CONFIG_TOUCHSCREEN_MTOUCH=m -CONFIG_TOUCHSCREEN_IMX6UL_TSC=m -CONFIG_TOUCHSCREEN_INEXIO=m -CONFIG_TOUCHSCREEN_MK712=m -CONFIG_TOUCHSCREEN_PENMOUNT=m -CONFIG_TOUCHSCREEN_EDT_FT5X06=m -CONFIG_TOUCHSCREEN_TOUCHRIGHT=m -CONFIG_TOUCHSCREEN_TOUCHWIN=m -CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m -CONFIG_TOUCHSCREEN_UCB1400=m -CONFIG_TOUCHSCREEN_PIXCIR=m -CONFIG_TOUCHSCREEN_WDT87XX_I2C=m -CONFIG_TOUCHSCREEN_WM831X=m -CONFIG_TOUCHSCREEN_WM97XX=m -CONFIG_TOUCHSCREEN_WM9705=y -CONFIG_TOUCHSCREEN_WM9712=y -CONFIG_TOUCHSCREEN_WM9713=y -CONFIG_TOUCHSCREEN_USB_COMPOSITE=m -CONFIG_TOUCHSCREEN_MC13783=m -CONFIG_TOUCHSCREEN_USB_EGALAX=y -CONFIG_TOUCHSCREEN_USB_PANJIT=y -CONFIG_TOUCHSCREEN_USB_3M=y -CONFIG_TOUCHSCREEN_USB_ITM=y -CONFIG_TOUCHSCREEN_USB_ETURBO=y -CONFIG_TOUCHSCREEN_USB_GUNZE=y -CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y -CONFIG_TOUCHSCREEN_USB_IRTOUCH=y -CONFIG_TOUCHSCREEN_USB_IDEALTEK=y -CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y -CONFIG_TOUCHSCREEN_USB_GOTOP=y -CONFIG_TOUCHSCREEN_USB_JASTEC=y -CONFIG_TOUCHSCREEN_USB_ELO=y -CONFIG_TOUCHSCREEN_USB_E2I=y -CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y -CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y -CONFIG_TOUCHSCREEN_USB_NEXIO=y -CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y -CONFIG_TOUCHSCREEN_TOUCHIT213=m -CONFIG_TOUCHSCREEN_TSC_SERIO=m -CONFIG_TOUCHSCREEN_TSC200X_CORE=m -CONFIG_TOUCHSCREEN_TSC2004=m -CONFIG_TOUCHSCREEN_TSC2005=m -CONFIG_TOUCHSCREEN_TSC2007=m -CONFIG_TOUCHSCREEN_TSC2007_IIO=y -CONFIG_TOUCHSCREEN_PCAP=m -CONFIG_TOUCHSCREEN_RM_TS=m -CONFIG_TOUCHSCREEN_SILEAD=m -CONFIG_TOUCHSCREEN_SIS_I2C=m -CONFIG_TOUCHSCREEN_ST1232=m -CONFIG_TOUCHSCREEN_STMFTS=m -CONFIG_TOUCHSCREEN_STMPE=m -CONFIG_TOUCHSCREEN_SUR40=m -CONFIG_TOUCHSCREEN_SURFACE3_SPI=m -CONFIG_TOUCHSCREEN_SX8654=m -CONFIG_TOUCHSCREEN_TPS6507X=m -CONFIG_TOUCHSCREEN_ZET6223=m -CONFIG_TOUCHSCREEN_ZFORCE=m -CONFIG_TOUCHSCREEN_COLIBRI_VF50=m -CONFIG_TOUCHSCREEN_ROHM_BU21023=m -CONFIG_TOUCHSCREEN_IQS5XX=m -CONFIG_INPUT_MISC=y -CONFIG_INPUT_88PM860X_ONKEY=m -CONFIG_INPUT_88PM80X_ONKEY=m -CONFIG_INPUT_AD714X=m -CONFIG_INPUT_AD714X_I2C=m -CONFIG_INPUT_AD714X_SPI=m -CONFIG_INPUT_ARIZONA_HAPTICS=m -CONFIG_INPUT_ATMEL_CAPTOUCH=m -CONFIG_INPUT_BMA150=m -CONFIG_INPUT_E3X0_BUTTON=m -CONFIG_INPUT_PCSPKR=m -CONFIG_INPUT_MAX77650_ONKEY=m -CONFIG_INPUT_MAX77693_HAPTIC=m -CONFIG_INPUT_MAX8925_ONKEY=m -CONFIG_INPUT_MAX8997_HAPTIC=m -CONFIG_INPUT_MC13783_PWRBUTTON=m -CONFIG_INPUT_MMA8450=m -CONFIG_INPUT_APANEL=m -CONFIG_INPUT_GPIO_BEEPER=m -CONFIG_INPUT_GPIO_DECODER=m -CONFIG_INPUT_GPIO_VIBRA=m -CONFIG_INPUT_CPCAP_PWRBUTTON=m -CONFIG_INPUT_ATLAS_BTNS=m -CONFIG_INPUT_ATI_REMOTE2=m -CONFIG_INPUT_KEYSPAN_REMOTE=m -CONFIG_INPUT_KXTJ9=m -CONFIG_INPUT_POWERMATE=m -CONFIG_INPUT_YEALINK=m -CONFIG_INPUT_CM109=m -CONFIG_INPUT_REGULATOR_HAPTIC=m -CONFIG_INPUT_RETU_PWRBUTTON=m -CONFIG_INPUT_TPS65218_PWRBUTTON=m -CONFIG_INPUT_AXP20X_PEK=m -CONFIG_INPUT_TWL4030_PWRBUTTON=m -CONFIG_INPUT_TWL4030_VIBRA=m -CONFIG_INPUT_TWL6040_VIBRA=m -CONFIG_INPUT_UINPUT=m -CONFIG_INPUT_PALMAS_PWRBUTTON=m -CONFIG_INPUT_PCF50633_PMU=m -CONFIG_INPUT_PCF8574=m -CONFIG_INPUT_PWM_BEEPER=m -CONFIG_INPUT_PWM_VIBRA=m -CONFIG_INPUT_RK805_PWRKEY=m -CONFIG_INPUT_GPIO_ROTARY_ENCODER=m -CONFIG_INPUT_DA9052_ONKEY=m -CONFIG_INPUT_DA9055_ONKEY=m -CONFIG_INPUT_DA9063_ONKEY=m -CONFIG_INPUT_WM831X_ON=m -CONFIG_INPUT_PCAP=m -CONFIG_INPUT_ADXL34X=m -CONFIG_INPUT_ADXL34X_I2C=m -CONFIG_INPUT_ADXL34X_SPI=m -CONFIG_INPUT_IMS_PCU=m -CONFIG_INPUT_IQS269A=m -CONFIG_INPUT_CMA3000=m -CONFIG_INPUT_CMA3000_I2C=m -CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m -CONFIG_INPUT_IDEAPAD_SLIDEBAR=m -CONFIG_INPUT_SOC_BUTTON_ARRAY=m -CONFIG_INPUT_DRV260X_HAPTICS=m -CONFIG_INPUT_DRV2665_HAPTICS=m -CONFIG_INPUT_DRV2667_HAPTICS=m -CONFIG_INPUT_RAVE_SP_PWRBUTTON=m -CONFIG_INPUT_STPMIC1_ONKEY=m -CONFIG_RMI4_CORE=m -CONFIG_RMI4_I2C=m -CONFIG_RMI4_SPI=m -CONFIG_RMI4_SMB=m -CONFIG_RMI4_F03=y -CONFIG_RMI4_F03_SERIO=m -CONFIG_RMI4_2D_SENSOR=y -CONFIG_RMI4_F11=y -CONFIG_RMI4_F12=y -CONFIG_RMI4_F30=y -CONFIG_RMI4_F34=y -# CONFIG_RMI4_F54 is not set -CONFIG_RMI4_F55=y - -# -# Hardware I/O ports -# -CONFIG_SERIO=m -CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y -CONFIG_SERIO_I8042=m -CONFIG_SERIO_SERPORT=m -CONFIG_SERIO_CT82C710=m -CONFIG_SERIO_PARKBD=m -CONFIG_SERIO_PCIPS2=m -CONFIG_SERIO_LIBPS2=m -CONFIG_SERIO_RAW=m -CONFIG_SERIO_ALTERA_PS2=m -CONFIG_SERIO_PS2MULT=m -CONFIG_SERIO_ARC_PS2=m -# CONFIG_SERIO_APBPS2 is not set -CONFIG_HYPERV_KEYBOARD=m -CONFIG_SERIO_GPIO_PS2=m -CONFIG_USERIO=m -CONFIG_GAMEPORT=m -CONFIG_GAMEPORT_NS558=m -CONFIG_GAMEPORT_L4=m -CONFIG_GAMEPORT_EMU10K1=m -CONFIG_GAMEPORT_FM801=m -# end of Hardware I/O ports -# end of Input device support - -# -# Character devices -# -CONFIG_TTY=y -CONFIG_VT=y -CONFIG_CONSOLE_TRANSLATIONS=y -CONFIG_VT_CONSOLE=y -CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y -CONFIG_VT_HW_CONSOLE_BINDING=y -CONFIG_UNIX98_PTYS=y -# CONFIG_LEGACY_PTYS is not set -CONFIG_LDISC_AUTOLOAD=y - -# -# Serial drivers -# -CONFIG_SERIAL_EARLYCON=y -CONFIG_SERIAL_8250=y -# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set -CONFIG_SERIAL_8250_PNP=y -# CONFIG_SERIAL_8250_16550A_VARIANTS is not set -CONFIG_SERIAL_8250_FINTEK=y -CONFIG_SERIAL_8250_CONSOLE=y -CONFIG_SERIAL_8250_DMA=y -CONFIG_SERIAL_8250_PCI=y -CONFIG_SERIAL_8250_EXAR=m -CONFIG_SERIAL_8250_CS=m -CONFIG_SERIAL_8250_MEN_MCB=m -CONFIG_SERIAL_8250_NR_UARTS=32 -CONFIG_SERIAL_8250_RUNTIME_UARTS=4 -CONFIG_SERIAL_8250_EXTENDED=y -CONFIG_SERIAL_8250_MANY_PORTS=y -CONFIG_SERIAL_8250_ASPEED_VUART=m -CONFIG_SERIAL_8250_SHARE_IRQ=y -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -CONFIG_SERIAL_8250_RSA=y -CONFIG_SERIAL_8250_DWLIB=y -CONFIG_SERIAL_8250_DW=m -CONFIG_SERIAL_8250_RT288X=y -CONFIG_SERIAL_8250_LPSS=y -CONFIG_SERIAL_8250_MID=y -CONFIG_SERIAL_OF_PLATFORM=m - -# -# Non-8250 serial port support -# -CONFIG_SERIAL_MAX3100=m -CONFIG_SERIAL_MAX310X=m -CONFIG_SERIAL_UARTLITE=m -CONFIG_SERIAL_UARTLITE_NR_UARTS=1 -CONFIG_SERIAL_CORE=y -CONFIG_SERIAL_CORE_CONSOLE=y -CONFIG_SERIAL_JSM=m -CONFIG_SERIAL_SIFIVE=m -CONFIG_SERIAL_LANTIQ=m -CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m -CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y -CONFIG_SERIAL_ALTERA_JTAGUART=m -CONFIG_SERIAL_ALTERA_UART=m -CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 -CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 -CONFIG_SERIAL_IFX6X60=m -CONFIG_SERIAL_XILINX_PS_UART=m -CONFIG_SERIAL_ARC=m -CONFIG_SERIAL_ARC_NR_PORTS=1 -CONFIG_SERIAL_RP2=m -CONFIG_SERIAL_RP2_NR_UARTS=32 -CONFIG_SERIAL_FSL_LPUART=m -CONFIG_SERIAL_FSL_LINFLEXUART=m -CONFIG_SERIAL_CONEXANT_DIGICOLOR=m -CONFIG_SERIAL_MEN_Z135=m -CONFIG_SERIAL_SPRD=m -# end of Serial drivers - -CONFIG_SERIAL_MCTRL_GPIO=y -CONFIG_SERIAL_NONSTANDARD=y -CONFIG_ROCKETPORT=m -CONFIG_CYCLADES=m -CONFIG_CYZ_INTR=y -CONFIG_MOXA_INTELLIO=m -CONFIG_MOXA_SMARTIO=m -CONFIG_SYNCLINK=m -CONFIG_SYNCLINKMP=m -CONFIG_SYNCLINK_GT=m -CONFIG_ISI=m -CONFIG_N_HDLC=m -CONFIG_N_GSM=m -CONFIG_NOZOMI=m -CONFIG_NULL_TTY=m -CONFIG_TRACE_ROUTER=m -CONFIG_TRACE_SINK=m -CONFIG_HVC_DRIVER=y -CONFIG_HVC_IRQ=y -CONFIG_HVC_XEN=y -CONFIG_HVC_XEN_FRONTEND=y -CONFIG_SERIAL_DEV_BUS=y -CONFIG_SERIAL_DEV_CTRL_TTYPORT=y -# CONFIG_TTY_PRINTK is not set -CONFIG_PRINTER=m -# CONFIG_LP_CONSOLE is not set -CONFIG_PPDEV=m -CONFIG_VIRTIO_CONSOLE=m -CONFIG_IPMI_HANDLER=m -CONFIG_IPMI_DMI_DECODE=y -CONFIG_IPMI_PLAT_DATA=y -# CONFIG_IPMI_PANIC_EVENT is not set -CONFIG_IPMI_DEVICE_INTERFACE=m -CONFIG_IPMI_SI=m -CONFIG_IPMI_SSIF=m -CONFIG_IPMI_WATCHDOG=m -CONFIG_IPMI_POWEROFF=m -CONFIG_IPMB_DEVICE_INTERFACE=m -CONFIG_HW_RANDOM=m -CONFIG_HW_RANDOM_TIMERIOMEM=m -CONFIG_HW_RANDOM_INTEL=m -CONFIG_HW_RANDOM_AMD=m -# CONFIG_HW_RANDOM_BA431 is not set -CONFIG_HW_RANDOM_VIA=m -CONFIG_HW_RANDOM_VIRTIO=m -CONFIG_HW_RANDOM_CCTRNG=m -CONFIG_APPLICOM=m - -# -# PCMCIA character devices -# -CONFIG_SYNCLINK_CS=m -CONFIG_CARDMAN_4000=m -CONFIG_CARDMAN_4040=m -CONFIG_SCR24X=m -CONFIG_IPWIRELESS=m -# end of PCMCIA character devices - -CONFIG_MWAVE=m -CONFIG_DEVMEM=y -# CONFIG_DEVKMEM is not set -CONFIG_NVRAM=m -CONFIG_RAW_DRIVER=m -CONFIG_MAX_RAW_DEVS=256 -CONFIG_DEVPORT=y -CONFIG_HPET=y -CONFIG_HPET_MMAP=y -CONFIG_HPET_MMAP_DEFAULT=y -CONFIG_HANGCHECK_TIMER=m -CONFIG_TCG_TPM=m -CONFIG_HW_RANDOM_TPM=y -CONFIG_TCG_TIS_CORE=m -CONFIG_TCG_TIS=m -CONFIG_TCG_TIS_SPI=m -CONFIG_TCG_TIS_SPI_CR50=y -CONFIG_TCG_TIS_I2C_ATMEL=m -CONFIG_TCG_TIS_I2C_INFINEON=m -CONFIG_TCG_TIS_I2C_NUVOTON=m -CONFIG_TCG_NSC=m -CONFIG_TCG_ATMEL=m -CONFIG_TCG_INFINEON=m -CONFIG_TCG_XEN=m -CONFIG_TCG_CRB=m -CONFIG_TCG_VTPM_PROXY=m -CONFIG_TCG_TIS_ST33ZP24=m -CONFIG_TCG_TIS_ST33ZP24_I2C=m -CONFIG_TCG_TIS_ST33ZP24_SPI=m -CONFIG_TELCLOCK=m -CONFIG_XILLYBUS=m -CONFIG_XILLYBUS_PCIE=m -CONFIG_XILLYBUS_OF=m -# end of Character devices - -# CONFIG_RANDOM_TRUST_CPU is not set -# CONFIG_RANDOM_TRUST_BOOTLOADER is not set - -# -# I2C support -# -CONFIG_I2C=y -CONFIG_ACPI_I2C_OPREGION=y -CONFIG_I2C_BOARDINFO=y -CONFIG_I2C_COMPAT=y -CONFIG_I2C_CHARDEV=m -CONFIG_I2C_MUX=m - -# -# Multiplexer I2C Chip support -# -CONFIG_I2C_ARB_GPIO_CHALLENGE=m -CONFIG_I2C_MUX_GPIO=m -CONFIG_I2C_MUX_GPMUX=m -CONFIG_I2C_MUX_LTC4306=m -CONFIG_I2C_MUX_PCA9541=m -CONFIG_I2C_MUX_PCA954x=m -CONFIG_I2C_MUX_PINCTRL=m -CONFIG_I2C_MUX_REG=m -CONFIG_I2C_DEMUX_PINCTRL=m -CONFIG_I2C_MUX_MLXCPLD=m -# end of Multiplexer I2C Chip support - -CONFIG_I2C_HELPER_AUTO=y -CONFIG_I2C_SMBUS=m -CONFIG_I2C_ALGOBIT=m -CONFIG_I2C_ALGOPCA=m - -# -# I2C Hardware Bus support -# - -# -# PC SMBus host controller drivers -# -CONFIG_I2C_ALI1535=m -CONFIG_I2C_ALI1563=m -CONFIG_I2C_ALI15X3=m -CONFIG_I2C_AMD756=m -CONFIG_I2C_AMD756_S4882=m -CONFIG_I2C_AMD8111=m -CONFIG_I2C_AMD_MP2=m -CONFIG_I2C_I801=m -CONFIG_I2C_ISCH=m -CONFIG_I2C_ISMT=m -CONFIG_I2C_PIIX4=m -CONFIG_I2C_CHT_WC=m -CONFIG_I2C_NFORCE2=m -CONFIG_I2C_NFORCE2_S4985=m -CONFIG_I2C_NVIDIA_GPU=m -CONFIG_I2C_SIS5595=m -CONFIG_I2C_SIS630=m -CONFIG_I2C_SIS96X=m -CONFIG_I2C_VIA=m -CONFIG_I2C_VIAPRO=m - -# -# ACPI drivers -# -CONFIG_I2C_SCMI=m - -# -# I2C system bus drivers (mostly embedded / system-on-chip) -# -CONFIG_I2C_CBUS_GPIO=m -CONFIG_I2C_DESIGNWARE_CORE=y -CONFIG_I2C_DESIGNWARE_SLAVE=y -CONFIG_I2C_DESIGNWARE_PLATFORM=y -CONFIG_I2C_DESIGNWARE_BAYTRAIL=y -CONFIG_I2C_DESIGNWARE_PCI=m -CONFIG_I2C_EMEV2=m -CONFIG_I2C_GPIO=m -# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set -CONFIG_I2C_KEMPLD=m -CONFIG_I2C_OCORES=m -CONFIG_I2C_PCA_PLATFORM=m -CONFIG_I2C_RK3X=m -CONFIG_I2C_SIMTEC=m -CONFIG_I2C_XILINX=m - -# -# External I2C/SMBus adapter drivers -# -CONFIG_I2C_DIOLAN_U2C=m -CONFIG_I2C_DLN2=m -CONFIG_I2C_PARPORT=m -CONFIG_I2C_ROBOTFUZZ_OSIF=m -CONFIG_I2C_TAOS_EVM=m -CONFIG_I2C_TINY_USB=m -CONFIG_I2C_VIPERBOARD=m - -# -# Other I2C/SMBus bus drivers -# -CONFIG_I2C_MLXCPLD=m -CONFIG_I2C_CROS_EC_TUNNEL=m -CONFIG_I2C_FSI=m -# end of I2C Hardware Bus support - -CONFIG_I2C_STUB=m -CONFIG_I2C_SLAVE=y -CONFIG_I2C_SLAVE_EEPROM=m -# CONFIG_I2C_DEBUG_CORE is not set -# CONFIG_I2C_DEBUG_ALGO is not set -# CONFIG_I2C_DEBUG_BUS is not set -# end of I2C support - -CONFIG_I3C=m -CONFIG_CDNS_I3C_MASTER=m -CONFIG_DW_I3C_MASTER=m -CONFIG_SPI=y -# CONFIG_SPI_DEBUG is not set -CONFIG_SPI_MASTER=y -CONFIG_SPI_MEM=y - -# -# SPI Master Controller Drivers -# -CONFIG_SPI_ALTERA=m -CONFIG_SPI_AXI_SPI_ENGINE=m -CONFIG_SPI_BITBANG=m -CONFIG_SPI_BUTTERFLY=m -CONFIG_SPI_CADENCE=m -CONFIG_SPI_DESIGNWARE=m -CONFIG_SPI_DW_DMA=y -CONFIG_SPI_DW_PCI=m -CONFIG_SPI_DW_MMIO=m -CONFIG_SPI_DLN2=m -CONFIG_SPI_FSI=m -CONFIG_SPI_NXP_FLEXSPI=m -CONFIG_SPI_GPIO=m -CONFIG_SPI_LM70_LLP=m -CONFIG_SPI_FSL_LIB=m -CONFIG_SPI_FSL_SPI=m -# CONFIG_SPI_LANTIQ_SSC is not set -CONFIG_SPI_OC_TINY=m -CONFIG_SPI_PXA2XX=m -CONFIG_SPI_PXA2XX_PCI=m -CONFIG_SPI_ROCKCHIP=m -CONFIG_SPI_SC18IS602=m -CONFIG_SPI_SIFIVE=m -CONFIG_SPI_MXIC=m -CONFIG_SPI_XCOMM=m -CONFIG_SPI_XILINX=m -CONFIG_SPI_ZYNQMP_GQSPI=m -CONFIG_SPI_AMD=m - -# -# SPI Multiplexer support -# -CONFIG_SPI_MUX=m - -# -# SPI Protocol Masters -# -CONFIG_SPI_SPIDEV=m -CONFIG_SPI_LOOPBACK_TEST=m -CONFIG_SPI_TLE62X0=m -CONFIG_SPI_SLAVE=y -CONFIG_SPI_SLAVE_TIME=m -CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m -CONFIG_SPI_DYNAMIC=y -CONFIG_SPMI=m -CONFIG_HSI=m -CONFIG_HSI_BOARDINFO=y - -# -# HSI controllers -# - -# -# HSI clients -# -CONFIG_HSI_CHAR=m -CONFIG_PPS=y -# CONFIG_PPS_DEBUG is not set - -# -# PPS clients support -# -CONFIG_PPS_CLIENT_KTIMER=m -CONFIG_PPS_CLIENT_LDISC=m -CONFIG_PPS_CLIENT_PARPORT=m -CONFIG_PPS_CLIENT_GPIO=m - -# -# PPS generators support -# - -# -# PTP clock support -# -CONFIG_PTP_1588_CLOCK=y -CONFIG_DP83640_PHY=m -CONFIG_PTP_1588_CLOCK_INES=m -CONFIG_PTP_1588_CLOCK_KVM=m -CONFIG_PTP_1588_CLOCK_IDT82P33=m -CONFIG_PTP_1588_CLOCK_IDTCM=m -CONFIG_PTP_1588_CLOCK_VMW=m -# end of PTP clock support - -CONFIG_PINCTRL=y -CONFIG_GENERIC_PINCTRL_GROUPS=y -CONFIG_PINMUX=y -CONFIG_GENERIC_PINMUX_FUNCTIONS=y -CONFIG_PINCONF=y -CONFIG_GENERIC_PINCONF=y -# CONFIG_DEBUG_PINCTRL is not set -CONFIG_PINCTRL_AS3722=m -CONFIG_PINCTRL_AXP209=m -CONFIG_PINCTRL_AMD=m -CONFIG_PINCTRL_DA9062=m -CONFIG_PINCTRL_MCP23S08_I2C=m -CONFIG_PINCTRL_MCP23S08_SPI=m -CONFIG_PINCTRL_MCP23S08=m -CONFIG_PINCTRL_SINGLE=m -CONFIG_PINCTRL_SX150X=y -CONFIG_PINCTRL_STMFX=m -CONFIG_PINCTRL_MAX77620=m -CONFIG_PINCTRL_PALMAS=m -CONFIG_PINCTRL_RK805=m -CONFIG_PINCTRL_OCELOT=y -CONFIG_PINCTRL_BAYTRAIL=y -CONFIG_PINCTRL_CHERRYVIEW=y -CONFIG_PINCTRL_LYNXPOINT=y -CONFIG_PINCTRL_INTEL=y -CONFIG_PINCTRL_BROXTON=y -CONFIG_PINCTRL_CANNONLAKE=y -CONFIG_PINCTRL_CEDARFORK=y -CONFIG_PINCTRL_DENVERTON=y -# CONFIG_PINCTRL_EMMITSBURG is not set -CONFIG_PINCTRL_GEMINILAKE=y -CONFIG_PINCTRL_ICELAKE=y -CONFIG_PINCTRL_JASPERLAKE=y -CONFIG_PINCTRL_LEWISBURG=y -CONFIG_PINCTRL_SUNRISEPOINT=y -CONFIG_PINCTRL_TIGERLAKE=y -CONFIG_PINCTRL_LOCHNAGAR=m -CONFIG_PINCTRL_MADERA=m -CONFIG_PINCTRL_CS47L15=y -CONFIG_PINCTRL_CS47L35=y -CONFIG_PINCTRL_CS47L85=y -CONFIG_PINCTRL_CS47L90=y -CONFIG_PINCTRL_CS47L92=y -CONFIG_PINCTRL_EQUILIBRIUM=m -CONFIG_GPIOLIB=y -CONFIG_GPIOLIB_FASTPATH_LIMIT=512 -CONFIG_OF_GPIO=y -CONFIG_GPIO_ACPI=y -CONFIG_GPIOLIB_IRQCHIP=y -# CONFIG_DEBUG_GPIO is not set -CONFIG_GPIO_SYSFS=y -CONFIG_GPIO_GENERIC=y -CONFIG_GPIO_MAX730X=m - -# -# Memory mapped GPIO drivers -# -CONFIG_GPIO_74XX_MMIO=m -CONFIG_GPIO_ALTERA=m -CONFIG_GPIO_AMDPT=m -CONFIG_GPIO_CADENCE=m -CONFIG_GPIO_DWAPB=m -CONFIG_GPIO_EXAR=m -CONFIG_GPIO_FTGPIO010=y -CONFIG_GPIO_GENERIC_PLATFORM=m -CONFIG_GPIO_GRGPIO=m -CONFIG_GPIO_HLWD=m -CONFIG_GPIO_ICH=m -CONFIG_GPIO_LOGICVC=m -CONFIG_GPIO_MB86S7X=m -CONFIG_GPIO_MENZ127=m -CONFIG_GPIO_SAMA5D2_PIOBU=m -CONFIG_GPIO_SIFIVE=y -CONFIG_GPIO_SIOX=m -CONFIG_GPIO_SYSCON=m -CONFIG_GPIO_VX855=m -CONFIG_GPIO_WCD934X=m -CONFIG_GPIO_XILINX=m -CONFIG_GPIO_AMD_FCH=m -# end of Memory mapped GPIO drivers - -# -# Port-mapped I/O GPIO drivers -# -CONFIG_GPIO_F7188X=m -CONFIG_GPIO_IT87=m -CONFIG_GPIO_SCH=m -CONFIG_GPIO_SCH311X=m -CONFIG_GPIO_WINBOND=m -CONFIG_GPIO_WS16C48=m -# end of Port-mapped I/O GPIO drivers - -# -# I2C GPIO expanders -# -CONFIG_GPIO_ADP5588=m -CONFIG_GPIO_ADNP=m -CONFIG_GPIO_GW_PLD=m -CONFIG_GPIO_MAX7300=m -CONFIG_GPIO_MAX732X=m -CONFIG_GPIO_PCA953X=m -CONFIG_GPIO_PCA953X_IRQ=y -CONFIG_GPIO_PCA9570=m -CONFIG_GPIO_PCF857X=m -CONFIG_GPIO_TPIC2810=m -# end of I2C GPIO expanders - -# -# MFD GPIO expanders -# -CONFIG_GPIO_ADP5520=m -CONFIG_GPIO_ARIZONA=m -CONFIG_GPIO_BD70528=m -CONFIG_GPIO_BD71828=m -CONFIG_GPIO_BD9571MWV=m -CONFIG_GPIO_CRYSTAL_COVE=m -CONFIG_GPIO_DA9052=m -CONFIG_GPIO_DA9055=m -CONFIG_GPIO_DLN2=m -CONFIG_GPIO_JANZ_TTL=m -CONFIG_GPIO_KEMPLD=m -CONFIG_GPIO_LP3943=m -CONFIG_GPIO_LP873X=m -CONFIG_GPIO_LP87565=m -CONFIG_GPIO_MADERA=m -CONFIG_GPIO_MAX77620=m -CONFIG_GPIO_MAX77650=m -CONFIG_GPIO_MSIC=y -CONFIG_GPIO_PALMAS=y -CONFIG_GPIO_RC5T583=y -CONFIG_GPIO_STMPE=y -CONFIG_GPIO_TC3589X=y -CONFIG_GPIO_TPS65086=m -CONFIG_GPIO_TPS65218=m -CONFIG_GPIO_TPS6586X=y -CONFIG_GPIO_TPS65910=y -CONFIG_GPIO_TPS65912=m -CONFIG_GPIO_TPS68470=y -CONFIG_GPIO_TQMX86=m -CONFIG_GPIO_TWL4030=m -CONFIG_GPIO_TWL6040=m -CONFIG_GPIO_UCB1400=m -CONFIG_GPIO_WHISKEY_COVE=m -CONFIG_GPIO_WM831X=m -CONFIG_GPIO_WM8350=m -CONFIG_GPIO_WM8994=m -# end of MFD GPIO expanders - -# -# PCI GPIO expanders -# -CONFIG_GPIO_AMD8111=m -CONFIG_GPIO_ML_IOH=m -CONFIG_GPIO_PCI_IDIO_16=m -CONFIG_GPIO_PCIE_IDIO_24=m -CONFIG_GPIO_RDC321X=m -CONFIG_GPIO_SODAVILLE=y -# end of PCI GPIO expanders - -# -# SPI GPIO expanders -# -CONFIG_GPIO_74X164=m -CONFIG_GPIO_MAX3191X=m -CONFIG_GPIO_MAX7301=m -CONFIG_GPIO_MC33880=m -CONFIG_GPIO_PISOSR=m -CONFIG_GPIO_XRA1403=m -CONFIG_GPIO_MOXTET=m -# end of SPI GPIO expanders - -# -# USB GPIO expanders -# -CONFIG_GPIO_VIPERBOARD=m -# end of USB GPIO expanders - -CONFIG_GPIO_AGGREGATOR=m -CONFIG_GPIO_MOCKUP=m -CONFIG_W1=m -CONFIG_W1_CON=y - -# -# 1-wire Bus Masters -# -CONFIG_W1_MASTER_MATROX=m -CONFIG_W1_MASTER_DS2490=m -CONFIG_W1_MASTER_DS2482=m -CONFIG_W1_MASTER_DS1WM=m -CONFIG_W1_MASTER_GPIO=m -CONFIG_W1_MASTER_SGI=m -# end of 1-wire Bus Masters - -# -# 1-wire Slaves -# -CONFIG_W1_SLAVE_THERM=m -CONFIG_W1_SLAVE_SMEM=m -CONFIG_W1_SLAVE_DS2405=m -CONFIG_W1_SLAVE_DS2408=m -# CONFIG_W1_SLAVE_DS2408_READBACK is not set -CONFIG_W1_SLAVE_DS2413=m -CONFIG_W1_SLAVE_DS2406=m -CONFIG_W1_SLAVE_DS2423=m -CONFIG_W1_SLAVE_DS2805=m -CONFIG_W1_SLAVE_DS2430=m -CONFIG_W1_SLAVE_DS2431=m -CONFIG_W1_SLAVE_DS2433=m -# CONFIG_W1_SLAVE_DS2433_CRC is not set -CONFIG_W1_SLAVE_DS2438=m -CONFIG_W1_SLAVE_DS250X=m -CONFIG_W1_SLAVE_DS2780=m -CONFIG_W1_SLAVE_DS2781=m -CONFIG_W1_SLAVE_DS28E04=m -CONFIG_W1_SLAVE_DS28E17=m -# end of 1-wire Slaves - -CONFIG_POWER_AVS=y -CONFIG_QCOM_CPR=m -CONFIG_POWER_RESET=y -CONFIG_POWER_RESET_AS3722=y -CONFIG_POWER_RESET_GPIO=y -CONFIG_POWER_RESET_GPIO_RESTART=y -CONFIG_POWER_RESET_LTC2952=y -CONFIG_POWER_RESET_MT6323=y -CONFIG_POWER_RESET_RESTART=y -CONFIG_POWER_RESET_SYSCON=y -CONFIG_POWER_RESET_SYSCON_POWEROFF=y -CONFIG_REBOOT_MODE=m -CONFIG_SYSCON_REBOOT_MODE=m -CONFIG_NVMEM_REBOOT_MODE=m -CONFIG_POWER_SUPPLY=y -# CONFIG_POWER_SUPPLY_DEBUG is not set -CONFIG_POWER_SUPPLY_HWMON=y -CONFIG_PDA_POWER=m -CONFIG_GENERIC_ADC_BATTERY=m -CONFIG_MAX8925_POWER=m -CONFIG_WM831X_BACKUP=m -CONFIG_WM831X_POWER=m -CONFIG_WM8350_POWER=m -CONFIG_TEST_POWER=m -CONFIG_BATTERY_88PM860X=m -CONFIG_CHARGER_ADP5061=m -CONFIG_BATTERY_ACT8945A=m -CONFIG_BATTERY_CPCAP=m -CONFIG_BATTERY_CW2015=m -CONFIG_BATTERY_DS2760=m -CONFIG_BATTERY_DS2780=m -CONFIG_BATTERY_DS2781=m -CONFIG_BATTERY_DS2782=m -CONFIG_BATTERY_LEGO_EV3=m -CONFIG_BATTERY_SBS=m -CONFIG_CHARGER_SBS=m -CONFIG_MANAGER_SBS=m -CONFIG_BATTERY_BQ27XXX=m -CONFIG_BATTERY_BQ27XXX_I2C=m -CONFIG_BATTERY_BQ27XXX_HDQ=m -# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set -CONFIG_BATTERY_DA9030=m -CONFIG_BATTERY_DA9052=m -CONFIG_CHARGER_DA9150=m -CONFIG_BATTERY_DA9150=m -CONFIG_CHARGER_AXP20X=m -CONFIG_BATTERY_AXP20X=m -CONFIG_AXP20X_POWER=m -CONFIG_AXP288_CHARGER=m -CONFIG_AXP288_FUEL_GAUGE=m -CONFIG_BATTERY_MAX17040=m -CONFIG_BATTERY_MAX17042=m -CONFIG_BATTERY_MAX1721X=m -CONFIG_BATTERY_TWL4030_MADC=m -CONFIG_CHARGER_88PM860X=m -CONFIG_CHARGER_PCF50633=m -CONFIG_BATTERY_RX51=m -CONFIG_CHARGER_ISP1704=m -CONFIG_CHARGER_MAX8903=m -CONFIG_CHARGER_TWL4030=m -CONFIG_CHARGER_LP8727=m -CONFIG_CHARGER_LP8788=m -CONFIG_CHARGER_GPIO=m -CONFIG_CHARGER_MANAGER=y -CONFIG_CHARGER_LT3651=m -CONFIG_CHARGER_MAX14577=m -CONFIG_CHARGER_DETECTOR_MAX14656=m -CONFIG_CHARGER_MAX77650=m -CONFIG_CHARGER_MAX77693=m -CONFIG_CHARGER_MAX8997=m -CONFIG_CHARGER_MAX8998=m -CONFIG_CHARGER_MP2629=m -CONFIG_CHARGER_BQ2415X=m -CONFIG_CHARGER_BQ24190=m -CONFIG_CHARGER_BQ24257=m -CONFIG_CHARGER_BQ24735=m -# CONFIG_CHARGER_BQ2515X is not set -CONFIG_CHARGER_BQ25890=m -CONFIG_CHARGER_SMB347=m -CONFIG_CHARGER_TPS65090=m -CONFIG_CHARGER_TPS65217=m -CONFIG_BATTERY_GAUGE_LTC2941=m -CONFIG_BATTERY_RT5033=m -CONFIG_CHARGER_RT9455=m -CONFIG_CHARGER_CROS_USBPD=m -CONFIG_CHARGER_UCS1002=m -CONFIG_CHARGER_BD70528=m -CONFIG_CHARGER_BD99954=m -CONFIG_CHARGER_WILCO=m -CONFIG_HWMON=y -CONFIG_HWMON_VID=m -# CONFIG_HWMON_DEBUG_CHIP is not set - -# -# Native drivers -# -CONFIG_SENSORS_ABITUGURU=m -CONFIG_SENSORS_ABITUGURU3=m -CONFIG_SENSORS_AD7314=m -CONFIG_SENSORS_AD7414=m -CONFIG_SENSORS_AD7418=m -CONFIG_SENSORS_ADM1021=m -CONFIG_SENSORS_ADM1025=m -CONFIG_SENSORS_ADM1026=m -CONFIG_SENSORS_ADM1029=m -CONFIG_SENSORS_ADM1031=m -CONFIG_SENSORS_ADM1177=m -CONFIG_SENSORS_ADM9240=m -CONFIG_SENSORS_ADT7X10=m -CONFIG_SENSORS_ADT7310=m -CONFIG_SENSORS_ADT7410=m -CONFIG_SENSORS_ADT7411=m -CONFIG_SENSORS_ADT7462=m -CONFIG_SENSORS_ADT7470=m -CONFIG_SENSORS_ADT7475=m -CONFIG_SENSORS_AS370=m -CONFIG_SENSORS_ASC7621=m -CONFIG_SENSORS_AXI_FAN_CONTROL=m -CONFIG_SENSORS_K8TEMP=m -CONFIG_SENSORS_K10TEMP=m -CONFIG_SENSORS_FAM15H_POWER=m -CONFIG_SENSORS_AMD_ENERGY=m -CONFIG_SENSORS_APPLESMC=m -CONFIG_SENSORS_ASB100=m -CONFIG_SENSORS_ASPEED=m -CONFIG_SENSORS_ATXP1=m -CONFIG_SENSORS_CORSAIR_CPRO=m -CONFIG_SENSORS_DRIVETEMP=m -CONFIG_SENSORS_DS620=m -CONFIG_SENSORS_DS1621=m -CONFIG_SENSORS_DELL_SMM=m -CONFIG_SENSORS_DA9052_ADC=m -CONFIG_SENSORS_DA9055=m -CONFIG_SENSORS_I5K_AMB=m -CONFIG_SENSORS_F71805F=m -CONFIG_SENSORS_F71882FG=m -CONFIG_SENSORS_F75375S=m -CONFIG_SENSORS_GSC=m -CONFIG_SENSORS_MC13783_ADC=m -CONFIG_SENSORS_FSCHMD=m -CONFIG_SENSORS_FTSTEUTATES=m -CONFIG_SENSORS_GL518SM=m -CONFIG_SENSORS_GL520SM=m -CONFIG_SENSORS_G760A=m -CONFIG_SENSORS_G762=m -CONFIG_SENSORS_GPIO_FAN=m -CONFIG_SENSORS_HIH6130=m -CONFIG_SENSORS_IBMAEM=m -CONFIG_SENSORS_IBMPEX=m -CONFIG_SENSORS_IIO_HWMON=m -CONFIG_SENSORS_I5500=m -CONFIG_SENSORS_CORETEMP=m -CONFIG_SENSORS_IT87=m -CONFIG_SENSORS_JC42=m -CONFIG_SENSORS_POWR1220=m -CONFIG_SENSORS_LINEAGE=m -CONFIG_SENSORS_LOCHNAGAR=m -CONFIG_SENSORS_LTC2945=m -CONFIG_SENSORS_LTC2947=m -CONFIG_SENSORS_LTC2947_I2C=m -CONFIG_SENSORS_LTC2947_SPI=m -CONFIG_SENSORS_LTC2990=m -CONFIG_SENSORS_LTC4151=m -CONFIG_SENSORS_LTC4215=m -CONFIG_SENSORS_LTC4222=m -CONFIG_SENSORS_LTC4245=m -CONFIG_SENSORS_LTC4260=m -CONFIG_SENSORS_LTC4261=m -CONFIG_SENSORS_MAX1111=m -CONFIG_SENSORS_MAX16065=m -CONFIG_SENSORS_MAX1619=m -CONFIG_SENSORS_MAX1668=m -CONFIG_SENSORS_MAX197=m -CONFIG_SENSORS_MAX31722=m -CONFIG_SENSORS_MAX31730=m -CONFIG_SENSORS_MAX6621=m -CONFIG_SENSORS_MAX6639=m -CONFIG_SENSORS_MAX6642=m -CONFIG_SENSORS_MAX6650=m -CONFIG_SENSORS_MAX6697=m -CONFIG_SENSORS_MAX31790=m -CONFIG_SENSORS_MCP3021=m -CONFIG_SENSORS_MLXREG_FAN=m -CONFIG_SENSORS_TC654=m -CONFIG_SENSORS_MENF21BMC_HWMON=m -CONFIG_SENSORS_ADCXX=m -CONFIG_SENSORS_LM63=m -CONFIG_SENSORS_LM70=m -CONFIG_SENSORS_LM73=m -CONFIG_SENSORS_LM75=m -CONFIG_SENSORS_LM77=m -CONFIG_SENSORS_LM78=m -CONFIG_SENSORS_LM80=m -CONFIG_SENSORS_LM83=m -CONFIG_SENSORS_LM85=m -CONFIG_SENSORS_LM87=m -CONFIG_SENSORS_LM90=m -CONFIG_SENSORS_LM92=m -CONFIG_SENSORS_LM93=m -CONFIG_SENSORS_LM95234=m -CONFIG_SENSORS_LM95241=m -CONFIG_SENSORS_LM95245=m -CONFIG_SENSORS_PC87360=m -CONFIG_SENSORS_PC87427=m -CONFIG_SENSORS_NTC_THERMISTOR=m -CONFIG_SENSORS_NCT6683=m -CONFIG_SENSORS_NCT6775=m -CONFIG_SENSORS_NCT7802=m -CONFIG_SENSORS_NCT7904=m -CONFIG_SENSORS_NPCM7XX=m -CONFIG_SENSORS_PCF8591=m -CONFIG_PMBUS=m -CONFIG_SENSORS_PMBUS=m -CONFIG_SENSORS_ADM1275=m -CONFIG_SENSORS_BEL_PFE=m -CONFIG_SENSORS_IBM_CFFPS=m -CONFIG_SENSORS_INSPUR_IPSPS=m -CONFIG_SENSORS_IR35221=m -CONFIG_SENSORS_IR38064=m -CONFIG_SENSORS_IRPS5401=m -CONFIG_SENSORS_ISL68137=m -CONFIG_SENSORS_LM25066=m -CONFIG_SENSORS_LTC2978=m -# CONFIG_SENSORS_LTC2978_REGULATOR is not set -CONFIG_SENSORS_LTC3815=m -CONFIG_SENSORS_MAX16064=m -CONFIG_SENSORS_MAX16601=m -CONFIG_SENSORS_MAX20730=m -CONFIG_SENSORS_MAX20751=m -CONFIG_SENSORS_MAX31785=m -CONFIG_SENSORS_MAX34440=m -CONFIG_SENSORS_MAX8688=m -CONFIG_SENSORS_PXE1610=m -CONFIG_SENSORS_TPS40422=m -CONFIG_SENSORS_TPS53679=m -CONFIG_SENSORS_UCD9000=m -CONFIG_SENSORS_UCD9200=m -CONFIG_SENSORS_XDPE122=m -CONFIG_SENSORS_ZL6100=m -CONFIG_SENSORS_PWM_FAN=m -CONFIG_SENSORS_SHT15=m -CONFIG_SENSORS_SHT21=m -CONFIG_SENSORS_SHT3x=m -CONFIG_SENSORS_SHTC1=m -CONFIG_SENSORS_SIS5595=m -CONFIG_SENSORS_DME1737=m -CONFIG_SENSORS_EMC1403=m -CONFIG_SENSORS_EMC2103=m -CONFIG_SENSORS_EMC6W201=m -CONFIG_SENSORS_SMSC47M1=m -CONFIG_SENSORS_SMSC47M192=m -CONFIG_SENSORS_SMSC47B397=m -CONFIG_SENSORS_SCH56XX_COMMON=m -CONFIG_SENSORS_SCH5627=m -CONFIG_SENSORS_SCH5636=m -CONFIG_SENSORS_STTS751=m -CONFIG_SENSORS_SMM665=m -CONFIG_SENSORS_ADC128D818=m -CONFIG_SENSORS_ADS7828=m -CONFIG_SENSORS_ADS7871=m -CONFIG_SENSORS_AMC6821=m -CONFIG_SENSORS_INA209=m -CONFIG_SENSORS_INA2XX=m -CONFIG_SENSORS_INA3221=m -CONFIG_SENSORS_TC74=m -CONFIG_SENSORS_THMC50=m -CONFIG_SENSORS_TMP102=m -CONFIG_SENSORS_TMP103=m -CONFIG_SENSORS_TMP108=m -CONFIG_SENSORS_TMP401=m -CONFIG_SENSORS_TMP421=m -CONFIG_SENSORS_TMP513=m -CONFIG_SENSORS_VIA_CPUTEMP=m -CONFIG_SENSORS_VIA686A=m -CONFIG_SENSORS_VT1211=m -CONFIG_SENSORS_VT8231=m -CONFIG_SENSORS_W83773G=m -CONFIG_SENSORS_W83781D=m -CONFIG_SENSORS_W83791D=m -CONFIG_SENSORS_W83792D=m -CONFIG_SENSORS_W83793=m -CONFIG_SENSORS_W83795=m -# CONFIG_SENSORS_W83795_FANCTRL is not set -CONFIG_SENSORS_W83L785TS=m -CONFIG_SENSORS_W83L786NG=m -CONFIG_SENSORS_W83627HF=m -CONFIG_SENSORS_W83627EHF=m -CONFIG_SENSORS_WM831X=m -CONFIG_SENSORS_WM8350=m -CONFIG_SENSORS_XGENE=m - -# -# ACPI drivers -# -CONFIG_SENSORS_ACPI_POWER=m -CONFIG_SENSORS_ATK0110=m -CONFIG_THERMAL=y -# CONFIG_THERMAL_NETLINK is not set -# CONFIG_THERMAL_STATISTICS is not set -CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 -CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_OF=y -CONFIG_THERMAL_WRITABLE_TRIPS=y -CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y -# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set -# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set -# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set -CONFIG_THERMAL_GOV_FAIR_SHARE=y -CONFIG_THERMAL_GOV_STEP_WISE=y -CONFIG_THERMAL_GOV_BANG_BANG=y -CONFIG_THERMAL_GOV_USER_SPACE=y -CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y -CONFIG_CPU_THERMAL=y -CONFIG_CPU_FREQ_THERMAL=y -CONFIG_CPU_IDLE_THERMAL=y -CONFIG_DEVFREQ_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set -CONFIG_THERMAL_MMIO=m -CONFIG_MAX77620_THERMAL=m -CONFIG_DA9062_THERMAL=m - -# -# Intel thermal drivers -# -CONFIG_INTEL_POWERCLAMP=m -CONFIG_X86_PKG_TEMP_THERMAL=m -CONFIG_INTEL_SOC_DTS_IOSF_CORE=m -CONFIG_INTEL_SOC_DTS_THERMAL=m - -# -# ACPI INT340X thermal drivers -# -CONFIG_INT340X_THERMAL=m -CONFIG_ACPI_THERMAL_REL=m -CONFIG_INT3406_THERMAL=m -CONFIG_PROC_THERMAL_MMIO_RAPL=y -# end of ACPI INT340X thermal drivers - -CONFIG_INTEL_BXT_PMIC_THERMAL=m -CONFIG_INTEL_PCH_THERMAL=m -# end of Intel thermal drivers - -# CONFIG_TI_SOC_THERMAL is not set -CONFIG_GENERIC_ADC_THERMAL=m -CONFIG_WATCHDOG=y -CONFIG_WATCHDOG_CORE=y -# CONFIG_WATCHDOG_NOWAYOUT is not set -CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y -CONFIG_WATCHDOG_OPEN_TIMEOUT=0 -CONFIG_WATCHDOG_SYSFS=y - -# -# Watchdog Pretimeout Governors -# -CONFIG_WATCHDOG_PRETIMEOUT_GOV=y -CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m -CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y -# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set -CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y - -# -# Watchdog Device Drivers -# -CONFIG_SOFT_WATCHDOG=m -# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set -CONFIG_BD70528_WATCHDOG=m -CONFIG_DA9052_WATCHDOG=m -CONFIG_DA9055_WATCHDOG=m -CONFIG_DA9063_WATCHDOG=m -CONFIG_DA9062_WATCHDOG=m -CONFIG_GPIO_WATCHDOG=m -CONFIG_MENF21BMC_WATCHDOG=m -CONFIG_MENZ069_WATCHDOG=m -CONFIG_WDAT_WDT=m -CONFIG_WM831X_WATCHDOG=m -CONFIG_WM8350_WATCHDOG=m -CONFIG_XILINX_WATCHDOG=m -CONFIG_ZIIRAVE_WATCHDOG=m -CONFIG_RAVE_SP_WATCHDOG=m -CONFIG_MLX_WDT=m -CONFIG_CADENCE_WATCHDOG=m -CONFIG_DW_WATCHDOG=m -CONFIG_RN5T618_WATCHDOG=m -CONFIG_TWL4030_WATCHDOG=m -CONFIG_MAX63XX_WATCHDOG=m -CONFIG_MAX77620_WATCHDOG=m -CONFIG_RETU_WATCHDOG=m -CONFIG_STPMIC1_WATCHDOG=m -CONFIG_ACQUIRE_WDT=m -CONFIG_ADVANTECH_WDT=m -CONFIG_ALIM1535_WDT=m -CONFIG_ALIM7101_WDT=m -CONFIG_EBC_C384_WDT=m -CONFIG_F71808E_WDT=m -CONFIG_SP5100_TCO=m -CONFIG_SBC_FITPC2_WATCHDOG=m -CONFIG_EUROTECH_WDT=m -CONFIG_IB700_WDT=m -CONFIG_IBMASR=m -CONFIG_WAFER_WDT=m -CONFIG_I6300ESB_WDT=m -CONFIG_IE6XX_WDT=m -CONFIG_ITCO_WDT=m -CONFIG_ITCO_VENDOR_SUPPORT=y -CONFIG_IT8712F_WDT=m -CONFIG_IT87_WDT=m -CONFIG_HP_WATCHDOG=m -CONFIG_HPWDT_NMI_DECODING=y -CONFIG_KEMPLD_WDT=m -CONFIG_SC1200_WDT=m -CONFIG_PC87413_WDT=m -CONFIG_NV_TCO=m -CONFIG_60XX_WDT=m -CONFIG_CPU5_WDT=m -CONFIG_SMSC_SCH311X_WDT=m -CONFIG_SMSC37B787_WDT=m -CONFIG_TQMX86_WDT=m -CONFIG_VIA_WDT=m -CONFIG_W83627HF_WDT=m -CONFIG_W83877F_WDT=m -CONFIG_W83977F_WDT=m -CONFIG_MACHZ_WDT=m -CONFIG_SBC_EPX_C3_WATCHDOG=m -CONFIG_INTEL_MEI_WDT=m -CONFIG_NI903X_WDT=m -CONFIG_NIC7018_WDT=m -CONFIG_MEN_A21_WDT=m -CONFIG_XEN_WDT=m - -# -# PCI-based Watchdog Cards -# -CONFIG_PCIPCWATCHDOG=m -CONFIG_WDTPCI=m - -# -# USB-based Watchdog Cards -# -CONFIG_USBPCWATCHDOG=m -CONFIG_SSB_POSSIBLE=y -CONFIG_SSB=m -CONFIG_SSB_SPROM=y -CONFIG_SSB_BLOCKIO=y -CONFIG_SSB_PCIHOST_POSSIBLE=y -CONFIG_SSB_PCIHOST=y -CONFIG_SSB_B43_PCI_BRIDGE=y -CONFIG_SSB_PCMCIAHOST_POSSIBLE=y -CONFIG_SSB_PCMCIAHOST=y -CONFIG_SSB_SDIOHOST_POSSIBLE=y -CONFIG_SSB_SDIOHOST=y -CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y -CONFIG_SSB_DRIVER_PCICORE=y -CONFIG_SSB_DRIVER_GPIO=y -CONFIG_BCMA_POSSIBLE=y -CONFIG_BCMA=m -CONFIG_BCMA_BLOCKIO=y -CONFIG_BCMA_HOST_PCI_POSSIBLE=y -CONFIG_BCMA_HOST_PCI=y -# CONFIG_BCMA_HOST_SOC is not set -CONFIG_BCMA_DRIVER_PCI=y -CONFIG_BCMA_DRIVER_GMAC_CMN=y -CONFIG_BCMA_DRIVER_GPIO=y -# CONFIG_BCMA_DEBUG is not set - -# -# Multifunction device drivers -# -CONFIG_MFD_CORE=y -CONFIG_MFD_ACT8945A=m -CONFIG_MFD_AS3711=y -CONFIG_MFD_AS3722=m -CONFIG_PMIC_ADP5520=y -CONFIG_MFD_AAT2870_CORE=y -CONFIG_MFD_ATMEL_FLEXCOM=m -CONFIG_MFD_ATMEL_HLCDC=m -CONFIG_MFD_BCM590XX=m -CONFIG_MFD_BD9571MWV=m -CONFIG_MFD_AXP20X=m -CONFIG_MFD_AXP20X_I2C=m -CONFIG_MFD_CROS_EC_DEV=m -CONFIG_MFD_MADERA=m -CONFIG_MFD_MADERA_I2C=m -CONFIG_MFD_MADERA_SPI=m -CONFIG_MFD_CS47L15=y -CONFIG_MFD_CS47L35=y -CONFIG_MFD_CS47L85=y -CONFIG_MFD_CS47L90=y -CONFIG_MFD_CS47L92=y -CONFIG_PMIC_DA903X=y -CONFIG_PMIC_DA9052=y -CONFIG_MFD_DA9052_SPI=y -CONFIG_MFD_DA9052_I2C=y -CONFIG_MFD_DA9055=y -CONFIG_MFD_DA9062=m -CONFIG_MFD_DA9063=m -CONFIG_MFD_DA9150=m -CONFIG_MFD_DLN2=m -CONFIG_MFD_GATEWORKS_GSC=m -CONFIG_MFD_MC13XXX=m -CONFIG_MFD_MC13XXX_SPI=m -CONFIG_MFD_MC13XXX_I2C=m -CONFIG_MFD_MP2629=m -CONFIG_MFD_HI6421_PMIC=m -CONFIG_HTC_PASIC3=m -CONFIG_HTC_I2CPLD=y -CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m -CONFIG_LPC_ICH=m -CONFIG_LPC_SCH=m -CONFIG_INTEL_SOC_PMIC=y -CONFIG_INTEL_SOC_PMIC_BXTWC=m -CONFIG_INTEL_SOC_PMIC_CHTWC=y -CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m -CONFIG_INTEL_SOC_PMIC_MRFLD=m -CONFIG_MFD_INTEL_LPSS=m -CONFIG_MFD_INTEL_LPSS_ACPI=m -CONFIG_MFD_INTEL_LPSS_PCI=m -CONFIG_MFD_INTEL_MSIC=y -CONFIG_MFD_INTEL_PMC_BXT=m -CONFIG_MFD_IQS62X=m -CONFIG_MFD_JANZ_CMODIO=m -CONFIG_MFD_KEMPLD=m -CONFIG_MFD_88PM800=m -CONFIG_MFD_88PM805=m -CONFIG_MFD_88PM860X=y -CONFIG_MFD_MAX14577=m -CONFIG_MFD_MAX77620=y -CONFIG_MFD_MAX77650=m -CONFIG_MFD_MAX77686=m -CONFIG_MFD_MAX77693=m -CONFIG_MFD_MAX77843=y -CONFIG_MFD_MAX8907=m -CONFIG_MFD_MAX8925=y -CONFIG_MFD_MAX8997=y -CONFIG_MFD_MAX8998=y -CONFIG_MFD_MT6360=m -CONFIG_MFD_MT6397=m -CONFIG_MFD_MENF21BMC=m -CONFIG_EZX_PCAP=y -CONFIG_MFD_CPCAP=m -CONFIG_MFD_VIPERBOARD=m -CONFIG_MFD_RETU=m -CONFIG_MFD_PCF50633=m -CONFIG_PCF50633_ADC=m -CONFIG_PCF50633_GPIO=m -CONFIG_UCB1400_CORE=m -CONFIG_MFD_RDC321X=m -CONFIG_MFD_RT5033=m -CONFIG_MFD_RC5T583=y -CONFIG_MFD_RK808=m -CONFIG_MFD_RN5T618=m -CONFIG_MFD_SEC_CORE=y -CONFIG_MFD_SI476X_CORE=m -CONFIG_MFD_SM501=m -CONFIG_MFD_SM501_GPIO=y -CONFIG_MFD_SKY81452=m -CONFIG_ABX500_CORE=y -CONFIG_AB3100_CORE=y -CONFIG_AB3100_OTP=y -CONFIG_MFD_STMPE=y - -# -# STMicroelectronics STMPE Interface Drivers -# -CONFIG_STMPE_I2C=y -CONFIG_STMPE_SPI=y -# end of STMicroelectronics STMPE Interface Drivers - -CONFIG_MFD_SYSCON=y -CONFIG_MFD_TI_AM335X_TSCADC=m -CONFIG_MFD_LP3943=m -CONFIG_MFD_LP8788=y -CONFIG_MFD_TI_LMU=m -CONFIG_MFD_PALMAS=y -CONFIG_TPS6105X=m -CONFIG_TPS65010=m -CONFIG_TPS6507X=m -CONFIG_MFD_TPS65086=m -CONFIG_MFD_TPS65090=y -CONFIG_MFD_TPS65217=m -CONFIG_MFD_TPS68470=y -CONFIG_MFD_TI_LP873X=m -CONFIG_MFD_TI_LP87565=m -CONFIG_MFD_TPS65218=m -CONFIG_MFD_TPS6586X=y -CONFIG_MFD_TPS65910=y -CONFIG_MFD_TPS65912=m -CONFIG_MFD_TPS65912_I2C=m -CONFIG_MFD_TPS65912_SPI=m -CONFIG_MFD_TPS80031=y -CONFIG_TWL4030_CORE=y -CONFIG_MFD_TWL4030_AUDIO=y -CONFIG_TWL6040_CORE=y -CONFIG_MFD_WL1273_CORE=m -CONFIG_MFD_LM3533=m -CONFIG_MFD_TC3589X=y -CONFIG_MFD_TQMX86=m -CONFIG_MFD_VX855=m -CONFIG_MFD_LOCHNAGAR=y -CONFIG_MFD_ARIZONA=y -CONFIG_MFD_ARIZONA_I2C=m -CONFIG_MFD_ARIZONA_SPI=m -CONFIG_MFD_CS47L24=y -CONFIG_MFD_WM5102=y -CONFIG_MFD_WM5110=y -CONFIG_MFD_WM8997=y -CONFIG_MFD_WM8998=y -CONFIG_MFD_WM8400=y -CONFIG_MFD_WM831X=y -CONFIG_MFD_WM831X_I2C=y -CONFIG_MFD_WM831X_SPI=y -CONFIG_MFD_WM8350=y -CONFIG_MFD_WM8350_I2C=y -CONFIG_MFD_WM8994=m -CONFIG_MFD_ROHM_BD718XX=m -CONFIG_MFD_ROHM_BD70528=m -CONFIG_MFD_ROHM_BD71828=m -CONFIG_MFD_STPMIC1=m -CONFIG_MFD_STMFX=m -CONFIG_MFD_WCD934X=m -CONFIG_RAVE_SP_CORE=m -# end of Multifunction device drivers - -CONFIG_REGULATOR=y -# CONFIG_REGULATOR_DEBUG is not set -CONFIG_REGULATOR_FIXED_VOLTAGE=m -CONFIG_REGULATOR_VIRTUAL_CONSUMER=m -CONFIG_REGULATOR_USERSPACE_CONSUMER=m -CONFIG_REGULATOR_88PG86X=m -CONFIG_REGULATOR_88PM800=m -CONFIG_REGULATOR_88PM8607=m -CONFIG_REGULATOR_ACT8865=m -CONFIG_REGULATOR_ACT8945A=m -CONFIG_REGULATOR_AD5398=m -CONFIG_REGULATOR_AAT2870=m -CONFIG_REGULATOR_AB3100=m -CONFIG_REGULATOR_ARIZONA_LDO1=m -CONFIG_REGULATOR_ARIZONA_MICSUPP=m -CONFIG_REGULATOR_AS3711=m -CONFIG_REGULATOR_AS3722=m -CONFIG_REGULATOR_AXP20X=m -CONFIG_REGULATOR_BCM590XX=m -CONFIG_REGULATOR_BD70528=m -CONFIG_REGULATOR_BD71828=m -CONFIG_REGULATOR_BD718XX=m -CONFIG_REGULATOR_BD9571MWV=m -CONFIG_REGULATOR_CPCAP=m -CONFIG_REGULATOR_CROS_EC=m -CONFIG_REGULATOR_DA903X=m -CONFIG_REGULATOR_DA9052=m -CONFIG_REGULATOR_DA9055=m -CONFIG_REGULATOR_DA9062=m -CONFIG_REGULATOR_DA9063=m -CONFIG_REGULATOR_DA9210=m -CONFIG_REGULATOR_DA9211=m -CONFIG_REGULATOR_FAN53555=m -CONFIG_REGULATOR_FAN53880=m -CONFIG_REGULATOR_GPIO=m -CONFIG_REGULATOR_HI6421=m -CONFIG_REGULATOR_HI6421V530=m -CONFIG_REGULATOR_ISL9305=m -CONFIG_REGULATOR_ISL6271A=m -CONFIG_REGULATOR_LM363X=m -CONFIG_REGULATOR_LOCHNAGAR=m -CONFIG_REGULATOR_LP3971=m -CONFIG_REGULATOR_LP3972=m -CONFIG_REGULATOR_LP872X=m -CONFIG_REGULATOR_LP873X=m -CONFIG_REGULATOR_LP8755=m -CONFIG_REGULATOR_LP87565=m -CONFIG_REGULATOR_LP8788=m -CONFIG_REGULATOR_LTC3589=m -CONFIG_REGULATOR_LTC3676=m -CONFIG_REGULATOR_MAX14577=m -CONFIG_REGULATOR_MAX1586=m -CONFIG_REGULATOR_MAX77620=m -CONFIG_REGULATOR_MAX77650=m -CONFIG_REGULATOR_MAX8649=m -CONFIG_REGULATOR_MAX8660=m -CONFIG_REGULATOR_MAX8907=m -CONFIG_REGULATOR_MAX8925=m -CONFIG_REGULATOR_MAX8952=m -CONFIG_REGULATOR_MAX8973=m -CONFIG_REGULATOR_MAX8997=m -CONFIG_REGULATOR_MAX8998=m -CONFIG_REGULATOR_MAX77686=m -CONFIG_REGULATOR_MAX77693=m -CONFIG_REGULATOR_MAX77802=m -CONFIG_REGULATOR_MAX77826=m -CONFIG_REGULATOR_MC13XXX_CORE=m -CONFIG_REGULATOR_MC13783=m -CONFIG_REGULATOR_MC13892=m -CONFIG_REGULATOR_MCP16502=m -CONFIG_REGULATOR_MP5416=m -CONFIG_REGULATOR_MP8859=m -CONFIG_REGULATOR_MP886X=m -CONFIG_REGULATOR_MPQ7920=m -CONFIG_REGULATOR_MT6311=m -CONFIG_REGULATOR_MT6323=m -CONFIG_REGULATOR_MT6358=m -CONFIG_REGULATOR_MT6397=m -CONFIG_REGULATOR_PALMAS=m -CONFIG_REGULATOR_PCA9450=m -CONFIG_REGULATOR_PCAP=m -CONFIG_REGULATOR_PCF50633=m -CONFIG_REGULATOR_PFUZE100=m -CONFIG_REGULATOR_PV88060=m -CONFIG_REGULATOR_PV88080=m -CONFIG_REGULATOR_PV88090=m -CONFIG_REGULATOR_PWM=m -CONFIG_REGULATOR_QCOM_SPMI=m -CONFIG_REGULATOR_QCOM_USB_VBUS=m -CONFIG_REGULATOR_RC5T583=m -CONFIG_REGULATOR_RK808=m -CONFIG_REGULATOR_RN5T618=m -CONFIG_REGULATOR_ROHM=m -CONFIG_REGULATOR_RT5033=m -CONFIG_REGULATOR_S2MPA01=m -CONFIG_REGULATOR_S2MPS11=m -CONFIG_REGULATOR_S5M8767=m -CONFIG_REGULATOR_SKY81452=m -CONFIG_REGULATOR_SLG51000=m -CONFIG_REGULATOR_STPMIC1=m -CONFIG_REGULATOR_SY8106A=m -CONFIG_REGULATOR_SY8824X=m -CONFIG_REGULATOR_SY8827N=m -CONFIG_REGULATOR_TPS51632=m -CONFIG_REGULATOR_TPS6105X=m -CONFIG_REGULATOR_TPS62360=m -CONFIG_REGULATOR_TPS65023=m -CONFIG_REGULATOR_TPS6507X=m -CONFIG_REGULATOR_TPS65086=m -CONFIG_REGULATOR_TPS65090=m -CONFIG_REGULATOR_TPS65132=m -CONFIG_REGULATOR_TPS65217=m -CONFIG_REGULATOR_TPS65218=m -CONFIG_REGULATOR_TPS6524X=m -CONFIG_REGULATOR_TPS6586X=m -CONFIG_REGULATOR_TPS65910=m -CONFIG_REGULATOR_TPS65912=m -CONFIG_REGULATOR_TPS80031=m -CONFIG_REGULATOR_TWL4030=m -CONFIG_REGULATOR_VCTRL=m -CONFIG_REGULATOR_WM831X=m -CONFIG_REGULATOR_WM8350=m -CONFIG_REGULATOR_WM8400=m -CONFIG_REGULATOR_WM8994=m -CONFIG_REGULATOR_QCOM_LABIBB=m -CONFIG_RC_CORE=m -CONFIG_RC_MAP=m -CONFIG_LIRC=y -CONFIG_RC_DECODERS=y -CONFIG_IR_NEC_DECODER=m -CONFIG_IR_RC5_DECODER=m -CONFIG_IR_RC6_DECODER=m -CONFIG_IR_JVC_DECODER=m -CONFIG_IR_SONY_DECODER=m -CONFIG_IR_SANYO_DECODER=m -CONFIG_IR_SHARP_DECODER=m -CONFIG_IR_MCE_KBD_DECODER=m -CONFIG_IR_XMP_DECODER=m -CONFIG_IR_IMON_DECODER=m -CONFIG_IR_RCMM_DECODER=m -CONFIG_RC_DEVICES=y -CONFIG_RC_ATI_REMOTE=m -CONFIG_IR_ENE=m -CONFIG_IR_HIX5HD2=m -CONFIG_IR_IMON=m -CONFIG_IR_IMON_RAW=m -CONFIG_IR_MCEUSB=m -CONFIG_IR_ITE_CIR=m -CONFIG_IR_FINTEK=m -CONFIG_IR_NUVOTON=m -CONFIG_IR_REDRAT3=m -CONFIG_IR_SPI=m -CONFIG_IR_STREAMZAP=m -CONFIG_IR_WINBOND_CIR=m -CONFIG_IR_IGORPLUGUSB=m -CONFIG_IR_IGUANA=m -CONFIG_IR_TTUSBIR=m -CONFIG_RC_LOOPBACK=m -CONFIG_IR_GPIO_CIR=m -CONFIG_IR_GPIO_TX=m -CONFIG_IR_PWM_TX=m -CONFIG_IR_SERIAL=m -CONFIG_IR_SERIAL_TRANSMITTER=y -CONFIG_IR_SIR=m -CONFIG_RC_XBOX_DVD=m -CONFIG_IR_TOY=m -CONFIG_CEC_CORE=m -CONFIG_CEC_NOTIFIER=y -CONFIG_CEC_PIN=y -CONFIG_MEDIA_CEC_RC=y -# CONFIG_CEC_PIN_ERROR_INJ is not set -CONFIG_MEDIA_CEC_SUPPORT=y -CONFIG_CEC_CH7322=m -CONFIG_CEC_CROS_EC=m -CONFIG_CEC_GPIO=m -CONFIG_CEC_SECO=m -CONFIG_CEC_SECO_RC=y -CONFIG_USB_PULSE8_CEC=m -CONFIG_USB_RAINSHADOW_CEC=m -CONFIG_MEDIA_SUPPORT=m -# CONFIG_MEDIA_SUPPORT_FILTER is not set -CONFIG_MEDIA_SUBDRV_AUTOSELECT=y - -# -# Media device types -# -CONFIG_MEDIA_CAMERA_SUPPORT=y -CONFIG_MEDIA_ANALOG_TV_SUPPORT=y -CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y -CONFIG_MEDIA_RADIO_SUPPORT=y -CONFIG_MEDIA_SDR_SUPPORT=y -CONFIG_MEDIA_PLATFORM_SUPPORT=y -CONFIG_MEDIA_TEST_SUPPORT=y -# end of Media device types - -# -# Media core support -# -CONFIG_VIDEO_DEV=m -CONFIG_MEDIA_CONTROLLER=y -CONFIG_DVB_CORE=m -# end of Media core support - -# -# Video4Linux options -# -CONFIG_VIDEO_V4L2=m -CONFIG_VIDEO_V4L2_I2C=y -CONFIG_VIDEO_V4L2_SUBDEV_API=y -# CONFIG_VIDEO_ADV_DEBUG is not set -# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set -CONFIG_VIDEO_TUNER=m -CONFIG_V4L2_MEM2MEM_DEV=m -CONFIG_V4L2_FLASH_LED_CLASS=m -CONFIG_V4L2_FWNODE=m -CONFIG_VIDEOBUF_GEN=m -CONFIG_VIDEOBUF_DMA_SG=m -CONFIG_VIDEOBUF_VMALLOC=m -# end of Video4Linux options - -# -# Media controller options -# -CONFIG_MEDIA_CONTROLLER_DVB=y -CONFIG_MEDIA_CONTROLLER_REQUEST_API=y - -# -# Please notice that the enabled Media controller Request API is EXPERIMENTAL -# -# end of Media controller options - -# -# Digital TV options -# -CONFIG_DVB_MMAP=y -CONFIG_DVB_NET=y -CONFIG_DVB_MAX_ADAPTERS=16 -# CONFIG_DVB_DYNAMIC_MINORS is not set -# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set -# CONFIG_DVB_ULE_DEBUG is not set -# end of Digital TV options - -# -# Media drivers -# -CONFIG_TTPCI_EEPROM=m -CONFIG_MEDIA_USB_SUPPORT=y - -# -# Webcam devices -# -CONFIG_USB_VIDEO_CLASS=m -CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y -CONFIG_USB_GSPCA=m -CONFIG_USB_M5602=m -CONFIG_USB_STV06XX=m -CONFIG_USB_GL860=m -CONFIG_USB_GSPCA_BENQ=m -CONFIG_USB_GSPCA_CONEX=m -CONFIG_USB_GSPCA_CPIA1=m -CONFIG_USB_GSPCA_DTCS033=m -CONFIG_USB_GSPCA_ETOMS=m -CONFIG_USB_GSPCA_FINEPIX=m -CONFIG_USB_GSPCA_JEILINJ=m -CONFIG_USB_GSPCA_JL2005BCD=m -CONFIG_USB_GSPCA_KINECT=m -CONFIG_USB_GSPCA_KONICA=m -CONFIG_USB_GSPCA_MARS=m -CONFIG_USB_GSPCA_MR97310A=m -CONFIG_USB_GSPCA_NW80X=m -CONFIG_USB_GSPCA_OV519=m -CONFIG_USB_GSPCA_OV534=m -CONFIG_USB_GSPCA_OV534_9=m -CONFIG_USB_GSPCA_PAC207=m -CONFIG_USB_GSPCA_PAC7302=m -CONFIG_USB_GSPCA_PAC7311=m -CONFIG_USB_GSPCA_SE401=m -CONFIG_USB_GSPCA_SN9C2028=m -CONFIG_USB_GSPCA_SN9C20X=m -CONFIG_USB_GSPCA_SONIXB=m -CONFIG_USB_GSPCA_SONIXJ=m -CONFIG_USB_GSPCA_SPCA500=m -CONFIG_USB_GSPCA_SPCA501=m -CONFIG_USB_GSPCA_SPCA505=m -CONFIG_USB_GSPCA_SPCA506=m -CONFIG_USB_GSPCA_SPCA508=m -CONFIG_USB_GSPCA_SPCA561=m -CONFIG_USB_GSPCA_SPCA1528=m -CONFIG_USB_GSPCA_SQ905=m -CONFIG_USB_GSPCA_SQ905C=m -CONFIG_USB_GSPCA_SQ930X=m -CONFIG_USB_GSPCA_STK014=m -CONFIG_USB_GSPCA_STK1135=m -CONFIG_USB_GSPCA_STV0680=m -CONFIG_USB_GSPCA_SUNPLUS=m -CONFIG_USB_GSPCA_T613=m -CONFIG_USB_GSPCA_TOPRO=m -CONFIG_USB_GSPCA_TOUPTEK=m -CONFIG_USB_GSPCA_TV8532=m -CONFIG_USB_GSPCA_VC032X=m -CONFIG_USB_GSPCA_VICAM=m -CONFIG_USB_GSPCA_XIRLINK_CIT=m -CONFIG_USB_GSPCA_ZC3XX=m -CONFIG_USB_PWC=m -# CONFIG_USB_PWC_DEBUG is not set -CONFIG_USB_PWC_INPUT_EVDEV=y -CONFIG_VIDEO_CPIA2=m -CONFIG_USB_ZR364XX=m -CONFIG_USB_STKWEBCAM=m -CONFIG_USB_S2255=m -CONFIG_VIDEO_USBTV=m - -# -# Analog TV USB devices -# -CONFIG_VIDEO_PVRUSB2=m -CONFIG_VIDEO_PVRUSB2_SYSFS=y -CONFIG_VIDEO_PVRUSB2_DVB=y -# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set -CONFIG_VIDEO_HDPVR=m -CONFIG_VIDEO_STK1160_COMMON=m -CONFIG_VIDEO_STK1160=m -CONFIG_VIDEO_GO7007=m -CONFIG_VIDEO_GO7007_USB=m -CONFIG_VIDEO_GO7007_LOADER=m -CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m - -# -# Analog/digital TV USB devices -# -CONFIG_VIDEO_AU0828=m -CONFIG_VIDEO_AU0828_V4L2=y -CONFIG_VIDEO_AU0828_RC=y -CONFIG_VIDEO_CX231XX=m -CONFIG_VIDEO_CX231XX_RC=y -CONFIG_VIDEO_CX231XX_ALSA=m -CONFIG_VIDEO_CX231XX_DVB=m -CONFIG_VIDEO_TM6000=m -CONFIG_VIDEO_TM6000_ALSA=m -CONFIG_VIDEO_TM6000_DVB=m - -# -# Digital TV USB devices -# -CONFIG_DVB_USB=m -# CONFIG_DVB_USB_DEBUG is not set -CONFIG_DVB_USB_DIB3000MC=m -CONFIG_DVB_USB_A800=m -CONFIG_DVB_USB_DIBUSB_MB=m -CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y -CONFIG_DVB_USB_DIBUSB_MC=m -CONFIG_DVB_USB_DIB0700=m -CONFIG_DVB_USB_UMT_010=m -CONFIG_DVB_USB_CXUSB=m -CONFIG_DVB_USB_CXUSB_ANALOG=y -CONFIG_DVB_USB_M920X=m -CONFIG_DVB_USB_DIGITV=m -CONFIG_DVB_USB_VP7045=m -CONFIG_DVB_USB_VP702X=m -CONFIG_DVB_USB_GP8PSK=m -CONFIG_DVB_USB_NOVA_T_USB2=m -CONFIG_DVB_USB_TTUSB2=m -CONFIG_DVB_USB_DTT200U=m -CONFIG_DVB_USB_OPERA1=m -CONFIG_DVB_USB_AF9005=m -CONFIG_DVB_USB_AF9005_REMOTE=m -CONFIG_DVB_USB_PCTV452E=m -CONFIG_DVB_USB_DW2102=m -CONFIG_DVB_USB_CINERGY_T2=m -CONFIG_DVB_USB_DTV5100=m -CONFIG_DVB_USB_AZ6027=m -CONFIG_DVB_USB_TECHNISAT_USB2=m -CONFIG_DVB_USB_V2=m -CONFIG_DVB_USB_AF9015=m -CONFIG_DVB_USB_AF9035=m -CONFIG_DVB_USB_ANYSEE=m -CONFIG_DVB_USB_AU6610=m -CONFIG_DVB_USB_AZ6007=m -CONFIG_DVB_USB_CE6230=m -CONFIG_DVB_USB_EC168=m -CONFIG_DVB_USB_GL861=m -CONFIG_DVB_USB_LME2510=m -CONFIG_DVB_USB_MXL111SF=m -CONFIG_DVB_USB_RTL28XXU=m -CONFIG_DVB_USB_DVBSKY=m -CONFIG_DVB_USB_ZD1301=m -CONFIG_DVB_TTUSB_BUDGET=m -CONFIG_DVB_TTUSB_DEC=m -CONFIG_SMS_USB_DRV=m -CONFIG_DVB_B2C2_FLEXCOP_USB=m -# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set -CONFIG_DVB_AS102=m - -# -# Webcam, TV (analog/digital) USB devices -# -CONFIG_VIDEO_EM28XX=m -CONFIG_VIDEO_EM28XX_V4L2=m -CONFIG_VIDEO_EM28XX_ALSA=m -CONFIG_VIDEO_EM28XX_DVB=m -CONFIG_VIDEO_EM28XX_RC=m - -# -# Software defined radio USB devices -# -CONFIG_USB_AIRSPY=m -CONFIG_USB_HACKRF=m -CONFIG_USB_MSI2500=m -CONFIG_MEDIA_PCI_SUPPORT=y - -# -# Media capture support -# -CONFIG_VIDEO_MEYE=m -CONFIG_VIDEO_SOLO6X10=m -CONFIG_VIDEO_TW5864=m -CONFIG_VIDEO_TW68=m -CONFIG_VIDEO_TW686X=m - -# -# Media capture/analog TV support -# -CONFIG_VIDEO_IVTV=m -# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set -CONFIG_VIDEO_IVTV_ALSA=m -CONFIG_VIDEO_FB_IVTV=m -# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set -CONFIG_VIDEO_HEXIUM_GEMINI=m -CONFIG_VIDEO_HEXIUM_ORION=m -CONFIG_VIDEO_MXB=m -CONFIG_VIDEO_DT3155=m - -# -# Media capture/analog/hybrid TV support -# -CONFIG_VIDEO_CX18=m -CONFIG_VIDEO_CX18_ALSA=m -CONFIG_VIDEO_CX23885=m -CONFIG_MEDIA_ALTERA_CI=m -CONFIG_VIDEO_CX25821=m -CONFIG_VIDEO_CX25821_ALSA=m -CONFIG_VIDEO_CX88=m -CONFIG_VIDEO_CX88_ALSA=m -CONFIG_VIDEO_CX88_BLACKBIRD=m -CONFIG_VIDEO_CX88_DVB=m -CONFIG_VIDEO_CX88_ENABLE_VP3054=y -CONFIG_VIDEO_CX88_VP3054=m -CONFIG_VIDEO_CX88_MPEG=m -CONFIG_VIDEO_BT848=m -CONFIG_DVB_BT8XX=m -CONFIG_VIDEO_SAA7134=m -CONFIG_VIDEO_SAA7134_ALSA=m -CONFIG_VIDEO_SAA7134_RC=y -CONFIG_VIDEO_SAA7134_DVB=m -CONFIG_VIDEO_SAA7134_GO7007=m -CONFIG_VIDEO_SAA7164=m - -# -# Media digital TV PCI Adapters -# -CONFIG_DVB_AV7110_IR=y -CONFIG_DVB_AV7110=m -CONFIG_DVB_AV7110_OSD=y -CONFIG_DVB_BUDGET_CORE=m -CONFIG_DVB_BUDGET=m -CONFIG_DVB_BUDGET_CI=m -CONFIG_DVB_BUDGET_AV=m -CONFIG_DVB_BUDGET_PATCH=m -CONFIG_DVB_B2C2_FLEXCOP_PCI=m -# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set -CONFIG_DVB_PLUTO2=m -CONFIG_DVB_DM1105=m -CONFIG_DVB_PT1=m -CONFIG_DVB_PT3=m -CONFIG_MANTIS_CORE=m -CONFIG_DVB_MANTIS=m -CONFIG_DVB_HOPPER=m -CONFIG_DVB_NGENE=m -CONFIG_DVB_DDBRIDGE=m -# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set -CONFIG_DVB_SMIPCIE=m -CONFIG_DVB_NETUP_UNIDVB=m -CONFIG_VIDEO_IPU3_CIO2=m -CONFIG_RADIO_ADAPTERS=y -CONFIG_RADIO_TEA575X=m -CONFIG_RADIO_SI470X=m -CONFIG_USB_SI470X=m -CONFIG_I2C_SI470X=m -CONFIG_RADIO_SI4713=m -CONFIG_USB_SI4713=m -CONFIG_PLATFORM_SI4713=m -CONFIG_I2C_SI4713=m -CONFIG_RADIO_SI476X=m -CONFIG_USB_MR800=m -CONFIG_USB_DSBR=m -CONFIG_RADIO_MAXIRADIO=m -CONFIG_RADIO_SHARK=m -CONFIG_RADIO_SHARK2=m -CONFIG_USB_KEENE=m -CONFIG_USB_RAREMONO=m -CONFIG_USB_MA901=m -CONFIG_RADIO_TEA5764=m -CONFIG_RADIO_SAA7706H=m -CONFIG_RADIO_TEF6862=m -CONFIG_RADIO_WL1273=m -CONFIG_RADIO_WL128X=m -CONFIG_MEDIA_COMMON_OPTIONS=y - -# -# common driver options -# -CONFIG_VIDEO_CX2341X=m -CONFIG_VIDEO_TVEEPROM=m -CONFIG_CYPRESS_FIRMWARE=m -CONFIG_VIDEOBUF2_CORE=m -CONFIG_VIDEOBUF2_V4L2=m -CONFIG_VIDEOBUF2_MEMOPS=m -CONFIG_VIDEOBUF2_DMA_CONTIG=m -CONFIG_VIDEOBUF2_VMALLOC=m -CONFIG_VIDEOBUF2_DMA_SG=m -CONFIG_VIDEOBUF2_DVB=m -CONFIG_DVB_B2C2_FLEXCOP=m -CONFIG_VIDEO_SAA7146=m -CONFIG_VIDEO_SAA7146_VV=m -CONFIG_SMS_SIANO_MDTV=m -CONFIG_SMS_SIANO_RC=y -# CONFIG_SMS_SIANO_DEBUGFS is not set -CONFIG_VIDEO_V4L2_TPG=m -CONFIG_V4L_PLATFORM_DRIVERS=y -CONFIG_VIDEO_CAFE_CCIC=m -CONFIG_VIDEO_CADENCE=y -CONFIG_VIDEO_CADENCE_CSI2RX=m -CONFIG_VIDEO_CADENCE_CSI2TX=m -CONFIG_VIDEO_ASPEED=m -CONFIG_VIDEO_MUX=m -CONFIG_VIDEO_XILINX=m -# CONFIG_VIDEO_XILINX_CSI2RXSS is not set -CONFIG_VIDEO_XILINX_TPG=m -CONFIG_VIDEO_XILINX_VTC=m -CONFIG_V4L_MEM2MEM_DRIVERS=y -CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m -CONFIG_DVB_PLATFORM_DRIVERS=y -CONFIG_SDR_PLATFORM_DRIVERS=y - -# -# MMC/SDIO DVB adapters -# -CONFIG_SMS_SDIO_DRV=m -CONFIG_V4L_TEST_DRIVERS=y -CONFIG_VIDEO_VIMC=m -CONFIG_VIDEO_VIVID=m -CONFIG_VIDEO_VIVID_CEC=y -CONFIG_VIDEO_VIVID_MAX_DEVS=64 -CONFIG_VIDEO_VIM2M=m -CONFIG_VIDEO_VICODEC=m - -# -# FireWire (IEEE 1394) Adapters -# -CONFIG_DVB_FIREDTV=m -CONFIG_DVB_FIREDTV_INPUT=y -# end of Media drivers - -# -# Media ancillary drivers -# -CONFIG_MEDIA_ATTACH=y - -# -# IR I2C driver auto-selected by 'Autoselect ancillary drivers' -# -CONFIG_VIDEO_IR_I2C=m - -# -# Audio decoders, processors and mixers -# -CONFIG_VIDEO_TVAUDIO=m -CONFIG_VIDEO_TDA7432=m -CONFIG_VIDEO_TDA9840=m -CONFIG_VIDEO_TDA1997X=m -CONFIG_VIDEO_TEA6415C=m -CONFIG_VIDEO_TEA6420=m -CONFIG_VIDEO_MSP3400=m -CONFIG_VIDEO_CS3308=m -CONFIG_VIDEO_CS5345=m -CONFIG_VIDEO_CS53L32A=m -CONFIG_VIDEO_TLV320AIC23B=m -CONFIG_VIDEO_UDA1342=m -CONFIG_VIDEO_WM8775=m -CONFIG_VIDEO_WM8739=m -CONFIG_VIDEO_VP27SMPX=m -CONFIG_VIDEO_SONY_BTF_MPX=m -# end of Audio decoders, processors and mixers - -# -# RDS decoders -# -CONFIG_VIDEO_SAA6588=m -# end of RDS decoders - -# -# Video decoders -# -CONFIG_VIDEO_ADV7180=m -CONFIG_VIDEO_ADV7183=m -CONFIG_VIDEO_ADV748X=m -CONFIG_VIDEO_ADV7604=m -CONFIG_VIDEO_ADV7604_CEC=y -CONFIG_VIDEO_ADV7842=m -CONFIG_VIDEO_ADV7842_CEC=y -CONFIG_VIDEO_BT819=m -CONFIG_VIDEO_BT856=m -CONFIG_VIDEO_BT866=m -CONFIG_VIDEO_KS0127=m -CONFIG_VIDEO_ML86V7667=m -CONFIG_VIDEO_SAA7110=m -CONFIG_VIDEO_SAA711X=m -CONFIG_VIDEO_TC358743=m -CONFIG_VIDEO_TC358743_CEC=y -CONFIG_VIDEO_TVP514X=m -CONFIG_VIDEO_TVP5150=m -CONFIG_VIDEO_TVP7002=m -CONFIG_VIDEO_TW2804=m -CONFIG_VIDEO_TW9903=m -CONFIG_VIDEO_TW9906=m -CONFIG_VIDEO_TW9910=m -CONFIG_VIDEO_VPX3220=m -# CONFIG_VIDEO_MAX9286 is not set - -# -# Video and audio decoders -# -CONFIG_VIDEO_SAA717X=m -CONFIG_VIDEO_CX25840=m -# end of Video decoders - -# -# Video encoders -# -CONFIG_VIDEO_SAA7127=m -CONFIG_VIDEO_SAA7185=m -CONFIG_VIDEO_ADV7170=m -CONFIG_VIDEO_ADV7175=m -CONFIG_VIDEO_ADV7343=m -CONFIG_VIDEO_ADV7393=m -CONFIG_VIDEO_AD9389B=m -CONFIG_VIDEO_AK881X=m -CONFIG_VIDEO_THS8200=m -# end of Video encoders - -# -# Video improvement chips -# -CONFIG_VIDEO_UPD64031A=m -CONFIG_VIDEO_UPD64083=m -# end of Video improvement chips - -# -# Audio/Video compression chips -# -CONFIG_VIDEO_SAA6752HS=m -# end of Audio/Video compression chips - -# -# SDR tuner chips -# -CONFIG_SDR_MAX2175=m -# end of SDR tuner chips - -# -# Miscellaneous helper chips -# -CONFIG_VIDEO_THS7303=m -CONFIG_VIDEO_M52790=m -CONFIG_VIDEO_I2C=m -CONFIG_VIDEO_ST_MIPID02=m -# end of Miscellaneous helper chips - -# -# Camera sensor devices -# -CONFIG_VIDEO_APTINA_PLL=m -CONFIG_VIDEO_SMIAPP_PLL=m -CONFIG_VIDEO_HI556=m -CONFIG_VIDEO_IMX214=m -CONFIG_VIDEO_IMX219=m -CONFIG_VIDEO_IMX258=m -CONFIG_VIDEO_IMX274=m -CONFIG_VIDEO_IMX290=m -CONFIG_VIDEO_IMX319=m -CONFIG_VIDEO_IMX355=m -CONFIG_VIDEO_OV2640=m -CONFIG_VIDEO_OV2659=m -CONFIG_VIDEO_OV2680=m -CONFIG_VIDEO_OV2685=m -CONFIG_VIDEO_OV2740=m -CONFIG_VIDEO_OV5640=m -CONFIG_VIDEO_OV5645=m -CONFIG_VIDEO_OV5647=m -CONFIG_VIDEO_OV6650=m -CONFIG_VIDEO_OV5670=m -CONFIG_VIDEO_OV5675=m -CONFIG_VIDEO_OV5695=m -CONFIG_VIDEO_OV7251=m -CONFIG_VIDEO_OV772X=m -CONFIG_VIDEO_OV7640=m -CONFIG_VIDEO_OV7670=m -CONFIG_VIDEO_OV7740=m -CONFIG_VIDEO_OV8856=m -CONFIG_VIDEO_OV9640=m -CONFIG_VIDEO_OV9650=m -CONFIG_VIDEO_OV13858=m -CONFIG_VIDEO_VS6624=m -CONFIG_VIDEO_MT9M001=m -CONFIG_VIDEO_MT9M032=m -CONFIG_VIDEO_MT9M111=m -CONFIG_VIDEO_MT9P031=m -CONFIG_VIDEO_MT9T001=m -CONFIG_VIDEO_MT9T112=m -CONFIG_VIDEO_MT9V011=m -CONFIG_VIDEO_MT9V032=m -CONFIG_VIDEO_MT9V111=m -CONFIG_VIDEO_SR030PC30=m -CONFIG_VIDEO_NOON010PC30=m -CONFIG_VIDEO_M5MOLS=m -# CONFIG_VIDEO_RDACM20 is not set -CONFIG_VIDEO_RJ54N1=m -CONFIG_VIDEO_S5K6AA=m -CONFIG_VIDEO_S5K6A3=m -CONFIG_VIDEO_S5K4ECGX=m -CONFIG_VIDEO_S5K5BAF=m -CONFIG_VIDEO_SMIAPP=m -CONFIG_VIDEO_ET8EK8=m -CONFIG_VIDEO_S5C73M3=m -# end of Camera sensor devices - -# -# Lens drivers -# -CONFIG_VIDEO_AD5820=m -CONFIG_VIDEO_AK7375=m -CONFIG_VIDEO_DW9714=m -CONFIG_VIDEO_DW9768=m -CONFIG_VIDEO_DW9807_VCM=m -# end of Lens drivers - -# -# Flash devices -# -CONFIG_VIDEO_ADP1653=m -CONFIG_VIDEO_LM3560=m -CONFIG_VIDEO_LM3646=m -# end of Flash devices - -# -# SPI helper chips -# -CONFIG_VIDEO_GS1662=m -# end of SPI helper chips - -# -# Media SPI Adapters -# -CONFIG_CXD2880_SPI_DRV=m -# end of Media SPI Adapters - -CONFIG_MEDIA_TUNER=m - -# -# Customize TV tuners -# -CONFIG_MEDIA_TUNER_SIMPLE=m -CONFIG_MEDIA_TUNER_TDA18250=m -CONFIG_MEDIA_TUNER_TDA8290=m -CONFIG_MEDIA_TUNER_TDA827X=m -CONFIG_MEDIA_TUNER_TDA18271=m -CONFIG_MEDIA_TUNER_TDA9887=m -CONFIG_MEDIA_TUNER_TEA5761=m -CONFIG_MEDIA_TUNER_TEA5767=m -CONFIG_MEDIA_TUNER_MSI001=m -CONFIG_MEDIA_TUNER_MT20XX=m -CONFIG_MEDIA_TUNER_MT2060=m -CONFIG_MEDIA_TUNER_MT2063=m -CONFIG_MEDIA_TUNER_MT2266=m -CONFIG_MEDIA_TUNER_MT2131=m -CONFIG_MEDIA_TUNER_QT1010=m -CONFIG_MEDIA_TUNER_XC2028=m -CONFIG_MEDIA_TUNER_XC5000=m -CONFIG_MEDIA_TUNER_XC4000=m -CONFIG_MEDIA_TUNER_MXL5005S=m -CONFIG_MEDIA_TUNER_MXL5007T=m -CONFIG_MEDIA_TUNER_MC44S803=m -CONFIG_MEDIA_TUNER_MAX2165=m -CONFIG_MEDIA_TUNER_TDA18218=m -CONFIG_MEDIA_TUNER_FC0011=m -CONFIG_MEDIA_TUNER_FC0012=m -CONFIG_MEDIA_TUNER_FC0013=m -CONFIG_MEDIA_TUNER_TDA18212=m -CONFIG_MEDIA_TUNER_E4000=m -CONFIG_MEDIA_TUNER_FC2580=m -CONFIG_MEDIA_TUNER_M88RS6000T=m -CONFIG_MEDIA_TUNER_TUA9001=m -CONFIG_MEDIA_TUNER_SI2157=m -CONFIG_MEDIA_TUNER_IT913X=m -CONFIG_MEDIA_TUNER_R820T=m -CONFIG_MEDIA_TUNER_MXL301RF=m -CONFIG_MEDIA_TUNER_QM1D1C0042=m -CONFIG_MEDIA_TUNER_QM1D1B0004=m -# end of Customize TV tuners - -# -# Customise DVB Frontends -# - -# -# Multistandard (satellite) frontends -# -CONFIG_DVB_STB0899=m -CONFIG_DVB_STB6100=m -CONFIG_DVB_STV090x=m -CONFIG_DVB_STV0910=m -CONFIG_DVB_STV6110x=m -CONFIG_DVB_STV6111=m -CONFIG_DVB_MXL5XX=m -CONFIG_DVB_M88DS3103=m - -# -# Multistandard (cable + terrestrial) frontends -# -CONFIG_DVB_DRXK=m -CONFIG_DVB_TDA18271C2DD=m -CONFIG_DVB_SI2165=m -CONFIG_DVB_MN88472=m -CONFIG_DVB_MN88473=m - -# -# DVB-S (satellite) frontends -# -CONFIG_DVB_CX24110=m -CONFIG_DVB_CX24123=m -CONFIG_DVB_MT312=m -CONFIG_DVB_ZL10036=m -CONFIG_DVB_ZL10039=m -CONFIG_DVB_S5H1420=m -CONFIG_DVB_STV0288=m -CONFIG_DVB_STB6000=m -CONFIG_DVB_STV0299=m -CONFIG_DVB_STV6110=m -CONFIG_DVB_STV0900=m -CONFIG_DVB_TDA8083=m -CONFIG_DVB_TDA10086=m -CONFIG_DVB_TDA8261=m -CONFIG_DVB_VES1X93=m -CONFIG_DVB_TUNER_ITD1000=m -CONFIG_DVB_TUNER_CX24113=m -CONFIG_DVB_TDA826X=m -CONFIG_DVB_TUA6100=m -CONFIG_DVB_CX24116=m -CONFIG_DVB_CX24117=m -CONFIG_DVB_CX24120=m -CONFIG_DVB_SI21XX=m -CONFIG_DVB_TS2020=m -CONFIG_DVB_DS3000=m -CONFIG_DVB_MB86A16=m -CONFIG_DVB_TDA10071=m - -# -# DVB-T (terrestrial) frontends -# -CONFIG_DVB_SP8870=m -CONFIG_DVB_SP887X=m -CONFIG_DVB_CX22700=m -CONFIG_DVB_CX22702=m -CONFIG_DVB_S5H1432=m -CONFIG_DVB_DRXD=m -CONFIG_DVB_L64781=m -CONFIG_DVB_TDA1004X=m -CONFIG_DVB_NXT6000=m -CONFIG_DVB_MT352=m -CONFIG_DVB_ZL10353=m -CONFIG_DVB_DIB3000MB=m -CONFIG_DVB_DIB3000MC=m -CONFIG_DVB_DIB7000M=m -CONFIG_DVB_DIB7000P=m -CONFIG_DVB_DIB9000=m -CONFIG_DVB_TDA10048=m -CONFIG_DVB_AF9013=m -CONFIG_DVB_EC100=m -CONFIG_DVB_STV0367=m -CONFIG_DVB_CXD2820R=m -CONFIG_DVB_CXD2841ER=m -CONFIG_DVB_RTL2830=m -CONFIG_DVB_RTL2832=m -CONFIG_DVB_RTL2832_SDR=m -CONFIG_DVB_SI2168=m -CONFIG_DVB_AS102_FE=m -CONFIG_DVB_ZD1301_DEMOD=m -CONFIG_DVB_GP8PSK_FE=m -CONFIG_DVB_CXD2880=m - -# -# DVB-C (cable) frontends -# -CONFIG_DVB_VES1820=m -CONFIG_DVB_TDA10021=m -CONFIG_DVB_TDA10023=m -CONFIG_DVB_STV0297=m - -# -# ATSC (North American/Korean Terrestrial/Cable DTV) frontends -# -CONFIG_DVB_NXT200X=m -CONFIG_DVB_OR51211=m -CONFIG_DVB_OR51132=m -CONFIG_DVB_BCM3510=m -CONFIG_DVB_LGDT330X=m -CONFIG_DVB_LGDT3305=m -CONFIG_DVB_LGDT3306A=m -CONFIG_DVB_LG2160=m -CONFIG_DVB_S5H1409=m -CONFIG_DVB_AU8522=m -CONFIG_DVB_AU8522_DTV=m -CONFIG_DVB_AU8522_V4L=m -CONFIG_DVB_S5H1411=m - -# -# ISDB-T (terrestrial) frontends -# -CONFIG_DVB_S921=m -CONFIG_DVB_DIB8000=m -CONFIG_DVB_MB86A20S=m - -# -# ISDB-S (satellite) & ISDB-T (terrestrial) frontends -# -CONFIG_DVB_TC90522=m -CONFIG_DVB_MN88443X=m - -# -# Digital terrestrial only tuners/PLL -# -CONFIG_DVB_PLL=m -CONFIG_DVB_TUNER_DIB0070=m -CONFIG_DVB_TUNER_DIB0090=m - -# -# SEC control devices for DVB-S -# -CONFIG_DVB_DRX39XYJ=m -CONFIG_DVB_LNBH25=m -CONFIG_DVB_LNBH29=m -CONFIG_DVB_LNBP21=m -CONFIG_DVB_LNBP22=m -CONFIG_DVB_ISL6405=m -CONFIG_DVB_ISL6421=m -CONFIG_DVB_ISL6423=m -CONFIG_DVB_A8293=m -CONFIG_DVB_LGS8GL5=m -CONFIG_DVB_LGS8GXX=m -CONFIG_DVB_ATBM8830=m -CONFIG_DVB_TDA665x=m -CONFIG_DVB_IX2505V=m -CONFIG_DVB_M88RS2000=m -CONFIG_DVB_AF9033=m -CONFIG_DVB_HORUS3A=m -CONFIG_DVB_ASCOT2E=m -CONFIG_DVB_HELENE=m - -# -# Common Interface (EN50221) controller drivers -# -CONFIG_DVB_CXD2099=m -CONFIG_DVB_SP2=m -# end of Customise DVB Frontends - -# -# Tools to develop new frontends -# -CONFIG_DVB_DUMMY_FE=m -# end of Media ancillary drivers - -# -# Graphics support -# -CONFIG_AGP=m -CONFIG_AGP_AMD64=m -CONFIG_AGP_INTEL=m -CONFIG_AGP_SIS=m -CONFIG_AGP_VIA=m -CONFIG_INTEL_GTT=m -CONFIG_VGA_ARB=y -CONFIG_VGA_ARB_MAX_GPUS=10 -CONFIG_VGA_SWITCHEROO=y -CONFIG_DRM=m -CONFIG_DRM_MIPI_DBI=m -CONFIG_DRM_MIPI_DSI=y -CONFIG_DRM_DP_AUX_CHARDEV=y -# CONFIG_DRM_DEBUG_SELFTEST is not set -CONFIG_DRM_KMS_HELPER=m -CONFIG_DRM_KMS_FB_HELPER=y -# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set -CONFIG_DRM_FBDEV_EMULATION=y -CONFIG_DRM_FBDEV_OVERALLOC=100 -# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set -CONFIG_DRM_LOAD_EDID_FIRMWARE=y -CONFIG_DRM_DP_CEC=y -CONFIG_DRM_TTM=m -CONFIG_DRM_TTM_DMA_PAGE_POOL=y -CONFIG_DRM_VRAM_HELPER=m -CONFIG_DRM_TTM_HELPER=m -CONFIG_DRM_GEM_CMA_HELPER=y -CONFIG_DRM_KMS_CMA_HELPER=y -CONFIG_DRM_GEM_SHMEM_HELPER=y -CONFIG_DRM_SCHED=m - -# -# I2C encoder or helper chips -# -CONFIG_DRM_I2C_CH7006=m -CONFIG_DRM_I2C_SIL164=m -CONFIG_DRM_I2C_NXP_TDA998X=m -CONFIG_DRM_I2C_NXP_TDA9950=m -# end of I2C encoder or helper chips - -# -# ARM devices -# -CONFIG_DRM_KOMEDA=m -# end of ARM devices - -CONFIG_DRM_RADEON=m -CONFIG_DRM_RADEON_USERPTR=y -CONFIG_DRM_AMDGPU=m -CONFIG_DRM_AMDGPU_SI=y -CONFIG_DRM_AMDGPU_CIK=y -CONFIG_DRM_AMDGPU_USERPTR=y -# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set - -# -# ACP (Audio CoProcessor) Configuration -# -CONFIG_DRM_AMD_ACP=y -# end of ACP (Audio CoProcessor) Configuration - -# -# Display Engine Configuration -# -CONFIG_DRM_AMD_DC=y -CONFIG_DRM_AMD_DC_DCN=y -CONFIG_DRM_AMD_DC_DCN3_0=y -CONFIG_DRM_AMD_DC_HDCP=y -# CONFIG_DEBUG_KERNEL_DC is not set -# end of Display Engine Configuration - -CONFIG_HSA_AMD=y -CONFIG_DRM_NOUVEAU=m -# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set -CONFIG_NOUVEAU_DEBUG=5 -CONFIG_NOUVEAU_DEBUG_DEFAULT=3 -# CONFIG_NOUVEAU_DEBUG_MMU is not set -# CONFIG_NOUVEAU_DEBUG_PUSH is not set -CONFIG_DRM_NOUVEAU_BACKLIGHT=y -CONFIG_DRM_NOUVEAU_SVM=y -CONFIG_DRM_I915=m -CONFIG_DRM_I915_FORCE_PROBE="*" -CONFIG_DRM_I915_CAPTURE_ERROR=y -CONFIG_DRM_I915_COMPRESS_ERROR=y -CONFIG_DRM_I915_USERPTR=y -CONFIG_DRM_I915_GVT=y -CONFIG_DRM_I915_GVT_KVMGT=m - -# -# drm/i915 Debugging -# -# CONFIG_DRM_I915_WERROR is not set -# CONFIG_DRM_I915_DEBUG is not set -# CONFIG_DRM_I915_DEBUG_MMIO is not set -# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set -# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set -# CONFIG_DRM_I915_DEBUG_GUC is not set -# CONFIG_DRM_I915_SELFTEST is not set -# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set -# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set -# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set -# end of drm/i915 Debugging - -# -# drm/i915 Profile Guided Optimisation -# -CONFIG_DRM_I915_FENCE_TIMEOUT=10000 -CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 -CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 -CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 -CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 -CONFIG_DRM_I915_STOP_TIMEOUT=100 -CONFIG_DRM_I915_TIMESLICE_DURATION=1 -# end of drm/i915 Profile Guided Optimisation - -CONFIG_DRM_VGEM=m -CONFIG_DRM_VKMS=m -CONFIG_DRM_VMWGFX=m -CONFIG_DRM_VMWGFX_FBCON=y -CONFIG_DRM_GMA500=m -CONFIG_DRM_GMA600=y -CONFIG_DRM_GMA3600=y -CONFIG_DRM_UDL=m -CONFIG_DRM_AST=m -CONFIG_DRM_MGAG200=m -CONFIG_DRM_RCAR_DW_HDMI=m -CONFIG_DRM_RCAR_LVDS=m -CONFIG_DRM_QXL=m -CONFIG_DRM_BOCHS=m -CONFIG_DRM_VIRTIO_GPU=m -CONFIG_DRM_PANEL=y - -# -# Display Panels -# -CONFIG_DRM_PANEL_ARM_VERSATILE=m -CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596=m -CONFIG_DRM_PANEL_BOE_HIMAX8279D=m -CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m -CONFIG_DRM_PANEL_LVDS=m -CONFIG_DRM_PANEL_SIMPLE=m -CONFIG_DRM_PANEL_ELIDA_KD35T133=m -CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m -CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m -CONFIG_DRM_PANEL_ILITEK_IL9322=m -CONFIG_DRM_PANEL_ILITEK_ILI9881C=m -CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m -CONFIG_DRM_PANEL_JDI_LT070ME05000=m -CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m -CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W=m -CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m -CONFIG_DRM_PANEL_SAMSUNG_LD9040=m -CONFIG_DRM_PANEL_LG_LB035Q02=m -CONFIG_DRM_PANEL_LG_LG4573=m -CONFIG_DRM_PANEL_NEC_NL8048HL11=m -CONFIG_DRM_PANEL_NOVATEK_NT35510=m -CONFIG_DRM_PANEL_NOVATEK_NT39016=m -CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m -CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m -CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m -CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m -CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m -CONFIG_DRM_PANEL_RAYDIUM_RM67191=m -CONFIG_DRM_PANEL_RAYDIUM_RM68200=m -CONFIG_DRM_PANEL_RONBO_RB070D30=m -CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m -CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m -CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m -CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m -CONFIG_DRM_PANEL_SEIKO_43WVF1G=m -CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m -CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m -CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m -CONFIG_DRM_PANEL_SITRONIX_ST7701=m -# CONFIG_DRM_PANEL_SITRONIX_ST7703 is not set -CONFIG_DRM_PANEL_SITRONIX_ST7789V=m -CONFIG_DRM_PANEL_SONY_ACX424AKP=m -CONFIG_DRM_PANEL_SONY_ACX565AKM=m -CONFIG_DRM_PANEL_TPO_TD028TTEC1=m -CONFIG_DRM_PANEL_TPO_TD043MTEA1=m -CONFIG_DRM_PANEL_TPO_TPG110=m -CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m -CONFIG_DRM_PANEL_VISIONOX_RM69299=m -CONFIG_DRM_PANEL_XINPENG_XPP055C272=m -# end of Display Panels - -CONFIG_DRM_BRIDGE=y -CONFIG_DRM_PANEL_BRIDGE=y - -# -# Display Interface Bridges -# -CONFIG_DRM_CDNS_DSI=m -CONFIG_DRM_CHRONTEL_CH7033=m -CONFIG_DRM_DISPLAY_CONNECTOR=m -CONFIG_DRM_LVDS_CODEC=m -CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m -CONFIG_DRM_NWL_MIPI_DSI=m -CONFIG_DRM_NXP_PTN3460=m -CONFIG_DRM_PARADE_PS8622=m -CONFIG_DRM_PARADE_PS8640=m -CONFIG_DRM_SIL_SII8620=m -CONFIG_DRM_SII902X=m -CONFIG_DRM_SII9234=m -CONFIG_DRM_SIMPLE_BRIDGE=m -CONFIG_DRM_THINE_THC63LVD1024=m -CONFIG_DRM_TOSHIBA_TC358764=m -CONFIG_DRM_TOSHIBA_TC358767=m -CONFIG_DRM_TOSHIBA_TC358768=m -CONFIG_DRM_TI_TFP410=m -CONFIG_DRM_TI_SN65DSI86=m -CONFIG_DRM_TI_TPD12S015=m -CONFIG_DRM_ANALOGIX_ANX6345=m -CONFIG_DRM_ANALOGIX_ANX78XX=m -CONFIG_DRM_ANALOGIX_DP=m -CONFIG_DRM_I2C_ADV7511=m -CONFIG_DRM_I2C_ADV7511_AUDIO=y -CONFIG_DRM_I2C_ADV7511_CEC=y -CONFIG_DRM_DW_HDMI=m -CONFIG_DRM_DW_HDMI_AHB_AUDIO=m -CONFIG_DRM_DW_HDMI_I2S_AUDIO=m -CONFIG_DRM_DW_HDMI_CEC=m -# end of Display Interface Bridges - -# CONFIG_DRM_ETNAVIV is not set -CONFIG_DRM_ARCPGU=m -CONFIG_DRM_MXS=y -CONFIG_DRM_MXSFB=m -CONFIG_DRM_CIRRUS_QEMU=m -CONFIG_DRM_GM12U320=m -CONFIG_TINYDRM_HX8357D=m -CONFIG_TINYDRM_ILI9225=m -CONFIG_TINYDRM_ILI9341=m -CONFIG_TINYDRM_ILI9486=m -CONFIG_TINYDRM_MI0283QT=m -CONFIG_TINYDRM_REPAPER=m -CONFIG_TINYDRM_ST7586=m -CONFIG_TINYDRM_ST7735R=m -CONFIG_DRM_XEN=y -CONFIG_DRM_XEN_FRONTEND=m -CONFIG_DRM_VBOXVIDEO=m -# CONFIG_DRM_LEGACY is not set -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y - -# -# Frame buffer Devices -# -CONFIG_FB_CMDLINE=y -CONFIG_FB_NOTIFY=y -CONFIG_FB=y -CONFIG_FIRMWARE_EDID=y -CONFIG_FB_BOOT_VESA_SUPPORT=y -CONFIG_FB_CFB_FILLRECT=y -CONFIG_FB_CFB_COPYAREA=y -CONFIG_FB_CFB_IMAGEBLIT=y -CONFIG_FB_SYS_FILLRECT=m -CONFIG_FB_SYS_COPYAREA=m -CONFIG_FB_SYS_IMAGEBLIT=m -# CONFIG_FB_FOREIGN_ENDIAN is not set -CONFIG_FB_SYS_FOPS=m -CONFIG_FB_DEFERRED_IO=y -CONFIG_FB_BACKLIGHT=m -CONFIG_FB_MODE_HELPERS=y -CONFIG_FB_TILEBLITTING=y - -# -# Frame buffer hardware drivers -# -# CONFIG_FB_CIRRUS is not set -# CONFIG_FB_PM2 is not set -# CONFIG_FB_CYBER2000 is not set -# CONFIG_FB_ARC is not set -# CONFIG_FB_ASILIANT is not set -# CONFIG_FB_IMSTT is not set -# CONFIG_FB_VGA16 is not set -# CONFIG_FB_UVESA is not set -CONFIG_FB_VESA=y -CONFIG_FB_EFI=y -# CONFIG_FB_N411 is not set -# CONFIG_FB_HGA is not set -# CONFIG_FB_OPENCORES is not set -# CONFIG_FB_S1D13XXX is not set -# CONFIG_FB_NVIDIA is not set -# CONFIG_FB_RIVA is not set -# CONFIG_FB_I740 is not set -# CONFIG_FB_LE80578 is not set -# CONFIG_FB_INTEL is not set -# CONFIG_FB_MATROX is not set -# CONFIG_FB_RADEON is not set -# CONFIG_FB_ATY128 is not set -# CONFIG_FB_ATY is not set -# CONFIG_FB_S3 is not set -# CONFIG_FB_SAVAGE is not set -# CONFIG_FB_SIS is not set -# CONFIG_FB_VIA is not set -# CONFIG_FB_NEOMAGIC is not set -# CONFIG_FB_KYRO is not set -# CONFIG_FB_3DFX is not set -# CONFIG_FB_VOODOO1 is not set -# CONFIG_FB_VT8623 is not set -# CONFIG_FB_TRIDENT is not set -# CONFIG_FB_ARK is not set -# CONFIG_FB_PM3 is not set -# CONFIG_FB_CARMINE is not set -# CONFIG_FB_SM501 is not set -# CONFIG_FB_SMSCUFX is not set -# CONFIG_FB_UDL is not set -# CONFIG_FB_IBM_GXT4500 is not set -# CONFIG_FB_VIRTUAL is not set -CONFIG_XEN_FBDEV_FRONTEND=m -# CONFIG_FB_METRONOME is not set -# CONFIG_FB_MB862XX is not set -CONFIG_FB_HYPERV=m -CONFIG_FB_SIMPLE=y -# CONFIG_FB_SSD1307 is not set -# CONFIG_FB_SM712 is not set -# end of Frame buffer Devices - -# -# Backlight & LCD device support -# -CONFIG_LCD_CLASS_DEVICE=m -CONFIG_LCD_L4F00242T03=m -CONFIG_LCD_LMS283GF05=m -CONFIG_LCD_LTV350QV=m -CONFIG_LCD_ILI922X=m -CONFIG_LCD_ILI9320=m -CONFIG_LCD_TDO24M=m -CONFIG_LCD_VGG2432A4=m -CONFIG_LCD_PLATFORM=m -CONFIG_LCD_AMS369FG06=m -CONFIG_LCD_LMS501KF03=m -CONFIG_LCD_HX8357=m -CONFIG_LCD_OTM3225A=m -CONFIG_BACKLIGHT_CLASS_DEVICE=y -CONFIG_BACKLIGHT_LM3533=m -CONFIG_BACKLIGHT_PWM=m -CONFIG_BACKLIGHT_DA903X=m -CONFIG_BACKLIGHT_DA9052=m -CONFIG_BACKLIGHT_MAX8925=m -CONFIG_BACKLIGHT_APPLE=m -CONFIG_BACKLIGHT_QCOM_WLED=m -CONFIG_BACKLIGHT_SAHARA=m -CONFIG_BACKLIGHT_WM831X=m -CONFIG_BACKLIGHT_ADP5520=m -CONFIG_BACKLIGHT_ADP8860=m -CONFIG_BACKLIGHT_ADP8870=m -CONFIG_BACKLIGHT_88PM860X=m -CONFIG_BACKLIGHT_PCF50633=m -CONFIG_BACKLIGHT_AAT2870=m -CONFIG_BACKLIGHT_LM3630A=m -CONFIG_BACKLIGHT_LM3639=m -CONFIG_BACKLIGHT_LP855X=m -CONFIG_BACKLIGHT_LP8788=m -CONFIG_BACKLIGHT_PANDORA=m -CONFIG_BACKLIGHT_SKY81452=m -CONFIG_BACKLIGHT_TPS65217=m -CONFIG_BACKLIGHT_AS3711=m -CONFIG_BACKLIGHT_GPIO=m -CONFIG_BACKLIGHT_LV5207LP=m -CONFIG_BACKLIGHT_BD6107=m -CONFIG_BACKLIGHT_ARCXCNN=m -CONFIG_BACKLIGHT_RAVE_SP=m -CONFIG_BACKLIGHT_LED=m -# end of Backlight & LCD device support - -CONFIG_VIDEOMODE_HELPERS=y -CONFIG_HDMI=y - -# -# Console display driver support -# -CONFIG_VGA_CONSOLE=y -CONFIG_DUMMY_CONSOLE=y -CONFIG_DUMMY_CONSOLE_COLUMNS=80 -CONFIG_DUMMY_CONSOLE_ROWS=25 -CONFIG_FRAMEBUFFER_CONSOLE=y -CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y -CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y -CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y -# end of Console display driver support - -# CONFIG_LOGO is not set -# end of Graphics support - -CONFIG_SOUND=m -CONFIG_SOUND_OSS_CORE=y -# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set -CONFIG_SND=m -CONFIG_SND_TIMER=m -CONFIG_SND_PCM=m -CONFIG_SND_PCM_ELD=y -CONFIG_SND_PCM_IEC958=y -CONFIG_SND_DMAENGINE_PCM=m -CONFIG_SND_HWDEP=m -CONFIG_SND_SEQ_DEVICE=m -CONFIG_SND_RAWMIDI=m -CONFIG_SND_COMPRESS_OFFLOAD=m -CONFIG_SND_JACK=y -CONFIG_SND_JACK_INPUT_DEV=y -CONFIG_SND_OSSEMUL=y -CONFIG_SND_MIXER_OSS=m -CONFIG_SND_PCM_OSS=m -CONFIG_SND_PCM_OSS_PLUGINS=y -CONFIG_SND_PCM_TIMER=y -CONFIG_SND_HRTIMER=m -CONFIG_SND_DYNAMIC_MINORS=y -CONFIG_SND_MAX_CARDS=32 -# CONFIG_SND_SUPPORT_OLD_API is not set -CONFIG_SND_PROC_FS=y -CONFIG_SND_VERBOSE_PROCFS=y -CONFIG_SND_VERBOSE_PRINTK=y -CONFIG_SND_DEBUG=y -# CONFIG_SND_DEBUG_VERBOSE is not set -# CONFIG_SND_PCM_XRUN_DEBUG is not set -# CONFIG_SND_CTL_VALIDATION is not set -CONFIG_SND_VMASTER=y -CONFIG_SND_DMA_SGBUF=y -CONFIG_SND_SEQUENCER=m -CONFIG_SND_SEQ_DUMMY=m -CONFIG_SND_SEQUENCER_OSS=m -CONFIG_SND_SEQ_HRTIMER_DEFAULT=y -CONFIG_SND_SEQ_MIDI_EVENT=m -CONFIG_SND_SEQ_MIDI=m -CONFIG_SND_SEQ_MIDI_EMUL=m -CONFIG_SND_SEQ_VIRMIDI=m -CONFIG_SND_MPU401_UART=m -CONFIG_SND_OPL3_LIB=m -CONFIG_SND_OPL3_LIB_SEQ=m -CONFIG_SND_VX_LIB=m -CONFIG_SND_AC97_CODEC=m -CONFIG_SND_DRIVERS=y -# CONFIG_SND_PCSP is not set -CONFIG_SND_DUMMY=m -CONFIG_SND_ALOOP=m -CONFIG_SND_VIRMIDI=m -CONFIG_SND_MTPAV=m -CONFIG_SND_MTS64=m -CONFIG_SND_SERIAL_U16550=m -CONFIG_SND_MPU401=m -CONFIG_SND_PORTMAN2X4=m -CONFIG_SND_AC97_POWER_SAVE=y -CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 -CONFIG_SND_SB_COMMON=m -CONFIG_SND_PCI=y -CONFIG_SND_AD1889=m -CONFIG_SND_ALS300=m -CONFIG_SND_ALS4000=m -CONFIG_SND_ALI5451=m -CONFIG_SND_ASIHPI=m -CONFIG_SND_ATIIXP=m -CONFIG_SND_ATIIXP_MODEM=m -CONFIG_SND_AU8810=m -CONFIG_SND_AU8820=m -CONFIG_SND_AU8830=m -CONFIG_SND_AW2=m -CONFIG_SND_AZT3328=m -CONFIG_SND_BT87X=m -# CONFIG_SND_BT87X_OVERCLOCK is not set -CONFIG_SND_CA0106=m -CONFIG_SND_CMIPCI=m -CONFIG_SND_OXYGEN_LIB=m -CONFIG_SND_OXYGEN=m -CONFIG_SND_CS4281=m -CONFIG_SND_CS46XX=m -CONFIG_SND_CS46XX_NEW_DSP=y -CONFIG_SND_CTXFI=m -CONFIG_SND_DARLA20=m -CONFIG_SND_GINA20=m -CONFIG_SND_LAYLA20=m -CONFIG_SND_DARLA24=m -CONFIG_SND_GINA24=m -CONFIG_SND_LAYLA24=m -CONFIG_SND_MONA=m -CONFIG_SND_MIA=m -CONFIG_SND_ECHO3G=m -CONFIG_SND_INDIGO=m -CONFIG_SND_INDIGOIO=m -CONFIG_SND_INDIGODJ=m -CONFIG_SND_INDIGOIOX=m -CONFIG_SND_INDIGODJX=m -CONFIG_SND_EMU10K1=m -CONFIG_SND_EMU10K1_SEQ=m -CONFIG_SND_EMU10K1X=m -CONFIG_SND_ENS1370=m -CONFIG_SND_ENS1371=m -CONFIG_SND_ES1938=m -CONFIG_SND_ES1968=m -CONFIG_SND_ES1968_INPUT=y -CONFIG_SND_ES1968_RADIO=y -CONFIG_SND_FM801=m -CONFIG_SND_FM801_TEA575X_BOOL=y -CONFIG_SND_HDSP=m -CONFIG_SND_HDSPM=m -CONFIG_SND_ICE1712=m -CONFIG_SND_ICE1724=m -CONFIG_SND_INTEL8X0=m -CONFIG_SND_INTEL8X0M=m -CONFIG_SND_KORG1212=m -CONFIG_SND_LOLA=m -CONFIG_SND_LX6464ES=m -CONFIG_SND_MAESTRO3=m -CONFIG_SND_MAESTRO3_INPUT=y -CONFIG_SND_MIXART=m -CONFIG_SND_NM256=m -CONFIG_SND_PCXHR=m -CONFIG_SND_RIPTIDE=m -CONFIG_SND_RME32=m -CONFIG_SND_RME96=m -CONFIG_SND_RME9652=m -CONFIG_SND_SONICVIBES=m -CONFIG_SND_TRIDENT=m -CONFIG_SND_VIA82XX=m -CONFIG_SND_VIA82XX_MODEM=m -CONFIG_SND_VIRTUOSO=m -CONFIG_SND_VX222=m -CONFIG_SND_YMFPCI=m - -# -# HD-Audio -# -CONFIG_SND_HDA=m -CONFIG_SND_HDA_GENERIC_LEDS=y -CONFIG_SND_HDA_INTEL=m -CONFIG_SND_HDA_HWDEP=y -CONFIG_SND_HDA_RECONFIG=y -CONFIG_SND_HDA_INPUT_BEEP=y -CONFIG_SND_HDA_INPUT_BEEP_MODE=1 -CONFIG_SND_HDA_PATCH_LOADER=y -CONFIG_SND_HDA_CODEC_REALTEK=m -CONFIG_SND_HDA_CODEC_ANALOG=m -CONFIG_SND_HDA_CODEC_SIGMATEL=m -CONFIG_SND_HDA_CODEC_VIA=m -CONFIG_SND_HDA_CODEC_HDMI=m -CONFIG_SND_HDA_CODEC_CIRRUS=m -CONFIG_SND_HDA_CODEC_CONEXANT=m -CONFIG_SND_HDA_CODEC_CA0110=m -CONFIG_SND_HDA_CODEC_CA0132=m -CONFIG_SND_HDA_CODEC_CA0132_DSP=y -CONFIG_SND_HDA_CODEC_CMEDIA=m -CONFIG_SND_HDA_CODEC_SI3054=m -CONFIG_SND_HDA_GENERIC=m -CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 -# CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM is not set -# end of HD-Audio - -CONFIG_SND_HDA_CORE=m -CONFIG_SND_HDA_DSP_LOADER=y -CONFIG_SND_HDA_COMPONENT=y -CONFIG_SND_HDA_I915=y -CONFIG_SND_HDA_EXT_CORE=m -CONFIG_SND_HDA_PREALLOC_SIZE=0 -CONFIG_SND_INTEL_NHLT=y -CONFIG_SND_INTEL_DSP_CONFIG=m -CONFIG_SND_SPI=y -CONFIG_SND_USB=y -CONFIG_SND_USB_AUDIO=m -CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y -CONFIG_SND_USB_UA101=m -CONFIG_SND_USB_USX2Y=m -CONFIG_SND_USB_CAIAQ=m -CONFIG_SND_USB_CAIAQ_INPUT=y -CONFIG_SND_USB_US122L=m -CONFIG_SND_USB_6FIRE=m -CONFIG_SND_USB_HIFACE=m -CONFIG_SND_BCD2000=m -CONFIG_SND_USB_LINE6=m -CONFIG_SND_USB_POD=m -CONFIG_SND_USB_PODHD=m -CONFIG_SND_USB_TONEPORT=m -CONFIG_SND_USB_VARIAX=m -CONFIG_SND_FIREWIRE=y -CONFIG_SND_FIREWIRE_LIB=m -CONFIG_SND_DICE=m -CONFIG_SND_OXFW=m -CONFIG_SND_ISIGHT=m -CONFIG_SND_FIREWORKS=m -CONFIG_SND_BEBOB=m -CONFIG_SND_FIREWIRE_DIGI00X=m -CONFIG_SND_FIREWIRE_TASCAM=m -CONFIG_SND_FIREWIRE_MOTU=m -CONFIG_SND_FIREFACE=m -CONFIG_SND_PCMCIA=y -CONFIG_SND_VXPOCKET=m -CONFIG_SND_PDAUDIOCF=m -CONFIG_SND_SOC=m -CONFIG_SND_SOC_AC97_BUS=y -CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y -CONFIG_SND_SOC_COMPRESS=y -CONFIG_SND_SOC_TOPOLOGY=y -CONFIG_SND_SOC_ACPI=m -CONFIG_SND_SOC_AMD_ACP=m -CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m -CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m -CONFIG_SND_SOC_AMD_ACP3x=m -CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m -CONFIG_SND_SOC_AMD_RENOIR=m -CONFIG_SND_SOC_AMD_RENOIR_MACH=m -CONFIG_SND_ATMEL_SOC=m -CONFIG_SND_SOC_MIKROE_PROTO=m -CONFIG_SND_BCM63XX_I2S_WHISTLER=m -CONFIG_SND_DESIGNWARE_I2S=m -CONFIG_SND_DESIGNWARE_PCM=y - -# -# SoC Audio for Freescale CPUs -# - -# -# Common SoC Audio options for Freescale CPUs: -# -# CONFIG_SND_SOC_FSL_ASRC is not set -# CONFIG_SND_SOC_FSL_SAI is not set -# CONFIG_SND_SOC_FSL_AUDMIX is not set -# CONFIG_SND_SOC_FSL_SSI is not set -# CONFIG_SND_SOC_FSL_SPDIF is not set -# CONFIG_SND_SOC_FSL_ESAI is not set -# CONFIG_SND_SOC_FSL_MICFIL is not set -# CONFIG_SND_SOC_IMX_AUDMUX is not set -# end of SoC Audio for Freescale CPUs - -CONFIG_SND_I2S_HI6210_I2S=m -CONFIG_SND_SOC_IMG=y -CONFIG_SND_SOC_IMG_I2S_IN=m -CONFIG_SND_SOC_IMG_I2S_OUT=m -CONFIG_SND_SOC_IMG_PARALLEL_OUT=m -CONFIG_SND_SOC_IMG_SPDIF_IN=m -CONFIG_SND_SOC_IMG_SPDIF_OUT=m -CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m -CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y -CONFIG_SND_SST_IPC=m -CONFIG_SND_SST_IPC_PCI=m -CONFIG_SND_SST_IPC_ACPI=m -CONFIG_SND_SOC_INTEL_SST_ACPI=m -CONFIG_SND_SOC_INTEL_SST=m -CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m -CONFIG_SND_SOC_INTEL_HASWELL=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m -CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m -CONFIG_SND_SOC_INTEL_SKYLAKE=m -CONFIG_SND_SOC_INTEL_SKL=m -CONFIG_SND_SOC_INTEL_APL=m -CONFIG_SND_SOC_INTEL_KBL=m -CONFIG_SND_SOC_INTEL_GLK=m -CONFIG_SND_SOC_INTEL_CNL=m -CONFIG_SND_SOC_INTEL_CFL=m -CONFIG_SND_SOC_INTEL_CML_H=m -CONFIG_SND_SOC_INTEL_CML_LP=m -CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m -CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m -# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set -CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m -CONFIG_SND_SOC_ACPI_INTEL_MATCH=m -CONFIG_SND_SOC_INTEL_MACH=y -# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set -CONFIG_SND_SOC_INTEL_HASWELL_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m -CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m -CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m -CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m -CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m -CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m -# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set -CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m -CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m -CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m -CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m -CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m -CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m -CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m -CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m -CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m -CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH=m -CONFIG_SND_SOC_MTK_BTCVSD=m -CONFIG_SND_SOC_SOF_TOPLEVEL=y -CONFIG_SND_SOC_SOF_PCI=m -CONFIG_SND_SOC_SOF_ACPI=m -CONFIG_SND_SOC_SOF_OF=m -# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set -# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set -CONFIG_SND_SOC_SOF=m -CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y -CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y -CONFIG_SND_SOC_SOF_INTEL_ACPI=m -CONFIG_SND_SOC_SOF_INTEL_PCI=m -CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m -CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m -CONFIG_SND_SOC_SOF_INTEL_COMMON=m -CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y -CONFIG_SND_SOC_SOF_MERRIFIELD=m -CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_APOLLOLAKE=m -CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_GEMINILAKE=m -CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_CANNONLAKE=m -CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COFFEELAKE=m -CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ICELAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE=m -CONFIG_SND_SOC_SOF_COMETLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_TIGERLAKE=m -CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_ELKHARTLAKE=m -CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y -CONFIG_SND_SOC_SOF_JASPERLAKE=m -CONFIG_SND_SOC_SOF_HDA_COMMON=m -CONFIG_SND_SOC_SOF_HDA_LINK=y -CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y -# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set -CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m -CONFIG_SND_SOC_SOF_HDA=m -CONFIG_SND_SOC_SOF_XTENSA=m - -# -# STMicroelectronics STM32 SOC audio support -# -# end of STMicroelectronics STM32 SOC audio support - -CONFIG_SND_SOC_XILINX_I2S=m -CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m -CONFIG_SND_SOC_XILINX_SPDIF=m -CONFIG_SND_SOC_XTFPGA_I2S=m -CONFIG_ZX_TDM=m -CONFIG_SND_SOC_I2C_AND_SPI=m - -# -# CODEC drivers -# -CONFIG_SND_SOC_AC97_CODEC=m -CONFIG_SND_SOC_ADAU_UTILS=m -CONFIG_SND_SOC_ADAU1701=m -CONFIG_SND_SOC_ADAU17X1=m -CONFIG_SND_SOC_ADAU1761=m -CONFIG_SND_SOC_ADAU1761_I2C=m -CONFIG_SND_SOC_ADAU1761_SPI=m -CONFIG_SND_SOC_ADAU7002=m -CONFIG_SND_SOC_ADAU7118=m -CONFIG_SND_SOC_ADAU7118_HW=m -CONFIG_SND_SOC_ADAU7118_I2C=m -CONFIG_SND_SOC_AK4104=m -CONFIG_SND_SOC_AK4118=m -CONFIG_SND_SOC_AK4458=m -CONFIG_SND_SOC_AK4554=m -CONFIG_SND_SOC_AK4613=m -CONFIG_SND_SOC_AK4642=m -CONFIG_SND_SOC_AK5386=m -CONFIG_SND_SOC_AK5558=m -CONFIG_SND_SOC_ALC5623=m -CONFIG_SND_SOC_BD28623=m -# CONFIG_SND_SOC_BT_SCO is not set -CONFIG_SND_SOC_CPCAP=m -CONFIG_SND_SOC_CROS_EC_CODEC=m -CONFIG_SND_SOC_CS35L32=m -CONFIG_SND_SOC_CS35L33=m -CONFIG_SND_SOC_CS35L34=m -CONFIG_SND_SOC_CS35L35=m -CONFIG_SND_SOC_CS35L36=m -CONFIG_SND_SOC_CS42L42=m -CONFIG_SND_SOC_CS42L51=m -CONFIG_SND_SOC_CS42L51_I2C=m -CONFIG_SND_SOC_CS42L52=m -CONFIG_SND_SOC_CS42L56=m -CONFIG_SND_SOC_CS42L73=m -CONFIG_SND_SOC_CS4265=m -CONFIG_SND_SOC_CS4270=m -CONFIG_SND_SOC_CS4271=m -CONFIG_SND_SOC_CS4271_I2C=m -CONFIG_SND_SOC_CS4271_SPI=m -CONFIG_SND_SOC_CS42XX8=m -CONFIG_SND_SOC_CS42XX8_I2C=m -CONFIG_SND_SOC_CS43130=m -CONFIG_SND_SOC_CS4341=m -CONFIG_SND_SOC_CS4349=m -CONFIG_SND_SOC_CS53L30=m -CONFIG_SND_SOC_CX2072X=m -CONFIG_SND_SOC_DA7213=m -CONFIG_SND_SOC_DA7219=m -CONFIG_SND_SOC_DMIC=m -CONFIG_SND_SOC_HDMI_CODEC=m -CONFIG_SND_SOC_ES7134=m -CONFIG_SND_SOC_ES7241=m -CONFIG_SND_SOC_ES8316=m -CONFIG_SND_SOC_ES8328=m -CONFIG_SND_SOC_ES8328_I2C=m -CONFIG_SND_SOC_ES8328_SPI=m -CONFIG_SND_SOC_GTM601=m -CONFIG_SND_SOC_HDAC_HDMI=m -CONFIG_SND_SOC_HDAC_HDA=m -CONFIG_SND_SOC_INNO_RK3036=m -CONFIG_SND_SOC_LOCHNAGAR_SC=m -CONFIG_SND_SOC_MAX98088=m -CONFIG_SND_SOC_MAX98090=m -CONFIG_SND_SOC_MAX98357A=m -CONFIG_SND_SOC_MAX98504=m -CONFIG_SND_SOC_MAX9867=m -CONFIG_SND_SOC_MAX98927=m -CONFIG_SND_SOC_MAX98373=m -CONFIG_SND_SOC_MAX98373_I2C=m -# CONFIG_SND_SOC_MAX98373_SDW is not set -CONFIG_SND_SOC_MAX98390=m -CONFIG_SND_SOC_MAX9860=m -CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m -CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m -CONFIG_SND_SOC_PCM1681=m -CONFIG_SND_SOC_PCM1789=m -CONFIG_SND_SOC_PCM1789_I2C=m -CONFIG_SND_SOC_PCM179X=m -CONFIG_SND_SOC_PCM179X_I2C=m -CONFIG_SND_SOC_PCM179X_SPI=m -CONFIG_SND_SOC_PCM186X=m -CONFIG_SND_SOC_PCM186X_I2C=m -CONFIG_SND_SOC_PCM186X_SPI=m -CONFIG_SND_SOC_PCM3060=m -CONFIG_SND_SOC_PCM3060_I2C=m -CONFIG_SND_SOC_PCM3060_SPI=m -CONFIG_SND_SOC_PCM3168A=m -CONFIG_SND_SOC_PCM3168A_I2C=m -CONFIG_SND_SOC_PCM3168A_SPI=m -CONFIG_SND_SOC_PCM512x=m -CONFIG_SND_SOC_PCM512x_I2C=m -CONFIG_SND_SOC_PCM512x_SPI=m -CONFIG_SND_SOC_RK3328=m -CONFIG_SND_SOC_RL6231=m -CONFIG_SND_SOC_RL6347A=m -CONFIG_SND_SOC_RT286=m -CONFIG_SND_SOC_RT298=m -CONFIG_SND_SOC_RT1011=m -CONFIG_SND_SOC_RT1015=m -CONFIG_SND_SOC_RT1308_SDW=m -CONFIG_SND_SOC_RT5514=m -CONFIG_SND_SOC_RT5514_SPI=m -CONFIG_SND_SOC_RT5616=m -CONFIG_SND_SOC_RT5631=m -CONFIG_SND_SOC_RT5640=m -CONFIG_SND_SOC_RT5645=m -CONFIG_SND_SOC_RT5651=m -CONFIG_SND_SOC_RT5660=m -CONFIG_SND_SOC_RT5663=m -CONFIG_SND_SOC_RT5670=m -CONFIG_SND_SOC_RT5677=m -CONFIG_SND_SOC_RT5677_SPI=m -CONFIG_SND_SOC_RT5682=m -CONFIG_SND_SOC_RT5682_I2C=m -CONFIG_SND_SOC_RT5682_SDW=m -CONFIG_SND_SOC_RT700=m -CONFIG_SND_SOC_RT700_SDW=m -CONFIG_SND_SOC_RT711=m -CONFIG_SND_SOC_RT711_SDW=m -CONFIG_SND_SOC_RT715=m -CONFIG_SND_SOC_RT715_SDW=m -CONFIG_SND_SOC_SGTL5000=m -CONFIG_SND_SOC_SI476X=m -CONFIG_SND_SOC_SIGMADSP=m -CONFIG_SND_SOC_SIGMADSP_I2C=m -CONFIG_SND_SOC_SIGMADSP_REGMAP=m -CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m -CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m -CONFIG_SND_SOC_SPDIF=m -CONFIG_SND_SOC_SSM2305=m -CONFIG_SND_SOC_SSM2602=m -CONFIG_SND_SOC_SSM2602_SPI=m -CONFIG_SND_SOC_SSM2602_I2C=m -CONFIG_SND_SOC_SSM4567=m -CONFIG_SND_SOC_STA32X=m -CONFIG_SND_SOC_STA350=m -CONFIG_SND_SOC_STI_SAS=m -CONFIG_SND_SOC_TAS2552=m -CONFIG_SND_SOC_TAS2562=m -CONFIG_SND_SOC_TAS2770=m -CONFIG_SND_SOC_TAS5086=m -CONFIG_SND_SOC_TAS571X=m -CONFIG_SND_SOC_TAS5720=m -CONFIG_SND_SOC_TAS6424=m -CONFIG_SND_SOC_TDA7419=m -CONFIG_SND_SOC_TFA9879=m -CONFIG_SND_SOC_TLV320AIC23=m -CONFIG_SND_SOC_TLV320AIC23_I2C=m -CONFIG_SND_SOC_TLV320AIC23_SPI=m -CONFIG_SND_SOC_TLV320AIC31XX=m -CONFIG_SND_SOC_TLV320AIC32X4=m -CONFIG_SND_SOC_TLV320AIC32X4_I2C=m -CONFIG_SND_SOC_TLV320AIC32X4_SPI=m -CONFIG_SND_SOC_TLV320AIC3X=m -CONFIG_SND_SOC_TLV320ADCX140=m -CONFIG_SND_SOC_TS3A227E=m -CONFIG_SND_SOC_TSCS42XX=m -CONFIG_SND_SOC_TSCS454=m -CONFIG_SND_SOC_UDA1334=m -CONFIG_SND_SOC_WCD9335=m -CONFIG_SND_SOC_WCD934X=m -CONFIG_SND_SOC_WM8510=m -CONFIG_SND_SOC_WM8523=m -CONFIG_SND_SOC_WM8524=m -CONFIG_SND_SOC_WM8580=m -CONFIG_SND_SOC_WM8711=m -CONFIG_SND_SOC_WM8728=m -CONFIG_SND_SOC_WM8731=m -CONFIG_SND_SOC_WM8737=m -CONFIG_SND_SOC_WM8741=m -CONFIG_SND_SOC_WM8750=m -CONFIG_SND_SOC_WM8753=m -CONFIG_SND_SOC_WM8770=m -CONFIG_SND_SOC_WM8776=m -CONFIG_SND_SOC_WM8782=m -CONFIG_SND_SOC_WM8804=m -CONFIG_SND_SOC_WM8804_I2C=m -CONFIG_SND_SOC_WM8804_SPI=m -CONFIG_SND_SOC_WM8903=m -CONFIG_SND_SOC_WM8904=m -CONFIG_SND_SOC_WM8960=m -CONFIG_SND_SOC_WM8962=m -CONFIG_SND_SOC_WM8974=m -CONFIG_SND_SOC_WM8978=m -CONFIG_SND_SOC_WM8985=m -CONFIG_SND_SOC_WSA881X=m -CONFIG_SND_SOC_ZL38060=m -CONFIG_SND_SOC_ZX_AUD96P22=m -CONFIG_SND_SOC_MAX9759=m -CONFIG_SND_SOC_MT6351=m -CONFIG_SND_SOC_MT6358=m -CONFIG_SND_SOC_MT6660=m -CONFIG_SND_SOC_NAU8540=m -CONFIG_SND_SOC_NAU8810=m -CONFIG_SND_SOC_NAU8822=m -CONFIG_SND_SOC_NAU8824=m -CONFIG_SND_SOC_NAU8825=m -CONFIG_SND_SOC_TPA6130A2=m -# end of CODEC drivers - -CONFIG_SND_SIMPLE_CARD_UTILS=m -CONFIG_SND_SIMPLE_CARD=m -CONFIG_SND_AUDIO_GRAPH_CARD=m -CONFIG_SND_X86=y -CONFIG_HDMI_LPE_AUDIO=m -CONFIG_SND_SYNTH_EMUX=m -CONFIG_SND_XEN_FRONTEND=m -CONFIG_AC97_BUS=m - -# -# HID support -# -CONFIG_HID=m -CONFIG_HID_BATTERY_STRENGTH=y -CONFIG_HIDRAW=y -CONFIG_UHID=m -CONFIG_HID_GENERIC=m - -# -# Special HID drivers -# -CONFIG_HID_A4TECH=m -CONFIG_HID_ACCUTOUCH=m -CONFIG_HID_ACRUX=m -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=m -CONFIG_HID_APPLEIR=m -CONFIG_HID_ASUS=m -CONFIG_HID_AUREAL=m -CONFIG_HID_BELKIN=m -CONFIG_HID_BETOP_FF=m -CONFIG_HID_BIGBEN_FF=m -CONFIG_HID_CHERRY=m -CONFIG_HID_CHICONY=m -CONFIG_HID_CORSAIR=m -CONFIG_HID_COUGAR=m -CONFIG_HID_MACALLY=m -CONFIG_HID_PRODIKEYS=m -CONFIG_HID_CMEDIA=m -CONFIG_HID_CP2112=m -CONFIG_HID_CREATIVE_SB0540=m -CONFIG_HID_CYPRESS=m -CONFIG_HID_DRAGONRISE=m -CONFIG_DRAGONRISE_FF=y -CONFIG_HID_EMS_FF=m -CONFIG_HID_ELAN=m -CONFIG_HID_ELECOM=m -CONFIG_HID_ELO=m -CONFIG_HID_EZKEY=m -CONFIG_HID_GEMBIRD=m -CONFIG_HID_GFRM=m -CONFIG_HID_GLORIOUS=m -CONFIG_HID_HOLTEK=m -CONFIG_HOLTEK_FF=y -CONFIG_HID_GOOGLE_HAMMER=m -CONFIG_HID_GT683R=m -CONFIG_HID_KEYTOUCH=m -CONFIG_HID_KYE=m -CONFIG_HID_UCLOGIC=m -CONFIG_HID_WALTOP=m -CONFIG_HID_VIEWSONIC=m -CONFIG_HID_GYRATION=m -CONFIG_HID_ICADE=m -CONFIG_HID_ITE=m -CONFIG_HID_JABRA=m -CONFIG_HID_TWINHAN=m -CONFIG_HID_KENSINGTON=m -CONFIG_HID_LCPOWER=m -CONFIG_HID_LED=m -CONFIG_HID_LENOVO=m -CONFIG_HID_LOGITECH=m -CONFIG_HID_LOGITECH_DJ=m -CONFIG_HID_LOGITECH_HIDPP=m -CONFIG_LOGITECH_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIWHEELS_FF=y -CONFIG_HID_MAGICMOUSE=m -CONFIG_HID_MALTRON=m -CONFIG_HID_MAYFLASH=m -CONFIG_HID_REDRAGON=m -CONFIG_HID_MICROSOFT=m -CONFIG_HID_MONTEREY=m -CONFIG_HID_MULTITOUCH=m -CONFIG_HID_NTI=m -CONFIG_HID_NTRIG=m -CONFIG_HID_ORTEK=m -CONFIG_HID_PANTHERLORD=m -CONFIG_PANTHERLORD_FF=y -CONFIG_HID_PENMOUNT=m -CONFIG_HID_PETALYNX=m -CONFIG_HID_PICOLCD=m -CONFIG_HID_PICOLCD_FB=y -CONFIG_HID_PICOLCD_BACKLIGHT=y -CONFIG_HID_PICOLCD_LCD=y -CONFIG_HID_PICOLCD_LEDS=y -CONFIG_HID_PICOLCD_CIR=y -CONFIG_HID_PLANTRONICS=m -CONFIG_HID_PRIMAX=m -CONFIG_HID_RETRODE=m -CONFIG_HID_ROCCAT=m -CONFIG_HID_SAITEK=m -CONFIG_HID_SAMSUNG=m -CONFIG_HID_SONY=m -CONFIG_SONY_FF=y -CONFIG_HID_SPEEDLINK=m -CONFIG_HID_STEAM=m -CONFIG_HID_STEELSERIES=m -CONFIG_HID_SUNPLUS=m -CONFIG_HID_RMI=m -CONFIG_HID_GREENASIA=m -CONFIG_GREENASIA_FF=y -CONFIG_HID_HYPERV_MOUSE=m -CONFIG_HID_SMARTJOYPLUS=m -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_HID_TIVO=m -CONFIG_HID_TOPSEED=m -CONFIG_HID_THINGM=m -CONFIG_HID_THRUSTMASTER=m -CONFIG_THRUSTMASTER_FF=y -CONFIG_HID_UDRAW_PS3=m -CONFIG_HID_U2FZERO=m -CONFIG_HID_WACOM=m -CONFIG_HID_WIIMOTE=m -CONFIG_HID_XINMO=m -CONFIG_HID_ZEROPLUS=m -CONFIG_ZEROPLUS_FF=y -CONFIG_HID_ZYDACRON=m -CONFIG_HID_SENSOR_HUB=m -# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set -CONFIG_HID_ALPS=m -CONFIG_HID_MCP2221=m -# end of Special HID drivers - -# -# USB HID support -# -CONFIG_USB_HID=m -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y - -# -# USB HID Boot Protocol drivers -# -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# end of USB HID Boot Protocol drivers -# end of USB HID support - -# -# I2C HID support -# -CONFIG_I2C_HID=m -# end of I2C HID support - -# -# Intel ISH HID support -# -CONFIG_INTEL_ISH_HID=m -CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m -# end of Intel ISH HID support -# end of HID support - -CONFIG_USB_OHCI_LITTLE_ENDIAN=y -CONFIG_USB_SUPPORT=y -CONFIG_USB_COMMON=y -CONFIG_USB_LED_TRIG=y -CONFIG_USB_ULPI_BUS=m -CONFIG_USB_CONN_GPIO=m -CONFIG_USB_ARCH_HAS_HCD=y -CONFIG_USB=y -CONFIG_USB_PCI=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y - -# -# Miscellaneous USB options -# -CONFIG_USB_DEFAULT_PERSIST=y -CONFIG_USB_DYNAMIC_MINORS=y -# CONFIG_USB_OTG is not set -# CONFIG_USB_OTG_PRODUCTLIST is not set -# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set -CONFIG_USB_LEDS_TRIGGER_USBPORT=m -CONFIG_USB_AUTOSUSPEND_DELAY=2 -CONFIG_USB_MON=m - -# -# USB Host Controller Drivers -# -CONFIG_USB_C67X00_HCD=m -CONFIG_USB_XHCI_HCD=m -# CONFIG_USB_XHCI_DBGCAP is not set -CONFIG_USB_XHCI_PCI=m -CONFIG_USB_XHCI_PCI_RENESAS=m -CONFIG_USB_XHCI_PLATFORM=m -CONFIG_USB_EHCI_HCD=m -CONFIG_USB_EHCI_ROOT_HUB_TT=y -CONFIG_USB_EHCI_TT_NEWSCHED=y -CONFIG_USB_EHCI_PCI=m -CONFIG_USB_EHCI_FSL=m -CONFIG_USB_EHCI_HCD_PLATFORM=m -CONFIG_USB_OXU210HP_HCD=m -CONFIG_USB_ISP116X_HCD=m -CONFIG_USB_FOTG210_HCD=m -CONFIG_USB_MAX3421_HCD=m -CONFIG_USB_OHCI_HCD=m -CONFIG_USB_OHCI_HCD_PCI=m -# CONFIG_USB_OHCI_HCD_SSB is not set -CONFIG_USB_OHCI_HCD_PLATFORM=m -CONFIG_USB_UHCI_HCD=m -CONFIG_USB_U132_HCD=m -CONFIG_USB_SL811_HCD=m -# CONFIG_USB_SL811_HCD_ISO is not set -CONFIG_USB_SL811_CS=m -CONFIG_USB_R8A66597_HCD=m -CONFIG_USB_HCD_BCMA=m -CONFIG_USB_HCD_SSB=m -# CONFIG_USB_HCD_TEST_MODE is not set - -# -# USB Device Class drivers -# -CONFIG_USB_ACM=m -CONFIG_USB_PRINTER=m -CONFIG_USB_WDM=m -CONFIG_USB_TMC=m - -# -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may -# - -# -# also be needed; see USB_STORAGE Help for more info -# -CONFIG_USB_STORAGE=m -# CONFIG_USB_STORAGE_DEBUG is not set -CONFIG_USB_STORAGE_REALTEK=m -CONFIG_REALTEK_AUTOPM=y -CONFIG_USB_STORAGE_DATAFAB=m -CONFIG_USB_STORAGE_FREECOM=m -CONFIG_USB_STORAGE_ISD200=m -CONFIG_USB_STORAGE_USBAT=m -CONFIG_USB_STORAGE_SDDR09=m -CONFIG_USB_STORAGE_SDDR55=m -CONFIG_USB_STORAGE_JUMPSHOT=m -CONFIG_USB_STORAGE_ALAUDA=m -CONFIG_USB_STORAGE_ONETOUCH=m -CONFIG_USB_STORAGE_KARMA=m -CONFIG_USB_STORAGE_CYPRESS_ATACB=m -CONFIG_USB_STORAGE_ENE_UB6250=m -CONFIG_USB_UAS=m - -# -# USB Imaging devices -# -CONFIG_USB_MDC800=m -CONFIG_USB_MICROTEK=m -CONFIG_USBIP_CORE=m -CONFIG_USBIP_VHCI_HCD=m -CONFIG_USBIP_VHCI_HC_PORTS=8 -CONFIG_USBIP_VHCI_NR_HCS=1 -CONFIG_USBIP_HOST=m -CONFIG_USBIP_VUDC=m -# CONFIG_USBIP_DEBUG is not set -CONFIG_USB_CDNS3=m -CONFIG_USB_CDNS3_GADGET=y -CONFIG_USB_CDNS3_HOST=y -CONFIG_USB_CDNS3_PCI_WRAP=m -CONFIG_USB_MUSB_HDRC=m -# CONFIG_USB_MUSB_HOST is not set -# CONFIG_USB_MUSB_GADGET is not set -CONFIG_USB_MUSB_DUAL_ROLE=y - -# -# Platform Glue Layer -# - -# -# MUSB DMA mode -# -# CONFIG_MUSB_PIO_ONLY is not set -CONFIG_USB_DWC3=m -CONFIG_USB_DWC3_ULPI=y -# CONFIG_USB_DWC3_HOST is not set -# CONFIG_USB_DWC3_GADGET is not set -CONFIG_USB_DWC3_DUAL_ROLE=y - -# -# Platform Glue Driver Support -# -CONFIG_USB_DWC3_PCI=m -CONFIG_USB_DWC3_HAPS=m -CONFIG_USB_DWC3_OF_SIMPLE=m -CONFIG_USB_DWC2=m -# CONFIG_USB_DWC2_HOST is not set - -# -# Gadget/Dual-role mode requires USB Gadget support to be enabled -# -# CONFIG_USB_DWC2_PERIPHERAL is not set -CONFIG_USB_DWC2_DUAL_ROLE=y -CONFIG_USB_DWC2_PCI=m -# CONFIG_USB_DWC2_DEBUG is not set -# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set -CONFIG_USB_CHIPIDEA=m -CONFIG_USB_CHIPIDEA_UDC=y -CONFIG_USB_CHIPIDEA_HOST=y -CONFIG_USB_CHIPIDEA_PCI=m -CONFIG_USB_CHIPIDEA_MSM=m -CONFIG_USB_CHIPIDEA_IMX=m -CONFIG_USB_CHIPIDEA_GENERIC=m -CONFIG_USB_CHIPIDEA_TEGRA=m -CONFIG_USB_ISP1760=m -CONFIG_USB_ISP1760_HCD=y -CONFIG_USB_ISP1761_UDC=y -# CONFIG_USB_ISP1760_HOST_ROLE is not set -# CONFIG_USB_ISP1760_GADGET_ROLE is not set -CONFIG_USB_ISP1760_DUAL_ROLE=y - -# -# USB port drivers -# -CONFIG_USB_USS720=m -CONFIG_USB_SERIAL=y -CONFIG_USB_SERIAL_CONSOLE=y -CONFIG_USB_SERIAL_GENERIC=y -CONFIG_USB_SERIAL_SIMPLE=m -CONFIG_USB_SERIAL_AIRCABLE=m -CONFIG_USB_SERIAL_ARK3116=m -CONFIG_USB_SERIAL_BELKIN=m -CONFIG_USB_SERIAL_CH341=m -CONFIG_USB_SERIAL_WHITEHEAT=m -CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m -CONFIG_USB_SERIAL_CP210X=m -CONFIG_USB_SERIAL_CYPRESS_M8=m -CONFIG_USB_SERIAL_EMPEG=m -CONFIG_USB_SERIAL_FTDI_SIO=m -CONFIG_USB_SERIAL_VISOR=m -CONFIG_USB_SERIAL_IPAQ=m -CONFIG_USB_SERIAL_IR=m -CONFIG_USB_SERIAL_EDGEPORT=m -CONFIG_USB_SERIAL_EDGEPORT_TI=m -CONFIG_USB_SERIAL_F81232=m -CONFIG_USB_SERIAL_F8153X=m -CONFIG_USB_SERIAL_GARMIN=m -CONFIG_USB_SERIAL_IPW=m -CONFIG_USB_SERIAL_IUU=m -CONFIG_USB_SERIAL_KEYSPAN_PDA=m -CONFIG_USB_SERIAL_KEYSPAN=m -CONFIG_USB_SERIAL_KLSI=m -CONFIG_USB_SERIAL_KOBIL_SCT=m -CONFIG_USB_SERIAL_MCT_U232=m -CONFIG_USB_SERIAL_METRO=m -CONFIG_USB_SERIAL_MOS7720=m -CONFIG_USB_SERIAL_MOS7715_PARPORT=y -CONFIG_USB_SERIAL_MOS7840=m -CONFIG_USB_SERIAL_MXUPORT=m -CONFIG_USB_SERIAL_NAVMAN=m -CONFIG_USB_SERIAL_PL2303=m -CONFIG_USB_SERIAL_OTI6858=m -CONFIG_USB_SERIAL_QCAUX=m -CONFIG_USB_SERIAL_QUALCOMM=m -CONFIG_USB_SERIAL_SPCP8X5=m -CONFIG_USB_SERIAL_SAFE=m -# CONFIG_USB_SERIAL_SAFE_PADDED is not set -CONFIG_USB_SERIAL_SIERRAWIRELESS=m -CONFIG_USB_SERIAL_SYMBOL=m -CONFIG_USB_SERIAL_TI=m -CONFIG_USB_SERIAL_CYBERJACK=m -CONFIG_USB_SERIAL_XIRCOM=m -CONFIG_USB_SERIAL_WWAN=m -CONFIG_USB_SERIAL_OPTION=m -CONFIG_USB_SERIAL_OMNINET=m -CONFIG_USB_SERIAL_OPTICON=m -CONFIG_USB_SERIAL_XSENS_MT=m -CONFIG_USB_SERIAL_WISHBONE=m -CONFIG_USB_SERIAL_SSU100=m -CONFIG_USB_SERIAL_QT2=m -CONFIG_USB_SERIAL_UPD78F0730=m -CONFIG_USB_SERIAL_DEBUG=m - -# -# USB Miscellaneous drivers -# -CONFIG_USB_EMI62=m -CONFIG_USB_EMI26=m -CONFIG_USB_ADUTUX=m -CONFIG_USB_SEVSEG=m -CONFIG_USB_LEGOTOWER=m -CONFIG_USB_LCD=m -CONFIG_USB_CYPRESS_CY7C63=m -CONFIG_USB_CYTHERM=m -CONFIG_USB_IDMOUSE=m -CONFIG_USB_FTDI_ELAN=m -CONFIG_USB_APPLEDISPLAY=m -CONFIG_APPLE_MFI_FASTCHARGE=m -CONFIG_USB_SISUSBVGA=m -CONFIG_USB_SISUSBVGA_CON=y -CONFIG_USB_LD=m -CONFIG_USB_TRANCEVIBRATOR=m -CONFIG_USB_IOWARRIOR=m -CONFIG_USB_TEST=m -CONFIG_USB_EHSET_TEST_FIXTURE=m -CONFIG_USB_ISIGHTFW=m -CONFIG_USB_YUREX=m -CONFIG_USB_EZUSB_FX2=m -CONFIG_USB_HUB_USB251XB=m -CONFIG_USB_HSIC_USB3503=m -CONFIG_USB_HSIC_USB4604=m -CONFIG_USB_LINK_LAYER_TEST=m -CONFIG_USB_CHAOSKEY=m -CONFIG_USB_ATM=m -CONFIG_USB_SPEEDTOUCH=m -CONFIG_USB_CXACRU=m -CONFIG_USB_UEAGLEATM=m -CONFIG_USB_XUSBATM=m - -# -# USB Physical Layer drivers -# -CONFIG_USB_PHY=y -CONFIG_NOP_USB_XCEIV=m -CONFIG_USB_GPIO_VBUS=m -CONFIG_TAHVO_USB=m -# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set -CONFIG_USB_ISP1301=m -# end of USB Physical Layer drivers - -CONFIG_USB_GADGET=m -# CONFIG_USB_GADGET_DEBUG is not set -# CONFIG_USB_GADGET_DEBUG_FILES is not set -# CONFIG_USB_GADGET_DEBUG_FS is not set -CONFIG_USB_GADGET_VBUS_DRAW=2 -CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 -CONFIG_U_SERIAL_CONSOLE=y - -# -# USB Peripheral Controller -# -CONFIG_USB_FOTG210_UDC=m -CONFIG_USB_GR_UDC=m -CONFIG_USB_R8A66597=m -CONFIG_USB_PXA27X=m -CONFIG_USB_MV_UDC=m -CONFIG_USB_MV_U3D=m -CONFIG_USB_SNP_CORE=m -CONFIG_USB_SNP_UDC_PLAT=m -CONFIG_USB_M66592=m -CONFIG_USB_BDC_UDC=m - -# -# Platform Support -# -CONFIG_USB_BDC_PCI=m -CONFIG_USB_AMD5536UDC=m -CONFIG_USB_NET2272=m -CONFIG_USB_NET2272_DMA=y -CONFIG_USB_NET2280=m -CONFIG_USB_GOKU=m -CONFIG_USB_EG20T=m -CONFIG_USB_GADGET_XILINX=m -CONFIG_USB_MAX3420_UDC=m -CONFIG_USB_DUMMY_HCD=m -# end of USB Peripheral Controller - -CONFIG_USB_LIBCOMPOSITE=m -CONFIG_USB_F_ACM=m -CONFIG_USB_F_SS_LB=m -CONFIG_USB_U_SERIAL=m -CONFIG_USB_U_ETHER=m -CONFIG_USB_U_AUDIO=m -CONFIG_USB_F_SERIAL=m -CONFIG_USB_F_OBEX=m -CONFIG_USB_F_NCM=m -CONFIG_USB_F_ECM=m -CONFIG_USB_F_PHONET=m -CONFIG_USB_F_EEM=m -CONFIG_USB_F_SUBSET=m -CONFIG_USB_F_RNDIS=m -CONFIG_USB_F_MASS_STORAGE=m -CONFIG_USB_F_FS=m -CONFIG_USB_F_UAC1=m -CONFIG_USB_F_UAC1_LEGACY=m -CONFIG_USB_F_UAC2=m -CONFIG_USB_F_UVC=m -CONFIG_USB_F_MIDI=m -CONFIG_USB_F_HID=m -CONFIG_USB_F_PRINTER=m -CONFIG_USB_F_TCM=m -CONFIG_USB_CONFIGFS=m -CONFIG_USB_CONFIGFS_SERIAL=y -CONFIG_USB_CONFIGFS_ACM=y -CONFIG_USB_CONFIGFS_OBEX=y -CONFIG_USB_CONFIGFS_NCM=y -CONFIG_USB_CONFIGFS_ECM=y -CONFIG_USB_CONFIGFS_ECM_SUBSET=y -CONFIG_USB_CONFIGFS_RNDIS=y -CONFIG_USB_CONFIGFS_EEM=y -CONFIG_USB_CONFIGFS_PHONET=y -CONFIG_USB_CONFIGFS_MASS_STORAGE=y -CONFIG_USB_CONFIGFS_F_LB_SS=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_UAC1=y -CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y -CONFIG_USB_CONFIGFS_F_UAC2=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_HID=y -CONFIG_USB_CONFIGFS_F_UVC=y -CONFIG_USB_CONFIGFS_F_PRINTER=y -CONFIG_USB_CONFIGFS_F_TCM=y - -# -# USB Gadget precomposed configurations -# -CONFIG_USB_ZERO=m -CONFIG_USB_AUDIO=m -# CONFIG_GADGET_UAC1 is not set -CONFIG_USB_ETH=m -CONFIG_USB_ETH_RNDIS=y -CONFIG_USB_ETH_EEM=y -CONFIG_USB_G_NCM=m -CONFIG_USB_GADGETFS=m -CONFIG_USB_FUNCTIONFS=m -CONFIG_USB_FUNCTIONFS_ETH=y -CONFIG_USB_FUNCTIONFS_RNDIS=y -CONFIG_USB_FUNCTIONFS_GENERIC=y -CONFIG_USB_MASS_STORAGE=m -CONFIG_USB_GADGET_TARGET=m -CONFIG_USB_G_SERIAL=m -CONFIG_USB_MIDI_GADGET=m -CONFIG_USB_G_PRINTER=m -CONFIG_USB_CDC_COMPOSITE=m -CONFIG_USB_G_NOKIA=m -CONFIG_USB_G_ACM_MS=m -CONFIG_USB_G_MULTI=m -CONFIG_USB_G_MULTI_RNDIS=y -CONFIG_USB_G_MULTI_CDC=y -CONFIG_USB_G_HID=m -CONFIG_USB_G_DBGP=m -# CONFIG_USB_G_DBGP_PRINTK is not set -CONFIG_USB_G_DBGP_SERIAL=y -CONFIG_USB_G_WEBCAM=m -CONFIG_USB_RAW_GADGET=m -# end of USB Gadget precomposed configurations - -CONFIG_TYPEC=m -CONFIG_TYPEC_TCPM=m -CONFIG_TYPEC_TCPCI=m -CONFIG_TYPEC_RT1711H=m -CONFIG_TYPEC_FUSB302=m -CONFIG_TYPEC_WCOVE=m -CONFIG_TYPEC_UCSI=m -CONFIG_UCSI_CCG=m -CONFIG_UCSI_ACPI=m -CONFIG_TYPEC_HD3SS3220=m -CONFIG_TYPEC_TPS6598X=m - -# -# USB Type-C Multiplexer/DeMultiplexer Switch support -# -CONFIG_TYPEC_MUX_PI3USB30532=m -CONFIG_TYPEC_MUX_INTEL_PMC=m -# end of USB Type-C Multiplexer/DeMultiplexer Switch support - -# -# USB Type-C Alternate Mode drivers -# -CONFIG_TYPEC_DP_ALTMODE=m -CONFIG_TYPEC_NVIDIA_ALTMODE=m -# end of USB Type-C Alternate Mode drivers - -CONFIG_USB_ROLE_SWITCH=m -CONFIG_USB_ROLES_INTEL_XHCI=m -CONFIG_MMC=m -CONFIG_PWRSEQ_EMMC=m -CONFIG_PWRSEQ_SD8787=m -CONFIG_PWRSEQ_SIMPLE=m -CONFIG_MMC_BLOCK=m -CONFIG_MMC_BLOCK_MINORS=8 -CONFIG_SDIO_UART=m -CONFIG_MMC_TEST=m - -# -# MMC/SD/SDIO Host Controller Drivers -# -# CONFIG_MMC_DEBUG is not set -CONFIG_MMC_SDHCI=m -CONFIG_MMC_SDHCI_IO_ACCESSORS=y -CONFIG_MMC_SDHCI_PCI=m -CONFIG_MMC_RICOH_MMC=y -CONFIG_MMC_SDHCI_ACPI=m -CONFIG_MMC_SDHCI_PLTFM=m -CONFIG_MMC_SDHCI_OF_ARASAN=m -CONFIG_MMC_SDHCI_OF_ASPEED=m -CONFIG_MMC_SDHCI_OF_AT91=m -CONFIG_MMC_SDHCI_OF_DWCMSHC=m -CONFIG_MMC_SDHCI_CADENCE=m -CONFIG_MMC_SDHCI_F_SDH30=m -CONFIG_MMC_SDHCI_MILBEAUT=m -CONFIG_MMC_WBSD=m -CONFIG_MMC_ALCOR=m -CONFIG_MMC_TIFM_SD=m -CONFIG_MMC_SPI=m -CONFIG_MMC_SDRICOH_CS=m -CONFIG_MMC_CB710=m -CONFIG_MMC_VIA_SDMMC=m -CONFIG_MMC_VUB300=m -CONFIG_MMC_USHC=m -CONFIG_MMC_USDHI6ROL0=m -CONFIG_MMC_REALTEK_PCI=m -CONFIG_MMC_REALTEK_USB=m -CONFIG_MMC_CQHCI=m -CONFIG_MMC_HSQ=m -CONFIG_MMC_TOSHIBA_PCI=m -CONFIG_MMC_MTK=m -CONFIG_MMC_SDHCI_XENON=m -CONFIG_MMC_SDHCI_OMAP=m -CONFIG_MMC_SDHCI_AM654=m -CONFIG_MMC_SDHCI_EXTERNAL_DMA=y -CONFIG_MEMSTICK=m -# CONFIG_MEMSTICK_DEBUG is not set - -# -# MemoryStick drivers -# -# CONFIG_MEMSTICK_UNSAFE_RESUME is not set -CONFIG_MSPRO_BLOCK=m -CONFIG_MS_BLOCK=m - -# -# MemoryStick Host Controller Drivers -# -CONFIG_MEMSTICK_TIFM_MS=m -CONFIG_MEMSTICK_JMICRON_38X=m -CONFIG_MEMSTICK_R592=m -CONFIG_MEMSTICK_REALTEK_PCI=m -CONFIG_MEMSTICK_REALTEK_USB=m -CONFIG_NEW_LEDS=y -CONFIG_LEDS_CLASS=y -CONFIG_LEDS_CLASS_FLASH=m -CONFIG_LEDS_CLASS_MULTICOLOR=m -CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y - -# -# LED drivers -# -CONFIG_LEDS_88PM860X=m -CONFIG_LEDS_AAT1290=m -CONFIG_LEDS_AN30259A=m -CONFIG_LEDS_APU=m -CONFIG_LEDS_AS3645A=m -CONFIG_LEDS_AW2013=m -CONFIG_LEDS_BCM6328=m -CONFIG_LEDS_BCM6358=m -CONFIG_LEDS_CPCAP=m -CONFIG_LEDS_CR0014114=m -CONFIG_LEDS_EL15203000=m -CONFIG_LEDS_LM3530=m -CONFIG_LEDS_LM3532=m -CONFIG_LEDS_LM3533=m -CONFIG_LEDS_LM3642=m -CONFIG_LEDS_LM3692X=m -CONFIG_LEDS_LM3601X=m -CONFIG_LEDS_MT6323=m -CONFIG_LEDS_PCA9532=m -CONFIG_LEDS_PCA9532_GPIO=y -CONFIG_LEDS_GPIO=m -CONFIG_LEDS_LP3944=m -CONFIG_LEDS_LP3952=m -# CONFIG_LEDS_LP55XX_COMMON is not set -CONFIG_LEDS_LP8788=m -CONFIG_LEDS_LP8860=m -CONFIG_LEDS_CLEVO_MAIL=m -CONFIG_LEDS_PCA955X=m -CONFIG_LEDS_PCA955X_GPIO=y -CONFIG_LEDS_PCA963X=m -CONFIG_LEDS_WM831X_STATUS=m -CONFIG_LEDS_WM8350=m -CONFIG_LEDS_DA903X=m -CONFIG_LEDS_DA9052=m -CONFIG_LEDS_DAC124S085=m -CONFIG_LEDS_PWM=m -CONFIG_LEDS_REGULATOR=m -CONFIG_LEDS_BD2802=m -CONFIG_LEDS_INTEL_SS4200=m -CONFIG_LEDS_LT3593=m -CONFIG_LEDS_ADP5520=m -CONFIG_LEDS_MC13783=m -CONFIG_LEDS_TCA6507=m -CONFIG_LEDS_TLC591XX=m -CONFIG_LEDS_MAX77650=m -CONFIG_LEDS_MAX77693=m -CONFIG_LEDS_MAX8997=m -CONFIG_LEDS_LM355x=m -CONFIG_LEDS_MENF21BMC=m -CONFIG_LEDS_KTD2692=m -CONFIG_LEDS_IS31FL319X=m -CONFIG_LEDS_IS31FL32XX=m - -# -# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) -# -CONFIG_LEDS_BLINKM=m -CONFIG_LEDS_SYSCON=y -CONFIG_LEDS_MLXCPLD=m -CONFIG_LEDS_MLXREG=m -CONFIG_LEDS_USER=m -CONFIG_LEDS_NIC78BX=m -CONFIG_LEDS_SPI_BYTE=m -CONFIG_LEDS_TI_LMU_COMMON=m -CONFIG_LEDS_LM3697=m -CONFIG_LEDS_LM36274=m -CONFIG_LEDS_TPS6105X=m -CONFIG_LEDS_SGM3140=m - -# -# LED Triggers -# -CONFIG_LEDS_TRIGGERS=y -CONFIG_LEDS_TRIGGER_TIMER=m -CONFIG_LEDS_TRIGGER_ONESHOT=m -CONFIG_LEDS_TRIGGER_DISK=y -CONFIG_LEDS_TRIGGER_MTD=y -CONFIG_LEDS_TRIGGER_HEARTBEAT=m -CONFIG_LEDS_TRIGGER_BACKLIGHT=m -CONFIG_LEDS_TRIGGER_CPU=y -CONFIG_LEDS_TRIGGER_ACTIVITY=m -CONFIG_LEDS_TRIGGER_GPIO=m -CONFIG_LEDS_TRIGGER_DEFAULT_ON=m - -# -# iptables trigger is under Netfilter config (LED target) -# -CONFIG_LEDS_TRIGGER_TRANSIENT=m -CONFIG_LEDS_TRIGGER_CAMERA=m -CONFIG_LEDS_TRIGGER_PANIC=y -CONFIG_LEDS_TRIGGER_NETDEV=m -CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m -CONFIG_ACCESSIBILITY=y -CONFIG_A11Y_BRAILLE_CONSOLE=y - -# -# Speakup console speech -# -CONFIG_SPEAKUP=m -CONFIG_SPEAKUP_SYNTH_ACNTSA=m -CONFIG_SPEAKUP_SYNTH_APOLLO=m -CONFIG_SPEAKUP_SYNTH_AUDPTR=m -CONFIG_SPEAKUP_SYNTH_BNS=m -CONFIG_SPEAKUP_SYNTH_DECTLK=m -CONFIG_SPEAKUP_SYNTH_DECEXT=m -CONFIG_SPEAKUP_SYNTH_LTLK=m -CONFIG_SPEAKUP_SYNTH_SOFT=m -CONFIG_SPEAKUP_SYNTH_SPKOUT=m -CONFIG_SPEAKUP_SYNTH_TXPRT=m -CONFIG_SPEAKUP_SYNTH_DUMMY=m -# end of Speakup console speech - -CONFIG_INFINIBAND=m -CONFIG_INFINIBAND_USER_MAD=m -CONFIG_INFINIBAND_USER_ACCESS=m -CONFIG_INFINIBAND_USER_MEM=y -CONFIG_INFINIBAND_ON_DEMAND_PAGING=y -CONFIG_INFINIBAND_ADDR_TRANS=y -CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y -CONFIG_INFINIBAND_MTHCA=m -CONFIG_INFINIBAND_MTHCA_DEBUG=y -CONFIG_INFINIBAND_QIB=m -CONFIG_INFINIBAND_QIB_DCA=y -CONFIG_INFINIBAND_CXGB4=m -CONFIG_INFINIBAND_EFA=m -CONFIG_INFINIBAND_I40IW=m -CONFIG_MLX4_INFINIBAND=m -CONFIG_MLX5_INFINIBAND=m -CONFIG_INFINIBAND_OCRDMA=m -CONFIG_INFINIBAND_VMWARE_PVRDMA=m -CONFIG_INFINIBAND_USNIC=m -CONFIG_INFINIBAND_BNXT_RE=m -CONFIG_INFINIBAND_HFI1=m -# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set -# CONFIG_SDMA_VERBOSITY is not set -CONFIG_INFINIBAND_QEDR=m -CONFIG_INFINIBAND_RDMAVT=m -CONFIG_RDMA_RXE=m -CONFIG_RDMA_SIW=m -CONFIG_INFINIBAND_IPOIB=m -CONFIG_INFINIBAND_IPOIB_CM=y -CONFIG_INFINIBAND_IPOIB_DEBUG=y -# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set -CONFIG_INFINIBAND_SRP=m -CONFIG_INFINIBAND_SRPT=m -CONFIG_INFINIBAND_ISER=m -CONFIG_INFINIBAND_ISERT=m -CONFIG_INFINIBAND_RTRS=m -CONFIG_INFINIBAND_RTRS_CLIENT=m -CONFIG_INFINIBAND_RTRS_SERVER=m -CONFIG_INFINIBAND_OPA_VNIC=m -CONFIG_EDAC_ATOMIC_SCRUB=y -CONFIG_EDAC_SUPPORT=y -CONFIG_EDAC=y -CONFIG_EDAC_LEGACY_SYSFS=y -# CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_DECODE_MCE=m -CONFIG_EDAC_GHES=y -CONFIG_EDAC_AMD64=m -# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set -CONFIG_EDAC_E752X=m -CONFIG_EDAC_I82975X=m -CONFIG_EDAC_I3000=m -CONFIG_EDAC_I3200=m -CONFIG_EDAC_IE31200=m -CONFIG_EDAC_X38=m -CONFIG_EDAC_I5400=m -CONFIG_EDAC_I7CORE=m -CONFIG_EDAC_I5000=m -CONFIG_EDAC_I5100=m -CONFIG_EDAC_I7300=m -CONFIG_EDAC_SBRIDGE=m -CONFIG_EDAC_SKX=m -CONFIG_EDAC_I10NM=m -CONFIG_EDAC_PND2=m -CONFIG_RTC_LIB=y -CONFIG_RTC_MC146818_LIB=y -CONFIG_RTC_CLASS=y -CONFIG_RTC_HCTOSYS=y -CONFIG_RTC_HCTOSYS_DEVICE="rtc0" -CONFIG_RTC_SYSTOHC=y -CONFIG_RTC_SYSTOHC_DEVICE="rtc0" -# CONFIG_RTC_DEBUG is not set -CONFIG_RTC_NVMEM=y - -# -# RTC interfaces -# -CONFIG_RTC_INTF_SYSFS=y -CONFIG_RTC_INTF_PROC=y -CONFIG_RTC_INTF_DEV=y -CONFIG_RTC_INTF_DEV_UIE_EMUL=y -# CONFIG_RTC_DRV_TEST is not set - -# -# I2C RTC drivers -# -CONFIG_RTC_DRV_88PM860X=m -CONFIG_RTC_DRV_88PM80X=m -CONFIG_RTC_DRV_ABB5ZES3=m -CONFIG_RTC_DRV_ABEOZ9=m -CONFIG_RTC_DRV_ABX80X=m -CONFIG_RTC_DRV_AS3722=m -CONFIG_RTC_DRV_DS1307=m -CONFIG_RTC_DRV_DS1307_CENTURY=y -CONFIG_RTC_DRV_DS1374=m -CONFIG_RTC_DRV_DS1374_WDT=y -CONFIG_RTC_DRV_DS1672=m -CONFIG_RTC_DRV_HYM8563=m -CONFIG_RTC_DRV_LP8788=m -CONFIG_RTC_DRV_MAX6900=m -CONFIG_RTC_DRV_MAX8907=m -CONFIG_RTC_DRV_MAX8925=m -CONFIG_RTC_DRV_MAX8998=m -CONFIG_RTC_DRV_MAX8997=m -CONFIG_RTC_DRV_MAX77686=m -CONFIG_RTC_DRV_RK808=m -CONFIG_RTC_DRV_RS5C372=m -CONFIG_RTC_DRV_ISL1208=m -CONFIG_RTC_DRV_ISL12022=m -CONFIG_RTC_DRV_ISL12026=m -CONFIG_RTC_DRV_X1205=m -CONFIG_RTC_DRV_PCF8523=m -CONFIG_RTC_DRV_PCF85063=m -CONFIG_RTC_DRV_PCF85363=m -CONFIG_RTC_DRV_PCF8563=m -CONFIG_RTC_DRV_PCF8583=m -CONFIG_RTC_DRV_M41T80=m -CONFIG_RTC_DRV_M41T80_WDT=y -CONFIG_RTC_DRV_BD70528=m -CONFIG_RTC_DRV_BQ32K=m -CONFIG_RTC_DRV_TWL4030=m -CONFIG_RTC_DRV_PALMAS=m -CONFIG_RTC_DRV_TPS6586X=m -CONFIG_RTC_DRV_TPS65910=m -CONFIG_RTC_DRV_TPS80031=m -CONFIG_RTC_DRV_RC5T583=m -CONFIG_RTC_DRV_RC5T619=m -CONFIG_RTC_DRV_S35390A=m -CONFIG_RTC_DRV_FM3130=m -CONFIG_RTC_DRV_RX8010=m -CONFIG_RTC_DRV_RX8581=m -CONFIG_RTC_DRV_RX8025=m -CONFIG_RTC_DRV_EM3027=m -CONFIG_RTC_DRV_RV3028=m -CONFIG_RTC_DRV_RV8803=m -CONFIG_RTC_DRV_S5M=m -CONFIG_RTC_DRV_SD3078=m - -# -# SPI RTC drivers -# -CONFIG_RTC_DRV_M41T93=m -CONFIG_RTC_DRV_M41T94=m -CONFIG_RTC_DRV_DS1302=m -CONFIG_RTC_DRV_DS1305=m -CONFIG_RTC_DRV_DS1343=m -CONFIG_RTC_DRV_DS1347=m -CONFIG_RTC_DRV_DS1390=m -CONFIG_RTC_DRV_MAX6916=m -CONFIG_RTC_DRV_R9701=m -CONFIG_RTC_DRV_RX4581=m -CONFIG_RTC_DRV_RX6110=m -CONFIG_RTC_DRV_RS5C348=m -CONFIG_RTC_DRV_MAX6902=m -CONFIG_RTC_DRV_PCF2123=m -CONFIG_RTC_DRV_MCP795=m -CONFIG_RTC_I2C_AND_SPI=y - -# -# SPI and I2C RTC drivers -# -CONFIG_RTC_DRV_DS3232=m -CONFIG_RTC_DRV_DS3232_HWMON=y -CONFIG_RTC_DRV_PCF2127=m -CONFIG_RTC_DRV_RV3029C2=m -CONFIG_RTC_DRV_RV3029_HWMON=y - -# -# Platform RTC drivers -# -CONFIG_RTC_DRV_CMOS=y -CONFIG_RTC_DRV_DS1286=m -CONFIG_RTC_DRV_DS1511=m -CONFIG_RTC_DRV_DS1553=m -CONFIG_RTC_DRV_DS1685_FAMILY=m -CONFIG_RTC_DRV_DS1685=y -# CONFIG_RTC_DRV_DS1689 is not set -# CONFIG_RTC_DRV_DS17285 is not set -# CONFIG_RTC_DRV_DS17485 is not set -# CONFIG_RTC_DRV_DS17885 is not set -CONFIG_RTC_DRV_DS1742=m -CONFIG_RTC_DRV_DS2404=m -CONFIG_RTC_DRV_DA9052=m -CONFIG_RTC_DRV_DA9055=m -CONFIG_RTC_DRV_DA9063=m -CONFIG_RTC_DRV_STK17TA8=m -CONFIG_RTC_DRV_M48T86=m -CONFIG_RTC_DRV_M48T35=m -CONFIG_RTC_DRV_M48T59=m -CONFIG_RTC_DRV_MSM6242=m -CONFIG_RTC_DRV_BQ4802=m -CONFIG_RTC_DRV_RP5C01=m -CONFIG_RTC_DRV_V3020=m -CONFIG_RTC_DRV_WM831X=m -CONFIG_RTC_DRV_WM8350=m -CONFIG_RTC_DRV_PCF50633=m -CONFIG_RTC_DRV_AB3100=m -CONFIG_RTC_DRV_ZYNQMP=m -CONFIG_RTC_DRV_CROS_EC=m - -# -# on-CPU RTC drivers -# -CONFIG_RTC_DRV_CADENCE=m -CONFIG_RTC_DRV_FTRTC010=m -CONFIG_RTC_DRV_PCAP=m -CONFIG_RTC_DRV_MC13XXX=m -CONFIG_RTC_DRV_MT6397=m -CONFIG_RTC_DRV_R7301=m -CONFIG_RTC_DRV_CPCAP=m - -# -# HID Sensor RTC drivers -# -CONFIG_RTC_DRV_HID_SENSOR_TIME=m -CONFIG_RTC_DRV_WILCO_EC=m -CONFIG_DMADEVICES=y -# CONFIG_DMADEVICES_DEBUG is not set - -# -# DMA Devices -# -CONFIG_DMA_ENGINE=y -CONFIG_DMA_VIRTUAL_CHANNELS=y -CONFIG_DMA_ACPI=y -CONFIG_DMA_OF=y -CONFIG_ALTERA_MSGDMA=m -CONFIG_DW_AXI_DMAC=m -CONFIG_FSL_EDMA=m -CONFIG_INTEL_IDMA64=m -CONFIG_INTEL_IDXD=m -CONFIG_INTEL_IOATDMA=m -CONFIG_INTEL_MIC_X100_DMA=m -CONFIG_PLX_DMA=m -# CONFIG_XILINX_ZYNQMP_DPDMA is not set -CONFIG_QCOM_HIDMA_MGMT=m -CONFIG_QCOM_HIDMA=m -CONFIG_DW_DMAC_CORE=y -CONFIG_DW_DMAC=y -CONFIG_DW_DMAC_PCI=y -CONFIG_DW_EDMA=m -CONFIG_DW_EDMA_PCIE=m -CONFIG_HSU_DMA=y -CONFIG_SF_PDMA=m - -# -# DMA Clients -# -CONFIG_ASYNC_TX_DMA=y -# CONFIG_DMATEST is not set -CONFIG_DMA_ENGINE_RAID=y - -# -# DMABUF options -# -CONFIG_SYNC_FILE=y -# CONFIG_SW_SYNC is not set -CONFIG_UDMABUF=y -# CONFIG_DMABUF_MOVE_NOTIFY is not set -# CONFIG_DMABUF_SELFTESTS is not set -CONFIG_DMABUF_HEAPS=y -CONFIG_DMABUF_HEAPS_SYSTEM=y -# end of DMABUF options - -CONFIG_DCA=m -CONFIG_AUXDISPLAY=y -CONFIG_HD44780=m -CONFIG_KS0108=m -CONFIG_KS0108_PORT=0x378 -CONFIG_KS0108_DELAY=2 -CONFIG_CFAG12864B=m -CONFIG_CFAG12864B_RATE=20 -CONFIG_IMG_ASCII_LCD=m -CONFIG_HT16K33=m -CONFIG_PARPORT_PANEL=m -CONFIG_PANEL_PARPORT=0 -CONFIG_PANEL_PROFILE=5 -# CONFIG_PANEL_CHANGE_MESSAGE is not set -# CONFIG_CHARLCD_BL_OFF is not set -# CONFIG_CHARLCD_BL_ON is not set -CONFIG_CHARLCD_BL_FLASH=y -CONFIG_PANEL=m -CONFIG_CHARLCD=m -CONFIG_UIO=m -CONFIG_UIO_CIF=m -CONFIG_UIO_PDRV_GENIRQ=m -CONFIG_UIO_DMEM_GENIRQ=m -CONFIG_UIO_AEC=m -CONFIG_UIO_SERCOS3=m -CONFIG_UIO_PCI_GENERIC=m -CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m -CONFIG_UIO_MF624=m -CONFIG_UIO_HV_GENERIC=m -CONFIG_VFIO_IOMMU_TYPE1=m -CONFIG_VFIO_VIRQFD=m -CONFIG_VFIO=m -# CONFIG_VFIO_NOIOMMU is not set -CONFIG_VFIO_PCI=m -CONFIG_VFIO_PCI_VGA=y -CONFIG_VFIO_PCI_MMAP=y -CONFIG_VFIO_PCI_INTX=y -CONFIG_VFIO_PCI_IGD=y -CONFIG_VFIO_MDEV=m -CONFIG_VFIO_MDEV_DEVICE=m -CONFIG_IRQ_BYPASS_MANAGER=m -CONFIG_VIRT_DRIVERS=y -CONFIG_VBOXGUEST=m -CONFIG_VIRTIO=y -CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m -CONFIG_VIRTIO_PCI_LEGACY=y -CONFIG_VIRTIO_VDPA=m -CONFIG_VIRTIO_PMEM=m -CONFIG_VIRTIO_BALLOON=m -CONFIG_VIRTIO_MEM=m -CONFIG_VIRTIO_INPUT=m -CONFIG_VIRTIO_MMIO=m -CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y -CONFIG_VDPA=m -CONFIG_VDPA_SIM=m -CONFIG_IFCVF=m -CONFIG_MLX5_VDPA=y -CONFIG_MLX5_VDPA_NET=m -CONFIG_VHOST_IOTLB=m -CONFIG_VHOST_RING=m -CONFIG_VHOST=m -CONFIG_VHOST_MENU=y -CONFIG_VHOST_NET=m -CONFIG_VHOST_SCSI=m -CONFIG_VHOST_VSOCK=m -CONFIG_VHOST_VDPA=m -# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set - -# -# Microsoft Hyper-V guest support -# -CONFIG_HYPERV=m -CONFIG_HYPERV_TIMER=y -CONFIG_HYPERV_UTILS=m -CONFIG_HYPERV_BALLOON=m -# end of Microsoft Hyper-V guest support - -# -# Xen driver support -# -CONFIG_XEN_BALLOON=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y -CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 -CONFIG_XEN_SCRUB_PAGES_DEFAULT=y -CONFIG_XEN_DEV_EVTCHN=m -CONFIG_XEN_BACKEND=y -CONFIG_XENFS=m -CONFIG_XEN_COMPAT_XENFS=y -CONFIG_XEN_SYS_HYPERVISOR=y -CONFIG_XEN_XENBUS_FRONTEND=y -CONFIG_XEN_GNTDEV=m -CONFIG_XEN_GNTDEV_DMABUF=y -CONFIG_XEN_GRANT_DEV_ALLOC=m -CONFIG_XEN_GRANT_DMA_ALLOC=y -CONFIG_SWIOTLB_XEN=y -CONFIG_XEN_PCIDEV_BACKEND=m -CONFIG_XEN_PVCALLS_FRONTEND=m -CONFIG_XEN_PVCALLS_BACKEND=y -CONFIG_XEN_SCSI_BACKEND=m -CONFIG_XEN_PRIVCMD=m -CONFIG_XEN_ACPI_PROCESSOR=m -CONFIG_XEN_MCE_LOG=y -CONFIG_XEN_HAVE_PVMMU=y -CONFIG_XEN_EFI=y -CONFIG_XEN_AUTO_XLATE=y -CONFIG_XEN_ACPI=y -CONFIG_XEN_SYMS=y -CONFIG_XEN_HAVE_VPMU=y -CONFIG_XEN_FRONT_PGDIR_SHBUF=m -CONFIG_XEN_UNPOPULATED_ALLOC=y -# end of Xen driver support - -# CONFIG_GREYBUS is not set -CONFIG_STAGING=y -CONFIG_PRISM2_USB=m -CONFIG_COMEDI=m -# CONFIG_COMEDI_DEBUG is not set -CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 -CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 -CONFIG_COMEDI_MISC_DRIVERS=y -CONFIG_COMEDI_BOND=m -CONFIG_COMEDI_TEST=m -CONFIG_COMEDI_PARPORT=m -# CONFIG_COMEDI_ISA_DRIVERS is not set -CONFIG_COMEDI_PCI_DRIVERS=m -CONFIG_COMEDI_8255_PCI=m -CONFIG_COMEDI_ADDI_WATCHDOG=m -CONFIG_COMEDI_ADDI_APCI_1032=m -CONFIG_COMEDI_ADDI_APCI_1500=m -CONFIG_COMEDI_ADDI_APCI_1516=m -CONFIG_COMEDI_ADDI_APCI_1564=m -CONFIG_COMEDI_ADDI_APCI_16XX=m -CONFIG_COMEDI_ADDI_APCI_2032=m -CONFIG_COMEDI_ADDI_APCI_2200=m -CONFIG_COMEDI_ADDI_APCI_3120=m -CONFIG_COMEDI_ADDI_APCI_3501=m -CONFIG_COMEDI_ADDI_APCI_3XXX=m -CONFIG_COMEDI_ADL_PCI6208=m -CONFIG_COMEDI_ADL_PCI7X3X=m -CONFIG_COMEDI_ADL_PCI8164=m -CONFIG_COMEDI_ADL_PCI9111=m -CONFIG_COMEDI_ADL_PCI9118=m -CONFIG_COMEDI_ADV_PCI1710=m -CONFIG_COMEDI_ADV_PCI1720=m -CONFIG_COMEDI_ADV_PCI1723=m -CONFIG_COMEDI_ADV_PCI1724=m -CONFIG_COMEDI_ADV_PCI1760=m -CONFIG_COMEDI_ADV_PCI_DIO=m -CONFIG_COMEDI_AMPLC_DIO200_PCI=m -CONFIG_COMEDI_AMPLC_PC236_PCI=m -CONFIG_COMEDI_AMPLC_PC263_PCI=m -CONFIG_COMEDI_AMPLC_PCI224=m -CONFIG_COMEDI_AMPLC_PCI230=m -CONFIG_COMEDI_CONTEC_PCI_DIO=m -CONFIG_COMEDI_DAS08_PCI=m -CONFIG_COMEDI_DT3000=m -CONFIG_COMEDI_DYNA_PCI10XX=m -CONFIG_COMEDI_GSC_HPDI=m -CONFIG_COMEDI_MF6X4=m -CONFIG_COMEDI_ICP_MULTI=m -CONFIG_COMEDI_DAQBOARD2000=m -CONFIG_COMEDI_JR3_PCI=m -CONFIG_COMEDI_KE_COUNTER=m -CONFIG_COMEDI_CB_PCIDAS64=m -CONFIG_COMEDI_CB_PCIDAS=m -CONFIG_COMEDI_CB_PCIDDA=m -CONFIG_COMEDI_CB_PCIMDAS=m -CONFIG_COMEDI_CB_PCIMDDA=m -CONFIG_COMEDI_ME4000=m -CONFIG_COMEDI_ME_DAQ=m -CONFIG_COMEDI_NI_6527=m -CONFIG_COMEDI_NI_65XX=m -CONFIG_COMEDI_NI_660X=m -CONFIG_COMEDI_NI_670X=m -CONFIG_COMEDI_NI_LABPC_PCI=m -CONFIG_COMEDI_NI_PCIDIO=m -CONFIG_COMEDI_NI_PCIMIO=m -CONFIG_COMEDI_RTD520=m -CONFIG_COMEDI_S626=m -CONFIG_COMEDI_MITE=m -CONFIG_COMEDI_NI_TIOCMD=m -CONFIG_COMEDI_PCMCIA_DRIVERS=m -CONFIG_COMEDI_CB_DAS16_CS=m -CONFIG_COMEDI_DAS08_CS=m -CONFIG_COMEDI_NI_DAQ_700_CS=m -CONFIG_COMEDI_NI_DAQ_DIO24_CS=m -CONFIG_COMEDI_NI_LABPC_CS=m -CONFIG_COMEDI_NI_MIO_CS=m -CONFIG_COMEDI_QUATECH_DAQP_CS=m -CONFIG_COMEDI_USB_DRIVERS=m -CONFIG_COMEDI_DT9812=m -CONFIG_COMEDI_NI_USB6501=m -CONFIG_COMEDI_USBDUX=m -CONFIG_COMEDI_USBDUXFAST=m -CONFIG_COMEDI_USBDUXSIGMA=m -CONFIG_COMEDI_VMK80XX=m -CONFIG_COMEDI_8254=m -CONFIG_COMEDI_8255=m -CONFIG_COMEDI_8255_SA=m -CONFIG_COMEDI_KCOMEDILIB=m -CONFIG_COMEDI_AMPLC_DIO200=m -CONFIG_COMEDI_AMPLC_PC236=m -CONFIG_COMEDI_DAS08=m -CONFIG_COMEDI_NI_LABPC=m -CONFIG_COMEDI_NI_TIO=m -CONFIG_COMEDI_NI_ROUTING=m -CONFIG_RTL8192U=m -CONFIG_RTLLIB=m -CONFIG_RTLLIB_CRYPTO_CCMP=m -CONFIG_RTLLIB_CRYPTO_TKIP=m -CONFIG_RTLLIB_CRYPTO_WEP=m -CONFIG_RTL8192E=m -CONFIG_RTL8723BS=m -CONFIG_R8712U=m -CONFIG_R8188EU=m -CONFIG_88EU_AP_MODE=y -CONFIG_RTS5208=m -CONFIG_VT6655=m -CONFIG_VT6656=m - -# -# IIO staging drivers -# - -# -# Accelerometers -# -CONFIG_ADIS16203=m -CONFIG_ADIS16240=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD7816=m -CONFIG_AD7280=m -# end of Analog to digital converters - -# -# Analog digital bi-direction converters -# -CONFIG_ADT7316=m -CONFIG_ADT7316_SPI=m -CONFIG_ADT7316_I2C=m -# end of Analog digital bi-direction converters - -# -# Capacitance to digital converters -# -CONFIG_AD7150=m -CONFIG_AD7746=m -# end of Capacitance to digital converters - -# -# Direct Digital Synthesis -# -CONFIG_AD9832=m -CONFIG_AD9834=m -# end of Direct Digital Synthesis - -# -# Network Analyzer, Impedance Converters -# -CONFIG_AD5933=m -# end of Network Analyzer, Impedance Converters - -# -# Active energy metering IC -# -CONFIG_ADE7854=m -CONFIG_ADE7854_I2C=m -CONFIG_ADE7854_SPI=m -# end of Active energy metering IC - -# -# Resolver to digital converters -# -CONFIG_AD2S1210=m -# end of Resolver to digital converters -# end of IIO staging drivers - -# CONFIG_FB_SM750 is not set -CONFIG_STAGING_MEDIA=y -CONFIG_INTEL_ATOMISP=y -CONFIG_VIDEO_ATOMISP=m -CONFIG_VIDEO_ATOMISP_ISP2401=y -CONFIG_VIDEO_ATOMISP_OV5693=m -CONFIG_VIDEO_ATOMISP_OV2722=m -CONFIG_VIDEO_ATOMISP_GC2235=m -CONFIG_VIDEO_ATOMISP_MSRLIST_HELPER=m -CONFIG_VIDEO_ATOMISP_MT9M114=m -CONFIG_VIDEO_ATOMISP_GC0310=m -CONFIG_VIDEO_ATOMISP_OV2680=m -CONFIG_VIDEO_ATOMISP_LM3554=m -CONFIG_VIDEO_IPU3_IMGU=m -CONFIG_VIDEO_USBVISION=m - -# -# Android -# -# end of Android - -CONFIG_STAGING_BOARD=y -CONFIG_LTE_GDM724X=m -CONFIG_FIREWIRE_SERIAL=m -CONFIG_FWTTY_MAX_TOTAL_PORTS=64 -CONFIG_FWTTY_MAX_CARD_PORTS=32 -CONFIG_GS_FPGABOOT=m -CONFIG_UNISYSSPAR=y -CONFIG_UNISYS_VISORNIC=m -CONFIG_UNISYS_VISORINPUT=m -CONFIG_UNISYS_VISORHBA=m -# CONFIG_FB_TFT is not set -CONFIG_MOST_COMPONENTS=m -CONFIG_MOST_CDEV=m -CONFIG_MOST_NET=m -CONFIG_MOST_SOUND=m -CONFIG_MOST_VIDEO=m -CONFIG_MOST_DIM2=m -CONFIG_MOST_I2C=m -CONFIG_KS7010=m -CONFIG_PI433=m - -# -# Gasket devices -# -CONFIG_STAGING_GASKET_FRAMEWORK=m -CONFIG_STAGING_APEX_DRIVER=m -# end of Gasket devices - -CONFIG_XIL_AXIS_FIFO=m -CONFIG_FIELDBUS_DEV=m -CONFIG_HMS_ANYBUSS_BUS=m -CONFIG_ARCX_ANYBUS_CONTROLLER=m -CONFIG_HMS_PROFINET=m -CONFIG_KPC2000=y -CONFIG_KPC2000_CORE=m -CONFIG_KPC2000_SPI=m -CONFIG_KPC2000_I2C=m -CONFIG_KPC2000_DMA=m -CONFIG_QLGE=m -CONFIG_WFX=m -CONFIG_X86_PLATFORM_DEVICES=y -CONFIG_ACPI_WMI=m -CONFIG_WMI_BMOF=m -CONFIG_ALIENWARE_WMI=m -CONFIG_HUAWEI_WMI=m -CONFIG_INTEL_WMI_SBL_FW_UPDATE=m -CONFIG_INTEL_WMI_THUNDERBOLT=m -CONFIG_MXM_WMI=m -CONFIG_PEAQ_WMI=m -CONFIG_XIAOMI_WMI=m -CONFIG_ACERHDF=m -CONFIG_ACER_WIRELESS=m -CONFIG_ACER_WMI=m -CONFIG_APPLE_GMUX=m -CONFIG_ASUS_LAPTOP=m -CONFIG_ASUS_WIRELESS=m -CONFIG_ASUS_WMI=m -CONFIG_ASUS_NB_WMI=m -CONFIG_EEEPC_LAPTOP=m -CONFIG_EEEPC_WMI=m -CONFIG_DCDBAS=m -CONFIG_DELL_SMBIOS=m -CONFIG_DELL_SMBIOS_WMI=y -CONFIG_DELL_SMBIOS_SMM=y -CONFIG_DELL_LAPTOP=m -CONFIG_DELL_RBTN=m -# CONFIG_DELL_RBU is not set -CONFIG_DELL_SMO8800=m -CONFIG_DELL_WMI=m -CONFIG_DELL_WMI_DESCRIPTOR=m -CONFIG_DELL_WMI_AIO=m -CONFIG_DELL_WMI_LED=m -CONFIG_AMILO_RFKILL=m -CONFIG_FUJITSU_LAPTOP=m -CONFIG_FUJITSU_TABLET=m -CONFIG_GPD_POCKET_FAN=m -CONFIG_HP_ACCEL=m -CONFIG_HP_WIRELESS=m -CONFIG_HP_WMI=m -CONFIG_IBM_RTL=m -CONFIG_IDEAPAD_LAPTOP=m -CONFIG_SENSORS_HDAPS=m -CONFIG_THINKPAD_ACPI=m -CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y -# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set -# CONFIG_THINKPAD_ACPI_DEBUG is not set -# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set -CONFIG_THINKPAD_ACPI_VIDEO=y -CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y -CONFIG_INTEL_ATOMISP2_LED=m -CONFIG_INTEL_CHT_INT33FE=m -CONFIG_INTEL_HID_EVENT=m -CONFIG_INTEL_INT0002_VGPIO=m -CONFIG_INTEL_MENLOW=m -CONFIG_INTEL_OAKTRAIL=m -CONFIG_INTEL_VBTN=m -CONFIG_SURFACE3_WMI=m -CONFIG_SURFACE_3_BUTTON=m -CONFIG_SURFACE_3_POWER_OPREGION=m -CONFIG_SURFACE_PRO3_BUTTON=m -CONFIG_MSI_LAPTOP=m -CONFIG_MSI_WMI=m -CONFIG_PCENGINES_APU2=m -CONFIG_SAMSUNG_LAPTOP=m -CONFIG_SAMSUNG_Q10=m -CONFIG_ACPI_TOSHIBA=m -CONFIG_TOSHIBA_BT_RFKILL=m -CONFIG_TOSHIBA_HAPS=m -CONFIG_TOSHIBA_WMI=m -CONFIG_ACPI_CMPC=m -CONFIG_COMPAL_LAPTOP=m -CONFIG_LG_LAPTOP=m -CONFIG_PANASONIC_LAPTOP=m -CONFIG_SONY_LAPTOP=m -CONFIG_SONYPI_COMPAT=y -CONFIG_SYSTEM76_ACPI=m -CONFIG_TOPSTAR_LAPTOP=m -CONFIG_I2C_MULTI_INSTANTIATE=m -CONFIG_MLX_PLATFORM=m -CONFIG_TOUCHSCREEN_DMI=y -CONFIG_INTEL_IPS=m -CONFIG_INTEL_RST=m -CONFIG_INTEL_SMARTCONNECT=m - -# -# Intel Speed Select Technology interface support -# -CONFIG_INTEL_SPEED_SELECT_INTERFACE=m -# end of Intel Speed Select Technology interface support - -CONFIG_INTEL_TURBO_MAX_3=y -CONFIG_INTEL_UNCORE_FREQ_CONTROL=m -CONFIG_INTEL_BXTWC_PMIC_TMU=m -CONFIG_INTEL_CHTDC_TI_PWRBTN=m -CONFIG_INTEL_MFLD_THERMAL=m -CONFIG_INTEL_MID_POWER_BUTTON=m -CONFIG_INTEL_MRFLD_PWRBTN=m -CONFIG_INTEL_PMC_CORE=y -CONFIG_INTEL_PUNIT_IPC=m -CONFIG_INTEL_SCU_IPC=y -CONFIG_INTEL_SCU=y -CONFIG_INTEL_SCU_PCI=y -CONFIG_INTEL_SCU_PLATFORM=m -CONFIG_INTEL_SCU_IPC_UTIL=m -CONFIG_INTEL_TELEMETRY=m -CONFIG_PMC_ATOM=y -CONFIG_MFD_CROS_EC=m -CONFIG_CHROME_PLATFORMS=y -CONFIG_CHROMEOS_LAPTOP=m -CONFIG_CHROMEOS_PSTORE=m -CONFIG_CHROMEOS_TBMC=m -CONFIG_CROS_EC=m -CONFIG_CROS_EC_I2C=m -CONFIG_CROS_EC_RPMSG=m -CONFIG_CROS_EC_ISHTP=m -CONFIG_CROS_EC_SPI=m -CONFIG_CROS_EC_LPC=m -CONFIG_CROS_EC_PROTO=y -CONFIG_CROS_KBD_LED_BACKLIGHT=m -CONFIG_CROS_EC_CHARDEV=m -CONFIG_CROS_EC_LIGHTBAR=m -CONFIG_CROS_EC_VBC=m -CONFIG_CROS_EC_DEBUGFS=m -CONFIG_CROS_EC_SENSORHUB=m -CONFIG_CROS_EC_SYSFS=m -CONFIG_CROS_EC_TYPEC=m -CONFIG_CROS_USBPD_LOGGER=m -CONFIG_CROS_USBPD_NOTIFY=m -CONFIG_WILCO_EC=m -CONFIG_WILCO_EC_DEBUGFS=m -CONFIG_WILCO_EC_EVENTS=m -CONFIG_WILCO_EC_TELEMETRY=m -CONFIG_MELLANOX_PLATFORM=y -CONFIG_MLXREG_HOTPLUG=m -CONFIG_MLXREG_IO=m -CONFIG_HAVE_CLK=y -CONFIG_CLKDEV_LOOKUP=y -CONFIG_HAVE_CLK_PREPARE=y -CONFIG_COMMON_CLK=y -CONFIG_COMMON_CLK_WM831X=m -CONFIG_CLK_HSDK=y -CONFIG_COMMON_CLK_MAX77686=m -CONFIG_COMMON_CLK_MAX9485=m -CONFIG_COMMON_CLK_RK808=m -CONFIG_COMMON_CLK_SI5341=m -CONFIG_COMMON_CLK_SI5351=m -CONFIG_COMMON_CLK_SI514=m -CONFIG_COMMON_CLK_SI544=m -CONFIG_COMMON_CLK_SI570=m -CONFIG_COMMON_CLK_CDCE706=m -CONFIG_COMMON_CLK_CDCE925=m -CONFIG_COMMON_CLK_CS2000_CP=m -CONFIG_COMMON_CLK_S2MPS11=m -CONFIG_CLK_TWL6040=m -CONFIG_COMMON_CLK_LOCHNAGAR=m -CONFIG_COMMON_CLK_PALMAS=m -CONFIG_COMMON_CLK_PWM=m -CONFIG_COMMON_CLK_VC5=m -CONFIG_COMMON_CLK_BD718XX=m -CONFIG_COMMON_CLK_FIXED_MMIO=y -CONFIG_CLK_LGM_CGU=y -CONFIG_HWSPINLOCK=y - -# -# Clock Source drivers -# -CONFIG_TIMER_OF=y -CONFIG_TIMER_PROBE=y -CONFIG_CLKEVT_I8253=y -CONFIG_I8253_LOCK=y -CONFIG_CLKBLD_I8253=y -CONFIG_CLKSRC_MMIO=y -CONFIG_MICROCHIP_PIT64B=y -# end of Clock Source drivers - -CONFIG_MAILBOX=y -CONFIG_PLATFORM_MHU=m -CONFIG_PCC=y -CONFIG_ALTERA_MBOX=m -CONFIG_MAILBOX_TEST=m -CONFIG_IOMMU_IOVA=y -CONFIG_IOASID=y -CONFIG_IOMMU_API=y -CONFIG_IOMMU_SUPPORT=y - -# -# Generic IOMMU Pagetable Support -# -# end of Generic IOMMU Pagetable Support - -# CONFIG_IOMMU_DEBUGFS is not set -# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set -CONFIG_OF_IOMMU=y -CONFIG_IOMMU_DMA=y -CONFIG_AMD_IOMMU=y -CONFIG_AMD_IOMMU_V2=y -CONFIG_DMAR_TABLE=y -CONFIG_INTEL_IOMMU=y -CONFIG_INTEL_IOMMU_SVM=y -# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set -CONFIG_INTEL_IOMMU_FLOPPY_WA=y -# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set -CONFIG_IRQ_REMAP=y -CONFIG_HYPERV_IOMMU=y - -# -# Remoteproc drivers -# -CONFIG_REMOTEPROC=y -# CONFIG_REMOTEPROC_CDEV is not set -# end of Remoteproc drivers - -# -# Rpmsg drivers -# -CONFIG_RPMSG=m -CONFIG_RPMSG_CHAR=m -CONFIG_RPMSG_QCOM_GLINK=m -CONFIG_RPMSG_QCOM_GLINK_RPM=m -CONFIG_RPMSG_VIRTIO=m -# end of Rpmsg drivers - -CONFIG_SOUNDWIRE=m - -# -# SoundWire Devices -# -CONFIG_SOUNDWIRE_CADENCE=m -CONFIG_SOUNDWIRE_INTEL=m -CONFIG_SOUNDWIRE_QCOM=m - -# -# SOC (System On Chip) specific Drivers -# - -# -# Amlogic SoC drivers -# -# end of Amlogic SoC drivers - -# -# Aspeed SoC drivers -# -# end of Aspeed SoC drivers - -# -# Broadcom SoC drivers -# -# end of Broadcom SoC drivers - -# -# NXP/Freescale QorIQ SoC drivers -# -# end of NXP/Freescale QorIQ SoC drivers - -# -# i.MX SoC drivers -# -# end of i.MX SoC drivers - -# -# Qualcomm SoC drivers -# -# end of Qualcomm SoC drivers - -CONFIG_SOC_TI=y - -# -# Xilinx SoC drivers -# -CONFIG_XILINX_VCU=m -# end of Xilinx SoC drivers -# end of SOC (System On Chip) specific Drivers - -CONFIG_PM_DEVFREQ=y - -# -# DEVFREQ Governors -# -CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m -CONFIG_DEVFREQ_GOV_PERFORMANCE=m -CONFIG_DEVFREQ_GOV_POWERSAVE=m -CONFIG_DEVFREQ_GOV_USERSPACE=m -CONFIG_DEVFREQ_GOV_PASSIVE=m - -# -# DEVFREQ Drivers -# -CONFIG_PM_DEVFREQ_EVENT=y -CONFIG_EXTCON=y - -# -# Extcon Device Drivers -# -CONFIG_EXTCON_ADC_JACK=m -CONFIG_EXTCON_ARIZONA=m -CONFIG_EXTCON_AXP288=m -CONFIG_EXTCON_FSA9480=m -CONFIG_EXTCON_GPIO=m -CONFIG_EXTCON_INTEL_INT3496=m -CONFIG_EXTCON_INTEL_CHT_WC=m -CONFIG_EXTCON_INTEL_MRFLD=m -CONFIG_EXTCON_MAX14577=m -CONFIG_EXTCON_MAX3355=m -CONFIG_EXTCON_MAX77693=m -CONFIG_EXTCON_MAX77843=m -CONFIG_EXTCON_MAX8997=m -CONFIG_EXTCON_PALMAS=m -CONFIG_EXTCON_PTN5150=m -CONFIG_EXTCON_RT8973A=m -CONFIG_EXTCON_SM5502=m -CONFIG_EXTCON_USB_GPIO=m -CONFIG_EXTCON_USBC_CROS_EC=m -CONFIG_MEMORY=y -CONFIG_IIO=m -CONFIG_IIO_BUFFER=y -CONFIG_IIO_BUFFER_CB=m -CONFIG_IIO_BUFFER_DMA=m -CONFIG_IIO_BUFFER_DMAENGINE=m -CONFIG_IIO_BUFFER_HW_CONSUMER=m -CONFIG_IIO_KFIFO_BUF=m -CONFIG_IIO_TRIGGERED_BUFFER=m -CONFIG_IIO_CONFIGFS=m -CONFIG_IIO_TRIGGER=y -CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 -CONFIG_IIO_SW_DEVICE=m -CONFIG_IIO_SW_TRIGGER=m -CONFIG_IIO_TRIGGERED_EVENT=m - -# -# Accelerometers -# -CONFIG_ADIS16201=m -CONFIG_ADIS16209=m -CONFIG_ADXL372=m -CONFIG_ADXL372_SPI=m -CONFIG_ADXL372_I2C=m -CONFIG_BMA220=m -CONFIG_BMA400=m -CONFIG_BMA400_I2C=m -CONFIG_BMA400_SPI=m -CONFIG_BMC150_ACCEL=m -CONFIG_BMC150_ACCEL_I2C=m -CONFIG_BMC150_ACCEL_SPI=m -CONFIG_DA280=m -CONFIG_DA311=m -CONFIG_DMARD06=m -CONFIG_DMARD09=m -CONFIG_DMARD10=m -CONFIG_HID_SENSOR_ACCEL_3D=m -CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m -CONFIG_IIO_ST_ACCEL_3AXIS=m -CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m -CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m -CONFIG_KXSD9=m -CONFIG_KXSD9_SPI=m -CONFIG_KXSD9_I2C=m -CONFIG_KXCJK1013=m -CONFIG_MC3230=m -CONFIG_MMA7455=m -CONFIG_MMA7455_I2C=m -CONFIG_MMA7455_SPI=m -CONFIG_MMA7660=m -CONFIG_MMA8452=m -CONFIG_MMA9551_CORE=m -CONFIG_MMA9551=m -CONFIG_MMA9553=m -CONFIG_MXC4005=m -CONFIG_MXC6255=m -CONFIG_SCA3000=m -CONFIG_STK8312=m -CONFIG_STK8BA50=m -# end of Accelerometers - -# -# Analog to digital converters -# -CONFIG_AD_SIGMA_DELTA=m -CONFIG_AD7091R5=m -CONFIG_AD7124=m -CONFIG_AD7192=m -CONFIG_AD7266=m -CONFIG_AD7291=m -CONFIG_AD7292=m -CONFIG_AD7298=m -CONFIG_AD7476=m -CONFIG_AD7606=m -CONFIG_AD7606_IFACE_PARALLEL=m -CONFIG_AD7606_IFACE_SPI=m -CONFIG_AD7766=m -CONFIG_AD7768_1=m -CONFIG_AD7780=m -CONFIG_AD7791=m -CONFIG_AD7793=m -CONFIG_AD7887=m -CONFIG_AD7923=m -CONFIG_AD7949=m -CONFIG_AD799X=m -CONFIG_AD9467=m -CONFIG_ADI_AXI_ADC=m -CONFIG_AXP20X_ADC=m -CONFIG_AXP288_ADC=m -CONFIG_CC10001_ADC=m -CONFIG_CPCAP_ADC=m -CONFIG_DA9150_GPADC=m -CONFIG_DLN2_ADC=m -CONFIG_ENVELOPE_DETECTOR=m -CONFIG_HI8435=m -CONFIG_HX711=m -CONFIG_INA2XX_ADC=m -CONFIG_INTEL_MRFLD_ADC=m -CONFIG_LP8788_ADC=m -CONFIG_LTC2471=m -CONFIG_LTC2485=m -CONFIG_LTC2496=m -CONFIG_LTC2497=m -CONFIG_MAX1027=m -CONFIG_MAX11100=m -CONFIG_MAX1118=m -CONFIG_MAX1241=m -CONFIG_MAX1363=m -CONFIG_MAX9611=m -CONFIG_MCP320X=m -CONFIG_MCP3422=m -CONFIG_MCP3911=m -CONFIG_MEN_Z188_ADC=m -CONFIG_MP2629_ADC=m -CONFIG_NAU7802=m -CONFIG_PALMAS_GPADC=m -CONFIG_QCOM_VADC_COMMON=m -CONFIG_QCOM_SPMI_IADC=m -CONFIG_QCOM_SPMI_VADC=m -CONFIG_QCOM_SPMI_ADC5=m -CONFIG_RN5T618_ADC=m -CONFIG_SD_ADC_MODULATOR=m -CONFIG_STMPE_ADC=m -CONFIG_TI_ADC081C=m -CONFIG_TI_ADC0832=m -CONFIG_TI_ADC084S021=m -CONFIG_TI_ADC12138=m -CONFIG_TI_ADC108S102=m -CONFIG_TI_ADC128S052=m -CONFIG_TI_ADC161S626=m -CONFIG_TI_ADS1015=m -CONFIG_TI_ADS7950=m -CONFIG_TI_ADS8344=m -CONFIG_TI_ADS8688=m -CONFIG_TI_ADS124S08=m -CONFIG_TI_AM335X_ADC=m -CONFIG_TI_TLC4541=m -CONFIG_TWL4030_MADC=m -CONFIG_TWL6030_GPADC=m -CONFIG_VF610_ADC=m -CONFIG_VIPERBOARD_ADC=m -CONFIG_XILINX_XADC=m -# end of Analog to digital converters - -# -# Analog Front Ends -# -CONFIG_IIO_RESCALE=m -# end of Analog Front Ends - -# -# Amplifiers -# -CONFIG_AD8366=m -CONFIG_HMC425=m -# end of Amplifiers - -# -# Chemical Sensors -# -CONFIG_ATLAS_PH_SENSOR=m -CONFIG_ATLAS_EZO_SENSOR=m -CONFIG_BME680=m -CONFIG_BME680_I2C=m -CONFIG_BME680_SPI=m -CONFIG_CCS811=m -CONFIG_IAQCORE=m -CONFIG_PMS7003=m -# CONFIG_SCD30_CORE is not set -CONFIG_SENSIRION_SGP30=m -CONFIG_SPS30=m -CONFIG_VZ89X=m -# end of Chemical Sensors - -CONFIG_IIO_CROS_EC_SENSORS_CORE=m -CONFIG_IIO_CROS_EC_SENSORS=m -CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m - -# -# Hid Sensor IIO Common -# -CONFIG_HID_SENSOR_IIO_COMMON=m -CONFIG_HID_SENSOR_IIO_TRIGGER=m -# end of Hid Sensor IIO Common - -CONFIG_IIO_MS_SENSORS_I2C=m - -# -# SSP Sensor Common -# -CONFIG_IIO_SSP_SENSORS_COMMONS=m -CONFIG_IIO_SSP_SENSORHUB=m -# end of SSP Sensor Common - -CONFIG_IIO_ST_SENSORS_I2C=m -CONFIG_IIO_ST_SENSORS_SPI=m -CONFIG_IIO_ST_SENSORS_CORE=m - -# -# Digital to analog converters -# -CONFIG_AD5064=m -CONFIG_AD5360=m -CONFIG_AD5380=m -CONFIG_AD5421=m -CONFIG_AD5446=m -CONFIG_AD5449=m -CONFIG_AD5592R_BASE=m -CONFIG_AD5592R=m -CONFIG_AD5593R=m -CONFIG_AD5504=m -CONFIG_AD5624R_SPI=m -CONFIG_AD5686=m -CONFIG_AD5686_SPI=m -CONFIG_AD5696_I2C=m -CONFIG_AD5755=m -CONFIG_AD5758=m -CONFIG_AD5761=m -CONFIG_AD5764=m -CONFIG_AD5770R=m -CONFIG_AD5791=m -CONFIG_AD7303=m -CONFIG_AD8801=m -CONFIG_DPOT_DAC=m -CONFIG_DS4424=m -CONFIG_LTC1660=m -CONFIG_LTC2632=m -CONFIG_M62332=m -CONFIG_MAX517=m -CONFIG_MAX5821=m -CONFIG_MCP4725=m -CONFIG_MCP4922=m -CONFIG_TI_DAC082S085=m -CONFIG_TI_DAC5571=m -CONFIG_TI_DAC7311=m -CONFIG_TI_DAC7612=m -CONFIG_VF610_DAC=m -# end of Digital to analog converters - -# -# IIO dummy driver -# -# CONFIG_IIO_SIMPLE_DUMMY is not set -# end of IIO dummy driver - -# -# Frequency Synthesizers DDS/PLL -# - -# -# Clock Generator/Distribution -# -CONFIG_AD9523=m -# end of Clock Generator/Distribution - -# -# Phase-Locked Loop (PLL) frequency synthesizers -# -CONFIG_ADF4350=m -CONFIG_ADF4371=m -# end of Phase-Locked Loop (PLL) frequency synthesizers -# end of Frequency Synthesizers DDS/PLL - -# -# Digital gyroscope sensors -# -CONFIG_ADIS16080=m -CONFIG_ADIS16130=m -CONFIG_ADIS16136=m -CONFIG_ADIS16260=m -CONFIG_ADXRS450=m -CONFIG_BMG160=m -CONFIG_BMG160_I2C=m -CONFIG_BMG160_SPI=m -CONFIG_FXAS21002C=m -CONFIG_FXAS21002C_I2C=m -CONFIG_FXAS21002C_SPI=m -CONFIG_HID_SENSOR_GYRO_3D=m -CONFIG_MPU3050=m -CONFIG_MPU3050_I2C=m -CONFIG_IIO_ST_GYRO_3AXIS=m -CONFIG_IIO_ST_GYRO_I2C_3AXIS=m -CONFIG_IIO_ST_GYRO_SPI_3AXIS=m -CONFIG_ITG3200=m -# end of Digital gyroscope sensors - -# -# Health Sensors -# - -# -# Heart Rate Monitors -# -CONFIG_AFE4403=m -CONFIG_AFE4404=m -CONFIG_MAX30100=m -CONFIG_MAX30102=m -# end of Heart Rate Monitors -# end of Health Sensors - -# -# Humidity sensors -# -CONFIG_AM2315=m -CONFIG_DHT11=m -CONFIG_HDC100X=m -CONFIG_HID_SENSOR_HUMIDITY=m -CONFIG_HTS221=m -CONFIG_HTS221_I2C=m -CONFIG_HTS221_SPI=m -CONFIG_HTU21=m -CONFIG_SI7005=m -CONFIG_SI7020=m -# end of Humidity sensors - -# -# Inertial measurement units -# -CONFIG_ADIS16400=m -CONFIG_ADIS16460=m -CONFIG_ADIS16475=m -CONFIG_ADIS16480=m -CONFIG_BMI160=m -CONFIG_BMI160_I2C=m -CONFIG_BMI160_SPI=m -CONFIG_FXOS8700=m -CONFIG_FXOS8700_I2C=m -CONFIG_FXOS8700_SPI=m -CONFIG_KMX61=m -CONFIG_INV_ICM42600=m -CONFIG_INV_ICM42600_I2C=m -CONFIG_INV_ICM42600_SPI=m -CONFIG_INV_MPU6050_IIO=m -CONFIG_INV_MPU6050_I2C=m -CONFIG_INV_MPU6050_SPI=m -CONFIG_IIO_ST_LSM6DSX=m -CONFIG_IIO_ST_LSM6DSX_I2C=m -CONFIG_IIO_ST_LSM6DSX_SPI=m -CONFIG_IIO_ST_LSM6DSX_I3C=m -# end of Inertial measurement units - -CONFIG_IIO_ADIS_LIB=m -CONFIG_IIO_ADIS_LIB_BUFFER=y - -# -# Light sensors -# -CONFIG_ACPI_ALS=m -CONFIG_ADJD_S311=m -CONFIG_ADUX1020=m -CONFIG_AL3010=m -CONFIG_AL3320A=m -CONFIG_APDS9300=m -CONFIG_APDS9960=m -CONFIG_BH1750=m -CONFIG_BH1780=m -CONFIG_CM32181=m -CONFIG_CM3232=m -CONFIG_CM3323=m -CONFIG_CM3605=m -CONFIG_CM36651=m -CONFIG_IIO_CROS_EC_LIGHT_PROX=m -CONFIG_GP2AP002=m -CONFIG_GP2AP020A00F=m -CONFIG_IQS621_ALS=m -CONFIG_SENSORS_ISL29018=m -CONFIG_SENSORS_ISL29028=m -CONFIG_ISL29125=m -CONFIG_HID_SENSOR_ALS=m -CONFIG_HID_SENSOR_PROX=m -CONFIG_JSA1212=m -CONFIG_RPR0521=m -CONFIG_SENSORS_LM3533=m -CONFIG_LTR501=m -CONFIG_LV0104CS=m -CONFIG_MAX44000=m -CONFIG_MAX44009=m -CONFIG_NOA1305=m -CONFIG_OPT3001=m -CONFIG_PA12203001=m -CONFIG_SI1133=m -CONFIG_SI1145=m -CONFIG_STK3310=m -CONFIG_ST_UVIS25=m -CONFIG_ST_UVIS25_I2C=m -CONFIG_ST_UVIS25_SPI=m -CONFIG_TCS3414=m -CONFIG_TCS3472=m -CONFIG_SENSORS_TSL2563=m -CONFIG_TSL2583=m -CONFIG_TSL2772=m -CONFIG_TSL4531=m -CONFIG_US5182D=m -CONFIG_VCNL4000=m -CONFIG_VCNL4035=m -CONFIG_VEML6030=m -CONFIG_VEML6070=m -CONFIG_VL6180=m -CONFIG_ZOPT2201=m -# end of Light sensors - -# -# Magnetometer sensors -# -CONFIG_AK8974=m -CONFIG_AK8975=m -CONFIG_AK09911=m -CONFIG_BMC150_MAGN=m -CONFIG_BMC150_MAGN_I2C=m -CONFIG_BMC150_MAGN_SPI=m -CONFIG_MAG3110=m -CONFIG_HID_SENSOR_MAGNETOMETER_3D=m -CONFIG_MMC35240=m -CONFIG_IIO_ST_MAGN_3AXIS=m -CONFIG_IIO_ST_MAGN_I2C_3AXIS=m -CONFIG_IIO_ST_MAGN_SPI_3AXIS=m -CONFIG_SENSORS_HMC5843=m -CONFIG_SENSORS_HMC5843_I2C=m -CONFIG_SENSORS_HMC5843_SPI=m -CONFIG_SENSORS_RM3100=m -CONFIG_SENSORS_RM3100_I2C=m -CONFIG_SENSORS_RM3100_SPI=m -# end of Magnetometer sensors - -# -# Multiplexers -# -CONFIG_IIO_MUX=m -# end of Multiplexers - -# -# Inclinometer sensors -# -CONFIG_HID_SENSOR_INCLINOMETER_3D=m -CONFIG_HID_SENSOR_DEVICE_ROTATION=m -# end of Inclinometer sensors - -# -# Triggers - standalone -# -CONFIG_IIO_HRTIMER_TRIGGER=m -CONFIG_IIO_INTERRUPT_TRIGGER=m -CONFIG_IIO_TIGHTLOOP_TRIGGER=m -CONFIG_IIO_SYSFS_TRIGGER=m -# end of Triggers - standalone - -# -# Linear and angular position sensors -# -CONFIG_IQS624_POS=m -# end of Linear and angular position sensors - -# -# Digital potentiometers -# -CONFIG_AD5272=m -CONFIG_DS1803=m -CONFIG_MAX5432=m -CONFIG_MAX5481=m -CONFIG_MAX5487=m -CONFIG_MCP4018=m -CONFIG_MCP4131=m -CONFIG_MCP4531=m -CONFIG_MCP41010=m -CONFIG_TPL0102=m -# end of Digital potentiometers - -# -# Digital potentiostats -# -CONFIG_LMP91000=m -# end of Digital potentiostats - -# -# Pressure sensors -# -CONFIG_ABP060MG=m -CONFIG_BMP280=m -CONFIG_BMP280_I2C=m -CONFIG_BMP280_SPI=m -CONFIG_IIO_CROS_EC_BARO=m -CONFIG_DLHL60D=m -CONFIG_DPS310=m -CONFIG_HID_SENSOR_PRESS=m -CONFIG_HP03=m -CONFIG_ICP10100=m -CONFIG_MPL115=m -CONFIG_MPL115_I2C=m -CONFIG_MPL115_SPI=m -CONFIG_MPL3115=m -CONFIG_MS5611=m -CONFIG_MS5611_I2C=m -CONFIG_MS5611_SPI=m -CONFIG_MS5637=m -CONFIG_IIO_ST_PRESS=m -CONFIG_IIO_ST_PRESS_I2C=m -CONFIG_IIO_ST_PRESS_SPI=m -CONFIG_T5403=m -CONFIG_HP206C=m -CONFIG_ZPA2326=m -CONFIG_ZPA2326_I2C=m -CONFIG_ZPA2326_SPI=m -# end of Pressure sensors - -# -# Lightning sensors -# -CONFIG_AS3935=m -# end of Lightning sensors - -# -# Proximity and distance sensors -# -CONFIG_ISL29501=m -CONFIG_LIDAR_LITE_V2=m -CONFIG_MB1232=m -CONFIG_PING=m -CONFIG_RFD77402=m -CONFIG_SRF04=m -CONFIG_SX9310=m -CONFIG_SX9500=m -CONFIG_SRF08=m -CONFIG_VCNL3020=m -CONFIG_VL53L0X_I2C=m -# end of Proximity and distance sensors - -# -# Resolver to digital converters -# -CONFIG_AD2S90=m -CONFIG_AD2S1200=m -# end of Resolver to digital converters - -# -# Temperature sensors -# -CONFIG_IQS620AT_TEMP=m -CONFIG_LTC2983=m -CONFIG_MAXIM_THERMOCOUPLE=m -CONFIG_HID_SENSOR_TEMP=m -CONFIG_MLX90614=m -CONFIG_MLX90632=m -CONFIG_TMP006=m -CONFIG_TMP007=m -CONFIG_TSYS01=m -CONFIG_TSYS02D=m -CONFIG_MAX31856=m -# end of Temperature sensors - -CONFIG_NTB=m -CONFIG_NTB_MSI=y -CONFIG_NTB_AMD=m -CONFIG_NTB_IDT=m -CONFIG_NTB_INTEL=m -CONFIG_NTB_SWITCHTEC=m -# CONFIG_NTB_PINGPONG is not set -# CONFIG_NTB_TOOL is not set -# CONFIG_NTB_PERF is not set -# CONFIG_NTB_MSI_TEST is not set -CONFIG_NTB_TRANSPORT=m -CONFIG_VME_BUS=y - -# -# VME Bridge Drivers -# -CONFIG_VME_CA91CX42=m -CONFIG_VME_TSI148=m -# CONFIG_VME_FAKE is not set - -# -# VME Board Drivers -# -CONFIG_VMIVME_7805=m - -# -# VME Device Drivers -# -CONFIG_VME_USER=m -CONFIG_PWM=y -CONFIG_PWM_SYSFS=y -# CONFIG_PWM_DEBUG is not set -CONFIG_PWM_ATMEL_HLCDC_PWM=m -CONFIG_PWM_CRC=y -CONFIG_PWM_CROS_EC=m -CONFIG_PWM_FSL_FTM=m -CONFIG_PWM_IQS620A=m -CONFIG_PWM_LP3943=m -CONFIG_PWM_LPSS=m -CONFIG_PWM_LPSS_PCI=m -CONFIG_PWM_LPSS_PLATFORM=m -CONFIG_PWM_PCA9685=m -CONFIG_PWM_STMPE=y -CONFIG_PWM_TWL=m -CONFIG_PWM_TWL_LED=m - -# -# IRQ chip support -# -CONFIG_IRQCHIP=y -CONFIG_AL_FIC=y -CONFIG_MADERA_IRQ=m -# end of IRQ chip support - -CONFIG_IPACK_BUS=m -CONFIG_BOARD_TPCI200=m -CONFIG_SERIAL_IPOCTAL=m -CONFIG_RESET_CONTROLLER=y -CONFIG_RESET_BRCMSTB_RESCAL=y -CONFIG_RESET_INTEL_GW=y -CONFIG_RESET_TI_SYSCON=m - -# -# PHY Subsystem -# -CONFIG_GENERIC_PHY=y -CONFIG_GENERIC_PHY_MIPI_DPHY=y -CONFIG_BCM_KONA_USB2_PHY=m -CONFIG_PHY_CADENCE_TORRENT=m -CONFIG_PHY_CADENCE_DPHY=m -CONFIG_PHY_CADENCE_SIERRA=m -CONFIG_PHY_CADENCE_SALVO=m -CONFIG_PHY_FSL_IMX8MQ_USB=m -CONFIG_PHY_MIXEL_MIPI_DPHY=m -CONFIG_PHY_PXA_28NM_HSIC=m -CONFIG_PHY_PXA_28NM_USB2=m -CONFIG_PHY_CPCAP_USB=m -CONFIG_PHY_MAPPHONE_MDM6600=m -CONFIG_PHY_OCELOT_SERDES=m -CONFIG_PHY_QCOM_USB_HS=m -CONFIG_PHY_QCOM_USB_HSIC=m -CONFIG_PHY_SAMSUNG_USB2=m -CONFIG_PHY_TUSB1210=m -CONFIG_PHY_INTEL_COMBO=y -CONFIG_PHY_INTEL_EMMC=m -# end of PHY Subsystem - -CONFIG_POWERCAP=y -CONFIG_INTEL_RAPL_CORE=m -CONFIG_INTEL_RAPL=m -CONFIG_IDLE_INJECT=y -CONFIG_MCB=m -CONFIG_MCB_PCI=m -CONFIG_MCB_LPC=m - -# -# Performance monitor support -# -# end of Performance monitor support - -CONFIG_RAS=y -CONFIG_RAS_CEC=y -# CONFIG_RAS_CEC_DEBUG is not set -CONFIG_USB4=m - -# -# Android -# -# CONFIG_ANDROID is not set -# end of Android - -CONFIG_LIBNVDIMM=y -CONFIG_BLK_DEV_PMEM=m -CONFIG_ND_BLK=m -CONFIG_ND_CLAIM=y -CONFIG_ND_BTT=m -CONFIG_BTT=y -CONFIG_ND_PFN=m -CONFIG_NVDIMM_PFN=y -CONFIG_NVDIMM_DAX=y -CONFIG_OF_PMEM=m -CONFIG_DAX_DRIVER=y -CONFIG_DAX=y -CONFIG_DEV_DAX=m -CONFIG_DEV_DAX_PMEM=m -CONFIG_DEV_DAX_HMEM=m -CONFIG_DEV_DAX_KMEM=m -CONFIG_DEV_DAX_PMEM_COMPAT=m -CONFIG_NVMEM=y -CONFIG_NVMEM_SYSFS=y -CONFIG_NVMEM_SPMI_SDAM=m -CONFIG_RAVE_SP_EEPROM=m - -# -# HW tracing support -# -CONFIG_STM=m -CONFIG_STM_PROTO_BASIC=m -CONFIG_STM_PROTO_SYS_T=m -# CONFIG_STM_DUMMY is not set -CONFIG_STM_SOURCE_CONSOLE=m -CONFIG_STM_SOURCE_HEARTBEAT=m -CONFIG_STM_SOURCE_FTRACE=m -CONFIG_INTEL_TH=m -CONFIG_INTEL_TH_PCI=m -CONFIG_INTEL_TH_ACPI=m -CONFIG_INTEL_TH_GTH=m -CONFIG_INTEL_TH_STH=m -CONFIG_INTEL_TH_MSU=m -CONFIG_INTEL_TH_PTI=m -# CONFIG_INTEL_TH_DEBUG is not set -# end of HW tracing support - -CONFIG_FPGA=m -CONFIG_ALTERA_PR_IP_CORE=m -CONFIG_ALTERA_PR_IP_CORE_PLAT=m -CONFIG_FPGA_MGR_ALTERA_PS_SPI=m -CONFIG_FPGA_MGR_ALTERA_CVP=m -CONFIG_FPGA_MGR_XILINX_SPI=m -CONFIG_FPGA_MGR_ICE40_SPI=m -CONFIG_FPGA_MGR_MACHXO2_SPI=m -CONFIG_FPGA_BRIDGE=m -CONFIG_ALTERA_FREEZE_BRIDGE=m -CONFIG_XILINX_PR_DECOUPLER=m -CONFIG_FPGA_REGION=m -CONFIG_OF_FPGA_REGION=m -CONFIG_FPGA_DFL=m -CONFIG_FPGA_DFL_FME=m -CONFIG_FPGA_DFL_FME_MGR=m -CONFIG_FPGA_DFL_FME_BRIDGE=m -CONFIG_FPGA_DFL_FME_REGION=m -CONFIG_FPGA_DFL_AFU=m -CONFIG_FPGA_DFL_PCI=m -CONFIG_FSI=m -CONFIG_FSI_NEW_DEV_NODE=y -CONFIG_FSI_MASTER_GPIO=m -CONFIG_FSI_MASTER_HUB=m -CONFIG_FSI_MASTER_ASPEED=m -CONFIG_FSI_SCOM=m -CONFIG_FSI_SBEFIFO=m -CONFIG_FSI_OCC=m -CONFIG_TEE=m - -# -# TEE drivers -# -CONFIG_AMDTEE=m -# end of TEE drivers - -CONFIG_MULTIPLEXER=m - -# -# Multiplexer drivers -# -CONFIG_MUX_ADG792A=m -CONFIG_MUX_ADGS1408=m -CONFIG_MUX_GPIO=m -CONFIG_MUX_MMIO=m -# end of Multiplexer drivers - -CONFIG_PM_OPP=y -CONFIG_UNISYS_VISORBUS=m -CONFIG_SIOX=m -CONFIG_SIOX_BUS_GPIO=m -CONFIG_SLIMBUS=m -CONFIG_SLIM_QCOM_CTRL=m -CONFIG_INTERCONNECT=y -CONFIG_COUNTER=m -CONFIG_FTM_QUADDEC=m -CONFIG_MICROCHIP_TCB_CAPTURE=m -CONFIG_MOST=m -# CONFIG_MOST_USB_HDM is not set -# end of Device Drivers - -# -# File systems -# -CONFIG_DCACHE_WORD_ACCESS=y -CONFIG_VALIDATE_FS_PARSER=y -CONFIG_FS_IOMAP=y -# CONFIG_EXT2_FS is not set -# CONFIG_EXT3_FS is not set -CONFIG_EXT4_FS=m -CONFIG_EXT4_USE_FOR_EXT2=y -CONFIG_EXT4_FS_POSIX_ACL=y -CONFIG_EXT4_FS_SECURITY=y -# CONFIG_EXT4_DEBUG is not set -CONFIG_JBD2=m -# CONFIG_JBD2_DEBUG is not set -CONFIG_FS_MBCACHE=m -CONFIG_REISERFS_FS=m -# CONFIG_REISERFS_CHECK is not set -CONFIG_REISERFS_PROC_INFO=y -CONFIG_REISERFS_FS_XATTR=y -CONFIG_REISERFS_FS_POSIX_ACL=y -CONFIG_REISERFS_FS_SECURITY=y -CONFIG_JFS_FS=m -CONFIG_JFS_POSIX_ACL=y -CONFIG_JFS_SECURITY=y -# CONFIG_JFS_DEBUG is not set -CONFIG_JFS_STATISTICS=y -CONFIG_XFS_FS=m -CONFIG_XFS_QUOTA=y -CONFIG_XFS_POSIX_ACL=y -CONFIG_XFS_RT=y -CONFIG_XFS_ONLINE_SCRUB=y -CONFIG_XFS_ONLINE_REPAIR=y -# CONFIG_XFS_WARN is not set -# CONFIG_XFS_DEBUG is not set -CONFIG_GFS2_FS=m -CONFIG_GFS2_FS_LOCKING_DLM=y -CONFIG_OCFS2_FS=m -CONFIG_OCFS2_FS_O2CB=m -CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m -CONFIG_OCFS2_FS_STATS=y -CONFIG_OCFS2_DEBUG_MASKLOG=y -# CONFIG_OCFS2_DEBUG_FS is not set -CONFIG_BTRFS_FS=m -CONFIG_BTRFS_FS_POSIX_ACL=y -# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set -# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set -# CONFIG_BTRFS_DEBUG is not set -# CONFIG_BTRFS_ASSERT is not set -# CONFIG_BTRFS_FS_REF_VERIFY is not set -CONFIG_NILFS2_FS=m -CONFIG_F2FS_FS=m -CONFIG_F2FS_STAT_FS=y -CONFIG_F2FS_FS_XATTR=y -CONFIG_F2FS_FS_POSIX_ACL=y -CONFIG_F2FS_FS_SECURITY=y -CONFIG_F2FS_CHECK_FS=y -# CONFIG_F2FS_IO_TRACE is not set -# CONFIG_F2FS_FAULT_INJECTION is not set -CONFIG_F2FS_FS_COMPRESSION=y -CONFIG_F2FS_FS_LZO=y -CONFIG_F2FS_FS_LZ4=y -CONFIG_F2FS_FS_ZSTD=y -CONFIG_F2FS_FS_LZORLE=y -CONFIG_ZONEFS_FS=m -CONFIG_FS_DAX=y -CONFIG_FS_DAX_PMD=y -CONFIG_FS_POSIX_ACL=y -CONFIG_EXPORTFS=y -CONFIG_EXPORTFS_BLOCK_OPS=y -CONFIG_FILE_LOCKING=y -# CONFIG_MANDATORY_FILE_LOCKING is not set -CONFIG_FS_ENCRYPTION=y -CONFIG_FS_ENCRYPTION_ALGS=m -# CONFIG_FS_ENCRYPTION_INLINE_CRYPT is not set -CONFIG_FS_VERITY=y -# CONFIG_FS_VERITY_DEBUG is not set -CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y -CONFIG_FSNOTIFY=y -CONFIG_DNOTIFY=y -CONFIG_INOTIFY_USER=y -CONFIG_FANOTIFY=y -CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y -CONFIG_QUOTA=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -# CONFIG_PRINT_QUOTA_WARNING is not set -# CONFIG_QUOTA_DEBUG is not set -CONFIG_QUOTA_TREE=m -CONFIG_QFMT_V1=m -CONFIG_QFMT_V2=m -CONFIG_QUOTACTL=y -CONFIG_QUOTACTL_COMPAT=y -CONFIG_AUTOFS4_FS=y -CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m -CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m -CONFIG_OVERLAY_FS=m -CONFIG_OVERLAY_FS_REDIRECT_DIR=y -# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set -CONFIG_OVERLAY_FS_INDEX=y -CONFIG_OVERLAY_FS_XINO_AUTO=y -CONFIG_OVERLAY_FS_METACOPY=y - -# -# Caches -# -CONFIG_FSCACHE=m -CONFIG_FSCACHE_STATS=y -CONFIG_FSCACHE_HISTOGRAM=y -# CONFIG_FSCACHE_DEBUG is not set -# CONFIG_FSCACHE_OBJECT_LIST is not set -CONFIG_CACHEFILES=m -# CONFIG_CACHEFILES_DEBUG is not set -# CONFIG_CACHEFILES_HISTOGRAM is not set -# end of Caches - -# -# CD-ROM/DVD Filesystems -# -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -CONFIG_ZISOFS=y -CONFIG_UDF_FS=m -# end of CD-ROM/DVD Filesystems - -# -# DOS/FAT/EXFAT/NT Filesystems -# -CONFIG_FAT_FS=m -CONFIG_MSDOS_FS=m -CONFIG_VFAT_FS=m -CONFIG_FAT_DEFAULT_CODEPAGE=437 -CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" -CONFIG_FAT_DEFAULT_UTF8=y -CONFIG_EXFAT_FS=m -CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -CONFIG_NTFS_FS=m -# CONFIG_NTFS_DEBUG is not set -CONFIG_NTFS_RW=y -# end of DOS/FAT/EXFAT/NT Filesystems - -# -# Pseudo filesystems -# -CONFIG_PROC_FS=y -CONFIG_PROC_KCORE=y -CONFIG_PROC_VMCORE=y -CONFIG_PROC_VMCORE_DEVICE_DUMP=y -CONFIG_PROC_SYSCTL=y -CONFIG_PROC_PAGE_MONITOR=y -CONFIG_PROC_CHILDREN=y -CONFIG_PROC_PID_ARCH_STATUS=y -CONFIG_PROC_CPU_RESCTRL=y -CONFIG_KERNFS=y -CONFIG_SYSFS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_TMPFS_XATTR=y -# CONFIG_TMPFS_INODE64 is not set -CONFIG_HUGETLBFS=y -CONFIG_HUGETLB_PAGE=y -CONFIG_MEMFD_CREATE=y -CONFIG_ARCH_HAS_GIGANTIC_PAGE=y -CONFIG_CONFIGFS_FS=y -CONFIG_EFIVAR_FS=y -# end of Pseudo filesystems - -CONFIG_MISC_FILESYSTEMS=y -CONFIG_ORANGEFS_FS=m -# CONFIG_ADFS_FS is not set -CONFIG_AFFS_FS=m -CONFIG_ECRYPT_FS=m -# CONFIG_ECRYPT_FS_MESSAGING is not set -CONFIG_HFS_FS=m -CONFIG_HFSPLUS_FS=m -CONFIG_BEFS_FS=m -# CONFIG_BEFS_DEBUG is not set -# CONFIG_BFS_FS is not set -# CONFIG_EFS_FS is not set -CONFIG_JFFS2_FS=m -CONFIG_JFFS2_FS_DEBUG=0 -CONFIG_JFFS2_FS_WRITEBUFFER=y -# CONFIG_JFFS2_FS_WBUF_VERIFY is not set -CONFIG_JFFS2_SUMMARY=y -CONFIG_JFFS2_FS_XATTR=y -CONFIG_JFFS2_FS_POSIX_ACL=y -CONFIG_JFFS2_FS_SECURITY=y -# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set -CONFIG_JFFS2_ZLIB=y -CONFIG_JFFS2_RTIME=y -CONFIG_UBIFS_FS=m -# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set -CONFIG_UBIFS_FS_LZO=y -CONFIG_UBIFS_FS_ZLIB=y -CONFIG_UBIFS_FS_ZSTD=y -CONFIG_UBIFS_ATIME_SUPPORT=y -CONFIG_UBIFS_FS_XATTR=y -CONFIG_UBIFS_FS_SECURITY=y -CONFIG_UBIFS_FS_AUTHENTICATION=y -CONFIG_CRAMFS=m -CONFIG_CRAMFS_BLOCKDEV=y -CONFIG_CRAMFS_MTD=y -CONFIG_SQUASHFS=m -# CONFIG_SQUASHFS_FILE_CACHE is not set -CONFIG_SQUASHFS_FILE_DIRECT=y -# CONFIG_SQUASHFS_DECOMP_SINGLE is not set -CONFIG_SQUASHFS_DECOMP_MULTI=y -# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set -CONFIG_SQUASHFS_XATTR=y -CONFIG_SQUASHFS_ZLIB=y -CONFIG_SQUASHFS_LZ4=y -CONFIG_SQUASHFS_LZO=y -CONFIG_SQUASHFS_XZ=y -CONFIG_SQUASHFS_ZSTD=y -# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set -# CONFIG_SQUASHFS_EMBEDDED is not set -CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 -# CONFIG_VXFS_FS is not set -CONFIG_MINIX_FS=m -CONFIG_OMFS_FS=m -# CONFIG_HPFS_FS is not set -# CONFIG_QNX4FS_FS is not set -# CONFIG_QNX6FS_FS is not set -CONFIG_ROMFS_FS=m -CONFIG_ROMFS_BACKED_BY_BLOCK=y -# CONFIG_ROMFS_BACKED_BY_MTD is not set -# CONFIG_ROMFS_BACKED_BY_BOTH is not set -CONFIG_ROMFS_ON_BLOCK=y -CONFIG_PSTORE=y -CONFIG_PSTORE_DEFLATE_COMPRESS=m -CONFIG_PSTORE_LZO_COMPRESS=m -CONFIG_PSTORE_LZ4_COMPRESS=m -CONFIG_PSTORE_LZ4HC_COMPRESS=m -# CONFIG_PSTORE_842_COMPRESS is not set -CONFIG_PSTORE_ZSTD_COMPRESS=y -CONFIG_PSTORE_COMPRESS=y -# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set -# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set -CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y -CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" -# CONFIG_PSTORE_CONSOLE is not set -# CONFIG_PSTORE_PMSG is not set -# CONFIG_PSTORE_FTRACE is not set -CONFIG_PSTORE_RAM=y -CONFIG_PSTORE_ZONE=m -CONFIG_PSTORE_BLK=m -CONFIG_PSTORE_BLK_BLKDEV="" -CONFIG_PSTORE_BLK_KMSG_SIZE=64 -CONFIG_PSTORE_BLK_MAX_REASON=2 -# CONFIG_SYSV_FS is not set -CONFIG_UFS_FS=m -# CONFIG_UFS_FS_WRITE is not set -# CONFIG_UFS_DEBUG is not set -CONFIG_EROFS_FS=m -# CONFIG_EROFS_FS_DEBUG is not set -CONFIG_EROFS_FS_XATTR=y -CONFIG_EROFS_FS_POSIX_ACL=y -CONFIG_EROFS_FS_SECURITY=y -CONFIG_EROFS_FS_ZIP=y -CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 -CONFIG_VBOXSF_FS=m -CONFIG_NETWORK_FILESYSTEMS=y -CONFIG_NFS_FS=m -CONFIG_NFS_V2=m -CONFIG_NFS_V3=m -CONFIG_NFS_V3_ACL=y -CONFIG_NFS_V4=m -CONFIG_NFS_SWAP=y -CONFIG_NFS_V4_1=y -CONFIG_NFS_V4_2=y -CONFIG_PNFS_FILE_LAYOUT=m -CONFIG_PNFS_BLOCK=m -CONFIG_PNFS_FLEXFILE_LAYOUT=m -CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" -CONFIG_NFS_V4_1_MIGRATION=y -CONFIG_NFS_V4_SECURITY_LABEL=y -CONFIG_NFS_FSCACHE=y -# CONFIG_NFS_USE_LEGACY_DNS is not set -CONFIG_NFS_USE_KERNEL_DNS=y -CONFIG_NFS_DEBUG=y -# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set -CONFIG_NFSD=m -CONFIG_NFSD_V2_ACL=y -CONFIG_NFSD_V3=y -CONFIG_NFSD_V3_ACL=y -CONFIG_NFSD_V4=y -CONFIG_NFSD_PNFS=y -CONFIG_NFSD_BLOCKLAYOUT=y -CONFIG_NFSD_SCSILAYOUT=y -# CONFIG_NFSD_FLEXFILELAYOUT is not set -CONFIG_NFSD_V4_SECURITY_LABEL=y -CONFIG_GRACE_PERIOD=m -CONFIG_LOCKD=m -CONFIG_LOCKD_V4=y -CONFIG_NFS_ACL_SUPPORT=m -CONFIG_NFS_COMMON=y -CONFIG_SUNRPC=m -CONFIG_SUNRPC_GSS=m -CONFIG_SUNRPC_BACKCHANNEL=y -CONFIG_SUNRPC_SWAP=y -CONFIG_RPCSEC_GSS_KRB5=m -CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y -CONFIG_SUNRPC_DEBUG=y -CONFIG_SUNRPC_XPRT_RDMA=m -CONFIG_CEPH_FS=m -CONFIG_CEPH_FSCACHE=y -CONFIG_CEPH_FS_POSIX_ACL=y -CONFIG_CEPH_FS_SECURITY_LABEL=y -CONFIG_CIFS=m -# CONFIG_CIFS_STATS2 is not set -CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y -# CONFIG_CIFS_WEAK_PW_HASH is not set -CONFIG_CIFS_UPCALL=y -CONFIG_CIFS_XATTR=y -CONFIG_CIFS_POSIX=y -CONFIG_CIFS_DEBUG=y -# CONFIG_CIFS_DEBUG2 is not set -# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set -CONFIG_CIFS_DFS_UPCALL=y -# CONFIG_CIFS_SMB_DIRECT is not set -CONFIG_CIFS_FSCACHE=y -CONFIG_CODA_FS=m -CONFIG_AFS_FS=m -# CONFIG_AFS_DEBUG is not set -CONFIG_AFS_FSCACHE=y -# CONFIG_AFS_DEBUG_CURSOR is not set -CONFIG_9P_FS=m -CONFIG_9P_FSCACHE=y -CONFIG_9P_FS_POSIX_ACL=y -CONFIG_9P_FS_SECURITY=y -CONFIG_NLS=y -CONFIG_NLS_DEFAULT="utf8" -CONFIG_NLS_CODEPAGE_437=m -CONFIG_NLS_CODEPAGE_737=m -CONFIG_NLS_CODEPAGE_775=m -CONFIG_NLS_CODEPAGE_850=m -CONFIG_NLS_CODEPAGE_852=m -CONFIG_NLS_CODEPAGE_855=m -CONFIG_NLS_CODEPAGE_857=m -CONFIG_NLS_CODEPAGE_860=m -CONFIG_NLS_CODEPAGE_861=m -CONFIG_NLS_CODEPAGE_862=m -CONFIG_NLS_CODEPAGE_863=m -CONFIG_NLS_CODEPAGE_864=m -CONFIG_NLS_CODEPAGE_865=m -CONFIG_NLS_CODEPAGE_866=m -CONFIG_NLS_CODEPAGE_869=m -CONFIG_NLS_CODEPAGE_936=m -CONFIG_NLS_CODEPAGE_950=m -CONFIG_NLS_CODEPAGE_932=m -CONFIG_NLS_CODEPAGE_949=m -CONFIG_NLS_CODEPAGE_874=m -CONFIG_NLS_ISO8859_8=m -CONFIG_NLS_CODEPAGE_1250=m -CONFIG_NLS_CODEPAGE_1251=m -CONFIG_NLS_ASCII=m -CONFIG_NLS_ISO8859_1=m -CONFIG_NLS_ISO8859_2=m -CONFIG_NLS_ISO8859_3=m -CONFIG_NLS_ISO8859_4=m -CONFIG_NLS_ISO8859_5=m -CONFIG_NLS_ISO8859_6=m -CONFIG_NLS_ISO8859_7=m -CONFIG_NLS_ISO8859_9=m -CONFIG_NLS_ISO8859_13=m -CONFIG_NLS_ISO8859_14=m -CONFIG_NLS_ISO8859_15=m -CONFIG_NLS_KOI8_R=m -CONFIG_NLS_KOI8_U=m -CONFIG_NLS_MAC_ROMAN=m -CONFIG_NLS_MAC_CELTIC=m -CONFIG_NLS_MAC_CENTEURO=m -CONFIG_NLS_MAC_CROATIAN=m -CONFIG_NLS_MAC_CYRILLIC=m -CONFIG_NLS_MAC_GAELIC=m -CONFIG_NLS_MAC_GREEK=m -CONFIG_NLS_MAC_ICELAND=m -CONFIG_NLS_MAC_INUIT=m -CONFIG_NLS_MAC_ROMANIAN=m -CONFIG_NLS_MAC_TURKISH=m -CONFIG_NLS_UTF8=m -CONFIG_DLM=m -# CONFIG_DLM_DEBUG is not set -CONFIG_UNICODE=y -# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set -CONFIG_IO_WQ=y -# end of File systems - -# -# Security options -# -CONFIG_KEYS=y -CONFIG_KEYS_REQUEST_CACHE=y -CONFIG_PERSISTENT_KEYRINGS=y -CONFIG_TRUSTED_KEYS=m -CONFIG_ENCRYPTED_KEYS=m -CONFIG_KEY_DH_OPERATIONS=y -CONFIG_KEY_NOTIFICATIONS=y -# CONFIG_SECURITY_DMESG_RESTRICT is not set -CONFIG_SECURITY=y -CONFIG_SECURITYFS=y -CONFIG_SECURITY_NETWORK=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_SECURITY_INFINIBAND=y -CONFIG_SECURITY_NETWORK_XFRM=y -CONFIG_SECURITY_PATH=y -# CONFIG_INTEL_TXT is not set -CONFIG_LSM_MMAP_MIN_ADDR=65536 -CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HARDENED_USERCOPY_FALLBACK=y -# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set -CONFIG_FORTIFY_SOURCE=y -# CONFIG_STATIC_USERMODEHELPER is not set -CONFIG_SECURITY_SELINUX=y -CONFIG_SECURITY_SELINUX_BOOTPARAM=y -# CONFIG_SECURITY_SELINUX_DISABLE is not set -CONFIG_SECURITY_SELINUX_DEVELOP=y -CONFIG_SECURITY_SELINUX_AVC_STATS=y -CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 -CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 -CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 -CONFIG_SECURITY_SMACK=y -CONFIG_SECURITY_SMACK_BRINGUP=y -CONFIG_SECURITY_SMACK_NETFILTER=y -CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y -CONFIG_SECURITY_TOMOYO=y -CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 -CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 -# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set -CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" -CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" -# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set -CONFIG_SECURITY_APPARMOR=y -CONFIG_SECURITY_APPARMOR_HASH=y -CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y -# CONFIG_SECURITY_APPARMOR_DEBUG is not set -# CONFIG_SECURITY_LOADPIN is not set -CONFIG_SECURITY_YAMA=y -CONFIG_SECURITY_SAFESETID=y -CONFIG_SECURITY_LOCKDOWN_LSM=y -# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set -CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y -# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set -# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set -# CONFIG_INTEGRITY is not set -# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set -# CONFIG_DEFAULT_SECURITY_SELINUX is not set -# CONFIG_DEFAULT_SECURITY_SMACK is not set -# CONFIG_DEFAULT_SECURITY_TOMOYO is not set -# CONFIG_DEFAULT_SECURITY_APPARMOR is not set -CONFIG_DEFAULT_SECURITY_DAC=y -CONFIG_LSM="lockdown,yama" - -# -# Kernel hardening options -# -CONFIG_GCC_PLUGIN_STRUCTLEAK=y - -# -# Memory initialization -# -# CONFIG_INIT_STACK_NONE is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set -# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set -CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y -# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set -# CONFIG_GCC_PLUGIN_STACKLEAK is not set -CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y -# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set -# end of Memory initialization -# end of Kernel hardening options -# end of Security options - -CONFIG_XOR_BLOCKS=m -CONFIG_ASYNC_CORE=m -CONFIG_ASYNC_MEMCPY=m -CONFIG_ASYNC_XOR=m -CONFIG_ASYNC_PQ=m -CONFIG_ASYNC_RAID6_RECOV=m -CONFIG_CRYPTO=y - -# -# Crypto core or helper -# -CONFIG_CRYPTO_ALGAPI=y -CONFIG_CRYPTO_ALGAPI2=y -CONFIG_CRYPTO_AEAD=y -CONFIG_CRYPTO_AEAD2=y -CONFIG_CRYPTO_SKCIPHER=y -CONFIG_CRYPTO_SKCIPHER2=y -CONFIG_CRYPTO_HASH=y -CONFIG_CRYPTO_HASH2=y -CONFIG_CRYPTO_RNG=y -CONFIG_CRYPTO_RNG2=y -CONFIG_CRYPTO_RNG_DEFAULT=y -CONFIG_CRYPTO_AKCIPHER2=y -CONFIG_CRYPTO_AKCIPHER=y -CONFIG_CRYPTO_KPP2=y -CONFIG_CRYPTO_KPP=y -CONFIG_CRYPTO_ACOMP2=y -CONFIG_CRYPTO_MANAGER=y -CONFIG_CRYPTO_MANAGER2=y -CONFIG_CRYPTO_USER=m -CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y -CONFIG_CRYPTO_GF128MUL=y -CONFIG_CRYPTO_NULL=y -CONFIG_CRYPTO_NULL2=y -CONFIG_CRYPTO_PCRYPT=m -CONFIG_CRYPTO_CRYPTD=m -CONFIG_CRYPTO_AUTHENC=m -CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_SIMD=m -CONFIG_CRYPTO_GLUE_HELPER_X86=m -CONFIG_CRYPTO_ENGINE=m - -# -# Public-key cryptography -# -CONFIG_CRYPTO_RSA=y -CONFIG_CRYPTO_DH=y -CONFIG_CRYPTO_ECC=m -CONFIG_CRYPTO_ECDH=m -CONFIG_CRYPTO_ECRDSA=m -CONFIG_CRYPTO_CURVE25519=m -CONFIG_CRYPTO_CURVE25519_X86=m - -# -# Authenticated Encryption with Associated Data -# -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=y -CONFIG_CRYPTO_CHACHA20POLY1305=m -CONFIG_CRYPTO_AEGIS128=m -CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m -CONFIG_CRYPTO_SEQIV=y -CONFIG_CRYPTO_ECHAINIV=m - -# -# Block modes -# -CONFIG_CRYPTO_CBC=m -CONFIG_CRYPTO_CFB=m -CONFIG_CRYPTO_CTR=y -CONFIG_CRYPTO_CTS=m -CONFIG_CRYPTO_ECB=m -CONFIG_CRYPTO_LRW=m -CONFIG_CRYPTO_OFB=m -CONFIG_CRYPTO_PCBC=m -CONFIG_CRYPTO_XTS=m -CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_NHPOLY1305=m -CONFIG_CRYPTO_NHPOLY1305_SSE2=m -CONFIG_CRYPTO_NHPOLY1305_AVX2=m -CONFIG_CRYPTO_ADIANTUM=m -CONFIG_CRYPTO_ESSIV=m - -# -# Hash modes -# -CONFIG_CRYPTO_CMAC=m -CONFIG_CRYPTO_HMAC=y -CONFIG_CRYPTO_XCBC=m -CONFIG_CRYPTO_VMAC=m - -# -# Digest -# -CONFIG_CRYPTO_CRC32C=m -CONFIG_CRYPTO_CRC32C_INTEL=m -CONFIG_CRYPTO_CRC32=m -CONFIG_CRYPTO_CRC32_PCLMUL=m -CONFIG_CRYPTO_XXHASH=m -CONFIG_CRYPTO_BLAKE2B=m -CONFIG_CRYPTO_BLAKE2S=m -CONFIG_CRYPTO_BLAKE2S_X86=m -CONFIG_CRYPTO_CRCT10DIF=y -CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m -CONFIG_CRYPTO_GHASH=y -CONFIG_CRYPTO_POLY1305=m -CONFIG_CRYPTO_POLY1305_X86_64=m -CONFIG_CRYPTO_MD4=m -CONFIG_CRYPTO_MD5=y -CONFIG_CRYPTO_MICHAEL_MIC=m -CONFIG_CRYPTO_RMD128=m -CONFIG_CRYPTO_RMD160=m -CONFIG_CRYPTO_RMD256=m -CONFIG_CRYPTO_RMD320=m -CONFIG_CRYPTO_SHA1=y -CONFIG_CRYPTO_SHA1_SSSE3=m -CONFIG_CRYPTO_SHA256_SSSE3=m -CONFIG_CRYPTO_SHA512_SSSE3=m -CONFIG_CRYPTO_SHA256=y -CONFIG_CRYPTO_SHA512=y -CONFIG_CRYPTO_SHA3=m -CONFIG_CRYPTO_SM3=m -CONFIG_CRYPTO_STREEBOG=m -CONFIG_CRYPTO_TGR192=m -CONFIG_CRYPTO_WP512=m -CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m - -# -# Ciphers -# -CONFIG_CRYPTO_AES=y -CONFIG_CRYPTO_AES_TI=m -CONFIG_CRYPTO_AES_NI_INTEL=m -CONFIG_CRYPTO_ANUBIS=m -CONFIG_CRYPTO_ARC4=m -CONFIG_CRYPTO_BLOWFISH=m -CONFIG_CRYPTO_BLOWFISH_COMMON=m -CONFIG_CRYPTO_BLOWFISH_X86_64=m -CONFIG_CRYPTO_CAMELLIA=m -CONFIG_CRYPTO_CAMELLIA_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m -CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m -CONFIG_CRYPTO_CAST_COMMON=m -CONFIG_CRYPTO_CAST5=m -CONFIG_CRYPTO_CAST5_AVX_X86_64=m -CONFIG_CRYPTO_CAST6=m -CONFIG_CRYPTO_CAST6_AVX_X86_64=m -CONFIG_CRYPTO_DES=m -CONFIG_CRYPTO_DES3_EDE_X86_64=m -CONFIG_CRYPTO_FCRYPT=m -CONFIG_CRYPTO_KHAZAD=m -CONFIG_CRYPTO_SALSA20=m -CONFIG_CRYPTO_CHACHA20=m -CONFIG_CRYPTO_CHACHA20_X86_64=m -CONFIG_CRYPTO_SEED=m -CONFIG_CRYPTO_SERPENT=m -CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX_X86_64=m -CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m -CONFIG_CRYPTO_SM4=m -CONFIG_CRYPTO_TEA=m -CONFIG_CRYPTO_TWOFISH=m -CONFIG_CRYPTO_TWOFISH_COMMON=m -CONFIG_CRYPTO_TWOFISH_X86_64=m -CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m -CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m - -# -# Compression -# -CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m -CONFIG_CRYPTO_842=m -CONFIG_CRYPTO_LZ4=y -CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=y - -# -# Random Number Generation -# -CONFIG_CRYPTO_ANSI_CPRNG=m -CONFIG_CRYPTO_DRBG_MENU=y -CONFIG_CRYPTO_DRBG_HMAC=y -CONFIG_CRYPTO_DRBG_HASH=y -CONFIG_CRYPTO_DRBG_CTR=y -CONFIG_CRYPTO_DRBG=y -CONFIG_CRYPTO_JITTERENTROPY=y -CONFIG_CRYPTO_USER_API=m -CONFIG_CRYPTO_USER_API_HASH=m -CONFIG_CRYPTO_USER_API_SKCIPHER=m -CONFIG_CRYPTO_USER_API_RNG=m -CONFIG_CRYPTO_USER_API_AEAD=m -# CONFIG_CRYPTO_STATS is not set -CONFIG_CRYPTO_HASH_INFO=y - -# -# Crypto library routines -# -CONFIG_CRYPTO_LIB_AES=y -CONFIG_CRYPTO_LIB_ARC4=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m -CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m -CONFIG_CRYPTO_LIB_BLAKE2S=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m -CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m -CONFIG_CRYPTO_LIB_CHACHA=m -CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m -CONFIG_CRYPTO_LIB_CURVE25519=m -CONFIG_CRYPTO_LIB_DES=m -CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 -CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m -CONFIG_CRYPTO_LIB_POLY1305=m -CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m -CONFIG_CRYPTO_LIB_SHA256=y -CONFIG_CRYPTO_HW=y -CONFIG_CRYPTO_DEV_PADLOCK=m -CONFIG_CRYPTO_DEV_PADLOCK_AES=m -CONFIG_CRYPTO_DEV_PADLOCK_SHA=m -CONFIG_CRYPTO_DEV_ATMEL_I2C=m -CONFIG_CRYPTO_DEV_ATMEL_ECC=m -CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m -CONFIG_CRYPTO_DEV_CCP=y -CONFIG_CRYPTO_DEV_CCP_DD=m -CONFIG_CRYPTO_DEV_SP_CCP=y -CONFIG_CRYPTO_DEV_CCP_CRYPTO=m -CONFIG_CRYPTO_DEV_SP_PSP=y -CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y -CONFIG_CRYPTO_DEV_QAT=m -CONFIG_CRYPTO_DEV_QAT_DH895xCC=m -CONFIG_CRYPTO_DEV_QAT_C3XXX=m -CONFIG_CRYPTO_DEV_QAT_C62X=m -CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m -CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m -CONFIG_CRYPTO_DEV_QAT_C62XVF=m -CONFIG_CRYPTO_DEV_NITROX=m -CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m -CONFIG_CRYPTO_DEV_CHELSIO=m -CONFIG_CHELSIO_IPSEC_INLINE=y -CONFIG_CHELSIO_TLS_DEVICE=y -CONFIG_CRYPTO_DEV_VIRTIO=m -CONFIG_CRYPTO_DEV_SAFEXCEL=m -CONFIG_CRYPTO_DEV_CCREE=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m -CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y -CONFIG_ASYMMETRIC_KEY_TYPE=y -CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y -CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m -CONFIG_X509_CERTIFICATE_PARSER=y -CONFIG_PKCS8_PRIVATE_KEY_PARSER=m -CONFIG_TPM_KEY_PARSER=m -CONFIG_PKCS7_MESSAGE_PARSER=y -# CONFIG_PKCS7_TEST_KEY is not set -CONFIG_SIGNED_PE_FILE_VERIFICATION=y - -# -# Certificates for signature checking -# -CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" -CONFIG_SYSTEM_TRUSTED_KEYRING=y -CONFIG_SYSTEM_TRUSTED_KEYS="" -# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set -CONFIG_SECONDARY_TRUSTED_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_KEYRING=y -CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" -# end of Certificates for signature checking - -CONFIG_BINARY_PRINTF=y - -# -# Library routines -# -CONFIG_RAID6_PQ=m -CONFIG_RAID6_PQ_BENCHMARK=y -CONFIG_LINEAR_RANGES=y -CONFIG_PACKING=y -CONFIG_BITREVERSE=y -CONFIG_GENERIC_STRNCPY_FROM_USER=y -CONFIG_GENERIC_STRNLEN_USER=y -CONFIG_GENERIC_NET_UTILS=y -CONFIG_GENERIC_FIND_FIRST_BIT=y -CONFIG_CORDIC=m -# CONFIG_PRIME_NUMBERS is not set -CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y -CONFIG_GENERIC_IOMAP=y -CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y -CONFIG_ARCH_HAS_FAST_MULTIPLIER=y -CONFIG_ARCH_USE_SYM_ANNOTATIONS=y -CONFIG_CRC_CCITT=y -CONFIG_CRC16=m -CONFIG_CRC_T10DIF=y -CONFIG_CRC_ITU_T=m -CONFIG_CRC32=y -# CONFIG_CRC32_SELFTEST is not set -CONFIG_CRC32_SLICEBY8=y -# CONFIG_CRC32_SLICEBY4 is not set -# CONFIG_CRC32_SARWATE is not set -# CONFIG_CRC32_BIT is not set -CONFIG_CRC64=m -CONFIG_CRC4=m -CONFIG_CRC7=m -CONFIG_LIBCRC32C=m -CONFIG_CRC8=m -CONFIG_XXHASH=y -# CONFIG_RANDOM32_SELFTEST is not set -CONFIG_842_COMPRESS=m -CONFIG_842_DECOMPRESS=m -CONFIG_ZLIB_INFLATE=y -CONFIG_ZLIB_DEFLATE=y -CONFIG_LZO_COMPRESS=y -CONFIG_LZO_DECOMPRESS=y -CONFIG_LZ4_COMPRESS=y -CONFIG_LZ4HC_COMPRESS=m -CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=y -CONFIG_ZSTD_DECOMPRESS=y -CONFIG_XZ_DEC=y -CONFIG_XZ_DEC_X86=y -CONFIG_XZ_DEC_POWERPC=y -CONFIG_XZ_DEC_IA64=y -CONFIG_XZ_DEC_ARM=y -CONFIG_XZ_DEC_ARMTHUMB=y -CONFIG_XZ_DEC_SPARC=y -CONFIG_XZ_DEC_BCJ=y -# CONFIG_XZ_DEC_TEST is not set -CONFIG_DECOMPRESS_GZIP=y -CONFIG_DECOMPRESS_BZIP2=y -CONFIG_DECOMPRESS_LZMA=y -CONFIG_DECOMPRESS_XZ=y -CONFIG_DECOMPRESS_LZO=y -CONFIG_DECOMPRESS_LZ4=y -CONFIG_DECOMPRESS_ZSTD=y -CONFIG_GENERIC_ALLOCATOR=y -CONFIG_REED_SOLOMON=y -CONFIG_REED_SOLOMON_ENC8=y -CONFIG_REED_SOLOMON_DEC8=y -CONFIG_REED_SOLOMON_DEC16=y -CONFIG_BCH=m -CONFIG_TEXTSEARCH=y -CONFIG_TEXTSEARCH_KMP=m -CONFIG_TEXTSEARCH_BM=m -CONFIG_TEXTSEARCH_FSM=m -CONFIG_BTREE=y -CONFIG_INTERVAL_TREE=y -CONFIG_XARRAY_MULTI=y -CONFIG_ASSOCIATIVE_ARRAY=y -CONFIG_HAS_IOMEM=y -CONFIG_HAS_IOPORT_MAP=y -CONFIG_HAS_DMA=y -CONFIG_DMA_OPS=y -CONFIG_NEED_SG_DMA_LENGTH=y -CONFIG_NEED_DMA_MAP_STATE=y -CONFIG_ARCH_DMA_ADDR_T_64BIT=y -CONFIG_DMA_DECLARE_COHERENT=y -CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y -CONFIG_DMA_VIRT_OPS=y -CONFIG_SWIOTLB=y -CONFIG_DMA_COHERENT_POOL=y -# CONFIG_DMA_API_DEBUG is not set -CONFIG_SGL_ALLOC=y -CONFIG_IOMMU_HELPER=y -CONFIG_CHECK_SIGNATURE=y -CONFIG_CPU_RMAP=y -CONFIG_DQL=y -CONFIG_GLOB=y -# CONFIG_GLOB_SELFTEST is not set -CONFIG_NLATTR=y -CONFIG_LRU_CACHE=m -CONFIG_CLZ_TAB=y -CONFIG_IRQ_POLL=y -CONFIG_MPILIB=y -CONFIG_DIMLIB=y -CONFIG_LIBFDT=y -CONFIG_OID_REGISTRY=y -CONFIG_UCS2_STRING=y -CONFIG_HAVE_GENERIC_VDSO=y -CONFIG_GENERIC_GETTIMEOFDAY=y -CONFIG_GENERIC_VDSO_TIME_NS=y -CONFIG_FONT_SUPPORT=y -CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set -CONFIG_FONT_8x16=y -# CONFIG_FONT_6x11 is not set -# CONFIG_FONT_7x14 is not set -# CONFIG_FONT_PEARL_8x8 is not set -# CONFIG_FONT_ACORN_8x8 is not set -# CONFIG_FONT_MINI_4x6 is not set -# CONFIG_FONT_6x10 is not set -# CONFIG_FONT_10x18 is not set -# CONFIG_FONT_SUN8x16 is not set -# CONFIG_FONT_SUN12x22 is not set -CONFIG_FONT_TER16x32=y -CONFIG_SG_POOL=y -CONFIG_ARCH_HAS_PMEM_API=y -CONFIG_MEMREGION=y -CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y -CONFIG_ARCH_HAS_UACCESS_MCSAFE=y -CONFIG_ARCH_STACKWALK=y -CONFIG_SBITMAP=y -CONFIG_PARMAN=m -CONFIG_OBJAGG=m -# CONFIG_STRING_SELFTEST is not set -# end of Library routines - -CONFIG_PLDMFW=y - -# -# Kernel hacking -# - -# -# printk and dmesg options -# -CONFIG_PRINTK_TIME=y -# CONFIG_PRINTK_CALLER is not set -CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 -CONFIG_CONSOLE_LOGLEVEL_QUIET=1 -CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 -# CONFIG_BOOT_PRINTK_DELAY is not set -CONFIG_DYNAMIC_DEBUG=y -CONFIG_DYNAMIC_DEBUG_CORE=y -CONFIG_SYMBOLIC_ERRNAME=y -CONFIG_DEBUG_BUGVERBOSE=y -# end of printk and dmesg options - -# -# Compile-time checks and compiler options -# -CONFIG_DEBUG_INFO=y -# CONFIG_DEBUG_INFO_REDUCED is not set -# CONFIG_DEBUG_INFO_COMPRESSED is not set -# CONFIG_DEBUG_INFO_SPLIT is not set -CONFIG_DEBUG_INFO_DWARF4=y -CONFIG_DEBUG_INFO_BTF=y -# CONFIG_GDB_SCRIPTS is not set -# CONFIG_ENABLE_MUST_CHECK is not set -CONFIG_FRAME_WARN=2048 -CONFIG_STRIP_ASM_SYMS=y -# CONFIG_READABLE_ASM is not set -# CONFIG_HEADERS_INSTALL is not set -# CONFIG_DEBUG_SECTION_MISMATCH is not set -CONFIG_SECTION_MISMATCH_WARN_ONLY=y -# CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B is not set -CONFIG_STACK_VALIDATION=y -# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set -# end of Compile-time checks and compiler options - -# -# Generic Kernel Debugging Instruments -# -CONFIG_MAGIC_SYSRQ=y -CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 -CONFIG_MAGIC_SYSRQ_SERIAL=y -CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" -CONFIG_DEBUG_FS=y -CONFIG_DEBUG_FS_ALLOW_ALL=y -# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set -# CONFIG_DEBUG_FS_ALLOW_NONE is not set -CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y -# CONFIG_UBSAN is not set -CONFIG_HAVE_ARCH_KCSAN=y -# end of Generic Kernel Debugging Instruments - -CONFIG_DEBUG_KERNEL=y -CONFIG_DEBUG_MISC=y - -# -# Memory Debugging -# -# CONFIG_PAGE_EXTENSION is not set -# CONFIG_DEBUG_PAGEALLOC is not set -# CONFIG_PAGE_OWNER is not set -CONFIG_PAGE_POISONING=y -CONFIG_PAGE_POISONING_NO_SANITY=y -CONFIG_PAGE_POISONING_ZERO=y -# CONFIG_DEBUG_PAGE_REF is not set -# CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_ARCH_HAS_DEBUG_WX=y -CONFIG_DEBUG_WX=y -CONFIG_GENERIC_PTDUMP=y -CONFIG_PTDUMP_CORE=y -# CONFIG_PTDUMP_DEBUGFS is not set -# CONFIG_DEBUG_OBJECTS is not set -# CONFIG_SLUB_DEBUG_ON is not set -# CONFIG_SLUB_STATS is not set -CONFIG_HAVE_DEBUG_KMEMLEAK=y -# CONFIG_DEBUG_KMEMLEAK is not set -# CONFIG_DEBUG_STACK_USAGE is not set -CONFIG_SCHED_STACK_END_CHECK=y -CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y -# CONFIG_DEBUG_VM is not set -# CONFIG_DEBUG_VM_PGTABLE is not set -CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y -# CONFIG_DEBUG_VIRTUAL is not set -CONFIG_DEBUG_MEMORY_INIT=y -# CONFIG_DEBUG_PER_CPU_MAPS is not set -CONFIG_HAVE_ARCH_KASAN=y -CONFIG_HAVE_ARCH_KASAN_VMALLOC=y -CONFIG_CC_HAS_KASAN_GENERIC=y -CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y -# CONFIG_KASAN is not set -# end of Memory Debugging - -# CONFIG_DEBUG_SHIRQ is not set - -# -# Debug Oops, Lockups and Hangs -# -# CONFIG_PANIC_ON_OOPS is not set -CONFIG_PANIC_ON_OOPS_VALUE=0 -CONFIG_PANIC_TIMEOUT=0 -CONFIG_LOCKUP_DETECTOR=y -CONFIG_SOFTLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 -CONFIG_HARDLOCKUP_DETECTOR_PERF=y -CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y -CONFIG_HARDLOCKUP_DETECTOR=y -# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set -CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 -CONFIG_DETECT_HUNG_TASK=y -CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 -# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set -CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 -# CONFIG_WQ_WATCHDOG is not set -# CONFIG_TEST_LOCKUP is not set -# end of Debug Oops, Lockups and Hangs - -# -# Scheduler Debugging -# -CONFIG_SCHED_DEBUG=y -CONFIG_SCHED_INFO=y -CONFIG_SCHEDSTATS=y -# end of Scheduler Debugging - -# CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_DEBUG_PREEMPT=y - -# -# Lock Debugging (spinlocks, mutexes, etc...) -# -CONFIG_LOCK_DEBUGGING_SUPPORT=y -# CONFIG_PROVE_LOCKING is not set -# CONFIG_LOCK_STAT is not set -# CONFIG_DEBUG_RT_MUTEXES is not set -# CONFIG_DEBUG_SPINLOCK is not set -# CONFIG_DEBUG_MUTEXES is not set -# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set -# CONFIG_DEBUG_RWSEMS is not set -# CONFIG_DEBUG_LOCK_ALLOC is not set -# CONFIG_DEBUG_ATOMIC_SLEEP is not set -# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set -# CONFIG_LOCK_TORTURE_TEST is not set -# CONFIG_WW_MUTEX_SELFTEST is not set -# end of Lock Debugging (spinlocks, mutexes, etc...) - -CONFIG_STACKTRACE=y -# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set -# CONFIG_DEBUG_KOBJECT is not set - -# -# Debug kernel data structures -# -# CONFIG_DEBUG_LIST is not set -# CONFIG_DEBUG_PLIST is not set -# CONFIG_DEBUG_SG is not set -# CONFIG_DEBUG_NOTIFIERS is not set -# CONFIG_BUG_ON_DATA_CORRUPTION is not set -# end of Debug kernel data structures - -# CONFIG_DEBUG_CREDENTIALS is not set - -# -# RCU Debugging -# -# CONFIG_RCU_PERF_TEST is not set -# CONFIG_RCU_TORTURE_TEST is not set -# CONFIG_RCU_REF_SCALE_TEST is not set -CONFIG_RCU_CPU_STALL_TIMEOUT=60 -# CONFIG_RCU_TRACE is not set -# CONFIG_RCU_EQS_DEBUG is not set -# end of RCU Debugging - -# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set -# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set -CONFIG_LATENCYTOP=y -CONFIG_USER_STACKTRACE_SUPPORT=y -CONFIG_NOP_TRACER=y -CONFIG_HAVE_FUNCTION_TRACER=y -CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y -CONFIG_HAVE_DYNAMIC_FTRACE=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y -CONFIG_HAVE_SYSCALL_TRACEPOINTS=y -CONFIG_HAVE_FENTRY=y -CONFIG_HAVE_C_RECORDMCOUNT=y -CONFIG_TRACER_MAX_TRACE=y -CONFIG_TRACE_CLOCK=y -CONFIG_RING_BUFFER=y -CONFIG_EVENT_TRACING=y -CONFIG_CONTEXT_SWITCH_TRACER=y -CONFIG_RING_BUFFER_ALLOW_SWAP=y -CONFIG_TRACING=y -CONFIG_GENERIC_TRACER=y -CONFIG_TRACING_SUPPORT=y -CONFIG_FTRACE=y -# CONFIG_BOOTTIME_TRACING is not set -CONFIG_FUNCTION_TRACER=y -CONFIG_FUNCTION_GRAPH_TRACER=y -CONFIG_DYNAMIC_FTRACE=y -CONFIG_DYNAMIC_FTRACE_WITH_REGS=y -CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y -CONFIG_FUNCTION_PROFILER=y -CONFIG_STACK_TRACER=y -# CONFIG_IRQSOFF_TRACER is not set -# CONFIG_PREEMPT_TRACER is not set -CONFIG_SCHED_TRACER=y -CONFIG_HWLAT_TRACER=y -CONFIG_MMIOTRACE=y -CONFIG_FTRACE_SYSCALLS=y -CONFIG_TRACER_SNAPSHOT=y -# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set -CONFIG_BRANCH_PROFILE_NONE=y -# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set -CONFIG_BLK_DEV_IO_TRACE=y -CONFIG_KPROBE_EVENTS=y -# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set -CONFIG_UPROBE_EVENTS=y -CONFIG_BPF_EVENTS=y -CONFIG_DYNAMIC_EVENTS=y -CONFIG_PROBE_EVENTS=y -CONFIG_BPF_KPROBE_OVERRIDE=y -CONFIG_FTRACE_MCOUNT_RECORD=y -CONFIG_TRACING_MAP=y -CONFIG_SYNTH_EVENTS=y -CONFIG_HIST_TRIGGERS=y -# CONFIG_TRACE_EVENT_INJECT is not set -# CONFIG_TRACEPOINT_BENCHMARK is not set -# CONFIG_RING_BUFFER_BENCHMARK is not set -# CONFIG_TRACE_EVAL_MAP_FILE is not set -# CONFIG_FTRACE_STARTUP_TEST is not set -# CONFIG_RING_BUFFER_STARTUP_TEST is not set -# CONFIG_MMIOTRACE_TEST is not set -# CONFIG_PREEMPTIRQ_DELAY_TEST is not set -# CONFIG_SYNTH_EVENT_GEN_TEST is not set -# CONFIG_KPROBE_EVENT_GEN_TEST is not set -# CONFIG_HIST_TRIGGERS_DEBUG is not set -# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set -# CONFIG_SAMPLES is not set -CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y -CONFIG_STRICT_DEVMEM=y -CONFIG_IO_STRICT_DEVMEM=y - -# -# x86 Debugging -# -CONFIG_TRACE_IRQFLAGS_SUPPORT=y -CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y -# CONFIG_X86_VERBOSE_BOOTUP is not set -CONFIG_EARLY_PRINTK=y -# CONFIG_EARLY_PRINTK_DBGP is not set -# CONFIG_EARLY_PRINTK_USB_XDBC is not set -# CONFIG_EFI_PGT_DUMP is not set -# CONFIG_DEBUG_TLBFLUSH is not set -# CONFIG_IOMMU_DEBUG is not set -CONFIG_HAVE_MMIOTRACE_SUPPORT=y -# CONFIG_X86_DECODER_SELFTEST is not set -CONFIG_IO_DELAY_0X80=y -# CONFIG_IO_DELAY_0XED is not set -# CONFIG_IO_DELAY_UDELAY is not set -# CONFIG_IO_DELAY_NONE is not set -CONFIG_DEBUG_BOOT_PARAMS=y -# CONFIG_CPA_DEBUG is not set -# CONFIG_DEBUG_ENTRY is not set -# CONFIG_DEBUG_NMI_SELFTEST is not set -# CONFIG_X86_DEBUG_FPU is not set -# CONFIG_PUNIT_ATOM_DEBUG is not set -CONFIG_UNWINDER_ORC=y -# CONFIG_UNWINDER_FRAME_POINTER is not set -# CONFIG_UNWINDER_GUESS is not set -# end of x86 Debugging - -# -# Kernel Testing and Coverage -# -# CONFIG_KUNIT is not set -# CONFIG_NOTIFIER_ERROR_INJECTION is not set -CONFIG_FUNCTION_ERROR_INJECTION=y -# CONFIG_FAULT_INJECTION is not set -CONFIG_ARCH_HAS_KCOV=y -CONFIG_CC_HAS_SANCOV_TRACE_PC=y -# CONFIG_KCOV is not set -CONFIG_RUNTIME_TESTING_MENU=y -CONFIG_LKDTM=m -# CONFIG_TEST_LIST_SORT is not set -# CONFIG_TEST_MIN_HEAP is not set -# CONFIG_TEST_SORT is not set -# CONFIG_KPROBES_SANITY_TEST is not set -# CONFIG_BACKTRACE_SELF_TEST is not set -# CONFIG_RBTREE_TEST is not set -# CONFIG_REED_SOLOMON_TEST is not set -# CONFIG_INTERVAL_TREE_TEST is not set -# CONFIG_PERCPU_TEST is not set -# CONFIG_ATOMIC64_SELFTEST is not set -# CONFIG_ASYNC_RAID6_TEST is not set -# CONFIG_TEST_HEXDUMP is not set -# CONFIG_TEST_STRING_HELPERS is not set -# CONFIG_TEST_STRSCPY is not set -# CONFIG_TEST_KSTRTOX is not set -# CONFIG_TEST_PRINTF is not set -# CONFIG_TEST_BITMAP is not set -# CONFIG_TEST_BITFIELD is not set -# CONFIG_TEST_UUID is not set -# CONFIG_TEST_XARRAY is not set -# CONFIG_TEST_OVERFLOW is not set -# CONFIG_TEST_RHASHTABLE is not set -# CONFIG_TEST_HASH is not set -# CONFIG_TEST_IDA is not set -# CONFIG_TEST_PARMAN is not set -# CONFIG_TEST_LKM is not set -# CONFIG_TEST_BITOPS is not set -# CONFIG_TEST_VMALLOC is not set -# CONFIG_TEST_USER_COPY is not set -# CONFIG_TEST_BPF is not set -# CONFIG_TEST_BLACKHOLE_DEV is not set -# CONFIG_FIND_BIT_BENCHMARK is not set -# CONFIG_TEST_FIRMWARE is not set -# CONFIG_TEST_SYSCTL is not set -# CONFIG_TEST_UDELAY is not set -# CONFIG_TEST_STATIC_KEYS is not set -# CONFIG_TEST_KMOD is not set -# CONFIG_TEST_MEMCAT_P is not set -# CONFIG_TEST_OBJAGG is not set -# CONFIG_TEST_STACKINIT is not set -# CONFIG_TEST_MEMINIT is not set -# CONFIG_TEST_HMM is not set -# CONFIG_TEST_FPU is not set -# CONFIG_MEMTEST is not set -# CONFIG_HYPERV_TESTING is not set -# end of Kernel Testing and Coverage -# end of Kernel hacking diff --git a/linux59-tkg/linux59-tkg-config/generic-desktop-profile.cfg b/linux59-tkg/linux59-tkg-config/generic-desktop-profile.cfg deleted file mode 100644 index 9f33a13..0000000 --- a/linux59-tkg/linux59-tkg-config/generic-desktop-profile.cfg +++ /dev/null @@ -1,35 +0,0 @@ -# linux59-TkG config file -# Generic Desktop - - -#### KERNEL OPTIONS #### - -# Disable some non-module debugging - See PKGBUILD for the list -_debugdisable="false" - -# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME - -# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" -_ftracedisable="false" - -# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" -_numadisable="false" - -# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" -_voluntary_preempt="false" - -# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" -_zenify="true" - -# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" -_compileroptlevel="1" - -# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" -_random_trust_cpu="false" - -# CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) -# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "mc" -_runqueue_sharing="mc" - -# Timer frequency - "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "750" -_timer_freq="500" diff --git a/linux59-tkg/linux59-tkg-config/prepare b/linux59-tkg/linux59-tkg-config/prepare deleted file mode 100644 index dc2eaba..0000000 --- a/linux59-tkg/linux59-tkg-config/prepare +++ /dev/null @@ -1,991 +0,0 @@ -#!/bin/bash - -_basever=59 -_basekernel=5.9 -_sub=1 - -_tkg_initscript() { - - cp "$_where"/linux"$_basever"-tkg-patches/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - cp "$_where"/linux"$_basever"-tkg-config/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking - - # Load external configuration file if present. Available variable values will overwrite customization.cfg ones. - if [ -e "$_EXT_CONFIG_PATH" ]; then - source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" - fi - - if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_where"/cpuschedset ]; then - # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. - plain "Do you want to use a predefined optimized profile?" - read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; - fi - if [ "$_OPTIPROFILE" = "2" ]; then - source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" - elif [ "$_OPTIPROFILE" = "3" ]; then - source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" - fi - - # source cpuschedset early if present - if [ -e "$_where"/cpuschedset ]; then - source "$_where"/cpuschedset - fi - - # CPU SCHED selector - if [ -z "$_cpusched" ] && [ ! -e "$_where"/cpuschedset ]; then - plain "What CPU sched variant do you want to build/install?" - read -rp "`echo $' > 1.Project C / PDS\n 2.Project C / BMQ\n 3.MuQSS\n 4.CFS\nchoice[1-4?]: '`" CONDITION; - if [ "$CONDITION" = "2" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "3" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$CONDITION" = "4" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - fi - if [ -n "$_custom_pkgbase" ]; then - echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "$_where"/cpuschedset - fi - elif [ "$_cpusched" = "muqss" ] || [ "$_cpusched" = "MuQSS" ]; then - echo "_cpusched=\"MuQSS\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "pds" ]; then - echo "_cpusched=\"pds\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "cfs" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - elif [ "$_cpusched" = "bmq" ]; then - echo "_cpusched=\"bmq\"" > "$_where"/cpuschedset - else - if [ "$_nofallback" != "true" ]; then - warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" - read -rp "`echo $' > N/y : '`" _fallback; - fi - if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then - echo "_cpusched=\"cfs\"" > "$_where"/cpuschedset - else - error "Exiting..." - exit 1 - fi - fi - - source "$_where"/cpuschedset -} - -user_patcher() { - # To patch the user because all your base are belong to us - local _patches=("$_where"/*."${_userpatch_ext}revert") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Reverting your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 -R < "${_f}" - echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi - - _patches=("$_where"/*."${_userpatch_ext}patch") - if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then - if [ "$_user_patches_no_confirm" != "true" ]; then - msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" - printf '%s\n' "${_patches[@]}" - read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; - fi - if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then - for _f in "${_patches[@]}"; do - if [ -e "${_f}" ]; then - msg2 "######################################################" - msg2 "" - msg2 "Applying your own ${_userpatch_target} patch ${_f}" - msg2 "" - msg2 "######################################################" - patch -Np1 < "${_f}" - echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log - fi - done - fi - fi -} - -_tkg_srcprep() { - - if [ "${_distro}" = "Arch" ]; then - msg2 "Setting version..." - scripts/setlocalversion --save-scmversion - echo "-$pkgrel-tkg-${_cpusched}" > localversion.10-pkgrel - echo "" > localversion.20-pkgname - - # add upstream patch - msg2 "Patching from $_basekernel to $pkgver" - patch -p1 -i "$srcdir"/patch-"${pkgver}" - - # ARCH Patches - if [ "${_configfile}" = "config_hardened.x86_64" ] && [ "${_cpusched}" = "cfs" ]; then - msg2 "Using linux hardened patchset" - patch -Np1 -i "$srcdir"/0012-linux-hardened.patch - else - patch -Np1 -i "$srcdir"/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch - fi - fi - - # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch - msg2 "Applying graysky's cpu opts patch" - if [ "${_distro}" = "Arch" ]; then - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch - else - patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v5.8+.patch - fi - - # TkG - msg2 "Applying clear linux patches" - patch -Np1 -i "$srcdir"/0002-clear-patches.patch - - msg2 "Applying glitched base patch" - patch -Np1 -i "$srcdir"/0003-glitched-base.patch - - if [ -z $_misc_adds ]; then - plain "Enable misc additions ? May contain temporary fixes pending upstream or changes that can break on non-Arch. " - read -rp "`echo $' > [Y]/n : '`" _interactive_misc_adds; - if [ "$_interactive_misc_adds" != "n" ] && [ "$_interactive_misc_adds" != "N" ]; then - _misc_adds="true" - fi - fi - - if [ "$_misc_adds" = "true" ]; then - msg2 "Applying misc additions patch" - patch -Np1 -i "$srcdir"/0012-misc-additions.patch - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS - msg2 "Applying MuQSS base patch" - patch -Np1 -i "$srcdir"/0004-5.9-ck1.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying MuQSS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0004-glitched-ondemand-muqss.patch - fi - - msg2 "Applying Glitched MuQSS patch" - patch -Np1 -i "$srcdir"/0004-glitched-muqss.patch - - elif [ "${_cpusched}" = "pds" ]; then - # PDS-mq - msg2 "Applying PDS base patch" - patch -Np1 -i "$srcdir"/0009-prjc_v5.9-r0.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying PDS agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched PDS patch" - patch -Np1 -i "$srcdir"/0005-glitched-pds.patch - - elif [ "${_cpusched}" = "bmq" ]; then - # Project C / BMQ - msg2 "Applying Project C / BMQ base patch" - - patch -Np1 -i "$srcdir"/0009-prjc_v5.9-r0.patch - - if [ "${_aggressive_ondemand}" = "true" ]; then - msg2 "Applying BMQ agressive ondemand governor patch" - patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch - fi - - msg2 "Applying Glitched BMQ patch" - patch -Np1 -i "$srcdir"/0009-glitched-bmq.patch - - elif [ "${_cpusched}" = "cfs" ]; then - msg2 "Applying Glitched CFS patch" - patch -Np1 -i "$srcdir"/0003-glitched-cfs.patch - fi - - if [ "${_distro}" = "Arch" ]; then - if [ -z "${_configfile}" ]; then - _configfile="config.x86_64" - fi - - cat "${srcdir}/${_configfile}" > ./.config - fi - - - # Set some -tkg defaults - echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config - sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config - echo "CONFIG_DEFAULT_CAKE=y" >> ./.config - echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config - echo "# CONFIG_NTP_PPS is not set" >> ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config - sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config - sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO=y/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set/' ./.config - sed -i -e 's/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y/' ./.config - sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lzo"/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4"/' ./.config - sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config - sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config - echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config - echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config - echo "# CONFIG_X86_P6_NOP is not set" >> ./.config - - # openrgb - echo "CONFIG_I2C_NCT6775=m" >> ./.config - - # ccache fix - if [ "$_noccache" != "true" ]; then - if { [ "$_distro" = "Arch" ] && pacman -Qq ccache &> /dev/null; } || { [ "$_distro" = "Ubuntu" ] && dpkg -l ccache > /dev/null; }; then - sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config - fi - fi - # Skip dbg package creation on non-Arch - if [ "$_distro" != "Arch" ]; then - sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config - fi - - if [ "$_font_autoselect" != "false" ]; then - sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config - fi - - # Inject cpuopts options - echo "# CONFIG_MK8SSE3 is not set" >> ./.config - echo "# CONFIG_MK10 is not set" >> ./.config - echo "# CONFIG_MBARCELONA is not set" >> ./.config - echo "# CONFIG_MBOBCAT is not set" >> ./.config - echo "# CONFIG_MJAGUAR is not set" >> ./.config - echo "# CONFIG_MBULLDOZER is not set" >> ./.config - echo "# CONFIG_MPILEDRIVER is not set" >> ./.config - echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config - echo "# CONFIG_MEXCAVATOR is not set" >> ./.config - echo "# CONFIG_MZEN is not set" >> ./.config - echo "# CONFIG_MZEN2 is not set" >> ./.config - echo "# CONFIG_MATOM is not set" >> ./.config - echo "# CONFIG_MNEHALEM is not set" >> ./.config - echo "# CONFIG_MWESTMERE is not set" >> ./.config - echo "# CONFIG_MSILVERMONT is not set" >> ./.config - echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config - echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config - echo "# CONFIG_MHASWELL is not set" >> ./.config - echo "# CONFIG_MBROADWELL is not set" >> ./.config - echo "# CONFIG_MSKYLAKE is not set" >> ./.config - echo "# CONFIG_MSKYLAKEX is not set" >> ./.config - echo "# CONFIG_MCANNONLAKE is not set" >> ./.config - echo "# CONFIG_MICELAKE is not set" >> ./.config - echo "# CONFIG_MGOLDMONT is not set" >> ./.config - echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config - echo "# CONFIG_MCASCADELAKE is not set" >> ./.config - echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config - echo "# CONFIG_MTIGERLAKE is not set" >> ./.config - - # Disable some debugging - if [ "${_debugdisable}" = "true" ]; then - sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config - sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config - sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ]; then - # MuQSS default config - echo "CONFIG_SCHED_MUQSS=y" >> ./.config - elif [ "${_cpusched}" = "pds" ]; then - # PDS default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_PDS=y" >> ./.config - echo "# CONFIG_SCHED_BMQ is not set" >> ./.config - elif [ "${_cpusched}" = "bmq" ]; then - # BMQ default config - echo "CONFIG_SCHED_ALT=y" >> ./.config - echo "CONFIG_SCHED_BMQ=y" >> ./.config - echo "# CONFIG_SCHED_PDS is not set" >> ./.config - fi - - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - # Disable CFS - sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config - sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config - # sched yield type - if [ -n "$_sched_yield_type" ]; then - CONDITION0="$_sched_yield_type" - else - plain "" - plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." - plain "" - plain "For PDS and MuQSS:" - plain "0: No yield." - plain "1: Yield only to better priority/deadline tasks." - plain "2: Expire timeslice and recalculate deadline." - plain "" - plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" - plain "0: No yield." - plain "1: Deboost and requeue task. (default)" - plain "2: Set rq skip task." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0. Supposedly best option for gaming performance - could lead to stability issues on some (AMD) platforms when combined with MuQSS\n > 1. Default and recommended option for MuQSS - could lead to stability issues on some (Intel) platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - else - read -rp "`echo $'\n > 0. Recommended option for gaming on PDS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; - fi - fi - if [ "$CONDITION0" = "0" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - elif [ "$CONDITION0" = "1" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "$CONDITION0" = "2" ]; then - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c - fi - else - if [ "${_cpusched}" = "MuQSS" ]; then - msg2 "Using default CPU sched yield type (1)" - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/alt_core.c - else - sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c - fi - fi - fi - - # Round Robin interval - if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ]; then - if [ -n "$_rr_interval" ]; then - CONDITION1="$_rr_interval" - else - plain "" - plain "Round Robin interval is the longest duration two tasks with the same nice level will" - plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" - plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" - plain "value can help offset the disadvantages of rescheduling a process that has yielded." - plain "" - plain "MuQSS default: 6ms" - plain "PDS default: 4ms" - plain "BMQ default: 2ms" - read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; - fi - if [ "$CONDITION1" = "1" ]; then - msg2 "Using 2ms rr_interval" - _rrvalue="2" - elif [ "$CONDITION1" = "2" ]; then - msg2 "Using 4ms rr_interval" - _rrvalue="4" - elif [ "$CONDITION1" = "3" ]; then - msg2 "Using 6ms rr_interval" - _rrvalue="6" - elif [ "$CONDITION1" = "4" ]; then - msg2 "Using 8ms rr_interval" - _rrvalue="8" - else - msg2 "Using default rr_interval" - _rrvalue="default" - fi - if [ "$_rrvalue" != "default" ]; then - if [ "${_cpusched}" = "MuQSS" ]; then - sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c - elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - else - if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then - sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/alt_core.c - fi - fi - fi - - # zenify - if [ "$_zenify" = "true" ]; then - echo "CONFIG_ZENIFY=y" >> ./.config - elif [ "$_zenify" = "false" ]; then - echo "# CONFIG_ZENIFY is not set" >> ./.config - fi - - # compiler optimization level - if [ "$_compileroptlevel" = "1" ]; then - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - elif [ "$_compileroptlevel" = "2" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config - elif [ "$_compileroptlevel" = "3" ]; then - sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config - sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config - echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config - fi - - # cpu opt - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then - echo "# CONFIG_MNATIVE is not set" >> ./.config - fi - - if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then - sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config - fi - - if [ "$_processor_opt" = "native" ]; then - echo "CONFIG_MNATIVE=y" >> ./.config - elif [ "$_processor_opt" = "k8" ]; then - sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config - elif [ "$_processor_opt" = "k8sse3" ]; then - sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config - elif [ "$_processor_opt" = "k10" ]; then - sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config - elif [ "$_processor_opt" = "barcelona" ]; then - sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config - elif [ "$_processor_opt" = "bobcat" ]; then - sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config - elif [ "$_processor_opt" = "jaguar" ]; then - sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config - elif [ "$_processor_opt" = "bulldozer" ]; then - sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config - elif [ "$_processor_opt" = "piledriver" ]; then - sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config - elif [ "$_processor_opt" = "steamroller" ]; then - sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config - elif [ "$_processor_opt" = "excavator" ]; then - sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config - elif [ "$_processor_opt" = "zen" ]; then - sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config - elif [ "$_processor_opt" = "zen2" ]; then - sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config - elif [ "$_processor_opt" = "mpsc" ]; then - sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config - elif [ "$_processor_opt" = "atom" ]; then - sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config - elif [ "$_processor_opt" = "core2" ]; then - sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config - elif [ "$_processor_opt" = "nehalem" ]; then - sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config - elif [ "$_processor_opt" = "westmere" ]; then - sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config - elif [ "$_processor_opt" = "silvermont" ]; then - sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config - elif [ "$_processor_opt" = "sandybridge" ]; then - sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "ivybridge" ]; then - sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config - elif [ "$_processor_opt" = "haswell" ]; then - sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config - elif [ "$_processor_opt" = "broadwell" ]; then - sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config - elif [ "$_processor_opt" = "skylake" ]; then - sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config - elif [ "$_processor_opt" = "skylakex" ]; then - sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config - elif [ "$_processor_opt" = "cannonlake" ]; then - sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config - elif [ "$_processor_opt" = "icelake" ]; then - sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config - elif [ "$_processor_opt" = "goldmont" ]; then - sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config - elif [ "$_processor_opt" = "goldmontplus" ]; then - sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config - elif [ "$_processor_opt" = "cascadelake" ]; then - sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config - elif [ "$_processor_opt" = "cooperlake" ]; then - sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config - elif [ "$_processor_opt" = "tigerlake" ]; then - sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config - fi - - # irq threading - if [ "$_irq_threading" = "true" ]; then - echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config - elif [ "$_irq_threading" = "false" ]; then - echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config - fi - - # smt nice - if [ "$_smt_nice" = "true" ]; then - echo "CONFIG_SMT_NICE=y" >> ./.config - elif [ "$_smt_nice" = "false" ]; then - echo "# CONFIG_SMT_NICE is not set" >> ./.config - fi - - # random trust cpu - if [ "$_random_trust_cpu" = "true" ]; then - sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config - fi - - # rq sharing - if [ "$_runqueue_sharing" = "none" ]; then - echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" = "smt" ]; then - echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "mc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "smp" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config - elif [ "$_runqueue_sharing" = "all" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config - elif [ "$_runqueue_sharing" = "mc-llc" ]; then - echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config - fi - - # timer freq - if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - if [ "$_timer_freq" = "1000" ]; then - sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "CONFIG_HZ_1000_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "750" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "CONFIG_HZ_750=y" >> ./.config - echo "CONFIG_HZ_750_NODEF=y" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "500" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - elif [ "$_timer_freq" = "100" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - fi - elif [ "${_cpusched}" = "MuQSS" ] && [ -z "$_timer_freq" ]; then - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config - echo "# CONFIG_HZ_500 is not set" >> ./.config - echo "# CONFIG_HZ_750 is not set" >> ./.config - echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - echo "CONFIG_HZ_100=y" >> ./.config - echo "CONFIG_HZ_100_NODEF=y" >> ./.config - else - sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config - sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config - sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config - echo "CONFIG_HZ_500=y" >> ./.config - echo "CONFIG_HZ_500_NODEF=y" >> ./.config - echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config - echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config - fi - - # default cpu gov - if [ "$_default_cpu_gov" = "performance" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config - elif [ "$_default_cpu_gov" = "ondemand" ]; then - sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config - sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config - fi - - # ACPI_CPUFREQ disablement - if [ "$_disable_acpi_cpufreq" = "true" ]; then - sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config - fi - - # ftrace - if [ -z "$_ftracedisable" ]; then - plain "" - plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" - plain "and analyzing of kernel functions." - read -rp "`echo $' > N/y : '`" CONDITION2; - fi - if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" = "true" ]; then - sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config - sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config - fi - - # disable numa - if [ -z "$_numadisable" ]; then - plain "" - plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." - plain "https://bbs.archlinux.org/viewtopic.php?id=239174" - read -rp "`echo $' > N/y : '`" CONDITION3; - fi - if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" = "true" ]; then - # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU - sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ - -i -e '/CONFIG_AMD_NUMA=y/d' \ - -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ - -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ - -i -e '/# CONFIG_NUMA_EMU is not set/d' \ - -i -e '/CONFIG_NODES_SHIFT=6/d' \ - -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ - -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ - -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config - fi - - # tickless - if [ -z "$_tickless" ]; then - plain "" - plain "Use CattaRappa mode (Tickless/Dynticks) ?" - plain "Can give higher performances in many cases but lower consistency on some hardware." - plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." - if [ "${_cpusched}" = "MuQSS" ]; then - read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - else - read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; - fi - fi - if [ "$CONDITION4" = "0" ] || [ "$_tickless" = "0" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config - elif [ "$CONDITION4" = "2" ] || [ "$_tickless" = "2" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - if [ "${_cpusched}" = "MuQSS" ]; then - echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config - sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - else - echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config - sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config - sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config - sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config - echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config - echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config - fi - fi - - # voluntary preempt - if [ -z "$_voluntary_preempt" ]; then - plain "" - plain "Use explicit preemption points?" - plain "It can improve latency on PDS (at the cost of throughput)" - plain "and improve throughput on other schedulers (at the cost of latency)" - read -rp "`echo $' > N/y : '`" CONDITION5; - fi - if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" = "true" ]; then - sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config - sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config - sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config - fi - - # Open Firmware support - if [ -z "$_OFenable" ]; then - plain "" - plain "Enable Device Tree and Open Firmware support?" - read -rp "`echo $' > N/y : '`" CONDITION6; - fi - if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" = "true" ]; then - sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config - fi - - # acs override - if [ -z "$_acs_override" ]; then - plain "" - plain "Use ACS override patch?" - plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" - read -rp "`echo $' > N/y : '`" CONDITION7; - fi - if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" = "true" ]; then - msg2 "Patching ACS override" - patch -Np1 -i "$srcdir"/0006-add-acs-overrides_iommu.patch - fi - - # bcachefs - if [ -z "$_bcachefs" ]; then - plain "" - plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." - plain "https://bcachefs.org/" - read -rp "`echo $' > N/y : '`" CONDITION8; - fi - if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" = "true" ]; then - msg2 "Patching Bcache filesystem support override" - patch -Np1 -i "$srcdir"/0008-5.9-bcachefs.patch - echo "CONFIG_BCACHEFS_FS=m" >> ./.config - echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config - echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config - echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config - echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config - echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config - fi - - # fsync support - if [ -z "$_fsync" ]; then - plain "" - plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" - plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" - read -rp "`echo $' > N/y : '`" CONDITION9; - fi - if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then - msg2 "Patching Fsync support" - patch -Np1 -i "$srcdir"/0007-v5.9-fsync.patch - fi - - # ZFS fix - if [ -z "$_zfsfix" ]; then - plain "" - plain "Add back missing symbol for AES-NI/AVX support on ZFS" - plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" - read -rp "`echo $' > N/y : '`" CONDITION11; - fi - if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" = "true" ]; then - msg2 "Patching missing symbol for AES-NI/AVX support on ZFS" - patch -Np1 -i "$srcdir"/0011-ZFS-fix.patch - fi - - # Community patches - if [ -n "$_community_patches" ]; then - if [ ! -d "$_where/../../community-patches" ]; then - cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/${_srcpath}" - fi - _community_patches=($_community_patches) - for _p in ${_community_patches[@]}; do - ln -s "$_where"/../../community-patches/linux"$_basever"-tkg/$_p "$_where"/ - done - fi - - # userpatches - if [ "$_user_patches" = "true" ]; then - _userpatch_target="linux-${_basekernel}" - _userpatch_ext="my" - user_patcher - fi - - # Community patches removal - for _p in ${_community_patches[@]}; do - rm -f "$_where"/$_p - done - - if [ "$_distro" = "Arch" ]; then - # don't run depmod on 'make install'. We'll do this ourselves in packaging - sed -i '2iexit 0' scripts/depmod.sh - - # get kernel version - make prepare - fi - - # modprobed-db - if [ -z "$_modprobeddb" ]; then - plain "" - plain "Use modprobed db to clean config from unneeded modules?" - plain "Speeds up compilation considerably. Requires root." - plain "https://wiki.archlinux.org/index.php/Modprobed-db" - plain "!!!! Make sure to have a well populated db !!!!" - read -rp "`echo $' > N/y : '`" CONDITIONMPDB; - fi - if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" = "true" ]; then - sudo modprobed-db recall - yes "" | make localmodconfig - fi - - if [ true = "$_config_fragments" ]; then - local fragments=() - mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) - - if [ true = "$_config_fragments_no_confirm" ]; then - printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" - else - for i in "${!fragments[@]}"; do - while true; do - read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB - CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONMPDB" in - y|yes) - break;; - n|no|'') - unset fragments[$i] - break;; - *) - echo 'Please answer with yes or no' - esac - done - done - fi - - if [ 0 -lt "${#fragments[@]}" ]; then - scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" - fi - fi - - # menuconfig / nconfig - if [ -z "$_menunconfig" ]; then - plain "" - plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" - plain "to configure the kernel before building it?" - plain "If you do, make sure your terminal is currently" - plain "at least 19 lines by 80 columns large or you'll get an error :D" - read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n 3. xconfig\n choice[0-3?]: '`" CONDITIONMNC; - _menunconfig="$CONDITIONMNC" - fi - if [ 1 = "$_menunconfig" ]; then - cp .config .config.orig - make menuconfig - elif [ 2 = "$_menunconfig" ]; then - cp .config .config.orig - make nconfig - elif [ 3 = "$_menunconfig" ]; then - cp .config .config.orig - make xconfig - else - # rewrite configuration - yes "" | make config >/dev/null - fi - if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ] || [ 3 = "$_menunconfig" ]; then - if [ -z "${_diffconfig}" ]; then - while true; do - read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF - CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" - case "$CONDITIONF" in - y|yes) - _diffconfig=true - break;; - n|no|'') - _diffconfig=false - break;; - *) - echo 'Please answer with yes or no' - esac - done - fi - if [ true = "$_diffconfig" ]; then - if [ -z "$_diffconfig_name" ]; then - IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name - fi - if [ -z "$_diffconfig_name" ]; then - echo 'No file name given, not generating config fragment.' - else ( - prev_pwd="${PWD:-$(pwd)}" - cd "$_where" - "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" - ) fi - fi - rm .config.orig - fi - - if [ "$_distro" = "Arch" ]; then - make -s kernelrelease > version - msg2 "Prepared %s version %s" "$pkgbase" "$( -From: Serge Hallyn -Date: Fri, 31 May 2013 19:12:12 +0100 -Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default - -Signed-off-by: Serge Hallyn -[bwh: Remove unneeded binary sysctl bits] -Signed-off-by: Daniel Micay ---- - kernel/fork.c | 15 +++++++++++++++ - kernel/sysctl.c | 12 ++++++++++++ - kernel/user_namespace.c | 3 +++ - 3 files changed, 30 insertions(+) - -diff --git a/kernel/fork.c b/kernel/fork.c -index 07cc743698d3668e..4011d68a8ff9305c 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -102,6 +102,11 @@ - - #define CREATE_TRACE_POINTS - #include -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#else -+#define unprivileged_userns_clone 0 -+#endif - - /* - * Minimum number of threads to boot the kernel -@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( - if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) - return ERR_PTR(-EINVAL); - -+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) -+ if (!capable(CAP_SYS_ADMIN)) -+ return ERR_PTR(-EPERM); -+ - /* - * Thread groups must share signals as well, and detached threads - * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) - if (unshare_flags & CLONE_NEWNS) - unshare_flags |= CLONE_FS; - -+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { -+ err = -EPERM; -+ if (!capable(CAP_SYS_ADMIN)) -+ goto bad_unshare_out; -+ } -+ - err = check_unshare_flags(unshare_flags); - if (err) - goto bad_unshare_out; -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index b86520ed3fb60fbf..f7dab3760839f1a1 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -105,6 +105,9 @@ extern int core_uses_pid; - - #if defined(CONFIG_SYSCTL) - -+#ifdef CONFIG_USER_NS -+extern int unprivileged_userns_clone; -+#endif - /* Constants used for minimum and maximum */ - #ifdef CONFIG_LOCKUP_DETECTOR - static int sixty = 60; -@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_USER_NS -+ { -+ .procname = "unprivileged_userns_clone", -+ .data = &unprivileged_userns_clone, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = proc_dointvec, -+ }, -+#endif - #ifdef CONFIG_PROC_SYSCTL - { - .procname = "tainted", -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index c490f1e4313b998a..dd03bd39d7bf194d 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -24,6 +24,9 @@ - #include - #include - -+/* sysctl */ -+int unprivileged_userns_clone; -+ - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); - --- -2.15.1 - -From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Thu, 7 Dec 2017 13:50:48 +0100 -Subject: ZEN: Add CONFIG for unprivileged_userns_clone - -This way our default behavior continues to match the vanilla kernel. ---- - init/Kconfig | 16 ++++++++++++++++ - kernel/user_namespace.c | 4 ++++ - 2 files changed, 20 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 4592bf7997c0..f3df02990aff 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1004,6 +1004,22 @@ config USER_NS - - If unsure, say N. - -+config USER_NS_UNPRIVILEGED -+ bool "Allow unprivileged users to create namespaces" -+ default y -+ depends on USER_NS -+ help -+ When disabled, unprivileged users will not be able to create -+ new namespaces. Allowing users to create their own namespaces -+ has been part of several recent local privilege escalation -+ exploits, so if you need user namespaces but are -+ paranoid^Wsecurity-conscious you want to disable this. -+ -+ This setting can be overridden at runtime via the -+ kernel.unprivileged_userns_clone sysctl. -+ -+ If unsure, say Y. -+ - config PID_NS - bool "PID Namespaces" - default y -diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index 6b9dbc257e34..107b17f0d528 100644 ---- a/kernel/user_namespace.c -+++ b/kernel/user_namespace.c -@@ -27,7 +27,11 @@ - #include - - /* sysctl */ -+#ifdef CONFIG_USER_NS_UNPRIVILEGED -+int unprivileged_userns_clone = 1; -+#else - int unprivileged_userns_clone; -+#endif - - static struct kmem_cache *user_ns_cachep __read_mostly; - static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux59-tkg/linux59-tkg-patches/0002-clear-patches.patch b/linux59-tkg/linux59-tkg-patches/0002-clear-patches.patch deleted file mode 100644 index 22a32f5..0000000 --- a/linux59-tkg/linux59-tkg-patches/0002-clear-patches.patch +++ /dev/null @@ -1,360 +0,0 @@ -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Mon, 14 Mar 2016 11:10:58 -0600 -Subject: [PATCH] pci pme wakeups - -Reduce wakeups for PME checks, which are a workaround for miswired -boards (sadly, too many of them) in laptops. ---- - drivers/pci/pci.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index c9338f9..6974fbf 100644 ---- a/drivers/pci/pci.c -+++ b/drivers/pci/pci.c -@@ -62,7 +62,7 @@ struct pci_pme_device { - struct pci_dev *dev; - }; - --#define PME_TIMEOUT 1000 /* How long between PME checks */ -+#define PME_TIMEOUT 4000 /* How long between PME checks */ - - static void pci_dev_d3_sleep(struct pci_dev *dev) - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sat, 19 Mar 2016 21:32:19 -0400 -Subject: [PATCH] intel_idle: tweak cpuidle cstates - -Increase target_residency in cpuidle cstate - -Tune intel_idle to be a bit less agressive; -Clear linux is cleaner in hygiene (wakupes) than the average linux, -so we can afford changing these in a way that increases -performance while keeping power efficiency ---- - drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- - 1 file changed, 22 insertions(+), 22 deletions(-) - -diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index f449584..c994d24 100644 ---- a/drivers/idle/intel_idle.c -+++ b/drivers/idle/intel_idle.c -@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 33, -- .target_residency = 100, -+ .target_residency = 900, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 1500, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 5000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 40, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 133, -- .target_residency = 400, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x32", - .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 166, -- .target_residency = 500, -+ .target_residency = 2000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 300, -- .target_residency = 900, -+ .target_residency = 4000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 600, -- .target_residency = 1800, -+ .target_residency = 7000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 2600, -- .target_residency = 7700, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 120, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x10", - .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 70, -- .target_residency = 100, -+ .target_residency = 1000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x20", - .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 85, -- .target_residency = 200, -+ .target_residency = 600, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x33", - .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 124, -- .target_residency = 800, -+ .target_residency = 3000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x40", - .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 200, -- .target_residency = 800, -+ .target_residency = 3200, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x50", - .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 480, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { - .desc = "MWAIT 0x60", - .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 890, -- .target_residency = 5000, -+ .target_residency = 9000, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { -@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { - .desc = "MWAIT 0x01", - .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, - .exit_latency = 10, -- .target_residency = 20, -+ .target_residency = 300, - .enter = &intel_idle, - .enter_s2idle = intel_idle_s2idle, }, - { --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Fri, 6 Jan 2017 15:34:09 +0000 -Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little - bigger than default - ---- - net/ipv4/tcp.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index 30c1142..4345075 100644 ---- a/net/ipv4/tcp.c -+++ b/net/ipv4/tcp.c -@@ -4201,8 +4201,8 @@ void __init tcp_init(void) - tcp_init_mem(); - /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); -- max_wshare = min(4UL*1024*1024, limit); -- max_rshare = min(6UL*1024*1024, limit); -+ max_wshare = min(16UL*1024*1024, limit); -+ max_rshare = min(16UL*1024*1024, limit); - - init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; - init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Sun, 18 Feb 2018 23:35:41 +0000 -Subject: [PATCH] locking: rwsem: spin faster - -tweak rwsem owner spinning a bit ---- - kernel/locking/rwsem.c | 4 +++- - 1 file changed, 3 insertions(+), 1 deletion(-) - -diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index f11b9bd..1bbfcc1 100644 ---- a/kernel/locking/rwsem.c -+++ b/kernel/locking/rwsem.c -@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - struct task_struct *new, *owner; - unsigned long flags, new_flags; - enum owner_state state; -+ int i = 0; - - owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); -@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) - break; - } - -- cpu_relax(); -+ if (i++ > 1000) -+ cpu_relax(); - } - rcu_read_unlock(); - --- -https://clearlinux.org - -From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 -From: Arjan van de Ven -Date: Thu, 2 Jun 2016 23:36:32 -0500 -Subject: [PATCH] initialize ata before graphics - -ATA init is the long pole in the boot process, and its asynchronous. -move the graphics init after it so that ata and graphics initialize -in parallel ---- - drivers/Makefile | 15 ++++++++------- - 1 file changed, 8 insertions(+), 7 deletions(-) - -diff --git a/drivers/Makefile b/drivers/Makefile -index c0cd1b9..af1e2fb 100644 ---- a/drivers/Makefile -+++ b/drivers/Makefile -@@ -59,15 +59,8 @@ obj-y += char/ - # iommu/ comes before gpu as gpu are using iommu controllers - obj-y += iommu/ - --# gpu/ comes after char for AGP vs DRM startup and after iommu --obj-y += gpu/ -- - obj-$(CONFIG_CONNECTOR) += connector/ - --# i810fb and intelfb depend on char/agp/ --obj-$(CONFIG_FB_I810) += video/fbdev/i810/ --obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -- - obj-$(CONFIG_PARPORT) += parport/ - obj-$(CONFIG_NVM) += lightnvm/ - obj-y += base/ block/ misc/ mfd/ nfc/ -@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/ - obj-y += scsi/ - obj-y += nvme/ - obj-$(CONFIG_ATA) += ata/ -+ -+# gpu/ comes after char for AGP vs DRM startup and after iommu -+obj-y += gpu/ -+ -+# i810fb and intelfb depend on char/agp/ -+obj-$(CONFIG_FB_I810) += video/fbdev/i810/ -+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ -+ - obj-$(CONFIG_TARGET_CORE) += target/ - obj-$(CONFIG_MTD) += mtd/ - obj-$(CONFIG_SPI) += spi/ --- -https://clearlinux.org - diff --git a/linux59-tkg/linux59-tkg-patches/0003-glitched-base.patch b/linux59-tkg/linux59-tkg-patches/0003-glitched-base.patch deleted file mode 100644 index fb09b35..0000000 --- a/linux59-tkg/linux59-tkg-patches/0003-glitched-base.patch +++ /dev/null @@ -1,708 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: [PATCH 01/17] glitched - ---- - scripts/mkcompile_h | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h -index baf3ab8d9d49..854e32e6aec7 100755 ---- a/scripts/mkcompile_h -+++ b/scripts/mkcompile_h -@@ -41,8 +41,8 @@ else - fi - - UTS_VERSION="#$VERSION" --CONFIG_FLAGS="" --if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi -+CONFIG_FLAGS="TKG" -+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi - if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi - if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"; fi - --- -2.28.0 - - -From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 16:59:22 +0000 -Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which - VFS caches are reclaimed - -Signed-off-by: Alexandre Frade ---- - fs/dcache.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/fs/dcache.c b/fs/dcache.c -index 361ea7ab30ea..0c5cf69b241a 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -71,7 +71,7 @@ - * If no ancestor relationship: - * arbitrary, since it's serialized on rename_lock - */ --int sysctl_vfs_cache_pressure __read_mostly = 100; -+int sysctl_vfs_cache_pressure __read_mostly = 50; - EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); - - __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); --- -2.28.0 - - -From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 18:29:13 +0000 -Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks - to iterate in a single balance run. - -Signed-off-by: Alexandre Frade ---- - kernel/sched/core.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index f788cd61df21..2bfbb4213707 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features = - * Number of tasks to iterate in a single balance run. - * Limited because this is done with IRQs disabled. - */ --const_debug unsigned int sysctl_sched_nr_migrate = 32; -+const_debug unsigned int sysctl_sched_nr_migrate = 128; - - /* - * period over which we measure -rt task CPU usage in us. -@@ -71,9 +71,9 @@ __read_mostly int scheduler_running; - - /* - * part of the period that we allow rt tasks to run in us. -- * default: 0.95s -+ * XanMod default: 0.98s - */ --int sysctl_sched_rt_runtime = 950000; -+int sysctl_sched_rt_runtime = 980000; - - /* - * __task_rq_lock - lock the rq @p resides on. --- -2.28.0 - - -From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 29 Jan 2018 17:41:29 +0000 -Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo - -Signed-off-by: Alexandre Frade ---- - scripts/setlocalversion | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/scripts/setlocalversion b/scripts/setlocalversion -index 20f2efd57b11..0552d8b9f582 100755 ---- a/scripts/setlocalversion -+++ b/scripts/setlocalversion -@@ -54,7 +54,7 @@ scm_version() - # If only the short version is requested, don't bother - # running further git commands - if $short; then -- echo "+" -+ # echo "+" - return - fi - # If we are past a tagged commit (like --- -2.28.0 - - -From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001 -From: Oleksandr Natalenko -Date: Wed, 11 Dec 2019 11:46:19 +0100 -Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches - -Building a kernel with -O3 may help in hunting bugs like [1] and thus -using this switch should not be restricted to one specific arch only. - -With that, lets expose it for everyone. - -[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/ - -Signed-off-by: Oleksandr Natalenko ---- - init/Kconfig | 1 - - 1 file changed, 1 deletion(-) - -diff --git a/init/Kconfig b/init/Kconfig -index 0498af567f70..3ae8678e1145 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - - config CC_OPTIMIZE_FOR_PERFORMANCE_O3 - bool "Optimize more for performance (-O3)" -- depends on ARC - help - Choosing this option will pass "-O3" to your compiler to optimize - the kernel yet more for performance. --- -2.28.0 - - -From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 -From: "Jan Alexander Steffens (heftig)" -Date: Fri, 26 Oct 2018 11:22:33 +0100 -Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 - inlining - ---- - drivers/infiniband/core/addr.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c -index 3a98439bba83..6efc4f907f58 100644 ---- a/drivers/infiniband/core/addr.c -+++ b/drivers/infiniband/core/addr.c -@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, - union { - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; -+ struct sockaddr_ib _sockaddr_ib; - } sgid_addr, dgid_addr; - int ret; - --- -2.28.0 - - -From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 -From: Etienne Juvigny -Date: Mon, 3 Sep 2018 17:36:25 +0200 -Subject: [PATCH 07/17] Zenify & stuff - ---- - init/Kconfig | 32 ++++++++++++++++++++++++++++++++ - kernel/sched/fair.c | 25 +++++++++++++++++++++++++ - mm/page-writeback.c | 8 ++++++++ - 3 files changed, 65 insertions(+) - -diff --git a/init/Kconfig b/init/Kconfig -index 3ae8678e1145..da708eed0f1e 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config ZENIFY -+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" -+ default y -+ help -+ Tunes the kernel for responsiveness at the cost of throughput and power usage. -+ -+ --- Virtual Memory Subsystem --------------------------- -+ -+ Mem dirty before bg writeback..: 10 % -> 20 % -+ Mem dirty before sync writeback: 20 % -> 50 % -+ -+ --- Block Layer ---------------------------------------- -+ -+ Queue depth...............: 128 -> 512 -+ Default MQ scheduler......: mq-deadline -> bfq -+ -+ --- CFS CPU Scheduler ---------------------------------- -+ -+ Scheduling latency.............: 6 -> 3 ms -+ Minimal granularity............: 0.75 -> 0.3 ms -+ Wakeup granularity.............: 1 -> 0.5 ms -+ CPU migration cost.............: 0.5 -> 0.25 ms -+ Bandwidth slice size...........: 5 -> 3 ms -+ Ondemand fine upscaling limit..: 95 % -> 85 % -+ -+ --- MuQSS CPU Scheduler -------------------------------- -+ -+ Scheduling interval............: 6 -> 3 ms -+ ISO task max realtime use......: 70 % -> 25 % -+ Ondemand coarse upscaling limit: 80 % -> 45 % -+ Ondemand fine upscaling limit..: 95 % -> 45 % -+ - config BROKEN - bool - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 6b3b59cc51d6..2a0072192c3d 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -37,8 +37,13 @@ - * - * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_latency = 3000000ULL; -+static unsigned int normalized_sysctl_sched_latency = 3000000ULL; -+#else - unsigned int sysctl_sched_latency = 6000000ULL; - static unsigned int normalized_sysctl_sched_latency = 6000000ULL; -+#endif - - /* - * The initial- and re-scaling of tunables is configurable -@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L - * - * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_min_granularity = 300000ULL; -+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; -+#else - unsigned int sysctl_sched_min_granularity = 750000ULL; - static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; -+#endif - - /* - * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity - */ -+#ifdef CONFIG_ZENIFY -+static unsigned int sched_nr_latency = 10; -+#else - static unsigned int sched_nr_latency = 8; -+#endif - - /* - * After fork, child runs first. If set to 0 (default) then -@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; - * - * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_wakeup_granularity = 500000UL; -+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; -+ -+const_debug unsigned int sysctl_sched_migration_cost = 50000UL; -+#else - unsigned int sysctl_sched_wakeup_granularity = 1000000UL; - static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; - - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; -+#endif - - int sched_thermal_decay_shift; - static int __init setup_sched_thermal_decay_shift(char *str) -@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu) - * - * (default: 5 msec, units: microseconds) - */ -+#ifdef CONFIG_ZENIFY -+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; -+#else - unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; - #endif -+#endif - - static inline void update_load_add(struct load_weight *lw, unsigned long inc) - { -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 28b3e7a67565..01a1aef2b9b1 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; - /* - * Start background writeback (via writeback threads) at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int dirty_background_ratio = 20; -+#else - int dirty_background_ratio = 10; -+#endif - - /* - * dirty_background_bytes starts at 0 (disabled) so that it is a function of -@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; - /* - * The generator of dirty data starts writeback at this percentage - */ -+#ifdef CONFIG_ZENIFY -+int vm_dirty_ratio = 50; -+#else - int vm_dirty_ratio = 20; -+#endif - - /* - * vm_dirty_bytes starts at 0 (disabled) so that it is a function of --- -2.28.0 - - -From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Sun, 16 Jan 2011 18:57:32 -0600 -Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control - -4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, - reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 - seconds (netperf TCP_STREAM) including long stalls. - - Be careful when choosing this. ~heftig ---- - net/ipv4/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig -index e64e59b536d3..bfb55ef7ebbe 100644 ---- a/net/ipv4/Kconfig -+++ b/net/ipv4/Kconfig -@@ -691,6 +691,9 @@ choice - config DEFAULT_VEGAS - bool "Vegas" if TCP_CONG_VEGAS=y - -+ config DEFAULT_YEAH -+ bool "YeAH" if TCP_CONG_YEAH=y -+ - config DEFAULT_VENO - bool "Veno" if TCP_CONG_VENO=y - -@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG - default "htcp" if DEFAULT_HTCP - default "hybla" if DEFAULT_HYBLA - default "vegas" if DEFAULT_VEGAS -+ default "yeah" if DEFAULT_YEAH - default "westwood" if DEFAULT_WESTWOOD - default "veno" if DEFAULT_VENO - default "reno" if DEFAULT_RENO --- -2.28.0 - - -From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 28 Nov 2018 19:01:27 -0600 -Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag - strategy - -For some reason, the default strategy to respond to THP fault fallbacks -is still just madvise, meaning stall if the program wants transparent -hugepages, but don't trigger a background reclaim / compaction if THP -begins to fail allocations. This creates a snowball affect where we -still use the THP code paths, but we almost always fail once a system -has been active and busy for a while. - -The option "defer" was created for interactive systems where THP can -still improve performance. If we have to fallback to a regular page due -to an allocation failure or anything else, we will trigger a background -reclaim and compaction so future THP attempts succeed and previous -attempts eventually have their smaller pages combined without stalling -running applications. - -We still want madvise to stall applications that explicitely want THP, -so defer+madvise _does_ make a ton of sense. Make it the default for -interactive systems, especially if the kernel maintainer left -transparent hugepages on "always". - -Reasoning and details in the original patch: https://lwn.net/Articles/711248/ ---- - mm/huge_memory.c | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 74300e337c3c..9277f22c10a7 100644 ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = - #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE - (1< -Date: Wed, 24 Oct 2018 16:58:52 -0300 -Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default - -Signed-off-by: Alexandre Frade ---- - net/sched/Kconfig | 4 ++++ - 1 file changed, 4 insertions(+) - -diff --git a/net/sched/Kconfig b/net/sched/Kconfig -index 84badf00647e..6a922bca9f39 100644 ---- a/net/sched/Kconfig -+++ b/net/sched/Kconfig -@@ -471,6 +471,9 @@ choice - config DEFAULT_SFQ - bool "Stochastic Fair Queue" if NET_SCH_SFQ - -+ config DEFAULT_CAKE -+ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE -+ - config DEFAULT_PFIFO_FAST - bool "Priority FIFO Fast" - endchoice -@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH - default "fq" if DEFAULT_FQ - default "fq_codel" if DEFAULT_FQ_CODEL - default "sfq" if DEFAULT_SFQ -+ default "cake" if DEFAULT_CAKE - default "pfifo_fast" - endif - --- -2.28.0 - - -From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 18 Feb 2019 17:40:57 +0100 -Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10) - -Multiple users have reported it's helping reducing/eliminating stuttering -with DXVK. ---- - mm/page_alloc.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 898ff44f2c7b..e72074034793 100644 ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly; - #else - int watermark_boost_factor __read_mostly = 15000; - #endif --int watermark_scale_factor = 10; -+int watermark_scale_factor = 200; - - static unsigned long nr_kernel_pages __initdata; - static unsigned long nr_all_pages __initdata; --- -2.28.0 - - -From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Fri, 19 Apr 2019 12:33:38 +0200 -Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default - -The value is still pretty low, and AMD64-ABI and ELF extended numbering -supports that, so we should be fine on modern x86 systems. - -This fixes crashes in some applications using more than 65535 vmas (also -affects some windows games running in wine, such as Star Citizen). ---- - include/linux/mm.h | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index bc05c3588aa3..b0cefe94920d 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define MAPCOUNT_ELF_CORE_MARGIN (5) --#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -+#define DEFAULT_MAX_MAP_COUNT (262144) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Mon, 27 Jul 2020 00:19:18 +0200 -Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT - -Some games such as Detroit: Become Human tend to be very crash prone with -lower values. ---- - include/linux/mm.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/mm.h b/include/linux/mm.h -index b0cefe94920d..890165099b07 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) - * not a hard limit any more. Although some userspace tools can be surprised by - * that. - */ --#define DEFAULT_MAX_MAP_COUNT (262144) -+#define DEFAULT_MAX_MAP_COUNT (524288) - - extern int sysctl_max_map_count; - --- -2.28.0 - - -From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 25 Nov 2019 15:13:06 -0300 -Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq - -Signed-off-by: Alexandre Frade ---- - block/elevator.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/block/elevator.c b/block/elevator.c -index 4eab3d70e880..79669aa39d79 100644 ---- a/block/elevator.c -+++ b/block/elevator.c -@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) - } - - /* -- * For single queue devices, default to using mq-deadline. If we have multiple -- * queues or mq-deadline is not available, default to "none". -+ * For single queue devices, default to using bfq. If we have multiple -+ * queues or bfq is not available, default to "none". - */ - static struct elevator_type *elevator_get_default(struct request_queue *q) - { - if (q->nr_hw_queues != 1) - return NULL; - -- return elevator_get(q, "mq-deadline", false); -+ return elevator_get(q, "bfq", false); - } - - /* --- -2.28.0 - - -From e2111bc5989131c675659d40e0cc4f214df2f990 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Fri, 10 May 2019 16:45:59 -0300 -Subject: [PATCH 15/17] block: set rq_affinity = 2 for full multithreading I/O - requests - -Signed-off-by: Alexandre Frade ---- - include/linux/blkdev.h | 3 ++- - 1 file changed, 2 insertions(+), 1 deletion(-) - -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 28efe374a2e1..d4e5d35d2ece 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -624,7 +624,8 @@ struct request_queue { - #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ - - #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ -- (1 << QUEUE_FLAG_SAME_COMP)) -+ (1 << QUEUE_FLAG_SAME_COMP) | \ -+ (1 << QUEUE_FLAG_SAME_FORCE)) - - void blk_queue_flag_set(unsigned int flag, struct request_queue *q); - void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); --- -2.28.0 - - -From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 -From: Alexandre Frade -Date: Mon, 3 Aug 2020 17:05:04 +0000 -Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file - read-ahead pages size - -Signed-off-by: Alexandre Frade ---- - include/linux/pagemap.h | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index cf2468da68e9..007dea784451 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); - void delete_from_page_cache_batch(struct address_space *mapping, - struct pagevec *pvec); - --#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) -+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) - - void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, - struct file *, pgoff_t index, unsigned long req_count); --- -2.28.0 - - -From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 -From: Steven Barrett -Date: Wed, 15 Jan 2020 20:43:56 -0600 -Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter - -If intel-pstate is compiled into the kernel, it will preempt the loading -of acpi-cpufreq so you can take advantage of hardware p-states without -any friction. - -However, intel-pstate is not completely superior to cpufreq's ondemand -for one reason. There's no concept of an up_threshold property. - -In ondemand, up_threshold essentially reduces the maximum utilization to -compare against, allowing you to hit max frequencies and turbo boost -from a much lower core utilization. - -With intel-pstate, you have the concept of minimum and maximum -performance, but no tunable that lets you define, maximum frequency -means 50% core utilization. For just this oversight, there's reasons -you may want ondemand. - -Lets support setting "enable" in kernel boot parameters. This lets -kernel maintainers include "intel_pstate=disable" statically in the -static boot parameters, but let users of the kernel override this -selection. ---- - Documentation/admin-guide/kernel-parameters.txt | 3 +++ - drivers/cpufreq/intel_pstate.c | 2 ++ - 2 files changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index fb95fad81c79..3e92fee81e33 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -1857,6 +1857,9 @@ - disable - Do not enable intel_pstate as the default - scaling driver for the supported processors -+ enable -+ Enable intel_pstate in-case "disable" was passed -+ previously in the kernel boot parameters - passive - Use intel_pstate as a scaling driver, but configure it - to work with generic cpufreq governors (instead of -diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c -index 36a469150ff9..aee891c9b78a 100644 ---- a/drivers/cpufreq/intel_pstate.c -+++ b/drivers/cpufreq/intel_pstate.c -@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) - pr_info("HWP disabled\n"); - no_hwp = 1; - } -+ if (!strcmp(str, "enable")) -+ no_load = 0; - if (!strcmp(str, "force")) - force_load = 1; - if (!strcmp(str, "hwp_only")) --- -2.28.0 - diff --git a/linux59-tkg/linux59-tkg-patches/0003-glitched-cfs.patch b/linux59-tkg/linux59-tkg-patches/0003-glitched-cfs.patch deleted file mode 100644 index 06b7f02..0000000 --- a/linux59-tkg/linux59-tkg-patches/0003-glitched-cfs.patch +++ /dev/null @@ -1,72 +0,0 @@ -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - diff --git a/linux59-tkg/linux59-tkg-patches/0004-5.9-ck1.patch b/linux59-tkg/linux59-tkg-patches/0004-5.9-ck1.patch deleted file mode 100644 index 33e9da3..0000000 --- a/linux59-tkg/linux59-tkg-patches/0004-5.9-ck1.patch +++ /dev/null @@ -1,13384 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a1068742a6df..d2a8f1c637d2 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4595,6 +4595,14 @@ - Memory area to be used by remote processor image, - managed by CMA. - -+ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. -+ Format: -+ smt -- Share SMT (hyperthread) sibling runqueues -+ mc -- Share MC (multicore) sibling runqueues -+ smp -- Share SMP runqueues -+ none -- So not share any runqueues -+ Default value is mc -+ - rw [KNL] Mount root device read-write on boot - - S [KNL] Run init in single mode -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index d4b32cc32bb7..9e1e71fc66d0 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -436,6 +436,16 @@ this allows system administrators to override the - ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. - - -+iso_cpu: (MuQSS CPU scheduler only) -+=================================== -+ -+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can -+run effectively at realtime priority, averaged over a rolling five -+seconds over the -whole- system, meaning all cpus. -+ -+Set to 70 (percent) by default. -+ -+ - kexec_load_disabled - =================== - -@@ -1077,6 +1087,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after - rebooting. ??? - - -+rr_interval: (MuQSS CPU scheduler only) -+======================================= -+ -+This is the smallest duration that any cpu process scheduling unit -+will run for. Increasing this value can increase throughput of cpu -+bound tasks substantially but at the expense of increased latencies -+overall. Conversely decreasing it will decrease average and maximum -+latencies but at the expense of throughput. This value is in -+milliseconds and the default value chosen depends on the number of -+cpus available at scheduler initialisation with a minimum of 6. -+ -+Valid values are from 1-1000. -+ -+ - sched_energy_aware - ================== - -@@ -1515,3 +1539,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+ -+yield_type: (MuQSS CPU scheduler only) -+====================================== -+ -+This determines what type of yield calls to sched_yield will perform. -+ -+ 0: No yield. -+ 1: Yield only to better priority/deadline tasks. (default) -+ 2: Expire timeslice and recalculate deadline. -diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt -new file mode 100644 -index 000000000000..c0282002a079 ---- /dev/null -+++ b/Documentation/scheduler/sched-BFS.txt -@@ -0,0 +1,351 @@ -+BFS - The Brain Fuck Scheduler by Con Kolivas. -+ -+Goals. -+ -+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to -+completely do away with the complex designs of the past for the cpu process -+scheduler and instead implement one that is very simple in basic design. -+The main focus of BFS is to achieve excellent desktop interactivity and -+responsiveness without heuristics and tuning knobs that are difficult to -+understand, impossible to model and predict the effect of, and when tuned to -+one workload cause massive detriment to another. -+ -+ -+Design summary. -+ -+BFS is best described as a single runqueue, O(n) lookup, earliest effective -+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual -+deadline first) and my previous Staircase Deadline scheduler. Each component -+shall be described in order to understand the significance of, and reasoning for -+it. The codebase when the first stable version was released was approximately -+9000 lines less code than the existing mainline linux kernel scheduler (in -+2.6.31). This does not even take into account the removal of documentation and -+the cgroups code that is not used. -+ -+Design reasoning. -+ -+The single runqueue refers to the queued but not running processes for the -+entire system, regardless of the number of CPUs. The reason for going back to -+a single runqueue design is that once multiple runqueues are introduced, -+per-CPU or otherwise, there will be complex interactions as each runqueue will -+be responsible for the scheduling latency and fairness of the tasks only on its -+own runqueue, and to achieve fairness and low latency across multiple CPUs, any -+advantage in throughput of having CPU local tasks causes other disadvantages. -+This is due to requiring a very complex balancing system to at best achieve some -+semblance of fairness across CPUs and can only maintain relatively low latency -+for tasks bound to the same CPUs, not across them. To increase said fairness -+and latency across CPUs, the advantage of local runqueue locking, which makes -+for better scalability, is lost due to having to grab multiple locks. -+ -+A significant feature of BFS is that all accounting is done purely based on CPU -+used and nowhere is sleep time used in any way to determine entitlement or -+interactivity. Interactivity "estimators" that use some kind of sleep/run -+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag -+tasks that aren't interactive as being so. The reason for this is that it is -+close to impossible to determine that when a task is sleeping, whether it is -+doing it voluntarily, as in a userspace application waiting for input in the -+form of a mouse click or otherwise, or involuntarily, because it is waiting for -+another thread, process, I/O, kernel activity or whatever. Thus, such an -+estimator will introduce corner cases, and more heuristics will be required to -+cope with those corner cases, introducing more corner cases and failed -+interactivity detection and so on. Interactivity in BFS is built into the design -+by virtue of the fact that tasks that are waking up have not used up their quota -+of CPU time, and have earlier effective deadlines, thereby making it very likely -+they will preempt any CPU bound task of equivalent nice level. See below for -+more information on the virtual deadline mechanism. Even if they do not preempt -+a running task, because the rr interval is guaranteed to have a bound upper -+limit on how long a task will wait for, it will be scheduled within a timeframe -+that will not cause visible interface jitter. -+ -+ -+Design details. -+ -+Task insertion. -+ -+BFS inserts tasks into each relevant queue as an O(1) insertion into a double -+linked list. On insertion, *every* running queue is checked to see if the newly -+queued task can run on any idle queue, or preempt the lowest running task on the -+system. This is how the cross-CPU scheduling of BFS achieves significantly lower -+latency per extra CPU the system has. In this case the lookup is, in the worst -+case scenario, O(n) where n is the number of CPUs on the system. -+ -+Data protection. -+ -+BFS has one single lock protecting the process local data of every task in the -+global queue. Thus every insertion, removal and modification of task data in the -+global runqueue needs to grab the global lock. However, once a task is taken by -+a CPU, the CPU has its own local data copy of the running process' accounting -+information which only that CPU accesses and modifies (such as during a -+timer tick) thus allowing the accounting data to be updated lockless. Once a -+CPU has taken a task to run, it removes it from the global queue. Thus the -+global queue only ever has, at most, -+ -+ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 -+ -+tasks in the global queue. This value is relevant for the time taken to look up -+tasks during scheduling. This will increase if many tasks with CPU affinity set -+in their policy to limit which CPUs they're allowed to run on if they outnumber -+the number of CPUs. The +1 is because when rescheduling a task, the CPU's -+currently running task is put back on the queue. Lookup will be described after -+the virtual deadline mechanism is explained. -+ -+Virtual deadline. -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in BFS is entirely in the virtual deadline mechanism. The one -+tunable in BFS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in jiffies by this equation: -+ -+ jiffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. Once a task is descheduled, it is put back on the queue, and an -+O(n) lookup of all queued-but-not-running tasks is done to determine which has -+the earliest deadline and that task is chosen to receive CPU next. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (jiffies) is -+constantly moving. -+ -+Task lookup. -+ -+BFS has 103 priority queues. 100 of these are dedicated to the static priority -+of realtime tasks, and the remaining 3 are, in order of best to worst priority, -+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority -+scheduling). When a task of these priorities is queued, a bitmap of running -+priorities is set showing which of these priorities has tasks waiting for CPU -+time. When a CPU is made to reschedule, the lookup for the next task to get -+CPU time is performed in the following way: -+ -+First the bitmap is checked to see what static priority tasks are queued. If -+any realtime priorities are found, the corresponding queue is checked and the -+first task listed there is taken (provided CPU affinity is suitable) and lookup -+is complete. If the priority corresponds to a SCHED_ISO task, they are also -+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds -+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this -+stage, every task in the runlist that corresponds to that priority is checked -+to see which has the earliest set deadline, and (provided it has suitable CPU -+affinity) it is taken off the runqueue and given the CPU. If a task has an -+expired deadline, it is taken and the rest of the lookup aborted (as they are -+chosen in FIFO order). -+ -+Thus, the lookup is O(n) in the worst case only, where n is as described -+earlier, as tasks may be chosen before the whole task list is looked over. -+ -+ -+Scalability. -+ -+The major limitations of BFS will be that of scalability, as the separate -+runqueue designs will have less lock contention as the number of CPUs rises. -+However they do not scale linearly even with separate runqueues as multiple -+runqueues will need to be locked concurrently on such designs to be able to -+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness -+across CPUs, and to achieve low enough latency for tasks on a busy CPU when -+other CPUs would be more suited. BFS has the advantage that it requires no -+balancing algorithm whatsoever, as balancing occurs by proxy simply because -+all CPUs draw off the global runqueue, in priority and deadline order. Despite -+the fact that scalability is _not_ the prime concern of BFS, it both shows very -+good scalability to smaller numbers of CPUs and is likely a more scalable design -+at these numbers of CPUs. -+ -+It also has some very low overhead scalability features built into the design -+when it has been deemed their overhead is so marginal that they're worth adding. -+The first is the local copy of the running process' data to the CPU it's running -+on to allow that data to be updated lockless where possible. Then there is -+deference paid to the last CPU a task was running on, by trying that CPU first -+when looking for an idle CPU to use the next time it's scheduled. Finally there -+is the notion of cache locality beyond the last running CPU. The sched_domains -+information is used to determine the relative virtual "cache distance" that -+other CPUs have from the last CPU a task was running on. CPUs with shared -+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated -+as cache local. CPUs without shared caches are treated as not cache local, and -+CPUs on different NUMA nodes are treated as very distant. This "relative cache -+distance" is used by modifying the virtual deadline value when doing lookups. -+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for -+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning -+behind the doubling of deadlines is as follows. The real cost of migrating a -+task from one CPU to another is entirely dependant on the cache footprint of -+the task, how cache intensive the task is, how long it's been running on that -+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and -+how layered the CPU cache is, how fast a context switch is... and so on. In -+other words, it's close to random in the real world where we do more than just -+one sole workload. The only thing we can be sure of is that it's not free. So -+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs -+is more important than cache locality, and cache locality only plays a part -+after that. Doubling the effective deadline is based on the premise that the -+"cache local" CPUs will tend to work on the same tasks up to double the number -+of cache local CPUs, and once the workload is beyond that amount, it is likely -+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA -+is a value I pulled out of my arse. -+ -+When choosing an idle CPU for a waking task, the cache locality is determined -+according to where the task last ran and then idle CPUs are ranked from best -+to worst to choose the most suitable idle CPU based on cache locality, NUMA -+node locality and hyperthread sibling business. They are chosen in the -+following preference (if idle): -+ -+* Same core, idle or busy cache, idle threads -+* Other core, same cache, idle or busy cache, idle threads. -+* Same node, other CPU, idle cache, idle threads. -+* Same node, other CPU, busy cache, idle threads. -+* Same core, busy threads. -+* Other core, same cache, busy threads. -+* Same node, other CPU, busy threads. -+* Other node, other CPU, idle cache, idle threads. -+* Other node, other CPU, busy cache, idle threads. -+* Other node, other CPU, busy threads. -+ -+This shows the SMT or "hyperthread" awareness in the design as well which will -+choose a real idle core first before a logical SMT sibling which already has -+tasks on the physical CPU. -+ -+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. -+However this benchmarking was performed on an earlier design that was far less -+scalable than the current one so it's hard to know how scalable it is in terms -+of both CPUs (due to the global runqueue) and heavily loaded machines (due to -+O(n) lookup) at this stage. Note that in terms of scalability, the number of -+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) -+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark -+results are very promising indeed, without needing to tweak any knobs, features -+or options. Benchmark contributions are most welcome. -+ -+ -+Features -+ -+As the initial prime target audience for BFS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval -+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition -+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is -+support for CGROUPS. The average user should neither need to know what these -+are, nor should they need to be using them to have good desktop behaviour. -+ -+rr_interval -+ -+There is only one "scheduler" tunable, the round robin interval. This can be -+accessed in -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6 on a -+uniprocessor machine, and automatically set to a progressively higher value on -+multiprocessor machines. The reasoning behind increasing the value on more CPUs -+is that the effective latency is decreased by virtue of there being more CPUs on -+BFS (for reasons explained above), and increasing the value allows for less -+cache contention and more throughput. Valid values are from 1 to 1000 -+Decreasing the value will decrease latencies at the cost of decreasing -+throughput, while increasing it will improve throughput, but at the cost of -+worsening latencies. The accuracy of the rr interval is limited by HZ resolution -+of the kernel configuration. Thus, the worst case latencies are usually slightly -+higher than this actual value. The default value of 6 is not an arbitrary one. -+It is based on the fact that humans can detect jitter at approximately 7ms, so -+aiming for much lower latencies is pointless under most circumstances. It is -+worth noting this fact when comparing the latency performance of BFS to other -+schedulers. Worst case latencies being higher than 7ms are far worse than -+average latencies not being in the microsecond range. -+ -+Isochronous scheduling. -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of _total CPU_ available across the machine, configurable -+as a percentage in the following "resource handling" tunable (as opposed to a -+scheduler tunable): -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of BFS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+Because some applications constantly set their policy as well as their nice -+level, there is potential for them to undo the override specified by the user -+on the command line of setting the policy to SCHED_ISO. To counter this, once -+a task has been set to SCHED_ISO policy, it needs superuser privileges to set -+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child -+processes and threads will also inherit the ISO policy. -+ -+Idleprio scheduling. -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start -+a video encode or so on without any slowdown of other tasks. To avoid this -+policy from grabbing shared resources and holding them indefinitely, if it -+detects a state where the task is waiting on I/O, the machine is about to -+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As -+per the Isochronous task management, once a task has been scheduled as IDLEPRIO, -+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can -+be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+ schedtool -D -e ./mprime -+ -+Subtick accounting. -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the -+timer tick frequency (HZ) is lowered. It is possible to create an application -+which uses almost 100% CPU, yet by being descheduled at the right time, records -+zero CPU usage. While the main problem with this is that there are possible -+security implications, it is also difficult to determine how much CPU a task -+really does use. BFS tries to use the sub-tick accounting from the TSC clock, -+where possible, to determine real CPU usage. This is not entirely reliable, but -+is far more likely to produce accurate CPU usage data than the existing designs -+and will not show tasks as consuming no CPU usage when they actually are. Thus, -+the amount of CPU reported as being used by BFS will more accurately represent -+how much CPU the task itself is using (as is shown for example by the 'time' -+application), so the reported values may be quite different to other schedulers. -+Values reported as the 'load' are more prone to problems with this design, but -+per process values are closer to real usage. When comparing throughput of BFS -+to other designs, it is important to compare the actual completed work in terms -+of total wall clock time taken and total work done, rather than the reported -+"cpu usage". -+ -+ -+Con Kolivas Fri Aug 27 2010 -diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt -new file mode 100644 -index 000000000000..ae28b85c9995 ---- /dev/null -+++ b/Documentation/scheduler/sched-MuQSS.txt -@@ -0,0 +1,373 @@ -+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. -+ -+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with -+one 8 level skiplist per runqueue, and fine grained locking for much more -+scalability. -+ -+ -+Goals. -+ -+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from -+here on (pronounced mux) is to completely do away with the complex designs of -+the past for the cpu process scheduler and instead implement one that is very -+simple in basic design. The main focus of MuQSS is to achieve excellent desktop -+interactivity and responsiveness without heuristics and tuning knobs that are -+difficult to understand, impossible to model and predict the effect of, and when -+tuned to one workload cause massive detriment to another, while still being -+scalable to many CPUs and processes. -+ -+ -+Design summary. -+ -+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) -+lookup, earliest effective virtual deadline first tickless design, loosely based -+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase -+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. -+Each component shall be described in order to understand the significance of, -+and reasoning for it. -+ -+ -+Design reasoning. -+ -+In BFS, the use of a single runqueue across all CPUs meant that each CPU would -+need to scan the entire runqueue looking for the process with the earliest -+deadline and schedule that next, regardless of which CPU it originally came -+from. This made BFS deterministic with respect to latency and provided -+guaranteed latencies dependent on number of processes and CPUs. The single -+runqueue, however, meant that all CPUs would compete for the single lock -+protecting it, which would lead to increasing lock contention as the number of -+CPUs rose and appeared to limit scalability of common workloads beyond 16 -+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously -+increased overhead proportionate to the number of queued proecesses and led to -+cache thrashing while iterating over the linked list. -+ -+MuQSS is an evolution of BFS, designed to maintain the same scheduling -+decision mechanism and be virtually deterministic without relying on the -+constrained design of the single runqueue by splitting out the single runqueue -+to be per-CPU and use skiplists instead of linked lists. -+ -+The original reason for going back to a single runqueue design for BFS was that -+once multiple runqueues are introduced, per-CPU or otherwise, there will be -+complex interactions as each runqueue will be responsible for the scheduling -+latency and fairness of the tasks only on its own runqueue, and to achieve -+fairness and low latency across multiple CPUs, any advantage in throughput of -+having CPU local tasks causes other disadvantages. This is due to requiring a -+very complex balancing system to at best achieve some semblance of fairness -+across CPUs and can only maintain relatively low latency for tasks bound to the -+same CPUs, not across them. To increase said fairness and latency across CPUs, -+the advantage of local runqueue locking, which makes for better scalability, is -+lost due to having to grab multiple locks. -+ -+MuQSS works around the problems inherent in multiple runqueue designs by -+making its skip lists priority ordered and through novel use of lockless -+examination of each other runqueue it can decide if it should take the earliest -+deadline task from another runqueue for latency reasons, or for CPU balancing -+reasons. It still does not have a balancing system, choosing to allow the -+next task scheduling decision and task wakeup CPU choice to allow balancing to -+happen by virtue of its choices. -+ -+As a further evolution of the design, MuQSS normally configures sharing of -+runqueues in a logical fashion for when CPU resources are shared for improved -+latency and throughput. By default it shares runqueues and locks between -+multicore siblings. Optionally it can be configured to run with sharing of -+SMT siblings only, all SMP packages or no sharing at all. Additionally it can -+be selected at boot time. -+ -+ -+Design details. -+ -+Custom skip list implementation: -+ -+To avoid the overhead of building up and tearing down skip list structures, -+the variant used by MuQSS has a number of optimisations making it specific for -+its use case in the scheduler. It uses static arrays of 8 'levels' instead of -+building up and tearing down structures dynamically. This makes each runqueue -+only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU -+it means that it scales O(log N) up to 64k x number of logical CPUs which is -+far beyond the realistic task limits each CPU could handle. By being 8 levels -+it also makes the array exactly one cacheline in size. Additionally, each -+skip list node is bidirectional making insertion and removal amortised O(1), -+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very -+first entry in each list at all times with MuQSS, so there is never a need to -+do a search and thus look up is always O(1). In interactive mode, the queues -+will be searched beyond their first entry if the first task is not suitable -+for affinity or SMT nice reasons. -+ -+Task insertion: -+ -+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into -+a custom skip list as described above (based on the original design by William -+Pugh). Insertion is ordered in such a way that there is never a need to do a -+search by ordering tasks according to static priority primarily, and then -+virtual deadline at the time of insertion. -+ -+Niffies: -+ -+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are -+of nanosecond resolution. Niffies are calculated per-runqueue from the high -+resolution TSC timers, and in order to maintain fairness are synchronised -+between CPUs whenever both runqueues are locked concurrently. -+ -+Virtual deadline: -+ -+The key to achieving low latency, scheduling fairness, and "nice level" -+distribution in MuQSS is entirely in the virtual deadline mechanism. The one -+tunable in MuQSS is the rr_interval, or "round robin interval". This is the -+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) -+tasks of the same nice level will be running for, or looking at it the other -+way around, the longest duration two tasks of the same nice level will be -+delayed for. When a task requests cpu time, it is given a quota (time_slice) -+equal to the rr_interval and a virtual deadline. The virtual deadline is -+offset from the current time in niffies by this equation: -+ -+ niffies + (prio_ratio * rr_interval) -+ -+The prio_ratio is determined as a ratio compared to the baseline of nice -20 -+and increases by 10% per nice level. The deadline is a virtual one only in that -+no guarantee is placed that a task will actually be scheduled by this time, but -+it is used to compare which task should go next. There are three components to -+how a task is next chosen. First is time_slice expiration. If a task runs out -+of its time_slice, it is descheduled, the time_slice is refilled, and the -+deadline reset to that formula above. Second is sleep, where a task no longer -+is requesting CPU for whatever reason. The time_slice and deadline are _not_ -+adjusted in this case and are just carried over for when the task is next -+scheduled. Third is preemption, and that is when a newly waking task is deemed -+higher priority than a currently running task on any cpu by virtue of the fact -+that it has an earlier virtual deadline than the currently running task. The -+earlier deadline is the key to which task is next chosen for the first and -+second cases. -+ -+The CPU proportion of different nice tasks works out to be approximately the -+ -+ (prio_ratio difference)^2 -+ -+The reason it is squared is that a task's deadline does not change while it is -+running unless it runs out of time_slice. Thus, even if the time actually -+passes the deadline of another task that is queued, it will not get CPU time -+unless the current running task deschedules, and the time "base" (niffies) is -+constantly moving. -+ -+Task lookup: -+ -+As tasks are already pre-ordered according to anticipated scheduling order in -+the skip lists, lookup for the next suitable task per-runqueue is always a -+matter of simply selecting the first task in the 0th level skip list entry. -+In order to maintain optimal latency and fairness across CPUs, MuQSS does a -+novel examination of every other runqueue in cache locality order, choosing the -+best task across all runqueues. This provides near-determinism of how long any -+task across the entire system may wait before receiving CPU time. The other -+runqueues are first examine lockless and then trylocked to minimise the -+potential lock contention if they are likely to have a suitable better task. -+Each other runqueue lock is only held for as long as it takes to examine the -+entry for suitability. In "interactive" mode, the default setting, MuQSS will -+look for the best deadline task across all CPUs, while in !interactive mode, -+it will only select a better deadline task from another CPU if it is more -+heavily laden than the current one. -+ -+Lookup is therefore O(k) where k is number of CPUs. -+ -+ -+Latency. -+ -+Through the use of virtual deadlines to govern the scheduling order of normal -+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by -+the rr_interval tunable which is set to 6ms by default. This means that the -+longest a CPU bound task will wait for more CPU is proportional to the number -+of running tasks and in the common case of 0-2 running tasks per CPU, will be -+under the 7ms threshold for human perception of jitter. Additionally, as newly -+woken tasks will have an early deadline from their previous runtime, the very -+tasks that are usually latency sensitive will have the shortest interval for -+activation, usually preempting any existing CPU bound tasks. -+ -+Tickless expiry: -+ -+A feature of MuQSS is that it is not tied to the resolution of the chosen tick -+rate in Hz, instead depending entirely on the high resolution timers where -+possible for sub-millisecond accuracy on timeouts regarless of the underlying -+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates -+such as 100 by default, benefiting from the improved throughput and lower -+power usage it provides. Another advantage of this approach is that in -+combination with the Full No HZ option, which disables ticks on running task -+CPUs instead of just idle CPUs, the tick can be disabled at all times -+regardless of how many tasks are running instead of being limited to just one -+running task. Note that this option is NOT recommended for regular desktop -+users. -+ -+ -+Scalability and balancing. -+ -+Unlike traditional approaches where balancing is a combination of CPU selection -+at task wakeup and intermittent balancing based on a vast array of rules set -+according to architecture, busyness calculations and special case management, -+MuQSS indirectly balances on the fly at task wakeup and next task selection. -+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for -+each logical CPU and uses this to aid task/CPU selection when CPUs are busy. -+Additionally it selects any idle CPUs, if they are available, at any time over -+busy CPUs according to the following preference: -+ -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ -+Mux is therefore SMT, MC and Numa aware without the need for extra -+intermittent balancing to maintain CPUs busy and make the most of cache -+coherency. -+ -+ -+Features -+ -+As the initial prime target audience for MuQSS was the average desktop user, it -+was designed to not need tweaking, tuning or have features set to obtain benefit -+from it. Thus the number of knobs and features has been kept to an absolute -+minimum and should not require extra user input for the vast majority of cases. -+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, -+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO -+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS -+does _not_ now feature is support for CGROUPS. The average user should neither -+need to know what these are, nor should they need to be using them to have good -+desktop behaviour. However since some applications refuse to work without -+cgroups, one can enable them with MuQSS as a stub and the filesystem will be -+created which will allow the applications to work. -+ -+rr_interval: -+ -+ /proc/sys/kernel/rr_interval -+ -+The value is in milliseconds, and the default value is set to 6. Valid values -+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of -+decreasing throughput, while increasing it will improve throughput, but at the -+cost of worsening latencies. It is based on the fact that humans can detect -+jitter at approximately 7ms, so aiming for much lower latencies is pointless -+under most circumstances. It is worth noting this fact when comparing the -+latency performance of MuQSS to other schedulers. Worst case latencies being -+higher than 7ms are far worse than average latencies not being in the -+microsecond range. -+ -+interactive: -+ -+ /proc/sys/kernel/interactive -+ -+The value is a simple boolean of 1 for on and 0 for off and is set to on by -+default. Disabling this will disable the near-determinism of MuQSS when -+selecting the next task by not examining all CPUs for the earliest deadline -+task, or which CPU to wake to, instead prioritising CPU balancing for improved -+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis -+instead of across the whole system. -+ -+Runqueue sharing. -+ -+By default MuQSS chooses to share runqueue resources (specifically the skip -+list and locking) between multicore siblings. It is configurable at build time -+to select between None, SMT, MC and SMP, corresponding to no sharing, sharing -+only between simultaneous mulithreading siblings, multicore siblings, or -+symmetric multiprocessing physical packages. Additionally it can be se at -+bootime with the use of the rqshare parameter. The reason for configurability -+is that some architectures have CPUs with many multicore siblings (>= 16) -+where it may be detrimental to throughput to share runqueues and another -+sharing option may be desirable. Additionally, more sharing than usual can -+improve latency on a system-wide level at the expense of throughput if desired. -+ -+The options are: -+none, smt, mc, smp -+ -+eg: -+ rqshare=mc -+ -+Isochronous scheduling: -+ -+Isochronous scheduling is a unique scheduling policy designed to provide -+near-real-time performance to unprivileged (ie non-root) users without the -+ability to starve the machine indefinitely. Isochronous tasks (which means -+"same time") are set using, for example, the schedtool application like so: -+ -+ schedtool -I -e amarok -+ -+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works -+is that it has a priority level between true realtime tasks and SCHED_NORMAL -+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, -+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval -+rate). However if ISO tasks run for more than a tunable finite amount of time, -+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of -+time is the percentage of CPU available per CPU, configurable as a percentage in -+the following "resource handling" tunable (as opposed to a scheduler tunable): -+ -+iso_cpu: -+ -+ /proc/sys/kernel/iso_cpu -+ -+and is set to 70% by default. It is calculated over a rolling 5 second average -+Because it is the total CPU available, it means that on a multi CPU machine, it -+is possible to have an ISO task running as realtime scheduling indefinitely on -+just one CPU, as the other CPUs will be available. Setting this to 100 is the -+equivalent of giving all users SCHED_RR access and setting it to 0 removes the -+ability to run any pseudo-realtime tasks. -+ -+A feature of MuQSS is that it detects when an application tries to obtain a -+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the -+appropriate privileges to use those policies. When it detects this, it will -+give the task SCHED_ISO policy instead. Thus it is transparent to the user. -+ -+ -+Idleprio scheduling: -+ -+Idleprio scheduling is a scheduling policy designed to give out CPU to a task -+_only_ when the CPU would be otherwise idle. The idea behind this is to allow -+ultra low priority tasks to be run in the background that have virtually no -+effect on the foreground tasks. This is ideally suited to distributed computing -+clients (like setiathome, folding, mprime etc) but can also be used to start a -+video encode or so on without any slowdown of other tasks. To avoid this policy -+from grabbing shared resources and holding them indefinitely, if it detects a -+state where the task is waiting on I/O, the machine is about to suspend to ram -+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has -+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without -+superuser privileges since it is effectively a lower scheduling policy. Tasks -+can be set to start as SCHED_IDLEPRIO with the schedtool command like so: -+ -+schedtool -D -e ./mprime -+ -+Subtick accounting: -+ -+It is surprisingly difficult to get accurate CPU accounting, and in many cases, -+the accounting is done by simply determining what is happening at the precise -+moment a timer tick fires off. This becomes increasingly inaccurate as the timer -+tick frequency (HZ) is lowered. It is possible to create an application which -+uses almost 100% CPU, yet by being descheduled at the right time, records zero -+CPU usage. While the main problem with this is that there are possible security -+implications, it is also difficult to determine how much CPU a task really does -+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU -+usage. Thus, the amount of CPU reported as being used by MuQSS will more -+accurately represent how much CPU the task itself is using (as is shown for -+example by the 'time' application), so the reported values may be quite -+different to other schedulers. When comparing throughput of MuQSS to other -+designs, it is important to compare the actual completed work in terms of total -+wall clock time taken and total work done, rather than the reported "cpu usage". -+ -+Symmetric MultiThreading (SMT) aware nice: -+ -+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the -+logical CPU count rises by adding thread units to each CPU core, allowing more -+than one task to be run simultaneously on the same core, the disadvantage of it -+is that the CPU power is shared between the tasks, not summating to the power -+of two CPUs. The practical upshot of this is that two tasks running on -+separate threads of the same core run significantly slower than if they had one -+core each to run on. While smart CPU selection allows each task to have a core -+to itself whenever available (as is done on MuQSS), it cannot offset the -+slowdown that occurs when the cores are all loaded and only a thread is left. -+Most of the time this is harmless as the CPU is effectively overloaded at this -+point and the extra thread is of benefit. However when running a niced task in -+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets -+precisely the same amount of CPU power as the unniced one. MuQSS has an -+optional configuration feature known as SMT-NICE which selectively idles the -+secondary niced thread for a period proportional to the nice difference, -+allowing CPU distribution according to nice level to be maintained, at the -+expense of a small amount of extra overhead. If this is configured in on a -+machine without SMT threads, the overhead is minimal. -+ -+ -+Con Kolivas Sat, 29th October 2016 -diff --git a/Makefile b/Makefile -index 51540b291738..ab8c480660a6 100644 ---- a/Makefile -+++ b/Makefile -@@ -18,6 +18,10 @@ $(if $(filter __%, $(MAKECMDGOALS)), \ - PHONY := __all - __all: - -+CKVERSION = -ck1 -+CKNAME = MuQSS Powered -+EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) -+ - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. - # -diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig -index 9c5f06e8eb9b..0d1069eee09c 100644 ---- a/arch/alpha/Kconfig -+++ b/arch/alpha/Kconfig -@@ -666,6 +666,8 @@ config HZ - default 1200 if HZ_1200 - default 1024 - -+source "kernel/Kconfig.MuQSS" -+ - config SRM_ENV - tristate "SRM environment through procfs" - depends on PROC_FS -diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig -index a12656ec0072..b46b6ddc7636 100644 ---- a/arch/arc/configs/tb10x_defconfig -+++ b/arch/arc/configs/tb10x_defconfig -@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y - CONFIG_ARC_CACHE_LINE_SHIFT=5 - CONFIG_HZ=250 - CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_COMPACTION is not set - CONFIG_NET=y - CONFIG_PACKET=y -diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig -index e00d94b16658..efabbd09475a 100644 ---- a/arch/arm/Kconfig -+++ b/arch/arm/Kconfig -@@ -1236,6 +1236,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config HAVE_ARM_SCU - bool - help -diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig -index 44ff9cd88d81..9c639c998015 100644 ---- a/arch/arm/configs/bcm2835_defconfig -+++ b/arch/arm/configs/bcm2835_defconfig -@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_ARCH_MULTI_V6=y - CONFIG_ARCH_BCM=y - CONFIG_ARCH_BCM2835=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_AEABI=y - CONFIG_KSM=y - CONFIG_CLEANCACHE=y -diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig -index 82d3ffb18e70..bb05667427a6 100644 ---- a/arch/arm/configs/imx_v6_v7_defconfig -+++ b/arch/arm/configs/imx_v6_v7_defconfig -@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y - CONFIG_PCI_IMX6=y - CONFIG_SMP=y - CONFIG_ARM_PSCI=y -+CONFIG_PREEMPT=y - CONFIG_HIGHMEM=y - CONFIG_FORCE_MAX_ZONEORDER=14 - CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" -diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig -index 1d923dbb9928..9c1931f1fafd 100644 ---- a/arch/arm/configs/mps2_defconfig -+++ b/arch/arm/configs/mps2_defconfig -@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y - CONFIG_SET_MEM_PARAM=y - CONFIG_DRAM_BASE=0x21000000 - CONFIG_DRAM_SIZE=0x1000000 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - # CONFIG_ATAGS is not set - CONFIG_ZBOOT_ROM_TEXT=0x0 - CONFIG_ZBOOT_ROM_BSS=0x0 -diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig -index a9c6f32a9b1c..870866aaa39d 100644 ---- a/arch/arm/configs/mxs_defconfig -+++ b/arch/arm/configs/mxs_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT_VOLUNTARY=n - CONFIG_TASKSTATS=y - CONFIG_TASK_DELAY_ACCT=y - CONFIG_TASK_XACCT=y -@@ -25,6 +25,13 @@ CONFIG_MODULE_UNLOAD=y - CONFIG_MODULE_FORCE_UNLOAD=y - CONFIG_MODVERSIONS=y - CONFIG_BLK_DEV_INTEGRITY=y -+# CONFIG_IOSCHED_DEADLINE is not set -+# CONFIG_IOSCHED_CFQ is not set -+# CONFIG_ARCH_MULTI_V7 is not set -+CONFIG_ARCH_MXS=y -+# CONFIG_ARM_THUMB is not set -+CONFIG_PREEMPT=y -+CONFIG_AEABI=y - CONFIG_NET=y - CONFIG_PACKET=y - CONFIG_UNIX=y -diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig -index 6d232837cbee..052cae73d674 100644 ---- a/arch/arm64/Kconfig -+++ b/arch/arm64/Kconfig -@@ -945,6 +945,8 @@ config SCHED_SMT - MultiThreading at a cost of slightly increased overhead in some - places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 -diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig -index 023b4e644b1c..013e630b96a6 100644 ---- a/arch/mips/configs/fuloong2e_defconfig -+++ b/arch/mips/configs/fuloong2e_defconfig -@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig -index 9085f4d6c698..fb23111d45f6 100644 ---- a/arch/mips/configs/gpr_defconfig -+++ b/arch/mips/configs/gpr_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig -index 21a1168ae301..529a1b1007cf 100644 ---- a/arch/mips/configs/ip22_defconfig -+++ b/arch/mips/configs/ip22_defconfig -@@ -1,7 +1,7 @@ - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig -index 0921ef38e9fb..6da05cef46f8 100644 ---- a/arch/mips/configs/ip28_defconfig -+++ b/arch/mips/configs/ip28_defconfig -@@ -1,5 +1,5 @@ - CONFIG_SYSVIPC=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig -index 8c223035921f..a3bf87450343 100644 ---- a/arch/mips/configs/jazz_defconfig -+++ b/arch/mips/configs/jazz_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_RELAY=y -diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig -index 914af125a7fa..76a64290373f 100644 ---- a/arch/mips/configs/mtx1_defconfig -+++ b/arch/mips/configs/mtx1_defconfig -@@ -1,8 +1,8 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_RELAY=y -diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig -index 4ecb157e56d4..ea7309283b01 100644 ---- a/arch/mips/configs/nlm_xlr_defconfig -+++ b/arch/mips/configs/nlm_xlr_defconfig -@@ -1,10 +1,10 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y - CONFIG_AUDIT=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_BSD_PROCESS_ACCT_V3=y - CONFIG_TASKSTATS=y -diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig -index 63fe2da1b37f..7f08ee237345 100644 ---- a/arch/mips/configs/pic32mzda_defconfig -+++ b/arch/mips/configs/pic32mzda_defconfig -@@ -1,7 +1,7 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=14 -diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig -index b9adf15ebbec..0025b56dc300 100644 ---- a/arch/mips/configs/pistachio_defconfig -+++ b/arch/mips/configs/pistachio_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - CONFIG_DEFAULT_HOSTNAME="localhost" - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_IKCONFIG=m - CONFIG_IKCONFIG_PROC=y - CONFIG_LOG_BUF_SHIFT=18 -diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig -index d06db6b87959..fb2cd3234d95 100644 ---- a/arch/mips/configs/pnx8335_stb225_defconfig -+++ b/arch/mips/configs/pnx8335_stb225_defconfig -@@ -1,9 +1,9 @@ -+CONFIG_PREEMPT=y - # CONFIG_LOCALVERSION_AUTO is not set - # CONFIG_SWAP is not set - CONFIG_SYSVIPC=y - CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_LOG_BUF_SHIFT=14 - CONFIG_EXPERT=y - CONFIG_SLAB=y -diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig -index 30d7c3db884e..9e68acfa0d0e 100644 ---- a/arch/mips/configs/rm200_defconfig -+++ b/arch/mips/configs/rm200_defconfig -@@ -1,6 +1,6 @@ -+CONFIG_PREEMPT=y - CONFIG_SYSVIPC=y - CONFIG_POSIX_MQUEUE=y --CONFIG_PREEMPT_VOLUNTARY=y - CONFIG_BSD_PROCESS_ACCT=y - CONFIG_IKCONFIG=y - CONFIG_IKCONFIG_PROC=y -diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig -new file mode 100644 -index 000000000000..578524f80cc4 ---- /dev/null -+++ b/arch/parisc/configs/712_defconfig -@@ -0,0 +1,181 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_GSC_LASI=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+# CONFIG_IPV6 is not set -+CONFIG_NETFILTER=y -+CONFIG_LLC2=m -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_ATA_OVER_ETH=m -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=m -+CONFIG_MD_LINEAR=m -+CONFIG_MD_RAID0=m -+CONFIG_MD_RAID1=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPP_MPPE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=m -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_SERIAL_MUX is not set -+CONFIG_PDC_CONSOLE=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_HARMONY=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_JFS_FS=m -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_UDF_FS=m -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_UFS_FS=m -+CONFIG_NFS_FS=y -+CONFIG_NFS_V4=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=m -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=m -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_HMAC=y -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+CONFIG_CRYPTO_DEFLATE=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_FONTS=y -+CONFIG_FONT_8x8=y -+CONFIG_FONT_8x16=y -diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig -new file mode 100644 -index 000000000000..d1bdfad94048 ---- /dev/null -+++ b/arch/parisc/configs/c3000_defconfig -@@ -0,0 +1,151 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_EXPERT=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+CONFIG_PA8X00=y -+CONFIG_PREEMPT=y -+# CONFIG_GSC is not set -+CONFIG_PCI=y -+CONFIG_PCI_LBA=y -+# CONFIG_PDC_CHASSIS is not set -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_BOOTP=y -+# CONFIG_INET_DIAG is not set -+CONFIG_INET6_IPCOMP=m -+CONFIG_IPV6_TUNNEL=m -+CONFIG_NETFILTER=y -+CONFIG_NET_PKTGEN=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_BLK_DEV_UMEM=m -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=m -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_ISCSI_ATTRS=m -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 -+CONFIG_SCSI_DEBUG=m -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_BLK_DEV_DM=m -+CONFIG_DM_CRYPT=m -+CONFIG_DM_SNAPSHOT=m -+CONFIG_DM_MIRROR=m -+CONFIG_DM_ZERO=m -+CONFIG_DM_MULTIPATH=m -+CONFIG_FUSION=y -+CONFIG_FUSION_SPI=m -+CONFIG_FUSION_CTL=m -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=m -+CONFIG_TIGON3=m -+CONFIG_NET_TULIP=y -+CONFIG_DE2104X=m -+CONFIG_TULIP=y -+CONFIG_TULIP_MMIO=y -+CONFIG_E100=m -+CONFIG_E1000=m -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_ATKBD is not set -+# CONFIG_MOUSE_PS2 is not set -+CONFIG_SERIO=m -+CONFIG_SERIO_LIBPS2=m -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_NR_UARTS=13 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+# CONFIG_HW_RANDOM is not set -+CONFIG_RAW_DRIVER=y -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_USB_HIDDEV=y -+CONFIG_USB=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_PRINTER=m -+CONFIG_USB_STORAGE=m -+CONFIG_USB_STORAGE_USBAT=m -+CONFIG_USB_STORAGE_SDDR09=m -+CONFIG_USB_STORAGE_SDDR55=m -+CONFIG_USB_STORAGE_JUMPSHOT=m -+CONFIG_USB_MDC800=m -+CONFIG_USB_MICROTEK=m -+CONFIG_USB_LEGOTOWER=m -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_XFS_FS=m -+CONFIG_AUTOFS4_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_MSDOS_FS=m -+CONFIG_VFAT_FS=m -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V3=y -+CONFIG_NLS_CODEPAGE_437=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_UTF8=m -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_MUTEXES=y -+# CONFIG_DEBUG_BUGVERBOSE is not set -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MD5=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_DES=m -+# CONFIG_CRYPTO_HW is not set -diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig -new file mode 100644 -index 000000000000..0d976614934c ---- /dev/null -+++ b/arch/parisc/configs/defconfig -@@ -0,0 +1,206 @@ -+# CONFIG_LOCALVERSION_AUTO is not set -+CONFIG_SYSVIPC=y -+CONFIG_POSIX_MQUEUE=y -+CONFIG_IKCONFIG=y -+CONFIG_IKCONFIG_PROC=y -+CONFIG_LOG_BUF_SHIFT=16 -+CONFIG_BLK_DEV_INITRD=y -+CONFIG_KALLSYMS_ALL=y -+CONFIG_SLAB=y -+CONFIG_PROFILING=y -+CONFIG_OPROFILE=m -+CONFIG_MODULES=y -+CONFIG_MODULE_UNLOAD=y -+CONFIG_MODULE_FORCE_UNLOAD=y -+# CONFIG_BLK_DEV_BSG is not set -+CONFIG_PA7100LC=y -+CONFIG_PREEMPT=y -+CONFIG_IOMMU_CCIO=y -+CONFIG_GSC_LASI=y -+CONFIG_GSC_WAX=y -+CONFIG_EISA=y -+CONFIG_PCI=y -+CONFIG_GSC_DINO=y -+CONFIG_PCI_LBA=y -+CONFIG_PCCARD=y -+CONFIG_YENTA=y -+CONFIG_PD6729=y -+CONFIG_I82092=y -+CONFIG_BINFMT_MISC=m -+CONFIG_NET=y -+CONFIG_PACKET=y -+CONFIG_UNIX=y -+CONFIG_XFRM_USER=m -+CONFIG_NET_KEY=m -+CONFIG_INET=y -+CONFIG_IP_MULTICAST=y -+CONFIG_IP_PNP=y -+CONFIG_IP_PNP_DHCP=y -+CONFIG_IP_PNP_BOOTP=y -+CONFIG_INET_AH=m -+CONFIG_INET_ESP=m -+CONFIG_INET_DIAG=m -+CONFIG_INET6_AH=y -+CONFIG_INET6_ESP=y -+CONFIG_INET6_IPCOMP=y -+CONFIG_LLC2=m -+CONFIG_DEVTMPFS=y -+CONFIG_DEVTMPFS_MOUNT=y -+# CONFIG_STANDALONE is not set -+# CONFIG_PREVENT_FIRMWARE_BUILD is not set -+CONFIG_PARPORT=y -+CONFIG_PARPORT_PC=m -+CONFIG_PARPORT_PC_PCMCIA=m -+CONFIG_PARPORT_1284=y -+CONFIG_BLK_DEV_LOOP=y -+CONFIG_BLK_DEV_CRYPTOLOOP=y -+CONFIG_BLK_DEV_RAM=y -+CONFIG_BLK_DEV_RAM_SIZE=6144 -+CONFIG_IDE=y -+CONFIG_BLK_DEV_IDECS=y -+CONFIG_BLK_DEV_IDECD=y -+CONFIG_BLK_DEV_GENERIC=y -+CONFIG_BLK_DEV_NS87415=y -+CONFIG_SCSI=y -+CONFIG_BLK_DEV_SD=y -+CONFIG_CHR_DEV_ST=y -+CONFIG_BLK_DEV_SR=y -+CONFIG_CHR_DEV_SG=y -+CONFIG_SCSI_LASI700=y -+CONFIG_SCSI_SYM53C8XX_2=y -+CONFIG_SCSI_ZALON=y -+CONFIG_MD=y -+CONFIG_BLK_DEV_MD=y -+CONFIG_MD_LINEAR=y -+CONFIG_MD_RAID0=y -+CONFIG_MD_RAID1=y -+CONFIG_MD_RAID10=y -+CONFIG_BLK_DEV_DM=y -+CONFIG_NETDEVICES=y -+CONFIG_BONDING=m -+CONFIG_DUMMY=m -+CONFIG_TUN=m -+CONFIG_ACENIC=y -+CONFIG_TIGON3=y -+CONFIG_NET_TULIP=y -+CONFIG_TULIP=y -+CONFIG_LASI_82596=y -+CONFIG_PPP=m -+CONFIG_PPP_BSDCOMP=m -+CONFIG_PPP_DEFLATE=m -+CONFIG_PPPOE=m -+CONFIG_PPP_ASYNC=m -+CONFIG_PPP_SYNC_TTY=m -+# CONFIG_KEYBOARD_HIL_OLD is not set -+CONFIG_MOUSE_SERIAL=y -+CONFIG_LEGACY_PTY_COUNT=64 -+CONFIG_SERIAL_8250=y -+CONFIG_SERIAL_8250_CONSOLE=y -+CONFIG_SERIAL_8250_CS=y -+CONFIG_SERIAL_8250_NR_UARTS=17 -+CONFIG_SERIAL_8250_EXTENDED=y -+CONFIG_SERIAL_8250_MANY_PORTS=y -+CONFIG_SERIAL_8250_SHARE_IRQ=y -+CONFIG_PRINTER=m -+CONFIG_PPDEV=m -+# CONFIG_HW_RANDOM is not set -+# CONFIG_HWMON is not set -+CONFIG_FB=y -+CONFIG_FB_MODE_HELPERS=y -+CONFIG_FB_TILEBLITTING=y -+CONFIG_DUMMY_CONSOLE_COLUMNS=128 -+CONFIG_DUMMY_CONSOLE_ROWS=48 -+CONFIG_FRAMEBUFFER_CONSOLE=y -+CONFIG_LOGO=y -+# CONFIG_LOGO_LINUX_MONO is not set -+# CONFIG_LOGO_LINUX_VGA16 is not set -+# CONFIG_LOGO_LINUX_CLUT224 is not set -+CONFIG_SOUND=y -+CONFIG_SND=y -+CONFIG_SND_DYNAMIC_MINORS=y -+CONFIG_SND_SEQUENCER=y -+CONFIG_SND_AD1889=y -+CONFIG_SND_HARMONY=y -+CONFIG_HID_GYRATION=y -+CONFIG_HID_NTRIG=y -+CONFIG_HID_PANTHERLORD=y -+CONFIG_HID_PETALYNX=y -+CONFIG_HID_SAMSUNG=y -+CONFIG_HID_SUNPLUS=y -+CONFIG_HID_TOPSEED=y -+CONFIG_USB=y -+CONFIG_USB_MON=y -+CONFIG_USB_OHCI_HCD=y -+CONFIG_USB_UHCI_HCD=y -+CONFIG_EXT2_FS=y -+CONFIG_EXT3_FS=y -+CONFIG_ISO9660_FS=y -+CONFIG_JOLIET=y -+CONFIG_VFAT_FS=y -+CONFIG_PROC_KCORE=y -+CONFIG_TMPFS=y -+CONFIG_NFS_FS=y -+CONFIG_ROOT_NFS=y -+CONFIG_NFSD=y -+CONFIG_NFSD_V4=y -+CONFIG_CIFS=m -+CONFIG_NLS_CODEPAGE_437=y -+CONFIG_NLS_CODEPAGE_737=m -+CONFIG_NLS_CODEPAGE_775=m -+CONFIG_NLS_CODEPAGE_850=m -+CONFIG_NLS_CODEPAGE_852=m -+CONFIG_NLS_CODEPAGE_855=m -+CONFIG_NLS_CODEPAGE_857=m -+CONFIG_NLS_CODEPAGE_860=m -+CONFIG_NLS_CODEPAGE_861=m -+CONFIG_NLS_CODEPAGE_862=m -+CONFIG_NLS_CODEPAGE_863=m -+CONFIG_NLS_CODEPAGE_864=m -+CONFIG_NLS_CODEPAGE_865=m -+CONFIG_NLS_CODEPAGE_866=m -+CONFIG_NLS_CODEPAGE_869=m -+CONFIG_NLS_CODEPAGE_936=m -+CONFIG_NLS_CODEPAGE_950=m -+CONFIG_NLS_CODEPAGE_932=m -+CONFIG_NLS_CODEPAGE_949=m -+CONFIG_NLS_CODEPAGE_874=m -+CONFIG_NLS_ISO8859_8=m -+CONFIG_NLS_CODEPAGE_1250=y -+CONFIG_NLS_CODEPAGE_1251=m -+CONFIG_NLS_ASCII=m -+CONFIG_NLS_ISO8859_1=y -+CONFIG_NLS_ISO8859_2=m -+CONFIG_NLS_ISO8859_3=m -+CONFIG_NLS_ISO8859_4=m -+CONFIG_NLS_ISO8859_5=m -+CONFIG_NLS_ISO8859_6=m -+CONFIG_NLS_ISO8859_7=m -+CONFIG_NLS_ISO8859_9=m -+CONFIG_NLS_ISO8859_13=m -+CONFIG_NLS_ISO8859_14=m -+CONFIG_NLS_ISO8859_15=m -+CONFIG_NLS_KOI8_R=m -+CONFIG_NLS_KOI8_U=m -+CONFIG_NLS_UTF8=y -+CONFIG_DEBUG_FS=y -+CONFIG_HEADERS_INSTALL=y -+CONFIG_HEADERS_CHECK=y -+CONFIG_MAGIC_SYSRQ=y -+CONFIG_DEBUG_KERNEL=y -+CONFIG_DEBUG_MUTEXES=y -+CONFIG_KEYS=y -+CONFIG_CRYPTO_TEST=m -+CONFIG_CRYPTO_MICHAEL_MIC=m -+CONFIG_CRYPTO_SHA512=m -+CONFIG_CRYPTO_TGR192=m -+CONFIG_CRYPTO_WP512=m -+CONFIG_CRYPTO_ANUBIS=m -+CONFIG_CRYPTO_BLOWFISH=m -+CONFIG_CRYPTO_CAST6=m -+CONFIG_CRYPTO_KHAZAD=m -+CONFIG_CRYPTO_SERPENT=m -+CONFIG_CRYPTO_TEA=m -+CONFIG_CRYPTO_TWOFISH=m -+# CONFIG_CRYPTO_HW is not set -+CONFIG_LIBCRC32C=m -+CONFIG_FONTS=y -diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig -index 787e829b6f25..22914bbb4caa 100644 ---- a/arch/powerpc/Kconfig -+++ b/arch/powerpc/Kconfig -@@ -882,6 +882,8 @@ config SCHED_SMT - when dealing with POWER5 cpus at a cost of slightly increased - overhead in some places. If unsure say N here. - -+source "kernel/Kconfig.MuQSS" -+ - config PPC_DENORMALISATION - bool "PowerPC denormalisation exception handling" - depends on PPC_BOOK3S_64 -diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig -index 66e9a0fd64ff..c8531232efb7 100644 ---- a/arch/powerpc/configs/ppc6xx_defconfig -+++ b/arch/powerpc/configs/ppc6xx_defconfig -@@ -73,7 +73,7 @@ CONFIG_QE_GPIO=y - CONFIG_MCU_MPC8349EMITX=y - CONFIG_HIGHMEM=y - CONFIG_HZ_1000=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_BINFMT_MISC=y - CONFIG_HIBERNATION=y - CONFIG_PM_DEBUG=y -diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c -index f18d5067cd0f..fe489fc01c73 100644 ---- a/arch/powerpc/platforms/cell/spufs/sched.c -+++ b/arch/powerpc/platforms/cell/spufs/sched.c -@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; - static struct timer_list spusched_timer; - static struct timer_list spuloadavg_timer; - --/* -- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). -- */ --#define NORMAL_PRIO 120 -- - /* - * Frequency of the spu scheduler tick. By default we do one SPU scheduler - * tick for every 10 CPU scheduler ticks. -diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig -index ee6d28ae08de..827e4693c5b2 100644 ---- a/arch/sh/configs/se7712_defconfig -+++ b/arch/sh/configs/se7712_defconfig -@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=66666666 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" - CONFIG_NET=y -diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig -index bad921bc10f8..e8f42bc0d370 100644 ---- a/arch/sh/configs/se7721_defconfig -+++ b/arch/sh/configs/se7721_defconfig -@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y - CONFIG_SH_7721_SOLUTION_ENGINE=y - CONFIG_SH_PCLK_FREQ=33333333 - CONFIG_HEARTBEAT=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" - CONFIG_NET=y -diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig -index ba887f1351be..4434e93b70bc 100644 ---- a/arch/sh/configs/titan_defconfig -+++ b/arch/sh/configs/titan_defconfig -@@ -19,7 +19,7 @@ CONFIG_SH_TITAN=y - CONFIG_SH_PCLK_FREQ=30000000 - CONFIG_SH_DMA=y - CONFIG_SH_DMA_API=y --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_CMDLINE_OVERWRITE=y - CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" - CONFIG_PCI=y -diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig -index bde4d21a8ac8..c054ec82d91b 100644 ---- a/arch/sparc/configs/sparc64_defconfig -+++ b/arch/sparc/configs/sparc64_defconfig -@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y - CONFIG_HIGH_RES_TIMERS=y - CONFIG_NUMA=y - CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 --CONFIG_PREEMPT_VOLUNTARY=y -+CONFIG_PREEMPT=y - CONFIG_SUN_LDOMS=y - CONFIG_PCI=y - CONFIG_PCI_MSI=y -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 7101ac64bb20..6f56ad1894d1 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1005,6 +1005,22 @@ config NR_CPUS - config SCHED_SMT - def_bool y if SMP - -+config SMT_NICE -+ bool "SMT (Hyperthreading) aware nice priority and policy support" -+ depends on SCHED_MUQSS && SCHED_SMT -+ default y -+ help -+ Enabling Hyperthreading on Intel CPUs decreases the effectiveness -+ of the use of 'nice' levels and different scheduling policies -+ (e.g. realtime) due to sharing of CPU power between hyperthreads. -+ SMT nice support makes each logical CPU aware of what is running on -+ its hyperthread siblings, maintaining appropriate distribution of -+ CPU according to nice levels and scheduling policies at the expense -+ of slightly increased overhead. -+ -+ If unsure say Y here. -+ -+ - config SCHED_MC - def_bool y - prompt "Multi-core scheduler support" -@@ -1035,6 +1051,8 @@ config SCHED_MC_PRIO - - If unsure say Y here. - -+source "kernel/Kconfig.MuQSS" -+ - config UP_LATE_INIT - def_bool y - depends on !SMP && X86_LOCAL_APIC -@@ -1419,7 +1437,7 @@ config HIGHMEM64G - endchoice - - choice -- prompt "Memory split" if EXPERT -+ prompt "Memory split" - default VMSPLIT_3G - depends on X86_32 - help -@@ -1439,17 +1457,17 @@ choice - option alone! - - config VMSPLIT_3G -- bool "3G/1G user/kernel split" -+ bool "Default 896MB lowmem (3G/1G user/kernel split)" - config VMSPLIT_3G_OPT - depends on !X86_PAE -- bool "3G/1G user/kernel split (for full 1G low memory)" -+ bool "1GB lowmem (3G/1G user/kernel split)" - config VMSPLIT_2G -- bool "2G/2G user/kernel split" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_2G_OPT - depends on !X86_PAE -- bool "2G/2G user/kernel split (for full 2G low memory)" -+ bool "2GB lowmem (2G/2G user/kernel split)" - config VMSPLIT_1G -- bool "1G/3G user/kernel split" -+ bool "3GB lowmem (1G/3G user/kernel split)" - endchoice - - config PAGE_OFFSET -diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig -index 78210793d357..0c4415b23002 100644 ---- a/arch/x86/configs/i386_defconfig -+++ b/arch/x86/configs/i386_defconfig -@@ -23,6 +23,8 @@ CONFIG_PROFILING=y - CONFIG_SMP=y - CONFIG_X86_GENERIC=y - CONFIG_HPET_TIMER=y -+CONFIG_SCHED_SMT=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_X86_REBOOTFIXUPS=y - CONFIG_MICROCODE_AMD=y -diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig -index 9936528e1939..328c7d0a38a1 100644 ---- a/arch/x86/configs/x86_64_defconfig -+++ b/arch/x86/configs/x86_64_defconfig -@@ -20,6 +20,9 @@ CONFIG_BLK_DEV_INITRD=y - # CONFIG_COMPAT_BRK is not set - CONFIG_PROFILING=y - CONFIG_SMP=y -+CONFIG_NR_CPUS=64 -+CONFIG_SCHED_SMT=y -+CONFIG_PREEMPT=y - CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y - CONFIG_MICROCODE_AMD=y - CONFIG_X86_MSR=y -diff --git a/drivers/accessibility/speakup/speakup_acntpc.c b/drivers/accessibility/speakup/speakup_acntpc.c -index c94328a5bd4a..6e7d4671aa69 100644 ---- a/drivers/accessibility/speakup/speakup_acntpc.c -+++ b/drivers/accessibility/speakup/speakup_acntpc.c -@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/speakup_apollo.c b/drivers/accessibility/speakup/speakup_apollo.c -index 0877b4044c28..627102d048c1 100644 ---- a/drivers/accessibility/speakup/speakup_apollo.c -+++ b/drivers/accessibility/speakup/speakup_apollo.c -@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) - if (!synth->io_ops->synth_out(synth, ch)) { - synth->io_ops->tiocmset(0, UART_MCR_RTS); - synth->io_ops->tiocmset(UART_MCR_RTS, 0); -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -diff --git a/drivers/accessibility/speakup/speakup_decext.c b/drivers/accessibility/speakup/speakup_decext.c -index 7408eb29cf38..938a0c35968f 100644 ---- a/drivers/accessibility/speakup/speakup_decext.c -+++ b/drivers/accessibility/speakup/speakup_decext.c -@@ -180,7 +180,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_decpc.c b/drivers/accessibility/speakup/speakup_decpc.c -index 96f24c848cc5..1130dfe4da6c 100644 ---- a/drivers/accessibility/speakup/speakup_decpc.c -+++ b/drivers/accessibility/speakup/speakup_decpc.c -@@ -398,7 +398,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (dt_sendchar(ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_dectlk.c b/drivers/accessibility/speakup/speakup_dectlk.c -index 780214b5ca16..7b91594c57aa 100644 ---- a/drivers/accessibility/speakup/speakup_dectlk.c -+++ b/drivers/accessibility/speakup/speakup_dectlk.c -@@ -247,7 +247,7 @@ static void do_catch_up(struct spk_synth *synth) - if (ch == '\n') - ch = 0x0D; - if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - continue; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/accessibility/speakup/speakup_dtlk.c b/drivers/accessibility/speakup/speakup_dtlk.c -index dbebed0eeeec..6d83c13ca4a6 100644 ---- a/drivers/accessibility/speakup/speakup_dtlk.c -+++ b/drivers/accessibility/speakup/speakup_dtlk.c -@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) - delay_time_val = delay_time->u.n.value; - jiffy_delta_val = jiffy_delta->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout((delay_time_val)); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/speakup_keypc.c b/drivers/accessibility/speakup/speakup_keypc.c -index 414827e888fc..cb31c9176daa 100644 ---- a/drivers/accessibility/speakup/speakup_keypc.c -+++ b/drivers/accessibility/speakup/speakup_keypc.c -@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth_full()) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout((full_time_val)); - continue; - } - set_current_state(TASK_RUNNING); -@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) - jiffy_delta_val = jiffy_delta->u.n.value; - delay_time_val = delay_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); -- schedule_timeout(msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - } -diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c -index ac47dbac7207..09f6ba829dfd 100644 ---- a/drivers/accessibility/speakup/synth.c -+++ b/drivers/accessibility/speakup/synth.c -@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (ch == '\n') - ch = synth->procspeech; -- if (unicode) -- ret = synth->io_ops->synth_out_unicode(synth, ch); -- else -- ret = synth->io_ops->synth_out(synth, ch); -- if (!ret) { -- schedule_timeout(msecs_to_jiffies(full_time_val)); -+ if (!synth->io_ops->synth_out(synth, ch)) { -+ schedule_msec_hrtimeout(full_time_val); - continue; - } - if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { -@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) - full_time_val = full_time->u.n.value; - spin_unlock_irqrestore(&speakup_info.spinlock, flags); - if (synth->io_ops->synth_out(synth, synth->procspeech)) -- schedule_timeout( -- msecs_to_jiffies(delay_time_val)); -+ schedule_msec_hrtimeout(delay_time_val); - else -- schedule_timeout( -- msecs_to_jiffies(full_time_val)); -+ schedule_msec_hrtimeout(full_time_val); - jiff_max = jiffies + jiffy_delta_val; - } - set_current_state(TASK_RUNNING); -diff --git a/drivers/block/swim.c b/drivers/block/swim.c -index dd34504382e5..0caa1c7e9223 100644 ---- a/drivers/block/swim.c -+++ b/drivers/block/swim.c -@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, - if (swim_readbit(base, MOTOR_ON)) - break; - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - } else if (action == OFF) { - swim_action(base, MOTOR_OFF); -@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) - if (!swim_readbit(base, DISK_IN)) - break; - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - swim_select(base, RELAX); - } -@@ -372,6 +372,7 @@ static inline int swim_step(struct swim __iomem *base) - - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(1); -+ schedule_min_hrtimeout(); - - swim_select(base, RELAX); - if (!swim_readbit(base, STEP)) -diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c -index 737c0b6b24ea..a3db1f42bb3b 100644 ---- a/drivers/char/ipmi/ipmi_msghandler.c -+++ b/drivers/char/ipmi/ipmi_msghandler.c -@@ -3542,7 +3542,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) - /* Current message first, to preserve order */ - while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { - /* Wait for the message to clear out. */ -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - /* No need for locks, the interface is down. */ -diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c -index 0416b9c9d410..9ce5fae0f1cf 100644 ---- a/drivers/char/ipmi/ipmi_ssif.c -+++ b/drivers/char/ipmi/ipmi_ssif.c -@@ -1288,7 +1288,7 @@ static void shutdown_ssif(void *send_info) - - /* make sure the driver is not looking for flags any more. */ - while (ssif_info->ssif_state != SSIF_NORMAL) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - - ssif_info->stopping = true; - del_timer_sync(&ssif_info->watch_timer); -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -index a95156fc5db7..8f07c8900184 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c -@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, - DRM_ERROR("SVGA device lockup.\n"); - break; - } -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - if (interruptible && signal_pending(current)) { - ret = -ERESTARTSYS; - break; -diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -index 75f3efee21a4..09b1932ce85b 100644 ---- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c -@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, - break; - } - if (lazy) -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - else if ((++count & 0x0F) == 0) { - /** - * FIXME: Use schedule_hr_timeout here for -diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c -index 29f5fed28c2a..974cb08c7aa7 100644 ---- a/drivers/hwmon/fam15h_power.c -+++ b/drivers/hwmon/fam15h_power.c -@@ -221,7 +221,7 @@ static ssize_t power1_average_show(struct device *dev, - prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; - } - -- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); -+ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); - if (leftover) - return 0; - -diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c -index abc8d7db8dc1..baa9d6338a52 100644 ---- a/drivers/iio/light/tsl2563.c -+++ b/drivers/iio/light/tsl2563.c -@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) - default: - delay = 402; - } -- /* -- * TODO: Make sure that we wait at least required delay but why we -- * have to extend it one tick more? -- */ -- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); -+ schedule_msec_hrtimeout_interruptible(delay + 1); - } - - static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) -diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c -index 39530d43590e..a7caf2eb5771 100644 ---- a/drivers/media/i2c/msp3400-driver.c -+++ b/drivers/media/i2c/msp3400-driver.c -@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) - break; - dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) - break; - dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, - dev, addr); -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - if (err == 3) { - dev_warn(&client->dev, "resetting chip, sound will go off.\n"); -diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c -index cf7cfda94107..f63e17489547 100644 ---- a/drivers/media/pci/cx18/cx18-gpio.c -+++ b/drivers/media/pci/cx18/cx18-gpio.c -@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, - - /* Assert */ - gpio_update(cx, mask, ~active_lo); -- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); - - /* Deassert */ - gpio_update(cx, mask, ~active_hi); -- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); -+ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); - } - - /* -diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c -index 856e7ab7f33e..766a26251337 100644 ---- a/drivers/media/pci/ivtv/ivtv-gpio.c -+++ b/drivers/media/pci/ivtv/ivtv-gpio.c -@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) - curout = (curout & ~0xF) | 1; - write_reg(curout, IVTV_REG_GPIO_OUT); - /* We could use something else for smaller time */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - curout |= 2; - write_reg(curout, IVTV_REG_GPIO_OUT); - curdir &= ~0x80; -@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) - curout = read_reg(IVTV_REG_GPIO_OUT); - curout &= ~(1 << itv->card->xceive_pin); - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - - curout |= 1 << itv->card->xceive_pin; - write_reg(curout, IVTV_REG_GPIO_OUT); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible((1)); - return 0; - } - -diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c -index 35dccb31174c..8181cd65e876 100644 ---- a/drivers/media/pci/ivtv/ivtv-ioctl.c -+++ b/drivers/media/pci/ivtv/ivtv-ioctl.c -@@ -1139,7 +1139,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) - TASK_UNINTERRUPTIBLE); - if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) - break; -- schedule_timeout(msecs_to_jiffies(25)); -+ schedule_msec_hrtimeout((25)); - } - finish_wait(&itv->vsync_waitq, &wait); - mutex_lock(&itv->serialize_lock); -diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c -index f04ee84bab5f..c4469b4b8f99 100644 ---- a/drivers/media/pci/ivtv/ivtv-streams.c -+++ b/drivers/media/pci/ivtv/ivtv-streams.c -@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) - while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && - time_before(jiffies, - then + msecs_to_jiffies(2000))) { -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - } - - /* To convert jiffies to ms, we must multiply by 1000 -diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c -index cb0437b4c331..163fffc0e1d4 100644 ---- a/drivers/media/radio/radio-mr800.c -+++ b/drivers/media/radio/radio-mr800.c -@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, - retval = -ENODATA; - break; - } -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - retval = -ERESTARTSYS; - break; - } -diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c -index fb9de7bbcd19..e53cf45e7f3f 100644 ---- a/drivers/media/radio/radio-tea5777.c -+++ b/drivers/media/radio/radio-tea5777.c -@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) - } - - if (wait) { -- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) -+ if (schedule_msec_hrtimeout_interruptible((wait))) - return -ERESTARTSYS; - } - -diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c -index c37315226c42..e73e6393403c 100644 ---- a/drivers/media/radio/tea575x.c -+++ b/drivers/media/radio/tea575x.c -@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, - for (;;) { - if (time_after(jiffies, timeout)) - break; -- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { -+ if (schedule_msec_hrtimeout_interruptible((10))) { - /* some signal arrived, stop search */ - tea->val &= ~TEA575X_BIT_SEARCH; - snd_tea575x_set_freq(tea); -diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c -index b690796d24d4..448b13da62b4 100644 ---- a/drivers/mfd/ucb1x00-core.c -+++ b/drivers/mfd/ucb1x00-core.c -@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) - break; - /* yield to other processes */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } - - return UCB_ADC_DAT(val); -diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c -index 8e6607fc8a67..b9ab770bbdb5 100644 ---- a/drivers/misc/sgi-xp/xpc_channel.c -+++ b/drivers/misc/sgi-xp/xpc_channel.c -@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) - - atomic_inc(&ch->n_on_msg_allocate_wq); - prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); -- ret = schedule_timeout(1); -+ ret = schedule_min_hrtimeout(); - finish_wait(&ch->msg_allocate_wq, &wait); - atomic_dec(&ch->n_on_msg_allocate_wq); - -diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c -index 4a33ec4fc089..da85f847ebb4 100644 ---- a/drivers/net/caif/caif_hsi.c -+++ b/drivers/net/caif/caif_hsi.c -@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) - break; - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - retry--; - } - -diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c -index 66d0198e7834..ce1c7bf9be87 100644 ---- a/drivers/net/can/usb/peak_usb/pcan_usb.c -+++ b/drivers/net/can/usb/peak_usb/pcan_usb.c -@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) - } else { - /* the PCAN-USB needs time to init */ - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); -+ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); - } - - return err; -diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c -index 65b315bc60ab..2b3f71086f5f 100644 ---- a/drivers/net/usb/lan78xx.c -+++ b/drivers/net/usb/lan78xx.c -@@ -2666,7 +2666,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) - while (!skb_queue_empty(&dev->rxq) && - !skb_queue_empty(&dev->txq) && - !skb_queue_empty(&dev->done)) { -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - netif_dbg(dev, ifdown, dev->net, - "waited for %d urb completions\n", temp); -diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c -index 2b2a841cd938..1a4d27179db1 100644 ---- a/drivers/net/usb/usbnet.c -+++ b/drivers/net/usb/usbnet.c -@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) - spin_lock_irqsave(&q->lock, flags); - while (!skb_queue_empty(q)) { - spin_unlock_irqrestore(&q->lock, flags); -- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); -+ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); - set_current_state(TASK_UNINTERRUPTIBLE); - spin_lock_irqsave(&q->lock, flags); - } -diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -index 461e955aa259..5ab8e7396ea4 100644 ---- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c -+++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c -@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, - * doesn't seem to have as many firmware restart cycles... - * - * As a test, we're sticking in a 1/100s delay here */ -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - return 0; - -@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) - IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); - i = 5000; - do { -- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_uninterruptible((40)); - /* Todo... wait for sync command ... */ - - read_register(priv->net_dev, IPW_REG_INTA, &inta); -diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c -index 4547ac44c8d4..8fa1a7fdf12c 100644 ---- a/drivers/parport/ieee1284.c -+++ b/drivers/parport/ieee1284.c -@@ -202,7 +202,7 @@ int parport_wait_peripheral(struct parport *port, - /* parport_wait_event didn't time out, but the - * peripheral wasn't actually ready either. - * Wait for another 10ms. */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - } - } - -diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c -index 2c11bd3fe1fd..8cb6b61c0880 100644 ---- a/drivers/parport/ieee1284_ops.c -+++ b/drivers/parport/ieee1284_ops.c -@@ -520,7 +520,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, - /* Yield the port for a while. */ - if (count && dev->port->irq != PARPORT_IRQ_NONE) { - parport_release (dev); -- schedule_timeout_interruptible(msecs_to_jiffies(40)); -+ schedule_msec_hrtimeout_interruptible((40)); - parport_claim_or_block (dev); - } - else -diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c -index bffe548187ee..c2918ee3e100 100644 ---- a/drivers/platform/x86/intel_ips.c -+++ b/drivers/platform/x86/intel_ips.c -@@ -798,7 +798,7 @@ static int ips_adjust(void *data) - ips_gpu_lower(ips); - - sleep: -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); - } while (!kthread_should_stop()); - - dev_dbg(ips->dev, "ips-adjust thread stopped\n"); -@@ -974,7 +974,7 @@ static int ips_monitor(void *data) - seqno_timestamp = get_jiffies_64(); - - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - - /* Collect an initial average */ - for (i = 0; i < IPS_SAMPLE_COUNT; i++) { -@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) - mchp_samples[i] = mchp; - } - -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - if (kthread_should_stop()) - break; - } -@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) - * us to reduce the sample frequency if the CPU and GPU are idle. - */ - old_cpu_power = thm_readl(THM_CEC); -- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); -+ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); - last_sample_period = IPS_SAMPLE_PERIOD; - - timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); -diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c -index 2018614f258f..fc19b312c345 100644 ---- a/drivers/rtc/rtc-wm8350.c -+++ b/drivers/rtc/rtc-wm8350.c -@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); - - if (!retries) { -@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) - /* Wait until confirmation of stopping */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); - - if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) -@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) - /* Wait until confirmation */ - do { - rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); -- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_uninterruptible((1)); - } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); - - if (rtc_ctrl & WM8350_RTC_ALMSTS) -diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c -index 03b1805b106c..41ee54ff304a 100644 ---- a/drivers/scsi/fnic/fnic_scsi.c -+++ b/drivers/scsi/fnic/fnic_scsi.c -@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) - - /* wait for io cmpl */ - while (atomic_read(&fnic->in_flight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); - -@@ -2278,7 +2278,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, - } - } - -- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); -+ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); - - /* walk again to check, if IOs are still pending in fw */ - if (fnic_is_abts_pending(fnic, lr_sc)) -diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c -index 983eeb0e3d07..007966930f94 100644 ---- a/drivers/scsi/lpfc/lpfc_scsi.c -+++ b/drivers/scsi/lpfc/lpfc_scsi.c -@@ -5194,7 +5194,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, - tgt_id, lun_id, context); - later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; - while (time_after(later, jiffies) && cnt) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); -+ schedule_msec_hrtimeout_uninterruptible((20)); - cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); - } - if (cnt) { -diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c -index b3650c989ed4..7ed1fb285754 100644 ---- a/drivers/scsi/snic/snic_scsi.c -+++ b/drivers/scsi/snic/snic_scsi.c -@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) - - /* Wait for all the IOs that are entered in Qcmd */ - while (atomic_read(&snic->ios_inflight)) -- schedule_timeout(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout((1)); - - ret = snic_issue_hba_reset(snic, sc); - if (ret) { -diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c -index 9266e13f6271..df5c53216d78 100644 ---- a/drivers/staging/comedi/drivers/ni_mio_common.c -+++ b/drivers/staging/comedi/drivers/ni_mio_common.c -@@ -4748,7 +4748,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) - if ((status & NI67XX_CAL_STATUS_BUSY) == 0) - break; - set_current_state(TASK_INTERRUPTIBLE); -- if (schedule_timeout(1)) -+ if (schedule_min_hrtimeout()) - return -EIO; - } - if (i == timeout) { -diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c -index 898add4d1fc8..0aa9dd467349 100644 ---- a/drivers/staging/rts5208/rtsx.c -+++ b/drivers/staging/rts5208/rtsx.c -@@ -477,7 +477,7 @@ static int rtsx_polling_thread(void *__dev) - - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); -+ schedule_msec_hrtimeout((POLLING_INTERVAL)); - - /* lock the device pointers */ - mutex_lock(&dev->dev_mutex); -diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c -index 0433536930a9..d8726f28843f 100644 ---- a/drivers/staging/unisys/visornic/visornic_main.c -+++ b/drivers/staging/unisys/visornic/visornic_main.c -@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- schedule_timeout(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - if (atomic_read(&devdata->usage)) - break; -@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, - } - set_current_state(TASK_INTERRUPTIBLE); - spin_unlock_irqrestore(&devdata->priv_lock, flags); -- wait += schedule_timeout(msecs_to_jiffies(10)); -+ wait += schedule_msec_hrtimeout((10)); - spin_lock_irqsave(&devdata->priv_lock, flags); - } - -diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c -index cfe63932f825..71c00ef772a3 100644 ---- a/drivers/video/fbdev/omap/hwa742.c -+++ b/drivers/video/fbdev/omap/hwa742.c -@@ -913,7 +913,7 @@ static void hwa742_resume(void) - if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(5)); -+ schedule_msec_hrtimeout((5)); - } - hwa742_set_update_mode(hwa742.update_mode_before_suspend); - } -diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c -index f1551e00eb12..f0f651e92504 100644 ---- a/drivers/video/fbdev/pxafb.c -+++ b/drivers/video/fbdev/pxafb.c -@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) - mutex_unlock(&fbi->ctrlr_lock); - - set_current_state(TASK_INTERRUPTIBLE); -- schedule_timeout(msecs_to_jiffies(30)); -+ schedule_msec_hrtimeout((30)); - } - - pr_debug("%s(): task ending\n", __func__); -diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c -index 76d2e43817ea..6ba0604e2162 100644 ---- a/fs/btrfs/inode-map.c -+++ b/fs/btrfs/inode-map.c -@@ -91,7 +91,7 @@ static int caching_kthread(void *data) - btrfs_release_path(path); - root->ino_cache_progress = last; - up_read(&fs_info->commit_root_sem); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - goto again; - } else - continue; -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 617db4e0faa0..f85926764f9a 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/linux/freezer.h b/include/linux/freezer.h -index 27828145ca09..504cc97bf475 100644 ---- a/include/linux/freezer.h -+++ b/include/linux/freezer.h -@@ -311,6 +311,7 @@ static inline void set_freezable(void) {} - #define wait_event_freezekillable_unsafe(wq, condition) \ - wait_event_killable(wq, condition) - -+#define pm_freezing (false) - #endif /* !CONFIG_FREEZER */ - - #endif /* FREEZER_H_INCLUDED */ -diff --git a/include/linux/init_task.h b/include/linux/init_task.h -index 2c620d7ac432..73417df5daa2 100644 ---- a/include/linux/init_task.h -+++ b/include/linux/init_task.h -@@ -36,7 +36,11 @@ extern struct cred init_cred; - #define INIT_PREV_CPUTIME(x) - #endif - -+#ifdef CONFIG_SCHED_MUQSS -+#define INIT_TASK_COMM "MuQSS" -+#else - #define INIT_TASK_COMM "swapper" -+#endif - - /* Attach to the init_task data structure for proper alignment */ - #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK -diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h -index e9bfe6972aed..16ba1c7e5bde 100644 ---- a/include/linux/ioprio.h -+++ b/include/linux/ioprio.h -@@ -53,6 +53,8 @@ enum { - */ - static inline int task_nice_ioprio(struct task_struct *task) - { -+ if (iso_task(task)) -+ return 0; - return (task_nice(task) + 20) / 5; - } - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..139e4535fcc6 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -35,6 +35,10 @@ - #include - #include - -+#ifdef CONFIG_SCHED_MUQSS -+#include -+#endif -+ - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; - struct backing_dev_info; -@@ -213,13 +217,40 @@ struct task_group; - - extern void scheduler_tick(void); - --#define MAX_SCHEDULE_TIMEOUT LONG_MAX -- -+#define MAX_SCHEDULE_TIMEOUT LONG_MAX - extern long schedule_timeout(long timeout); - extern long schedule_timeout_interruptible(long timeout); - extern long schedule_timeout_killable(long timeout); - extern long schedule_timeout_uninterruptible(long timeout); - extern long schedule_timeout_idle(long timeout); -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+extern long schedule_msec_hrtimeout(long timeout); -+extern long schedule_min_hrtimeout(void); -+extern long schedule_msec_hrtimeout_interruptible(long timeout); -+extern long schedule_msec_hrtimeout_uninterruptible(long timeout); -+#else -+static inline long schedule_msec_hrtimeout(long timeout) -+{ -+ return schedule_timeout(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_min_hrtimeout(void) -+{ -+ return schedule_timeout(1); -+} -+ -+static inline long schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); -+} -+ -+static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); -+} -+#endif -+ - asmlinkage void schedule(void); - extern void schedule_preempt_disabled(void); - asmlinkage void preempt_schedule_irq(void); -@@ -651,8 +682,10 @@ struct task_struct { - unsigned int flags; - unsigned int ptrace; - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) - int on_cpu; -+#endif -+#ifdef CONFIG_SMP - struct __call_single_node wake_entry; - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ -@@ -678,10 +711,25 @@ struct task_struct { - int static_prio; - int normal_prio; - unsigned int rt_priority; -+#ifdef CONFIG_SCHED_MUQSS -+ int time_slice; -+ u64 deadline; -+ skiplist_node node; /* Skip list node */ -+ u64 last_ran; -+ u64 sched_time; /* sched_clock time spent running */ -+#ifdef CONFIG_SMT_NICE -+ int smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+#ifdef CONFIG_HOTPLUG_CPU -+ bool zerobound; /* Bound to CPU0 for hotplug */ -+#endif -+ unsigned long rt_timeout; -+#else /* CONFIG_SCHED_MUQSS */ - - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -@@ -863,6 +911,10 @@ struct task_struct { - #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME - u64 utimescaled; - u64 stimescaled; -+#endif -+#ifdef CONFIG_SCHED_MUQSS -+ /* Unbanked cpu time */ -+ unsigned long utime_ns, stime_ns; - #endif - u64 gtime; - struct prev_cputime prev_cputime; -@@ -1332,6 +1384,40 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_MUQSS -+#define tsk_seruntime(t) ((t)->sched_time) -+#define tsk_rttimeout(t) ((t)->rt_timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+} -+ -+void print_scheduler_version(void); -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return (p->policy == SCHED_ISO); -+} -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+ -+static inline void tsk_cpus_current(struct task_struct *p) -+{ -+ p->nr_cpus_allowed = current->nr_cpus_allowed; -+} -+ -+static inline void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "CFS CPU scheduler.\n"); -+} -+ -+static inline bool iso_task(struct task_struct *p) -+{ -+ return false; -+} -+#endif /* CONFIG_SCHED_MUQSS */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..73d6319a856a 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) - #ifdef CONFIG_SMP - - struct root_domain; -+#ifdef CONFIG_SCHED_MUQSS -+static inline void dl_clear_root_domain(struct root_domain *rd) -+{ -+} -+static inline void dl_add_task_root_domain(struct task_struct *p) -+{ -+} -+#else /* CONFIG_SCHED_MUQSS */ - extern void dl_add_task_root_domain(struct task_struct *p); - extern void dl_clear_root_domain(struct root_domain *rd); -+#endif /* CONFIG_SCHED_MUQSS */ - - #endif /* CONFIG_SMP */ -diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h -index 6d67e9a5af6b..101fe470aa8f 100644 ---- a/include/linux/sched/nohz.h -+++ b/include/linux/sched/nohz.h -@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); - static inline void nohz_balance_enter_idle(int cpu) { } - #endif - --#ifdef CONFIG_NO_HZ_COMMON -+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) - void calc_load_nohz_start(void); - void calc_load_nohz_remote(struct rq *rq); - void calc_load_nohz_stop(void); -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..43c9d9e50c09 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,8 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ -+#ifdef CONFIG_SCHED_MUQSS -+/* Note different MAX_RT_PRIO */ -+#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) -+ -+#define ISO_PRIO (MAX_RT_PRIO) -+#define NORMAL_PRIO (MAX_RT_PRIO + 1) -+#define IDLE_PRIO (MAX_RT_PRIO + 2) -+#define PRIO_LIMIT ((IDLE_PRIO) + 1) -+#else /* CONFIG_SCHED_MUQSS */ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - -+#endif /* CONFIG_SCHED_MUQSS */ -+ - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..010b2244e0b6 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_MUQSS - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index a98965007eef..743f67fd012e 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat); - extern void free_task(struct task_struct *tsk); - - /* sched_exec is called by processes performing an exec */ --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) - extern void sched_exec(void); - #else - #define sched_exec() {} -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..d4be84ba273b ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,33 @@ -+#ifndef _LINUX_SKIP_LISTS_H -+#define _LINUX_SKIP_LISTS_H -+typedef u64 keyType; -+typedef void *valueType; -+ -+typedef struct nodeStructure skiplist_node; -+ -+struct nodeStructure { -+ int level; /* Levels in this structure */ -+ keyType key; -+ valueType value; -+ skiplist_node *next[8]; -+ skiplist_node *prev[8]; -+}; -+ -+typedef struct listStructure { -+ int entries; -+ int level; /* Maximum level of the list -+ (1 more than the number of levels in the list) */ -+ skiplist_node *header; /* pointer to header */ -+} skiplist; -+ -+void skiplist_init(skiplist_node *slnode); -+skiplist *new_skiplist(skiplist_node *slnode); -+void free_skiplist(skiplist *l); -+void skiplist_node_init(skiplist_node *node); -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); -+void skiplist_delete(skiplist *l, skiplist_node *node); -+ -+static inline bool skiplist_node_empty(skiplist_node *node) { -+ return (!node->next[0]); -+} -+#endif /* _LINUX_SKIP_LISTS_H */ -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..f48c5c5da651 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -115,9 +115,16 @@ struct clone_args { - #define SCHED_FIFO 1 - #define SCHED_RR 2 - #define SCHED_BATCH 3 --/* SCHED_ISO: reserved but not implemented yet */ -+/* SCHED_ISO: Implemented on MuQSS only */ - #define SCHED_IDLE 5 -+#ifdef CONFIG_SCHED_MUQSS -+#define SCHED_ISO 4 -+#define SCHED_IDLEPRIO SCHED_IDLE -+#define SCHED_MAX (SCHED_IDLEPRIO) -+#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) -+#else /* CONFIG_SCHED_MUQSS */ - #define SCHED_DEADLINE 6 -+#endif /* CONFIG_SCHED_MUQSS */ - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..7e0eb99bd607 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -92,6 +92,18 @@ config THREAD_INFO_IN_TASK - - menu "General setup" - -+config SCHED_MUQSS -+ bool "MuQSS cpu scheduler" -+ select HIGH_RES_TIMERS -+ help -+ The Multiple Queue Skiplist Scheduler for excellent interactivity and -+ responsiveness on the desktop and highly scalable deterministic -+ low latency on any hardware. -+ -+ Say Y here. -+ default y -+ -+ - config BROKEN - bool - -@@ -510,6 +522,7 @@ config SCHED_THERMAL_PRESSURE - default y if ARM64 - depends on SMP - depends on CPU_FREQ_THERMAL -+ depends on !SCHED_MUQSS - help - Select this option to enable thermal pressure accounting in the - scheduler. Thermal pressure is the value conveyed to the scheduler -@@ -858,6 +871,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_MUQSS - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -942,9 +956,13 @@ menuconfig CGROUP_SCHED - help - This feature lets CPU scheduler recognize task groups and control CPU - bandwidth allocation to such task groups. It uses cgroups to group -- tasks. -+ tasks. In combination with MuQSS this is purely a STUB to create the -+ files associated with the CPU controller cgroup but most of the -+ controls do nothing. This is useful for working in environments and -+ with applications that will only work if this control group is -+ present. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_MUQSS - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1073,6 +1091,7 @@ config CGROUP_DEVICE - - config CGROUP_CPUACCT - bool "Simple CPU accounting controller" -+ depends on !SCHED_MUQSS - help - Provides a simple controller for monitoring the - total CPU consumed by the tasks in a cgroup. -@@ -1200,6 +1219,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_MUQSS - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..2557beb609c0 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,17 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_MUQSS -+ .prio = NORMAL_PRIO, -+ .static_prio = MAX_PRIO - 20, -+ .normal_prio = NORMAL_PRIO, -+ .deadline = 0, -+ .time_slice = 1000000, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -87,6 +95,7 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifndef CONFIG_SCHED_MUQSS - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -94,6 +103,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/init/main.c b/init/main.c -index e880b4ecb314..fe0a705e83f2 100644 ---- a/init/main.c -+++ b/init/main.c -@@ -1421,6 +1421,8 @@ static int __ref kernel_init(void *unused) - - do_sysctl_args(); - -+ print_scheduler_version(); -+ - if (ramdisk_execute_command) { - ret = run_init_process(ramdisk_execute_command); - if (!ret) -diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS -new file mode 100644 -index 000000000000..a6a58781ef91 ---- /dev/null -+++ b/kernel/Kconfig.MuQSS -@@ -0,0 +1,105 @@ -+choice -+ prompt "CPU scheduler runqueue sharing" -+ default RQ_MC if SCHED_MUQSS -+ default RQ_NONE -+ -+config RQ_NONE -+ bool "No sharing" -+ help -+ This is the default behaviour where the CPU scheduler has one runqueue -+ per CPU, whether it is a physical or logical CPU (hyperthread). -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=none -+ -+ If unsure, say N. -+ -+config RQ_SMT -+ bool "SMT (hyperthread) siblings" -+ depends on SCHED_SMT && SCHED_MUQSS -+ -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by SMT (hyperthread) siblings. As these logical cores share -+ one physical core, sharing the runqueue resource can lead to decreased -+ overhead, lower latency and higher throughput. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smt -+ -+ If unsure, say N. -+ -+config RQ_MC -+ bool "Multicore siblings" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by multicore siblings in addition to any SMT siblings. -+ As these physical cores share caches, sharing the runqueue resource -+ will lead to lower latency, but its effects on overhead and throughput -+ are less predictable. As a general rule, 6 or fewer cores will likely -+ benefit from this, while larger CPUs will only derive a latency -+ benefit. If your workloads are primarily single threaded, this will -+ possibly worsen throughput. If you are only concerned about latency -+ then enable this regardless of how many cores you have. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=mc -+ -+ If unsure, say Y. -+ -+config RQ_MC_LLC -+ bool "Multicore siblings (LLC)" -+ depends on SCHED_MC && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will behave similarly as -+ with "Multicore siblings". -+ This option takes LLC cache into account when scheduling tasks. -+ Option may benefit CPUs with multiple LLC caches, such as Ryzen -+ and Xeon CPUs. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=llc -+ -+ If unsure, say N. -+ -+config RQ_SMP -+ bool "Symmetric Multi-Processing" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ shared by all physical CPUs unless they are on separate NUMA nodes. -+ As physical CPUs usually do not share resources, sharing the runqueue -+ will normally worsen throughput but improve latency. If you only -+ care about latency enable this. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=smp -+ -+ If unsure, say N. -+ -+config RQ_ALL -+ bool "NUMA" -+ depends on SMP && SCHED_MUQSS -+ help -+ With this option enabled, the CPU scheduler will have one runqueue -+ regardless of the architecture configuration, including across NUMA -+ nodes. This can substantially decrease throughput in NUMA -+ configurations, but light NUMA designs will not be dramatically -+ affected. This option should only be chosen if latency is the prime -+ concern. -+ -+ This can still be enabled runtime with the boot parameter -+ rqshare=all -+ -+ If unsure, say N. -+endchoice -+ -+config SHARERQ -+ int -+ default 0 if RQ_NONE -+ default 1 if RQ_SMT -+ default 2 if RQ_MC -+ default 3 if RQ_MC_LLC -+ default 4 if RQ_SMP -+ default 5 if RQ_ALL -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 38ef6d06888e..89ed751ac4e4 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,8 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_100 if SCHED_MUQSS -+ default HZ_250_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -20,11 +21,18 @@ choice - config HZ_100 - bool "100 HZ" - help -+ 100 Hz is a suitable choice in combination with MuQSS which does -+ not rely on ticks for rescheduling interrupts, and is not Hz limited -+ for timeouts and sleeps from both the kernel and userspace. -+ This allows us to benefit from the lower overhead and higher -+ throughput of fewer timer ticks. -+ -+ Non-MuQSS kernels: - 100 Hz is a typical choice for servers, SMP and NUMA systems - with lots of processors that may show reduced performance if - too many timer interrupts are occurring. - -- config HZ_250 -+ config HZ_250_NODEF - bool "250 HZ" - help - 250 Hz is a good compromise choice allowing server performance -@@ -32,7 +40,10 @@ choice - on SMP and NUMA systems. If you are going to be using NTSC video - or multimedia, selected 300Hz instead. - -- config HZ_300 -+ 250 Hz is the default choice for the mainline scheduler but not -+ advantageous in combination with MuQSS. -+ -+ config HZ_300_NODEF - bool "300 HZ" - help - 300 Hz is a good compromise choice allowing server performance -@@ -40,7 +51,7 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -- config HZ_1000 -+ config HZ_1000_NODEF - bool "1000 HZ" - help - 1000 Hz is the preferred choice for desktop systems and other -@@ -51,9 +62,9 @@ endchoice - config HZ - int - default 100 if HZ_100 -- default 250 if HZ_250 -- default 300 if HZ_300 -- default 1000 if HZ_1000 -+ default 250 if HZ_250_NODEF -+ default 300 if HZ_300_NODEF -+ default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - def_bool HIGH_RES_TIMERS -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index bf82259cff96..d9438eb6f91c 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -2,7 +2,7 @@ - - choice - prompt "Preemption Model" -- default PREEMPT_NONE -+ default PREEMPT - - config PREEMPT_NONE - bool "No Forced Preemption (Server)" -@@ -18,7 +18,7 @@ config PREEMPT_NONE - latencies. - - config PREEMPT_VOLUNTARY -- bool "Voluntary Kernel Preemption (Desktop)" -+ bool "Voluntary Kernel Preemption (Nothing)" - depends on !ARCH_NO_PREEMPT - help - This option reduces the latency of the kernel by adding more -@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY - applications to run more 'smoothly' even when the system is - under load. - -- Select this if you are building a kernel for a desktop system. -+ Select this for no system in particular (choose Preemptible -+ instead on a desktop if you know what's good for you). - - config PREEMPT - bool "Preemptible Kernel (Low-Latency Desktop)" -diff --git a/kernel/Makefile b/kernel/Makefile -index 9a20016d4900..a2640d78eadb 100644 ---- a/kernel/Makefile -+++ b/kernel/Makefile -@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ - extable.o params.o \ - kthread.o sys_ni.o nsproxy.o \ - notifier.o ksysfs.o cred.o reboot.o \ -- async.o range.o smpboot.o ucount.o regset.o -+ async.o range.o smpboot.o ucount.o regset.o \ -+ skip_list.o - - obj-$(CONFIG_BPFILTER) += usermode_driver.o - obj-$(CONFIG_MODULES) += kmod.o -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 733e80f334e7..3f3506c851fd 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig -index 10a5aff4eecc..ce3bcc66b48d 100644 ---- a/kernel/irq/Kconfig -+++ b/kernel/irq/Kconfig -@@ -112,6 +112,23 @@ config GENERIC_IRQ_RESERVATION_MODE - config IRQ_FORCED_THREADING - bool - -+config FORCE_IRQ_THREADING -+ bool "Make IRQ threading compulsory" -+ depends on IRQ_FORCED_THREADING -+ default n -+ help -+ -+ Make IRQ threading mandatory for any IRQ handlers that support it -+ instead of being optional and requiring the threadirqs kernel -+ parameter. Instead they can be optionally disabled with the -+ nothreadirqs kernel parameter. -+ -+ Enabling this may make some architectures not boot with runqueue -+ sharing and MuQSS. -+ -+ Enable if you are building for a desktop or low latency system, -+ otherwise say N. -+ - config SPARSE_IRQ - bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ - help -diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c -index 5df903fccb60..17a0dd194582 100644 ---- a/kernel/irq/manage.c -+++ b/kernel/irq/manage.c -@@ -25,9 +25,20 @@ - #include "internals.h" - - #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) -+#ifdef CONFIG_FORCE_IRQ_THREADING -+__read_mostly bool force_irqthreads = true; -+#else - __read_mostly bool force_irqthreads; -+#endif - EXPORT_SYMBOL_GPL(force_irqthreads); - -+static int __init setup_noforced_irqthreads(char *arg) -+{ -+ force_irqthreads = false; -+ return 0; -+} -+early_param("nothreadirqs", setup_noforced_irqthreads); -+ - static int __init setup_forced_irqthreads(char *arg) - { - force_irqthreads = true; -diff --git a/kernel/kthread.c b/kernel/kthread.c -index 3edaa380dc7b..a1712699726b 100644 ---- a/kernel/kthread.c -+++ b/kernel/kthread.c -@@ -471,6 +471,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) - } - EXPORT_SYMBOL(kthread_bind); - -+#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) -+extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); -+ -+/* -+ * new_kthread_bind is a special variant of __kthread_bind_mask. -+ * For new threads to work on muqss we want to call do_set_cpus_allowed -+ * without the task_cpu being set and the task rescheduled until they're -+ * rescheduled on their own so we call __do_set_cpus_allowed directly which -+ * only changes the cpumask. This is particularly important for smpboot threads -+ * to work. -+ */ -+static void new_kthread_bind(struct task_struct *p, unsigned int cpu) -+{ -+ unsigned long flags; -+ -+ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) -+ return; -+ -+ /* It's safe because the task is inactive. */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ __do_set_cpus_allowed(p, cpumask_of(cpu)); -+ p->flags |= PF_NO_SETAFFINITY; -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+#else -+#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) -+#endif -+ - /** - * kthread_create_on_cpu - Create a cpu bound kthread - * @threadfn: the function to run until signal_pending(current). -@@ -491,7 +519,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), - cpu); - if (IS_ERR(p)) - return p; -- kthread_bind(p, cpu); -+ new_kthread_bind(p, cpu); - /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); - to_kthread(p)->cpu = cpu; -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..825f9b8e228f 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) - { - static char err_buf[STACK_ERR_BUF_SIZE]; - struct rq *rq; -- struct rq_flags flags; -+ struct rq_flags rf; - int ret; - bool success = false; - -@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) - * functions. If all goes well, switch the task to the target patch - * state. - */ -- rq = task_rq_lock(task, &flags); -+ rq = task_rq_lock(task, &rf); - - if (task_running(rq, task) && task != current) { - snprintf(err_buf, STACK_ERR_BUF_SIZE, -@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) - task->patch_state = klp_target_state; - - done: -- task_rq_unlock(rq, task, &flags); -+ task_rq_unlock(rq, task, &rf); - - /* - * Due to console deadlock issues, pr_debug() can't be used while -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..1ff14a21193d 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - -+ifdef CONFIG_SCHED_MUQSS -+obj-y += MuQSS.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+ -+obj-$(CONFIG_SMP) += topology.o -+else - obj-y += core.o loadavg.o clock.o cputime.o - obj-y += idle.o fair.o rt.o deadline.o - obj-y += wait.o wait_bit.o swait.o completion.o - - obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o -+endif -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o - obj-$(CONFIG_MEMBARRIER) += membarrier.o -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -new file mode 100644 -index 000000000000..8da537d5226c ---- /dev/null -+++ b/kernel/sched/MuQSS.c -@@ -0,0 +1,7855 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * kernel/sched/MuQSS.c, was kernel/sched.c -+ * -+ * Kernel scheduler and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and -+ * make semaphores SMP safe -+ * 1998-11-19 Implemented schedule_timeout() and related stuff -+ * by Andrea Arcangeli -+ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: -+ * hybrid priority-list and round-robin design with -+ * an array-switch method of distributing timeslices -+ * and per-CPU runqueues. Cleanups and useful suggestions -+ * by Davide Libenzi, preemptible kernel bits by Robert Love. -+ * 2003-09-03 Interactivity tuning by Con Kolivas. -+ * 2004-04-02 Scheduler domains code by Nick Piggin -+ * 2007-04-15 Work begun on replacing all interactivity tuning with a -+ * fair scheduling design by Con Kolivas. -+ * 2007-05-05 Load balancing (smp-nice) and other improvements -+ * by Peter Williams -+ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith -+ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri -+ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, -+ * Thomas Gleixner, Mike Kravetz -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS -+ * scheduler by Con Kolivas. -+ * 2019-08-31 LLC bits by Eduards Bezverhijs -+ */ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "MuQSS.h" -+#include "smp.h" -+ -+#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) -+#define rt_task(p) rt_prio((p)->prio) -+#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) -+#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ -+ (policy) == SCHED_RR) -+#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) -+ -+#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) -+#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) -+#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -+ -+#define is_iso_policy(policy) ((policy) == SCHED_ISO) -+#define iso_task(p) unlikely(is_iso_policy((p)->policy)) -+#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -+ -+#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -+ -+#define ISO_PERIOD (5 * HZ) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* -+ * Some helpers for converting to/from various scales. Use shifts to get -+ * approximate multiples of ten for less overhead. -+ */ -+#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ -+#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) -+#define JIFFY_NS (APPROX_NS_PS / HZ) -+#define JIFFY_US (1048576 / HZ) -+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) -+#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) -+#define HALF_JIFFY_US (1048576 / HZ / 2) -+#define MS_TO_NS(TIME) ((TIME) << 20) -+#define MS_TO_US(TIME) ((TIME) << 10) -+#define NS_TO_MS(TIME) ((TIME) >> 20) -+#define NS_TO_US(TIME) ((TIME) >> 10) -+#define US_TO_NS(TIME) ((TIME) << 10) -+#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) -+ -+#define RESCHED_US (100) /* Reschedule if less than this many μs left */ -+ -+void print_scheduler_version(void) -+{ -+ printk(KERN_INFO "MuQSS CPU scheduler v0.204 by Con Kolivas.\n"); -+} -+ -+/* Define RQ share levels */ -+#define RQSHARE_NONE 0 -+#define RQSHARE_SMT 1 -+#define RQSHARE_MC 2 -+#define RQSHARE_MC_LLC 3 -+#define RQSHARE_SMP 4 -+#define RQSHARE_ALL 5 -+ -+/* Define locality levels */ -+#define LOCALITY_SAME 0 -+#define LOCALITY_SMT 1 -+#define LOCALITY_MC_LLC 2 -+#define LOCALITY_MC 3 -+#define LOCALITY_SMP 4 -+#define LOCALITY_DISTANT 5 -+ -+/* -+ * This determines what level of runqueue sharing will be done and is -+ * configurable at boot time with the bootparam rqshare = -+ */ -+static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ -+ -+static int __init set_rqshare(char *str) -+{ -+ if (!strncmp(str, "none", 4)) { -+ rqshare = RQSHARE_NONE; -+ return 0; -+ } -+ if (!strncmp(str, "smt", 3)) { -+ rqshare = RQSHARE_SMT; -+ return 0; -+ } -+ if (!strncmp(str, "mc", 2)) { -+ rqshare = RQSHARE_MC; -+ return 0; -+ } -+ if (!strncmp(str, "llc", 3)) { -+ rqshare = RQSHARE_MC_LLC; -+ return 0; -+ } -+ if (!strncmp(str, "smp", 3)) { -+ rqshare = RQSHARE_SMP; -+ return 0; -+ } -+ if (!strncmp(str, "all", 3)) { -+ rqshare = RQSHARE_ALL; -+ return 0; -+ } -+ return 1; -+} -+__setup("rqshare=", set_rqshare); -+ -+/* -+ * This is the time all tasks within the same priority round robin. -+ * Value is in ms and set to a minimum of 6ms. -+ * Tunable via /proc interface. -+ */ -+int rr_interval __read_mostly = 6; -+ -+/* -+ * Tunable to choose whether to prioritise latency or throughput, simple -+ * binary yes or no -+ */ -+int sched_interactive __read_mostly = 1; -+ -+/* -+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks -+ * are allowed to run five seconds as real time tasks. This is the total over -+ * all online cpus. -+ */ -+int sched_iso_cpu __read_mostly = 70; -+ -+/* -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Yield only to better priority/deadline tasks. (default) -+ * 2: Expire timeslice and recalculate deadline. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+/* -+ * The relative length of deadline for each priority(nice) level. -+ */ -+static int prio_ratios[NICE_WIDTH] __read_mostly; -+ -+ -+/* -+ * The quota handed out to tasks of all priority levels when refilling their -+ * time_slice. -+ */ -+static inline int timeslice(void) -+{ -+ return MS_TO_US(rr_interval); -+} -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifdef CONFIG_SMP -+/* -+ * Total number of runqueues. Equals number of CPUs when there is no runqueue -+ * sharing but is usually less with SMT/MC sharing of runqueues. -+ */ -+static int total_runqueues __read_mostly = 1; -+ -+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; -+ -+struct rq *cpu_rq(int cpu) -+{ -+ return &per_cpu(runqueues, (cpu)); -+} -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+ -+/* -+ * For asym packing, by default the lower numbered cpu has higher priority. -+ */ -+int __weak arch_asym_cpu_priority(int cpu) -+{ -+ return -cpu; -+} -+ -+int __weak arch_sd_sibling_asym_packing(void) -+{ -+ return 0*SD_ASYM_PACKING; -+} -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+#else -+struct rq *uprq; -+#endif /* CONFIG_SMP */ -+ -+#include "stats.h" -+ -+/* -+ * All common locking functions performed on rq->lock. rq->clock is local to -+ * the CPU accessing it so it can be modified just with interrupts disabled -+ * when we're not updating niffies. -+ * Looking up task_rq must be done under rq->lock to be safe. -+ */ -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+static void update_irq_load_avg(struct rq *rq, long delta); -+#else -+static inline void update_irq_load_avg(struct rq *rq, long delta) {} -+#endif -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if (irq_delta + steal) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta < 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+/* -+ * Niffies are a globally increasing nanosecond counter. They're only used by -+ * update_load_avg and time_slice_expired, however deadlines are based on them -+ * across CPUs. Update them whenever we will call one of those functions, and -+ * synchronise them across CPUs whenever we hold both runqueue locks. -+ */ -+static inline void update_clocks(struct rq *rq) -+{ -+ s64 ndiff, minndiff; -+ long jdiff; -+ -+ update_rq_clock(rq); -+ ndiff = rq->clock - rq->old_clock; -+ rq->old_clock = rq->clock; -+ jdiff = jiffies - rq->last_jiffy; -+ -+ /* Subtract any niffies added by balancing with other rqs */ -+ ndiff -= rq->niffies - rq->last_niffy; -+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; -+ if (minndiff < 0) -+ minndiff = 0; -+ ndiff = max(ndiff, minndiff); -+ rq->niffies += ndiff; -+ rq->last_niffy = rq->niffies; -+ if (jdiff) { -+ rq->last_jiffy += jdiff; -+ rq->last_jiffy_niffies = rq->niffies; -+ } -+} -+ -+/* -+ * Any time we have two runqueues locked we use that as an opportunity to -+ * synchronise niffies to the highest value as idle ticks may have artificially -+ * kept niffies low on one CPU and the truth can only be later. -+ */ -+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) -+{ -+ if (rq1->niffies > rq2->niffies) -+ rq2->niffies = rq1->niffies; -+ else -+ rq1->niffies = rq2->niffies; -+} -+ -+/* -+ * double_rq_lock - safely lock two runqueues -+ * -+ * Note this does not disable interrupts like task_rq_lock, -+ * you need to do so manually before calling. -+ */ -+ -+/* For when we know rq1 != rq2 */ -+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ if (rq1 < rq2) { -+ raw_spin_lock(rq1->lock); -+ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); -+ } else { -+ raw_spin_lock(rq2->lock); -+ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); -+ } -+} -+ -+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) -+ __acquires(rq1->lock) -+ __acquires(rq2->lock) -+{ -+ BUG_ON(!irqs_disabled()); -+ if (rq1->lock == rq2->lock) { -+ raw_spin_lock(rq1->lock); -+ __acquire(rq2->lock); /* Fake it out ;) */ -+ } else -+ __double_rq_lock(rq1, rq2); -+ synchronise_niffies(rq1, rq2); -+} -+ -+/* -+ * double_rq_unlock - safely unlock two runqueues -+ * -+ * Note this does not restore interrupts like task_rq_unlock, -+ * you need to do so manually after calling. -+ */ -+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) -+ __releases(rq1->lock) -+ __releases(rq2->lock) -+{ -+ raw_spin_unlock(rq1->lock); -+ if (rq1->lock != rq2->lock) -+ raw_spin_unlock(rq2->lock); -+ else -+ __release(rq2->lock); -+} -+ -+static inline void lock_all_rqs(void) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_lock(rq->lock); -+ } -+} -+ -+static inline void unlock_all_rqs(void) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ do_raw_spin_unlock(rq->lock); -+ } -+ preempt_enable(); -+} -+ -+/* Specially nest trylock an rq */ -+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) -+{ -+ if (unlikely(!do_raw_spin_trylock(rq->lock))) -+ return false; -+ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ synchronise_niffies(this_rq, rq); -+ return true; -+} -+ -+/* Unlock a specially nested trylocked rq */ -+static inline void unlock_rq(struct rq *rq) -+{ -+ spin_release(&rq->lock->dep_map, _RET_IP_); -+ do_raw_spin_unlock(rq->lock); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* Task can safely be re-inserted now */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+static inline void smp_sched_reschedule(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ smp_send_reschedule(cpu); -+} -+ -+/* -+ * resched_task - mark a task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_task(struct task_struct *p) -+{ -+ int cpu; -+#ifdef CONFIG_LOCKDEP -+ /* Kernel threads call this when creating workqueues while still -+ * inactive from __kthread_bind_mask, holding only the pi_lock */ -+ if (!(p->flags & PF_KTHREAD)) { -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(rq->lock); -+ } -+#endif -+ if (test_tsk_need_resched(p)) -+ return; -+ -+ cpu = task_cpu(p); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(p)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * A task that is not running or queued will not have a node set. -+ * A task that is queued but not running will have a node set. -+ * A task that is currently running will have ->on_cpu set but no node set. -+ */ -+static inline bool task_queued(struct task_struct *p) -+{ -+ return !skiplist_node_empty(&p->node); -+} -+ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); -+static inline void resched_if_idle(struct rq *rq); -+ -+static inline bool deadline_before(u64 deadline, u64 time) -+{ -+ return (deadline < time); -+} -+ -+/* -+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline -+ * is the key to everything. It distributes cpu fairly amongst tasks of the -+ * same nice value, it proportions cpu according to nice level, it means the -+ * task that last woke up the longest ago has the earliest deadline, thus -+ * ensuring that interactive tasks get low latency on wake up. The CPU -+ * proportion works out to the square of the virtual deadline difference, so -+ * this equation will give nice 19 3% CPU compared to nice 0. -+ */ -+static inline u64 prio_deadline_diff(int user_prio) -+{ -+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -+} -+ -+static inline u64 task_deadline_diff(struct task_struct *p) -+{ -+ return prio_deadline_diff(TASK_USER_PRIO(p)); -+} -+ -+static inline u64 static_deadline_diff(int static_prio) -+{ -+ return prio_deadline_diff(USER_PRIO(static_prio)); -+} -+ -+static inline int longest_deadline_diff(void) -+{ -+ return prio_deadline_diff(39); -+} -+ -+static inline int ms_longest_deadline_diff(void) -+{ -+ return NS_TO_MS(longest_deadline_diff()); -+} -+ -+static inline bool rq_local(struct rq *rq); -+ -+#ifndef SCHED_CAPACITY_SCALE -+#define SCHED_CAPACITY_SCALE 1024 -+#endif -+ -+static inline int rq_load(struct rq *rq) -+{ -+ return rq->nr_running; -+} -+ -+/* -+ * Update the load average for feeding into cpu frequency governors. Use a -+ * rough estimate of a rolling average with ~ time constant of 32ms. -+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 -+ * Make sure a call to update_clocks has been made before calling this to get -+ * an updated rq->niffies. -+ */ -+static void update_load_avg(struct rq *rq, unsigned int flags) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; -+ rq->load_avg = load; -+ -+ rq->load_update = rq->niffies; -+ update_irq_load_avg(rq, 0); -+ if (likely(rq_local(rq))) -+ cpufreq_trigger(rq, flags); -+} -+ -+#ifdef HAVE_SCHED_AVG_IRQ -+/* -+ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds -+ * here so we scale curload to how long it's been since the last update. -+ */ -+static void update_irq_load_avg(struct rq *rq, long delta) -+{ -+ long us_interval, load; -+ -+ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); -+ if (unlikely(us_interval <= 0)) -+ return; -+ -+ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); -+ if (unlikely(load < 0)) -+ load = 0; -+ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; -+ rq->irq_load_avg = load; -+ -+ rq->irq_load_update = rq->niffies; -+} -+#endif -+ -+/* -+ * Removing from the runqueue. Enter with rq locked. Deleting a task -+ * from the skip list is done via the stored node reference in the task struct -+ * and does not require a full look up. Thus it occurs in O(k) time where k -+ * is the "level" of the list the task was stored at - usually < 4, max 8. -+ */ -+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ skiplist_delete(rq->sl, &p->node); -+ rq->best_key = rq->node->next[0]->key; -+ update_clocks(rq); -+ -+ if (!(flags & DEQUEUE_SAVE)) { -+ sched_info_dequeued(rq, p); -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); -+ } -+ rq->nr_running--; -+ if (rt_task(p)) -+ rq->rt_nr_running--; -+ update_load_avg(rq, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_RCU -+static bool rcu_read_critical(struct task_struct *p) -+{ -+ return p->rcu_read_unlock_special.b.blocked; -+} -+#else /* CONFIG_PREEMPT_RCU */ -+#define rcu_read_critical(p) (false) -+#endif /* CONFIG_PREEMPT_RCU */ -+ -+/* -+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as -+ * an idle task, we ensure none of the following conditions are met. -+ */ -+static bool idleprio_suitable(struct task_struct *p) -+{ -+ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) && -+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); -+} -+ -+/* -+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check -+ * that the iso_refractory flag is not set. -+ */ -+static inline bool isoprio_suitable(struct rq *rq) -+{ -+ return !rq->iso_refractory; -+} -+ -+static inline void inc_nr_running(struct rq *rq) -+{ -+ rq->nr_running++; -+ if (trace_sched_update_nr_running_tp_enabled()) { -+ call_trace_sched_update_nr_running(rq, 1); -+ } -+} -+ -+static inline void dec_nr_running(struct rq *rq) -+{ -+ rq->nr_running--; -+ if (trace_sched_update_nr_running_tp_enabled()) { -+ call_trace_sched_update_nr_running(rq, -1); -+ } -+} -+ -+/* -+ * Adding to the runqueue. Enter with rq locked. -+ */ -+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ unsigned int randseed, cflags = 0; -+ u64 sl_id; -+ -+ if (!rt_task(p)) { -+ /* Check it hasn't gotten rt from PI */ -+ if ((idleprio_task(p) && idleprio_suitable(p)) || -+ (iso_task(p) && isoprio_suitable(rq))) -+ p->prio = p->normal_prio; -+ else -+ p->prio = NORMAL_PRIO; -+ } else -+ rq->rt_nr_running++; -+ /* -+ * The sl_id key passed to the skiplist generates a sorted list. -+ * Realtime and sched iso tasks run FIFO so they only need be sorted -+ * according to priority. The skiplist will put tasks of the same -+ * key inserted later in FIFO order. Tasks of sched normal, batch -+ * and idleprio are sorted according to their deadlines. Idleprio -+ * tasks are offset by an impossibly large deadline value ensuring -+ * they get sorted into last positions, but still according to their -+ * own deadlines. This creates a "landscape" of skiplists running -+ * from priority 0 realtime in first place to the lowest priority -+ * idleprio tasks last. Skiplist insertion is an O(log n) process. -+ */ -+ if (p->prio <= ISO_PRIO) { -+ sl_id = p->prio; -+ } else { -+ sl_id = p->deadline; -+ if (idleprio_task(p)) { -+ if (p->prio == IDLE_PRIO) -+ sl_id |= 0xF000000000000000; -+ else -+ sl_id += longest_deadline_diff(); -+ } -+ } -+ /* -+ * Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as the random seed for skiplist insertion. -+ */ -+ update_clocks(rq); -+ if (!(flags & ENQUEUE_RESTORE)) { -+ sched_info_queued(rq, p); -+ psi_enqueue(p, flags & ENQUEUE_WAKEUP); -+ } -+ -+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; -+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); -+ rq->best_key = rq->node->next[0]->key; -+ if (p->in_iowait) -+ cflags |= SCHED_CPUFREQ_IOWAIT; -+ inc_nr_running(rq); -+ update_load_avg(rq, cflags); -+} -+ -+/* -+ * Returns the relative length of deadline all compared to the shortest -+ * deadline which is that of nice -20. -+ */ -+static inline int task_prio_ratio(struct task_struct *p) -+{ -+ return prio_ratios[TASK_USER_PRIO(p)]; -+} -+ -+/* -+ * task_timeslice - all tasks of all priorities get the exact same timeslice -+ * length. CPU distribution is handled by giving different deadlines to -+ * tasks of different priorities. Use 128 as the base value for fast shifts. -+ */ -+static inline int task_timeslice(struct task_struct *p) -+{ -+ return (rr_interval * task_prio_ratio(p) / 128); -+} -+ -+#ifdef CONFIG_SMP -+/* Entered with rq locked */ -+static inline void resched_if_idle(struct rq *rq) -+{ -+ if (rq_idle(rq)) -+ resched_task(rq->curr); -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return (rq->cpu == smp_processor_id()); -+} -+#ifdef CONFIG_SMT_NICE -+static const cpumask_t *thread_cpumask(int cpu); -+ -+/* Find the best real time priority running on any SMT siblings of cpu and if -+ * none are running, the static priority of the best deadline task running. -+ * The lookups to the other runqueues is done lockless as the occasional wrong -+ * value would be harmless. */ -+static int best_smt_bias(struct rq *this_rq) -+{ -+ int other_cpu, best_bias = 0; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq = cpu_rq(other_cpu); -+ -+ if (rq_idle(rq)) -+ continue; -+ if (unlikely(!rq->online)) -+ continue; -+ if (!rq->rq_mm) -+ continue; -+ if (likely(rq->rq_smt_bias > best_bias)) -+ best_bias = rq->rq_smt_bias; -+ } -+ return best_bias; -+} -+ -+static int task_prio_bias(struct task_struct *p) -+{ -+ if (rt_task(p)) -+ return 1 << 30; -+ else if (task_running_iso(p)) -+ return 1 << 29; -+ else if (task_running_idle(p)) -+ return 0; -+ return MAX_PRIO - p->static_prio; -+} -+ -+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) -+{ -+ return true; -+} -+ -+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; -+ -+/* We've already decided p can run on CPU, now test if it shouldn't for SMT -+ * nice reasons. */ -+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) -+{ -+ int best_bias, task_bias; -+ -+ /* Kernel threads always run */ -+ if (unlikely(!p->mm)) -+ return true; -+ if (rt_task(p)) -+ return true; -+ if (!idleprio_suitable(p)) -+ return true; -+ best_bias = best_smt_bias(this_rq); -+ /* The smt siblings are all idle or running IDLEPRIO */ -+ if (best_bias < 1) -+ return true; -+ task_bias = task_prio_bias(p); -+ if (task_bias < 1) -+ return false; -+ if (task_bias >= best_bias) -+ return true; -+ /* Dither 25% cpu of normal tasks regardless of nice difference */ -+ if (best_bias % 4 == 1) -+ return true; -+ /* Sorry, you lose */ -+ return false; -+} -+#else /* CONFIG_SMT_NICE */ -+#define smt_schedule(p, this_rq) (true) -+#endif /* CONFIG_SMT_NICE */ -+ -+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) -+{ -+ set_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+/* -+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to -+ * allow easy lookup of whether any suitable idle CPUs are available. -+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the -+ * idle_cpus variable than to do a full bitmask check when we are busy. The -+ * bits are set atomically but read locklessly as occasional false positive / -+ * negative is harmless. -+ */ -+static inline void set_cpuidle_map(int cpu) -+{ -+ if (likely(cpu_online(cpu))) -+ atomic_set_cpu(cpu, &cpu_idle_map); -+} -+ -+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) -+{ -+ clear_bit(cpu, (volatile unsigned long *)cpumask); -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+ atomic_clear_cpu(cpu, &cpu_idle_map); -+} -+ -+static bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); -+} -+ -+/* -+ * Resched current on rq. We don't know if rq is local to this CPU nor if it -+ * is locked so we do not use an intermediate variable for the task to avoid -+ * having it dereferenced. -+ */ -+static void resched_curr(struct rq *rq) -+{ -+ int cpu; -+ -+ if (test_tsk_need_resched(rq->curr)) -+ return; -+ -+ rq->preempt = rq->curr; -+ cpu = rq->cpu; -+ -+ /* We're doing this without holding the rq lock if it's not task_rq */ -+ -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(rq->curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(rq->curr)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+#define CPUIDLE_DIFF_THREAD (1) -+#define CPUIDLE_DIFF_CORE_LLC (2) -+#define CPUIDLE_DIFF_CORE (4) -+#define CPUIDLE_CACHE_BUSY (8) -+#define CPUIDLE_DIFF_CPU (16) -+#define CPUIDLE_THREAD_BUSY (32) -+#define CPUIDLE_DIFF_NODE (64) -+ -+/* -+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the -+ * lowest value would give the most suitable CPU to schedule p onto next. The -+ * order works out to be the following: -+ * -+ * Same thread, idle or busy cache, idle or busy threads -+ * Other core, same cache, idle or busy cache, idle threads. -+ * Same node, other CPU, idle cache, idle threads. -+ * Same node, other CPU, busy cache, idle threads. -+ * Other core, same cache, busy threads. -+ * Same node, other CPU, busy threads. -+ * Other node, other CPU, idle cache, idle threads. -+ * Other node, other CPU, busy cache, idle threads. -+ * Other node, other CPU, busy threads. -+ */ -+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) -+{ -+ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | -+ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | -+ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; -+ int cpu_tmp; -+ -+ if (cpumask_test_cpu(best_cpu, tmpmask)) -+ goto out; -+ -+ for_each_cpu(cpu_tmp, tmpmask) { -+ int ranking, locality; -+ struct rq *tmp_rq; -+ -+ ranking = 0; -+ tmp_rq = cpu_rq(cpu_tmp); -+ -+ locality = rq->cpu_locality[cpu_tmp]; -+#ifdef CONFIG_NUMA -+ if (locality > LOCALITY_SMP) -+ ranking |= CPUIDLE_DIFF_NODE; -+ else -+#endif -+ if (locality > LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CPU; -+#ifdef CONFIG_SCHED_MC -+ else if (locality == LOCALITY_MC_LLC) -+ ranking |= CPUIDLE_DIFF_CORE_LLC; -+ else if (locality == LOCALITY_MC) -+ ranking |= CPUIDLE_DIFF_CORE; -+ if (!(tmp_rq->cache_idle(tmp_rq))) -+ ranking |= CPUIDLE_CACHE_BUSY; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ if (locality == LOCALITY_SMT) -+ ranking |= CPUIDLE_DIFF_THREAD; -+#endif -+ if (ranking < best_ranking -+#ifdef CONFIG_SCHED_SMT -+ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) -+#endif -+ ) { -+ best_cpu = cpu_tmp; -+ best_ranking = ranking; -+ } -+ } -+out: -+ return best_cpu; -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ struct rq *this_rq = cpu_rq(this_cpu); -+ -+ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); -+} -+ -+/* As per resched_curr but only will resched idle task */ -+static inline void resched_idle(struct rq *rq) -+{ -+ if (test_tsk_need_resched(rq->idle)) -+ return; -+ -+ rq->preempt = rq->idle; -+ -+ set_tsk_need_resched(rq->idle); -+ -+ if (rq_local(rq)) { -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ smp_sched_reschedule(rq->cpu); -+} -+ -+DEFINE_PER_CPU(cpumask_t, idlemask); -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); -+ struct rq *rq; -+ int best_cpu; -+ -+ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); -+ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); -+ rq = cpu_rq(best_cpu); -+ if (!smt_schedule(p, rq)) -+ return NULL; -+ rq->preempt = p; -+ resched_idle(rq); -+ return rq; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq->rq_order[cpu]; -+} -+#else /* CONFIG_SMP */ -+static inline void set_cpuidle_map(int cpu) -+{ -+} -+ -+static inline void clear_cpuidle_map(int cpu) -+{ -+} -+ -+static inline bool suitable_idle_cpus(struct task_struct *p) -+{ -+ return uprq->curr == uprq->idle; -+} -+ -+static inline void resched_suitable_idle(struct task_struct *p) -+{ -+} -+ -+static inline void resched_curr(struct rq *rq) -+{ -+ resched_task(rq->curr); -+} -+ -+static inline void resched_if_idle(struct rq *rq) -+{ -+} -+ -+static inline bool rq_local(struct rq *rq) -+{ -+ return true; -+} -+ -+static inline struct rq *rq_order(struct rq *rq, int cpu) -+{ -+ return rq; -+} -+ -+static inline bool smt_schedule(struct task_struct *p, struct rq *rq) -+{ -+ return true; -+} -+#endif /* CONFIG_SMP */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ if (idleprio_task(p)) -+ return IDLE_PRIO; -+ if (iso_task(p)) -+ return ISO_PRIO; -+ return NORMAL_PRIO; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. Enter with rq locked. -+ */ -+static void activate_task(struct rq *rq, struct task_struct *p, int flags) -+{ -+ resched_if_idle(rq); -+ -+ /* -+ * Sleep time is in units of nanosecs, so shift by 20 to get a -+ * milliseconds-range estimation of the amount of time that the task -+ * spent sleeping: -+ */ -+ if (unlikely(prof_on == SLEEP_PROFILING)) { -+ if (p->state == TASK_UNINTERRUPTIBLE) -+ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), -+ (rq->niffies - p->last_ran) >> 20); -+ } -+ -+ p->prio = effective_prio(p); -+ enqueue_task(rq, p, flags); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+} -+ -+/* -+ * deactivate_task - If it's running, it's not on the runqueue and we can just -+ * decrement the nr_running. Enter with rq locked. -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ p->on_rq = 0; -+ sched_info_dequeued(rq, p); -+ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ -+ psi_dequeue(p, DEQUEUE_SLEEP); -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+ struct rq *rq; -+ -+ if (task_cpu(p) == new_cpu) -+ return; -+ -+ /* Do NOT call set_task_cpu on a currently queued task as we will not -+ * be reliably holding the rq lock after changing CPU. */ -+ BUG_ON(task_queued(p)); -+ rq = task_rq(p); -+ -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * Furthermore, all task_rq users should acquire both locks, see -+ * task_rq_lock(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(rq->lock))); -+#endif -+ -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ /* -+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+ p->wake_cpu = new_cpu; -+ -+ if (task_running(rq, p)) { -+ /* -+ * We should only be calling this on a running task if we're -+ * holding rq lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ -+ /* -+ * We can't change the task_thread_info CPU on a running task -+ * as p will still be protected by the rq lock of the CPU it -+ * is still running on so we only set the wake_cpu for it to be -+ * lazily updated once off the CPU. -+ */ -+ return; -+ } -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, new_cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); -+#endif -+ /* We're no longer protecting p after this point since we're holding -+ * the wrong runqueue lock. */ -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Move a task off the runqueue and take it to a cpu for it will -+ * become the running task. -+ */ -+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) -+{ -+ struct rq *p_rq = task_rq(p); -+ -+ dequeue_task(p_rq, p, DEQUEUE_SAVE); -+ if (p_rq != rq) { -+ sched_info_dequeued(p_rq, p); -+ sched_info_queued(rq, p); -+ } -+ set_task_cpu(p, cpu); -+} -+ -+/* -+ * Returns a descheduling task to the runqueue unless it is being -+ * deactivated. -+ */ -+static inline void return_task(struct task_struct *p, struct rq *rq, -+ int cpu, bool deactivate) -+{ -+ if (deactivate) -+ deactivate_task(p, rq); -+ else { -+#ifdef CONFIG_SMP -+ /* -+ * set_task_cpu was called on the running task that doesn't -+ * want to deactivate so it has to be enqueued to a different -+ * CPU and we need its lock. Tag it to be moved with as the -+ * lock is dropped in finish_lock_switch. -+ */ -+ if (unlikely(p->wake_cpu != cpu)) -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ else -+#endif -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ } -+} -+ -+/* Enter with rq lock held. We know p is on the local cpu */ -+static inline void __set_tsk_resched(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ int running, queued; -+ struct rq_flags rf; -+ unsigned long ncsw; -+ struct rq *rq; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(rq, p)) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ rq = task_rq_lock(p, &rf); -+ trace_sched_wait_task(p); -+ running = task_running(rq, p); -+ queued = task_on_rq_queued(p); -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_rq_unlock(rq, p, &rf); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(queued)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_sched_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+#endif -+ -+/* -+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the -+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or -+ * between themselves, they cooperatively multitask. An idle rq scores as -+ * prio PRIO_LIMIT so it is always preempted. -+ */ -+static inline bool -+can_preempt(struct task_struct *p, int prio, u64 deadline) -+{ -+ /* Better static priority RT task or better policy preemption */ -+ if (p->prio < prio) -+ return true; -+ if (p->prio > prio) -+ return false; -+ if (p->policy == SCHED_BATCH) -+ return false; -+ /* SCHED_NORMAL and ISO will preempt based on deadline */ -+ if (!deadline_before(p->deadline, deadline)) -+ return false; -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * Check to see if p can run on cpu, and if not, whether there are any online -+ * CPUs it can run on instead. This only happens with the hotplug threads that -+ * bring up the CPUs. -+ */ -+static inline bool sched_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) -+ return false; -+ if (p->nr_cpus_allowed == 1) { -+ cpumask_t valid_mask; -+ -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); -+ if (unlikely(cpumask_empty(&valid_mask))) -+ return false; -+ } -+ return true; -+} -+ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ return true; -+} -+ -+#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ int i, this_entries = rq_load(this_rq); -+ cpumask_t tmp; -+ -+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) -+ return; -+ -+ /* IDLEPRIO tasks never preempt anything but idle */ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ -+ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *rq = this_rq->cpu_order[i]; -+ -+ if (!cpumask_test_cpu(rq->cpu, &tmp)) -+ continue; -+ -+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) -+ continue; -+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { -+ /* We set rq->preempting lockless, it's a hint only */ -+ rq->preempting = p; -+ resched_curr(rq); -+ return; -+ } -+ } -+} -+ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check); -+#else /* CONFIG_SMP */ -+static inline bool needs_other_cpu(struct task_struct *p, int cpu) -+{ -+ return false; -+} -+ -+static void try_preempt(struct task_struct *p, struct rq *this_rq) -+{ -+ if (p->policy == SCHED_IDLEPRIO) -+ return; -+ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) -+ resched_curr(uprq); -+} -+ -+static inline int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq = this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) { -+ __schedstat_inc(rq->ttwu_local); -+ } else { -+ struct sched_domain *sd; -+ -+ rcu_read_lock(); -+ for_each_domain(rq->cpu, sd) { -+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -+ __schedstat_inc(sd->ttwu_wake_remote); -+ break; -+ } -+ } -+ rcu_read_unlock(); -+ } -+ -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ /* -+ * Sync wakeups (i.e. those types of wakeups where the waker -+ * has indicated that it will leave the CPU in short order) -+ * don't trigger a preemption if there are no idle cpus, -+ * instead waiting for current to deschedule. -+ */ -+ if (wake_flags & WF_SYNC) -+ resched_suitable_idle(p); -+ else -+ try_preempt(p, rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ int en_flags = ENQUEUE_WAKEUP; -+ -+ lockdep_assert_held(rq->lock); -+ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+#ifdef CONFIG_SMP -+ if (wake_flags & WF_MIGRATED) -+ en_flags |= ENQUEUE_MIGRATED; -+#endif -+ -+ activate_task(rq, p, en_flags); -+ ttwu_do_wakeup(rq, p, wake_flags); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = __task_rq_lock(p, NULL); -+ if (likely(task_on_rq_queued(p))) { -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_rq_unlock(rq, NULL); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ rq_lock_irqsave(rq, &rf); -+ if (likely(is_idle_task(rq->curr))) -+ smp_sched_reschedule(cpu); -+ /* Else cpu is not in idle, do nothing here */ -+ rq_unlock_irqrestore(rq, &rf); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ cpumask_t valid_mask; -+ -+ if (p->flags & PF_KTHREAD) -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); -+ else -+ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); -+ -+ if (unlikely(!cpumask_weight(&valid_mask))) { -+ /* We shouldn't be hitting this any more */ -+ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, -+ p->pid, cpumask_weight(p->cpus_ptr)); -+ return cpumask_any(p->cpus_ptr); -+ } -+ return cpumask_any(&valid_mask); -+} -+ -+/* -+ * For a task that's just being woken up we have a valuable balancing -+ * opportunity so choose the nearest cache most lightly loaded runqueue. -+ * Entered with rq locked and returns with the chosen runqueue locked. -+ */ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ unsigned int idlest = ~0U; -+ struct rq *rq = NULL; -+ int i; -+ -+ if (suitable_idle_cpus(p)) { -+ int cpu = task_cpu(p); -+ -+ if (unlikely(needs_other_cpu(p, cpu))) -+ cpu = valid_task_cpu(p); -+ rq = resched_best_idle(p, cpu); -+ if (likely(rq)) -+ return rq->cpu; -+ } -+ -+ for (i = 0; i < num_online_cpus(); i++) { -+ struct rq *other_rq = task_rq(p)->cpu_order[i]; -+ int entries; -+ -+ if (!other_rq->online) -+ continue; -+ if (needs_other_cpu(p, other_rq->cpu)) -+ continue; -+ entries = rq_load(other_rq); -+ if (entries >= idlest) -+ continue; -+ idlest = entries; -+ rq = other_rq; -+ } -+ if (unlikely(!rq)) -+ return task_cpu(p); -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+static int valid_task_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static inline int select_best_cpu(struct task_struct *p) -+{ -+ return 0; -+} -+ -+static struct rq *resched_best_idle(struct task_struct *p, int cpu) -+{ -+ return NULL; -+} -+#endif /* CONFIG_SMP */ -+ -+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ rq_lock(rq); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ rq_unlock(rq); -+} -+ -+/*** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Put it on the run-queue if it's not already there. The "current" -+ * thread is always on the run-queue (except when the actual -+ * re-schedule is in progress), and as such you're allowed to do -+ * the simpler "current->state = TASK_RUNNING" to mark yourself -+ * runnable without the overhead of this. -+ * -+ * Return: %true if @p was woken up, %false if it was already running. -+ * or @state didn't match @p's state. -+ */ -+static int -+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ cpu = select_best_cpu(p); -+ if (task_cpu(p) != cpu) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+ -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, NULL); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ rq_unlock(rq); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+static void time_slice_expired(struct task_struct *p, struct rq *rq); -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ */ -+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+ -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * The process state is set to the same value of the process executing -+ * do_fork() code. That is running. This guarantees that nobody will -+ * actually run it, and a signal or other external event cannot wake -+ * it up and insert it on the runqueue either. -+ */ -+ -+ /* Should be reset in fork.c but done here for ease of MuQSS patching */ -+ p->on_cpu = -+ p->on_rq = -+ p->utime = -+ p->stime = -+ p->sched_time = -+ p->stime_ns = -+ p->utime_ns = 0; -+ skiplist_node_init(&p->node); -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { -+ p->policy = SCHED_NORMAL; -+ p->normal_prio = normal_prio(p); -+ } -+ -+ if (PRIO_TO_NICE(p->static_prio) < 0) { -+ p->static_prio = NICE_TO_PRIO(0); -+ p->normal_prio = p->static_prio; -+ } -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rseq_migrate(p); -+ set_task_cpu(p, smp_processor_id()); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_post_fork(struct task_struct *p) -+{ -+} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, -+ size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); -+ -+static void account_task_cpu(struct rq *rq, struct task_struct *p) -+{ -+ update_clocks(rq); -+ /* This isn't really a context switch but accounting is the same */ -+ update_cpu_clock_switch(rq, p); -+ p->last_ran = rq->niffies; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+static inline int hrexpiry_enabled(struct rq *rq) -+{ -+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrexpiry_timer); -+} -+ -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+static inline void hrexpiry_clear(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (hrtimer_active(&rq->hrexpiry_timer)) -+ hrtimer_cancel(&rq->hrexpiry_timer); -+} -+ -+/* -+ * High-resolution time_slice expiry. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrexpiry(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); -+ struct task_struct *p; -+ -+ /* This can happen during CPU hotplug / resume */ -+ if (unlikely(cpu_of(rq) != smp_processor_id())) -+ goto out; -+ -+ /* -+ * We're doing this without the runqueue lock but this should always -+ * be run on the local CPU. Time slice should run out in __schedule -+ * but we set it to zero here in case niffies is slightly less. -+ */ -+ p = rq->curr; -+ p->time_slice = 0; -+ __set_tsk_resched(p); -+out: -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Called to set the hrexpiry timer state. -+ * -+ * called with irqs disabled from the local CPU only -+ */ -+static void hrexpiry_start(struct rq *rq, u64 delay) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ -+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED); -+} -+ -+static void init_rq_hrexpiry(struct rq *rq) -+{ -+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ rq->hrexpiry_timer.function = hrexpiry; -+} -+ -+static inline int rq_dither(struct rq *rq) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return HALF_JIFFY_US; -+ return 0; -+} -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ struct task_struct *parent, *rq_curr; -+ struct rq *rq, *new_rq; -+ unsigned long flags; -+ -+ parent = p->parent; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ p->state = TASK_RUNNING; -+ /* Task_rq can't change yet on a new task */ -+ new_rq = rq = task_rq(p); -+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { -+ set_task_cpu(p, valid_task_cpu(p)); -+ new_rq = task_rq(p); -+ } -+ -+ double_rq_lock(rq, new_rq); -+ rq_curr = rq->curr; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = rq_curr->normal_prio; -+ -+ trace_sched_wakeup_new(p); -+ -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. If it's negative, it won't -+ * matter since that's the same as being 0. rq->rq_deadline is only -+ * modified within schedule() so it is always equal to -+ * current->deadline. -+ */ -+ account_task_cpu(rq, rq_curr); -+ p->last_ran = rq_curr->last_ran; -+ if (likely(rq_curr->policy != SCHED_FIFO)) { -+ rq_curr->time_slice /= 2; -+ if (rq_curr->time_slice < RESCHED_US) { -+ /* -+ * Forking task has run out of timeslice. Reschedule it and -+ * start its child with a new time slice and deadline. The -+ * child will end up running first because its deadline will -+ * be slightly earlier. -+ */ -+ __set_tsk_resched(rq_curr); -+ time_slice_expired(p, new_rq); -+ if (suitable_idle_cpus(p)) -+ resched_best_idle(p, task_cpu(p)); -+ else if (unlikely(rq != new_rq)) -+ try_preempt(p, new_rq); -+ } else { -+ p->time_slice = rq_curr->time_slice; -+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { -+ /* -+ * The VM isn't cloned, so we're in a good position to -+ * do child-runs-first in anticipation of an exec. This -+ * usually avoids a lot of COW overhead. -+ */ -+ __set_tsk_resched(rq_curr); -+ } else { -+ /* -+ * Adjust the hrexpiry since rq_curr will keep -+ * running and its timeslice has been shortened. -+ */ -+ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); -+ try_preempt(p, new_rq); -+ } -+ } -+ } else { -+ time_slice_expired(p, new_rq); -+ try_preempt(p, new_rq); -+ } -+ activate_task(new_rq, p, 0); -+ double_rq_unlock(rq, new_rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ */ -+ next->on_cpu = 1; -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock->dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock->owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); -+ -+#ifdef CONFIG_SMP -+ /* -+ * If prev was marked as migrating to another CPU in return_task, drop -+ * the local runqueue lock but leave interrupts disabled and grab the -+ * remote lock we're migrating it to before enabling them. -+ */ -+ if (unlikely(task_on_rq_migrating(prev))) { -+ sched_info_dequeued(rq, prev); -+ /* -+ * We move the ownership of prev to the new cpu now. ttwu can't -+ * activate prev to the wrong cpu since it has to grab this -+ * runqueue in ttwu_remote. -+ */ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ prev->cpu = prev->wake_cpu; -+#else -+ task_thread_info(prev)->cpu = prev->wake_cpu; -+#endif -+ raw_spin_unlock(rq->lock); -+ -+ raw_spin_lock(&prev->pi_lock); -+ rq = __task_rq_lock(prev, NULL); -+ /* Check that someone else hasn't already queued prev */ -+ if (likely(!task_queued(prev))) { -+ enqueue_task(rq, prev, 0); -+ prev->on_rq = TASK_ON_RQ_QUEUED; -+ /* Wake up the CPU if it's not already running */ -+ resched_if_idle(rq); -+ } -+ raw_spin_unlock(&prev->pi_lock); -+ } -+#endif -+ rq_unlock(rq); -+ local_irq_enable(); -+} -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_switch -+# define finish_arch_switch(prev) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static void finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq, prev); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+{ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline void -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+static unsigned long nr_uninterruptible(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_uninterruptible; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ if (rq_load(raw_rq()) == 1) -+ return true; -+ else -+ return false; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int cpu; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += cpu_rq(cpu)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpufreq menu -+ * governor are using nonsensical data. Boosting frequency for a CPU that has -+ * IO-wait which might not even end up running the task when it does become -+ * runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long cpu, sum = 0; -+ -+ for_each_possible_cpu(cpu) -+ sum += nr_iowait_cpu(cpu); -+ -+ return sum; -+} -+ -+unsigned long nr_active(void) -+{ -+ return nr_running() + nr_uninterruptible(); -+} -+ -+/* Variables and functions for calc_load */ -+static unsigned long calc_load_update; -+unsigned long avenrun[3]; -+EXPORT_SYMBOL(avenrun); -+ -+/** -+ * get_avenrun - get the load average array -+ * @loads: pointer to dest load array -+ * @offset: offset to add -+ * @shift: shift count to shift the result left -+ * -+ * These values are estimates at best, so no need for locking. -+ */ -+void get_avenrun(unsigned long *loads, unsigned long offset, int shift) -+{ -+ loads[0] = (avenrun[0] + offset) << shift; -+ loads[1] = (avenrun[1] + offset) << shift; -+ loads[2] = (avenrun[2] + offset) << shift; -+} -+ -+/* -+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. -+ */ -+void calc_global_load(void) -+{ -+ long active; -+ -+ if (time_before(jiffies, READ_ONCE(calc_load_update))) -+ return; -+ active = nr_active() * FIXED_1; -+ -+ avenrun[0] = calc_load(avenrun[0], EXP_1, active); -+ avenrun[1] = calc_load(avenrun[1], EXP_5, active); -+ avenrun[2] = calc_load(avenrun[2], EXP_15, active); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+} -+ -+/** -+ * fixed_power_int - compute: x^n, in O(log n) time -+ * -+ * @x: base of the power -+ * @frac_bits: fractional bits of @x -+ * @n: power to raise @x to. -+ * -+ * By exploiting the relation between the definition of the natural power -+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and -+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, -+ * (where: n_i \elem {0, 1}, the binary vector representing n), -+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is -+ * of course trivially computable in O(log_2 n), the length of our binary -+ * vector. -+ */ -+static unsigned long -+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) -+{ -+ unsigned long result = 1UL << frac_bits; -+ -+ if (n) { -+ for (;;) { -+ if (n & 1) { -+ result *= x; -+ result += 1UL << (frac_bits - 1); -+ result >>= frac_bits; -+ } -+ n >>= 1; -+ if (!n) -+ break; -+ x *= x; -+ x += 1UL << (frac_bits - 1); -+ x >>= frac_bits; -+ } -+ } -+ -+ return result; -+} -+ -+/* -+ * a1 = a0 * e + a * (1 - e) -+ * -+ * a2 = a1 * e + a * (1 - e) -+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) -+ * = a0 * e^2 + a * (1 - e) * (1 + e) -+ * -+ * a3 = a2 * e + a * (1 - e) -+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) -+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) -+ * -+ * ... -+ * -+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] -+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) -+ * = a0 * e^n + a * (1 - e^n) -+ * -+ * [1] application of the geometric series: -+ * -+ * n 1 - x^(n+1) -+ * S_n := \Sum x^i = ------------- -+ * i=0 1 - x -+ */ -+unsigned long -+calc_load_n(unsigned long load, unsigned long exp, -+ unsigned long active, unsigned int n) -+{ -+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); -+} -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+#ifdef CONFIG_PARAVIRT -+static inline u64 steal_ticks(u64 steal) -+{ -+ if (unlikely(steal > NSEC_PER_SEC)) -+ return div_u64(steal, TICK_NSEC); -+ -+ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); -+} -+#endif -+ -+#ifndef nsecs_to_cputime -+# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) -+#endif -+ -+/* -+ * On each tick, add the number of nanoseconds to the unbanked variables and -+ * once one tick's worth has accumulated, account it allowing for accurate -+ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we -+ * deduct nanoseconds. -+ */ -+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ if (atomic_read(&rq->nr_iowait) > 0) { -+ rq->iowait_ns += ns; -+ if (rq->iowait_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->iowait_ns); -+ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->iowait_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->idle_ns += ns; -+ if (rq->idle_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->idle_ns); -+ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->idle_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(idle); -+} -+ -+static void pc_system_time(struct rq *rq, struct task_struct *p, -+ int hardirq_offset, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->stime_ns += ns; -+ if (p->stime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->stime_ns); -+ p->stime_ns %= JIFFY_NS; -+ p->stime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_system_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (hardirq_count() - hardirq_offset) { -+ rq->irq_ns += ns; -+ if (rq->irq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->irq_ns); -+ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->irq_ns %= JIFFY_NS; -+ } -+ } else if (in_serving_softirq()) { -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->system_ns += ns; -+ if (rq->system_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->system_ns); -+ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->system_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) -+{ -+ u64 *cpustat = kcpustat_this_cpu->cpustat; -+ unsigned long ticks; -+ -+ p->utime_ns += ns; -+ if (p->utime_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(p->utime_ns); -+ p->utime_ns %= JIFFY_NS; -+ p->utime += (__force u64)TICK_APPROX_NS * ticks; -+ account_group_user_time(p, TICK_APPROX_NS * ticks); -+ } -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ if (this_cpu_ksoftirqd() == p) { -+ /* -+ * ksoftirqd time do not get accounted in cpu_softirq_time. -+ * So, we have to handle it separately here. -+ */ -+ rq->softirq_ns += ns; -+ if (rq->softirq_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->softirq_ns); -+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->softirq_ns %= JIFFY_NS; -+ } -+ } -+ -+ if (task_nice(p) > 0 || idleprio_task(p)) { -+ rq->nice_ns += ns; -+ if (rq->nice_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->nice_ns); -+ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->nice_ns %= JIFFY_NS; -+ } -+ } else { -+ rq->user_ns += ns; -+ if (rq->user_ns >= JIFFY_NS) { -+ ticks = NS_TO_JIFFIES(rq->user_ns); -+ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; -+ rq->user_ns %= JIFFY_NS; -+ } -+ } -+ acct_update_integrals(p); -+} -+ -+/* -+ * This is called on clock ticks. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate tick timekeeping */ -+ if (user_mode(get_irq_regs())) -+ pc_user_time(rq, p, account_ns); -+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { -+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); -+ } else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+ -+ p->last_ran = rq->niffies; -+} -+ -+/* -+ * This is called on context switches. -+ * Bank in p->sched_time the ns elapsed since the last tick or switch. -+ * CPU scheduler quota accounting is also performed here in microseconds. -+ */ -+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) -+{ -+ s64 account_ns = rq->niffies - p->last_ran; -+ struct task_struct *idle = rq->idle; -+ -+ /* Accurate subtick timekeeping */ -+ if (p != idle) -+ pc_user_time(rq, p, account_ns); -+ else -+ pc_idle_time(rq, idle, account_ns); -+ -+ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ -+ if (p->policy != SCHED_FIFO && p != idle) -+ p->time_slice -= NS_TO_US(account_ns); -+} -+ -+/* -+ * Return any ns on the sched_clock that have not yet been accounted in -+ * @p in case that task is currently running. -+ * -+ * Called with task_rq_lock(p) held. -+ */ -+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) -+{ -+ u64 ns = 0; -+ -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_clocks(rq); -+ ns = rq->niffies - p->last_ran; -+ } -+ -+ return ns; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ struct rq_flags rf; -+ struct rq *rq; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimisation chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_rq_lock(p, &rf); -+ ns = p->sched_time + do_task_delta_exec(p, rq); -+ task_rq_unlock(rq, p, &rf); -+ -+ return ns; -+} -+ -+/* -+ * Functions to test for when SCHED_ISO tasks have used their allocated -+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All -+ * data is modified only by the local runqueue during scheduler_tick with -+ * interrupts disabled. -+ */ -+ -+/* -+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT -+ * tasks and set the refractory flag if necessary. There is 10% hysteresis -+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a -+ * slow division. -+ */ -+static inline void iso_tick(struct rq *rq) -+{ -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; -+ rq->iso_ticks += 100; -+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { -+ rq->iso_refractory = true; -+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) -+ rq->iso_ticks = ISO_PERIOD * 100; -+ } -+} -+ -+/* No SCHED_ISO task was running so decrease rq->iso_ticks */ -+static inline void no_iso_tick(struct rq *rq, int ticks) -+{ -+ if (rq->iso_ticks > 0 || rq->iso_refractory) { -+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; -+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { -+ rq->iso_refractory = false; -+ if (unlikely(rq->iso_ticks < 0)) -+ rq->iso_ticks = 0; -+ } -+ } -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static void task_running_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ /* -+ * If a SCHED_ISO task is running we increment the iso_ticks. In -+ * order to prevent SCHED_ISO tasks from causing starvation in the -+ * presence of true RT tasks we account those as iso_ticks as well. -+ */ -+ if (rt_task(p) || task_running_iso(p)) -+ iso_tick(rq); -+ else -+ no_iso_tick(rq, 1); -+ -+ /* SCHED_FIFO tasks never run out of timeslice. */ -+ if (p->policy == SCHED_FIFO) -+ return; -+ -+ if (iso_task(p)) { -+ if (task_running_iso(p)) { -+ if (rq->iso_refractory) { -+ /* -+ * SCHED_ISO task is running as RT and limit -+ * has been hit. Force it to reschedule as -+ * SCHED_NORMAL by zeroing its time_slice -+ */ -+ p->time_slice = 0; -+ } -+ } else if (!rq->iso_refractory) { -+ /* Can now run again ISO. Reschedule to pick up prio */ -+ goto out_resched; -+ } -+ } -+ -+ /* -+ * Tasks that were scheduled in the first half of a tick are not -+ * allowed to run into the 2nd half of the next tick if they will -+ * run out of time slice in the interim. Otherwise, if they have -+ * less than RESCHED_US μs of time slice left they will be rescheduled. -+ * Dither is used as a backup for when hrexpiry is disabled or high res -+ * timers not configured in. -+ */ -+ if (p->time_slice - rq->dither >= RESCHED_US) -+ return; -+out_resched: -+ rq_lock(rq); -+ __set_tsk_resched(p); -+ rq_unlock(rq); -+} -+ -+static inline void task_tick(struct rq *rq) -+{ -+ if (!rq_idle(rq)) -+ task_running_tick(rq); -+ else if (rq->last_jiffy > rq->last_scheduler_tick) -+ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * We can stop the timer tick any time highres timers are active since -+ * we rely entirely on highres timeouts for task expiry rescheduling. -+ */ -+static void sched_stop_tick(struct rq *rq, int cpu) -+{ -+ if (!hrexpiry_enabled(rq)) -+ return; -+ if (!tick_nohz_full_enabled()) -+ return; -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+static inline void sched_start_tick(struct rq *rq, int cpu) -+{ -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ rq_lock_irq(rq); -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ curr = rq->curr; -+ update_rq_clock(rq); -+ -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ task_tick(rq); -+ -+out_unlock: -+ rq_unlock_irq(rq, NULL); -+ -+out_requeue: -+ -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ int os; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ /* There cannot be competing actions, but don't rely on stop-machine. */ -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); -+ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); -+ /* Don't cancel, as this would mess up the state machine. */ -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_stop_tick(struct rq *rq, int cpu) {} -+static inline void sched_start_tick(struct rq *rq, int cpu) {} -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ update_clocks(rq); -+ update_load_avg(rq, 0); -+ update_cpu_clock_tick(rq, rq->curr); -+ task_tick(rq); -+ rq->last_scheduler_tick = rq->last_jiffy; -+ rq->last_tick = rq->clock; -+ psi_task_tick(rq); -+ perf_event_task_tick(); -+ sched_stop_tick(rq, cpu); -+} -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * The time_slice is only refilled when it is empty and that is when we set a -+ * new deadline. Make sure update_clocks has been called recently to update -+ * rq->niffies. -+ */ -+static void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = timeslice(); -+ p->deadline = rq->niffies + task_deadline_diff(p); -+#ifdef CONFIG_SMT_NICE -+ if (!p->mm) -+ p->smt_bias = 0; -+ else if (rt_task(p)) -+ p->smt_bias = 1 << 30; -+ else if (task_running_iso(p)) -+ p->smt_bias = 1 << 29; -+ else if (idleprio_task(p)) { -+ if (task_running_idle(p)) -+ p->smt_bias = 0; -+ else -+ p->smt_bias = 1; -+ } else if (--p->smt_bias < 1) -+ p->smt_bias = MAX_PRIO - p->static_prio; -+#endif -+} -+ -+/* -+ * Timeslices below RESCHED_US are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. SCHED_BATCH tasks -+ * have been flagged be not latency sensitive and likely to be fully CPU -+ * bound so every time they're rescheduled they have their time_slice -+ * refilled, but get a new later deadline to have little effect on -+ * SCHED_NORMAL tasks. -+ -+ */ -+static inline void check_deadline(struct task_struct *p, struct rq *rq) -+{ -+ if (p->time_slice < RESCHED_US || batch_task(p)) -+ time_slice_expired(p, rq); -+} -+ -+/* -+ * Task selection with skiplists is a simple matter of picking off the first -+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) -+ * being bound to the number of processors. -+ * -+ * Runqueues are selectively locked based on their unlocked data and then -+ * unlocked if not needed. At most 3 locks will be held at any time and are -+ * released as soon as they're no longer needed. All balancing between CPUs -+ * is thus done here in an extremely simple first come best fit manner. -+ * -+ * This iterates over runqueues in cache locality order. In interactive mode -+ * it iterates over all CPUs and finds the task with the best key/deadline. -+ * In non-interactive mode it will only take a task if it's from the current -+ * runqueue or a runqueue with more tasks than the current one with a better -+ * key/deadline. -+ */ -+#ifdef CONFIG_SMP -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct rq *locked = NULL, *chosen = NULL; -+ struct task_struct *edt = idle; -+ int i, best_entries = 0; -+ u64 best_key = ~0ULL; -+ -+ for (i = 0; i < total_runqueues; i++) { -+ struct rq *other_rq = rq_order(rq, i); -+ skiplist_node *next; -+ int entries; -+ -+ entries = other_rq->sl->entries; -+ /* -+ * Check for queued entres lockless first. The local runqueue -+ * is locked so entries will always be accurate. -+ */ -+ if (!sched_interactive) { -+ /* -+ * Don't reschedule balance across nodes unless the CPU -+ * is idle. -+ */ -+ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) -+ break; -+ if (entries <= best_entries) -+ continue; -+ } else if (!entries) -+ continue; -+ -+ /* if (i) implies other_rq != rq */ -+ if (i) { -+ /* Check for best id queued lockless first */ -+ if (other_rq->best_key >= best_key) -+ continue; -+ -+ if (unlikely(!trylock_rq(rq, other_rq))) -+ continue; -+ -+ /* Need to reevaluate entries after locking */ -+ entries = other_rq->sl->entries; -+ if (unlikely(!entries)) { -+ unlock_rq(other_rq); -+ continue; -+ } -+ } -+ -+ next = other_rq->node; -+ /* -+ * In interactive mode we check beyond the best entry on other -+ * runqueues if we can't get the best for smt or affinity -+ * reasons. -+ */ -+ while ((next = next->next[0]) != other_rq->node) { -+ struct task_struct *p; -+ u64 key = next->key; -+ -+ /* Reevaluate key after locking */ -+ if (key >= best_key) -+ break; -+ -+ p = next->value; -+ if (!smt_schedule(p, rq)) { -+ if (i && !sched_interactive) -+ break; -+ continue; -+ } -+ -+ if (sched_other_cpu(p, cpu)) { -+ if (sched_interactive || !i) -+ continue; -+ break; -+ } -+ /* Make sure affinity is ok */ -+ if (i) { -+ /* From this point on p is the best so far */ -+ if (locked) -+ unlock_rq(locked); -+ chosen = locked = other_rq; -+ } -+ best_entries = entries; -+ best_key = key; -+ edt = p; -+ break; -+ } -+ /* rq->preempting is a hint only as the state may have changed -+ * since it was set with the resched call but if we have met -+ * the condition we can break out here. */ -+ if (edt == rq->preempting) -+ break; -+ if (i && other_rq != chosen) -+ unlock_rq(other_rq); -+ } -+ -+ if (likely(edt != idle)) -+ take_task(rq, cpu, edt); -+ -+ if (locked) -+ unlock_rq(locked); -+ -+ rq->preempting = NULL; -+ -+ return edt; -+} -+#else /* CONFIG_SMP */ -+static inline struct task_struct -+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) -+{ -+ struct task_struct *edt; -+ -+ if (unlikely(!rq->sl->entries)) -+ return idle; -+ edt = rq->node->next[0]->value; -+ take_task(rq, cpu, edt); -+ return edt; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * The currently running task's information is all stored in rq local data -+ * which is only modified by the local CPU. -+ */ -+static inline void set_rq_task(struct rq *rq, struct task_struct *p) -+{ -+ if (p == rq->idle || p->policy == SCHED_FIFO) -+ hrexpiry_clear(rq); -+ else -+ hrexpiry_start(rq, US_TO_NS(p->time_slice)); -+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) -+ rq->dither = 0; -+ else -+ rq->dither = rq_dither(rq); -+ -+ rq->rq_deadline = p->deadline; -+ rq->rq_prio = p->prio; -+#ifdef CONFIG_SMT_NICE -+ rq->rq_mm = p->mm; -+ rq->rq_smt_bias = p->smt_bias; -+#endif -+} -+ -+#ifdef CONFIG_SMT_NICE -+static void check_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} -+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; -+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; -+ -+/* Iterate over smt siblings when we've scheduled a process on cpu and decide -+ * whether they should continue running or be descheduled. */ -+static void check_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct task_struct *p; -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ continue; -+ p = rq->curr; -+ if (!smt_schedule(p, this_rq)) -+ resched_curr(rq); -+ } -+} -+ -+static void wake_smt_siblings(struct rq *this_rq) -+{ -+ int other_cpu; -+ -+ for_each_cpu(other_cpu, &this_rq->thread_mask) { -+ struct rq *rq; -+ -+ rq = cpu_rq(other_cpu); -+ if (rq_idle(rq)) -+ resched_idle(rq); -+ } -+} -+#else -+static void check_siblings(struct rq __maybe_unused *this_rq) {} -+static void wake_siblings(struct rq __maybe_unused *this_rq) {} -+#endif -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next, *idle; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ bool deactivate = false; -+ struct rq *rq; -+ u64 niffies; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ idle = rq->idle; -+ -+ schedule_debug(prev, preempt); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ rq_lock(rq); -+ smp_mb__after_spinlock(); -+#ifdef CONFIG_SMP -+ if (rq->preempt) { -+ /* -+ * Make sure resched_curr hasn't triggered a preemption -+ * locklessly on a task that has since scheduled away. Spurious -+ * wakeup of idle is okay though. -+ */ -+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { -+ rq->preempt = NULL; -+ clear_preempt_need_resched(); -+ rq_unlock_irq(rq, NULL); -+ return; -+ } -+ rq->preempt = NULL; -+ } -+#endif -+ -+ switch_count = &prev->nivcsw; -+ -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state) { -+ if (signal_pending_state(prev_state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ deactivate = true; -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ /* -+ * Store the niffy value here for use by the next task's last_ran -+ * below to avoid losing niffies due to update_clocks being called -+ * again after this point. -+ */ -+ update_clocks(rq); -+ niffies = rq->niffies; -+ update_cpu_clock_switch(rq, prev); -+ -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ if (idle != prev) { -+ check_deadline(prev, rq); -+ return_task(prev, rq, cpu, deactivate); -+ } -+ -+ next = earliest_deadline_task(rq, cpu, idle); -+ if (likely(next->prio != PRIO_LIMIT)) -+ clear_cpuidle_map(cpu); -+ else { -+ set_cpuidle_map(cpu); -+ update_load_avg(rq, 0); -+ } -+ -+ set_rq_task(rq, next); -+ next->last_ran = niffies; -+ -+ if (likely(prev != next)) { -+ /* -+ * Don't reschedule an idle task or deactivated tasks -+ */ -+ if (prev == idle) { -+ inc_nr_running(rq); -+ if (rt_task(next)) -+ rq->rt_nr_running++; -+ } else if (!deactivate) -+ resched_suitable_idle(prev); -+ if (unlikely(next == idle)) { -+ dec_nr_running(rq); -+ if (rt_task(prev)) -+ rq->rt_nr_running--; -+ wake_siblings(rq); -+ } else -+ check_siblings(rq); -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ context_switch(rq, prev, next); /* unlocks the rq */ -+ } else { -+ check_siblings(rq); -+ rq_unlock(rq); -+ local_irq_enable(); -+ } -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(). */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static inline void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+ -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != IN_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio, oldprio; -+ struct rq *rq; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_rq_lock(p, NULL); -+ update_rq_clock(rq); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ oldprio = p->prio; -+ p->prio = prio; -+ if (task_running(rq, p)){ -+ if (prio > oldprio) -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (prio < oldprio) -+ try_preempt(p, rq); -+ } -+out_unlock: -+ __task_rq_unlock(rq, NULL); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+/* -+ * Adjust the deadline for when the priority is to change, before it's -+ * changed. -+ */ -+static inline void adjust_deadline(struct task_struct *p, int new_prio) -+{ -+ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); -+} -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ int new_static, old_static; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ new_static = NICE_TO_PRIO(nice); -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (has_rt_policy(p)) { -+ p->static_prio = new_static; -+ goto out_unlock; -+ } -+ -+ adjust_deadline(p, new_static); -+ old_static = p->static_prio; -+ p->static_prio = new_static; -+ p->prio = effective_prio(p); -+ -+ if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (new_static < old_static) -+ try_preempt(p, rq); -+ } else if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ if (old_static < new_static) -+ resched_task(p); -+ } -+out_unlock: -+ task_rq_unlock(rq, p, &rf); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int delta, prio = p->prio - MAX_RT_PRIO; -+ -+ /* rt tasks and iso tasks */ -+ if (prio <= 0) -+ goto out; -+ -+ /* Convert to ms to avoid overflows */ -+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); -+ if (unlikely(delta < 0)) -+ delta = 0; -+ delta = delta * 40 / ms_longest_deadline_diff(); -+ if (delta <= 80) -+ prio += delta; -+ if (idleprio_task(p)) -+ prio += 40; -+out: -+ return prio; -+} -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * available_idle_cpu - is a given CPU idle for enqueuing work. -+ * @cpu: the CPU in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int available_idle_cpu(int cpu) -+{ -+ if (!idle_cpu(cpu)) -+ return 0; -+ -+ if (vcpu_is_preempted(cpu)) -+ return 0; -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the CPU @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, -+ int prio, const struct sched_attr *attr, -+ bool keep_boost) -+{ -+ int oldrtprio, oldprio; -+ -+ /* -+ * If params can't change scheduling class changes aren't allowed -+ * either. -+ */ -+ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) -+ return; -+ -+ p->policy = policy; -+ oldrtprio = p->rt_priority; -+ p->rt_priority = prio; -+ p->normal_prio = normal_prio(p); -+ oldprio = p->prio; -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ -+ if (task_running(rq, p)) { -+ set_rq_task(rq, p); -+ resched_task(p); -+ } else if (task_queued(p)) { -+ dequeue_task(rq, p, DEQUEUE_SAVE); -+ enqueue_task(rq, p, ENQUEUE_RESTORE); -+ if (p->prio < oldprio || p->rt_priority > oldrtprio) -+ try_preempt(p, rq); -+ } -+} -+ -+/* -+ * Check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; -+ unsigned long rlim_rtprio = 0; -+ struct rq_flags rf; -+ int reset_on_fork; -+ struct rq *rq; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { -+ unsigned long lflags; -+ -+ if (!lock_task_sighand(p, &lflags)) -+ return -ESRCH; -+ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); -+ unlock_task_sighand(p, &lflags); -+ if (rlim_rtprio) -+ goto recheck; -+ /* -+ * If the caller requested an RT policy without having the -+ * necessary rights, we downgrade the policy to SCHED_ISO. -+ * We also set the parameter to zero to pass the checks. -+ */ -+ policy = SCHED_ISO; -+ priority = 0; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); -+ policy &= ~SCHED_RESET_ON_FORK; -+ -+ if (!SCHED_RANGE(policy)) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH is 0. -+ */ -+ if (priority < 0 || -+ (p->mm && priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if (is_rt_policy(policy) != (priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (is_rt_policy(policy)) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (priority > p->rt_priority && -+ priority > rlim_rtprio) -+ return -EPERM; -+ } else { -+ switch (p->policy) { -+ /* -+ * Can only downgrade policies but not back to -+ * SCHED_NORMAL -+ */ -+ case SCHED_ISO: -+ if (policy == SCHED_ISO) -+ goto out; -+ if (policy != SCHED_NORMAL) -+ return -EPERM; -+ break; -+ case SCHED_BATCH: -+ if (policy == SCHED_BATCH) -+ goto out; -+ if (policy != SCHED_IDLEPRIO) -+ return -EPERM; -+ break; -+ case SCHED_IDLEPRIO: -+ if (policy == SCHED_IDLEPRIO) -+ goto out; -+ return -EPERM; -+ default: -+ break; -+ } -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag: */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ * -+ * To be able to change p->policy safely, the runqueue lock must be -+ * held. -+ */ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea: -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || -+ priority == p->rt_priority))) { -+ retval = 0; -+ goto unlock; -+ } -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ __setscheduler(p, rq, policy, priority, attr, pi); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ task_rq_unlock(rq, p, &rf); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ preempt_enable(); -+out: -+ return 0; -+ -+unlock: -+ task_rq_unlock(rq, p, &rf); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, -+ struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) -+ attr.sched_policy = SETPARAM_POLICY; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setattr(p, &attr); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (rt_task(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ cpumask_t *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ unsigned long flags; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ put_online_cpus(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min(len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ -+ if (!sched_yield_type) -+ return; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ -+ if (sched_yield_type > 1) -+ time_slice_expired(current, rq); -+ schedstat_inc(rq->yld_count); -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ rq_unlock(rq); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ struct task_struct *rq_p; -+ struct rq *rq, *p_rq; -+ unsigned long flags; -+ int yielded = 0; -+ -+ local_irq_save(flags); -+ rq = this_rq(); -+ -+again: -+ p_rq = task_rq(p); -+ /* -+ * If we're the only runnable task on the rq and target rq also -+ * has only one task, there's absolutely no point in yielding. -+ */ -+ if (task_running(p_rq, p) || p->state) { -+ yielded = -ESRCH; -+ goto out_irq; -+ } -+ -+ double_rq_lock(rq, p_rq); -+ if (unlikely(task_rq(p) != p_rq)) { -+ double_rq_unlock(rq, p_rq); -+ goto again; -+ } -+ -+ yielded = 1; -+ schedstat_inc(rq->yld_count); -+ rq_p = rq->curr; -+ if (p->deadline > rq_p->deadline) -+ p->deadline = rq_p->deadline; -+ p->time_slice += rq_p->time_slice; -+ if (p->time_slice > timeslice()) -+ p->time_slice = timeslice(); -+ time_slice_expired(rq_p, rq); -+ if (preempt && rq != p_rq) -+ resched_task(p_rq->curr); -+ double_rq_unlock(rq, p_rq); -+out_irq: -+ local_irq_restore(flags); -+ -+ if (yielded > 0) -+ schedule(); -+ return yielded; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_ISO: -+ case SCHED_IDLEPRIO: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ unsigned int time_slice; -+ struct rq_flags rf; -+ struct rq *rq; -+ int retval; -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ rq = task_rq_lock(p, &rf); -+ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); -+ task_rq_unlock(rq, p, &rf); -+ -+ rcu_read_unlock(); -+ *t = ns_to_timespec64(time_slice); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * this syscall writes the default timeslice value of a given process -+ * into the user-space timespec buffer. A value of '0' means infinity. -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ printk(KERN_CONT " running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+#ifdef CONFIG_SMP -+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ -+ if (task_queued(p)) { -+ /* -+ * Because __kthread_bind() calls this on blocked tasks without -+ * holding rq->lock. -+ */ -+ lockdep_assert_held(rq->lock); -+ } -+} -+ -+/* -+ * Calling do_set_cpus_allowed from outside the scheduler code should not be -+ * called on a running or queued task. We should be holding pi_lock. -+ */ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ __do_set_cpus_allowed(p, new_mask); -+ if (needs_other_cpu(p, task_cpu(p))) { -+ struct rq *rq; -+ -+ rq = __task_rq_lock(p, NULL); -+ set_task_cpu(p, valid_task_cpu(p)); -+ resched_task(p); -+ __task_rq_unlock(rq, NULL); -+ } -+} -+#endif -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: cpu the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(rq->lock); -+ idle->last_ran = rq->niffies; -+ time_slice_expired(idle, rq); -+ idle->state = TASK_RUNNING; -+ /* Setting prio to illegal value shouldn't matter when never queued */ -+ idle->prio = PRIO_LIMIT; -+ idle->flags |= PF_IDLE; -+ -+ scs_task_reset(idle); -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#ifdef CONFIG_SMT_NICE -+ idle->smt_bias = 0; -+#endif -+#endif -+ set_rq_task(rq, idle); -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_rq = TASK_ON_RQ_QUEUED; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+ rq_lock_irqsave(rq, &rf); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(rq); -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void select_nohz_load_balancer(int stop_tick) -+{ -+} -+ -+void set_cpu_sd_state_idle(void) {} -+void nohz_balance_enter_idle(int cpu) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct sched_domain *sd; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ rcu_read_lock(); -+ for_each_domain(cpu, sd) { -+ for_each_cpu_and(i, sched_domain_span(sd), -+ housekeeping_cpumask(HK_FLAG_TIMER)) { -+ if (cpu == i) -+ continue; -+ -+ if (!idle_cpu(i)) { -+ cpu = i; -+ goto unlock; -+ } -+ } -+ } -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+unlock: -+ rcu_read_unlock(); -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+void wake_up_idle_cpu(int cpu) -+{ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) -+ smp_sched_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * Wake up the specified CPU. If the CPU is going offline, it is the -+ * caller's responsibility to deal with the lost wakeup, for example, -+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. -+ */ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+#endif /* CONFIG_NO_HZ_COMMON */ -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ bool queued = false, running_wrong = false, kthread; -+ unsigned int dest_cpu; -+ struct rq_flags rf; -+ struct rq *rq; -+ int ret = 0; -+ -+ rq = task_rq_lock(p, &rf); -+ update_rq_clock(rq); -+ -+ kthread = !!(p->flags & PF_KTHREAD); -+ if (kthread) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ /* -+ * Picking a ~random cpu helps in cases where we are changing affinity -+ * for groups of tasks (ie. cpuset), so that load balancing is not -+ * immediately required to distribute the tasks within their new mask. -+ */ -+ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ queued = task_queued(p); -+ __do_set_cpus_allowed(p, new_mask); -+ -+ if (kthread) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(rq, p)) { -+ /* Task is running on the wrong cpu now, reschedule it. */ -+ if (rq == this_rq()) { -+ set_task_cpu(p, dest_cpu); -+ set_tsk_need_resched(p); -+ running_wrong = true; -+ } else -+ resched_task(p); -+ } else { -+ if (queued) { -+ /* -+ * Switch runqueue locks after dequeueing the task -+ * here while still holding the pi_lock to be holding -+ * the correct lock for enqueueing. -+ */ -+ dequeue_task(rq, p, 0); -+ rq_unlock(rq); -+ -+ rq = cpu_rq(dest_cpu); -+ rq_lock(rq); -+ } -+ set_task_cpu(p, dest_cpu); -+ if (queued) -+ enqueue_task(rq, p, 0); -+ } -+ if (queued) -+ try_preempt(p, rq); -+ if (running_wrong) -+ preempt_disable(); -+out: -+ task_rq_unlock(rq, p, &rf); -+ -+ if (running_wrong) { -+ __schedule(true); -+ preempt_enable(); -+ } -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Run through task list and find tasks affined to the dead cpu, then remove -+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold -+ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and -+ * pi_lock to change cpus_mask but it's not going to matter here. -+ */ -+static void bind_zero(int src_cpu) -+{ -+ struct task_struct *p, *t; -+ struct rq *rq0; -+ int bound = 0; -+ -+ if (src_cpu == 0) -+ return; -+ -+ rq0 = cpu_rq(0); -+ -+ do_each_thread(t, p) { -+ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { -+ bool local = (task_cpu(p) == src_cpu); -+ struct rq *rq = task_rq(p); -+ -+ /* task_running is the cpu stopper thread */ -+ if (local && task_running(rq, p)) -+ continue; -+ atomic_clear_cpu(src_cpu, &p->cpus_mask); -+ atomic_set_cpu(0, &p->cpus_mask); -+ p->zerobound = true; -+ bound++; -+ if (local) { -+ bool queued = task_queued(p); -+ -+ if (queued) -+ dequeue_task(rq, p, 0); -+ set_task_cpu(p, 0); -+ if (queued) -+ enqueue_task(rq0, p, 0); -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (bound) { -+ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", -+ bound, src_cpu); -+ } -+} -+ -+/* Find processes with the zerobound flag and reenable their affinity for the -+ * CPU coming alive. */ -+static void unbind_zero(int src_cpu) -+{ -+ int unbound = 0, zerobound = 0; -+ struct task_struct *p, *t; -+ -+ if (src_cpu == 0) -+ return; -+ -+ do_each_thread(t, p) { -+ if (!p->mm) -+ p->zerobound = false; -+ if (p->zerobound) { -+ unbound++; -+ cpumask_set_cpu(src_cpu, &p->cpus_mask); -+ /* Once every CPU affinity has been re-enabled, remove -+ * the zerobound flag */ -+ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { -+ p->zerobound = false; -+ zerobound++; -+ } -+ } -+ } while_each_thread(t, p); -+ -+ if (unbound) { -+ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", -+ unbound, src_cpu); -+ } -+ if (zerobound) { -+ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", -+ zerobound); -+ } -+} -+ -+/* -+ * Ensure that the idle task is using init_mm right before its cpu goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(cpu_online(smp_processor_id())); -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+#else /* CONFIG_HOTPLUG_CPU */ -+static void unbind_zero(int src_cpu) {} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+ -+static struct ctl_table sd_ctl_dir[] = { -+ { -+ .procname = "sched_domain", -+ .mode = 0555, -+ }, -+ {} -+}; -+ -+static struct ctl_table sd_ctl_root[] = { -+ { -+ .procname = "kernel", -+ .mode = 0555, -+ .child = sd_ctl_dir, -+ }, -+ {} -+}; -+ -+static struct ctl_table *sd_alloc_ctl_entry(int n) -+{ -+ struct ctl_table *entry = -+ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); -+ -+ return entry; -+} -+ -+static void sd_free_ctl_entry(struct ctl_table **tablep) -+{ -+ struct ctl_table *entry; -+ -+ /* -+ * In the intermediate directories, both the child directory and -+ * procname are dynamically allocated and could fail but the mode -+ * will always be set. In the lowest directory the names are -+ * static strings and all have proc handlers. -+ */ -+ for (entry = *tablep; entry->mode; entry++) { -+ if (entry->child) -+ sd_free_ctl_entry(&entry->child); -+ if (entry->proc_handler == NULL) -+ kfree(entry->procname); -+ } -+ -+ kfree(*tablep); -+ *tablep = NULL; -+} -+ -+static void -+set_table_entry(struct ctl_table *entry, -+ const char *procname, void *data, int maxlen, -+ umode_t mode, proc_handler *proc_handler) -+{ -+ entry->procname = procname; -+ entry->data = data; -+ entry->maxlen = maxlen; -+ entry->mode = mode; -+ entry->proc_handler = proc_handler; -+} -+ -+static struct ctl_table * -+sd_alloc_ctl_domain_table(struct sched_domain *sd) -+{ -+ struct ctl_table *table = sd_alloc_ctl_entry(9); -+ -+ if (table == NULL) -+ return NULL; -+ -+ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); -+ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); -+ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); -+ /* &table[8] is terminator */ -+ -+ return table; -+} -+ -+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -+{ -+ struct ctl_table *entry, *table; -+ struct sched_domain *sd; -+ int domain_num = 0, i; -+ char buf[32]; -+ -+ for_each_domain(cpu, sd) -+ domain_num++; -+ entry = table = sd_alloc_ctl_entry(domain_num + 1); -+ if (table == NULL) -+ return NULL; -+ -+ i = 0; -+ for_each_domain(cpu, sd) { -+ snprintf(buf, 32, "domain%d", i); -+ entry->procname = kstrdup(buf, GFP_KERNEL); -+ entry->mode = 0555; -+ entry->child = sd_alloc_ctl_domain_table(sd); -+ entry++; -+ i++; -+ } -+ return table; -+} -+ -+static cpumask_var_t sd_sysctl_cpus; -+static struct ctl_table_header *sd_sysctl_header; -+ -+void register_sched_domain_sysctl(void) -+{ -+ static struct ctl_table *cpu_entries; -+ static struct ctl_table **cpu_idx; -+ char buf[32]; -+ int i; -+ -+ if (!cpu_entries) { -+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); -+ if (!cpu_entries) -+ return; -+ -+ WARN_ON(sd_ctl_dir[0].child); -+ sd_ctl_dir[0].child = cpu_entries; -+ } -+ -+ if (!cpu_idx) { -+ struct ctl_table *e = cpu_entries; -+ -+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); -+ if (!cpu_idx) -+ return; -+ -+ /* deal with sparse possible map */ -+ for_each_possible_cpu(i) { -+ cpu_idx[i] = e; -+ e++; -+ } -+ } -+ -+ if (!cpumask_available(sd_sysctl_cpus)) { -+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) -+ return; -+ -+ /* init to possible to not have holes in @cpu_entries */ -+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); -+ } -+ -+ for_each_cpu(i, sd_sysctl_cpus) { -+ struct ctl_table *e = cpu_idx[i]; -+ -+ if (e->child) -+ sd_free_ctl_entry(&e->child); -+ -+ if (!e->procname) { -+ snprintf(buf, 32, "cpu%d", i); -+ e->procname = kstrdup(buf, GFP_KERNEL); -+ } -+ e->mode = 0555; -+ e->child = sd_alloc_ctl_cpu_table(i); -+ -+ __cpumask_clear_cpu(i, sd_sysctl_cpus); -+ } -+ -+ WARN_ON(sd_sysctl_header); -+ sd_sysctl_header = register_sysctl_table(sd_ctl_root); -+} -+ -+void dirty_sched_domain_sysctl(int cpu) -+{ -+ if (cpumask_available(sd_sysctl_cpus)) -+ __cpumask_set_cpu(cpu, sd_sysctl_cpus); -+} -+ -+/* may be called multiple times per register */ -+void unregister_sched_domain_sysctl(void) -+{ -+ unregister_sysctl_table(sd_sysctl_header); -+ sd_sysctl_header = NULL; -+} -+#endif /* CONFIG_SYSCTL */ -+ -+void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) { -+ cpumask_set_cpu(cpu_of(rq), rq->rd->online); -+ rq->online = true; -+ } -+} -+ -+void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) { -+ int cpu = cpu_of(rq); -+ -+ cpumask_clear_cpu(cpu, rq->rd->online); -+ rq->online = false; -+ clear_cpuidle_map(cpu); -+ } -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) { -+ sched_domains_numa_masks_set(cpu); -+ cpuset_cpu_active(); -+ } -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all CPUs have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ rq_lock_irqsave(rq, &rf); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_online(rq); -+ } -+ unbind_zero(cpu); -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_dec_cpuslocked(&sched_smt_present); -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ sched_domains_numa_masks_clear(cpu); -+ return 0; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ local_irq_save(flags); -+ double_rq_lock(rq, cpu_rq(0)); -+ if (rq->rd) { -+ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -+ set_rq_offline(rq); -+ } -+ bind_zero(cpu); -+ double_rq_unlock(rq, cpu_rq(0)); -+ sched_start_tick(rq, cpu); -+ hrexpiry_clear(rq); -+ local_irq_restore(flags); -+ -+ return 0; -+} -+#endif -+ -+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) -+/* -+ * Cheaper version of the below functions in case support for SMT and MC is -+ * compiled in but CPUs have no siblings. -+ */ -+static bool sole_cpu_idle(struct rq *rq) -+{ -+ return rq_idle(rq); -+} -+#endif -+#ifdef CONFIG_SCHED_SMT -+static const cpumask_t *thread_cpumask(int cpu) -+{ -+ return topology_sibling_cpumask(cpu); -+} -+/* All this CPU's SMT siblings are idle */ -+static bool siblings_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); -+} -+#endif -+#ifdef CONFIG_SCHED_MC -+static const cpumask_t *core_cpumask(int cpu) -+{ -+ return topology_core_cpumask(cpu); -+} -+/* All this CPU's shared cache siblings are idle */ -+static bool cache_cpu_idle(struct rq *rq) -+{ -+ return cpumask_subset(&rq->core_mask, &cpu_idle_map); -+} -+/* MC siblings CPU mask which share the same LLC */ -+static const cpumask_t *llc_core_cpumask(int cpu) -+{ -+#ifdef CONFIG_X86 -+ return per_cpu(cpu_llc_shared_map, cpu); -+#else -+ return topology_core_cpumask(cpu); -+#endif -+} -+#endif -+ -+enum sched_domain_level { -+ SD_LV_NONE = 0, -+ SD_LV_SIBLING, -+ SD_LV_MC, -+ SD_LV_BOOK, -+ SD_LV_CPU, -+ SD_LV_NODE, -+ SD_LV_ALLNODES, -+ SD_LV_MAX -+}; -+ -+/* -+ * Set up the relative cache distance of each online cpu from each -+ * other in a simple array for quick lookup. Locality is determined -+ * by the closest sched_domain that CPUs are separated by. CPUs with -+ * shared cache in SMT and MC are treated as local. Separate CPUs -+ * (within the same package or physically) within the same node are -+ * treated as not local. CPUs not even in the same domain (different -+ * nodes) are treated as very distant. -+ */ -+static void __init select_leaders(void) -+{ -+ struct rq *rq, *other_rq, *leader; -+ struct sched_domain *sd; -+ int cpu, other_cpu; -+#ifdef CONFIG_SCHED_SMT -+ bool smt_threads = false; -+#endif -+ -+ for (cpu = 0; cpu < num_online_cpus(); cpu++) { -+ rq = cpu_rq(cpu); -+ leader = NULL; -+ /* First check if this cpu is in the same node */ -+ for_each_domain(cpu, sd) { -+ if (sd->level > SD_LV_MC) -+ continue; -+ if (rqshare != RQSHARE_ALL) -+ leader = NULL; -+ /* Set locality to local node if not already found lower */ -+ for_each_cpu(other_cpu, sched_domain_span(sd)) { -+ if (rqshare >= RQSHARE_SMP) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smp_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smp_leader) -+ other_rq->smp_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMP; -+ } -+ } -+ -+ /* -+ * Each runqueue has its own function in case it doesn't have -+ * siblings of its own allowing mixed topologies. -+ */ -+#ifdef CONFIG_SCHED_MC -+ leader = NULL; -+ if (cpumask_weight(core_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->core_mask); -+ for_each_cpu(other_cpu, core_cpumask(cpu)) { -+ if (rqshare == RQSHARE_MC || -+ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the mc_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->mc_leader) -+ other_rq->mc_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { -+ /* this is to get LLC into play even in case LLC sharing is not used */ -+ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) -+ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; -+ else -+ rq->cpu_locality[other_cpu] = LOCALITY_MC; -+ } -+ } -+ rq->cache_idle = cache_cpu_idle; -+ } -+#endif -+#ifdef CONFIG_SCHED_SMT -+ leader = NULL; -+ if (cpumask_weight(thread_cpumask(cpu)) > 1) { -+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); -+ cpumask_clear_cpu(cpu, &rq->thread_mask); -+ for_each_cpu(other_cpu, thread_cpumask(cpu)) { -+ if (rqshare == RQSHARE_SMT) { -+ other_rq = cpu_rq(other_cpu); -+ -+ /* Set the smt_leader to the first CPU */ -+ if (!leader) -+ leader = rq; -+ if (!other_rq->smt_leader) -+ other_rq->smt_leader = leader; -+ } -+ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) -+ rq->cpu_locality[other_cpu] = LOCALITY_SMT; -+ } -+ rq->siblings_idle = siblings_cpu_idle; -+ smt_threads = true; -+ } -+#endif -+ } -+ -+#ifdef CONFIG_SMT_NICE -+ if (smt_threads) { -+ check_siblings = &check_smt_siblings; -+ wake_siblings = &wake_smt_siblings; -+ smt_schedule = &smt_should_schedule; -+ } -+#endif -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for_each_online_cpu(other_cpu) { -+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); -+ } -+ } -+} -+ -+/* FIXME freeing locked spinlock */ -+static void __init share_and_free_rq(struct rq *leader, struct rq *rq) -+{ -+ WARN_ON(rq->nr_running > 0); -+ -+ kfree(rq->node); -+ kfree(rq->sl); -+ kfree(rq->lock); -+ rq->node = leader->node; -+ rq->sl = leader->sl; -+ rq->lock = leader->lock; -+ rq->is_leader = false; -+ barrier(); -+ /* To make up for not unlocking the freed runlock */ -+ preempt_enable(); -+} -+ -+static void __init share_rqs(void) -+{ -+ struct rq *rq, *leader; -+ int cpu; -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smp_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+ -+#ifdef CONFIG_SCHED_MC -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->mc_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_MC */ -+ -+#ifdef CONFIG_SCHED_SMT -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ leader = rq->smt_leader; -+ -+ rq_lock(rq); -+ if (leader && rq != leader) { -+ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", -+ leader->cpu, rq->cpu); -+ share_and_free_rq(leader, rq); -+ } else -+ rq_unlock(rq); -+ } -+#endif /* CONFIG_SCHED_SMT */ -+} -+ -+static void __init setup_rq_orders(void) -+{ -+ int *selected_cpus, *ordered_cpus; -+ struct rq *rq, *other_rq; -+ int cpu, other_cpu, i; -+ -+ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); -+ -+ total_runqueues = 0; -+ for_each_online_cpu(cpu) { -+ int locality, total_rqs = 0, total_cpus = 0; -+ -+ rq = cpu_rq(cpu); -+ if (rq->is_leader) -+ total_runqueues++; -+ -+ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { -+ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; -+ int ordered_cpus_idx; -+ -+ ordered_cpus_idx = -1; -+ selected_cpu_cnt = 0; -+ -+ for_each_online_cpu(test_cpu) { -+ if (cpu < num_online_cpus() / 2) -+ other_cpu = cpu + test_cpu; -+ else -+ other_cpu = cpu - test_cpu; -+ if (other_cpu < 0) -+ other_cpu += num_online_cpus(); -+ else -+ other_cpu %= num_online_cpus(); -+ /* gather CPUs of the same locality */ -+ if (rq->cpu_locality[other_cpu] == locality) { -+ selected_cpus[selected_cpu_cnt] = other_cpu; -+ selected_cpu_cnt++; -+ } -+ } -+ -+ /* reserve first CPU as starting point */ -+ if (selected_cpu_cnt > 0) { -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; -+ selected_cpus[ordered_cpus_idx] = -1; -+ } -+ -+ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ -+ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { -+ /* starting point with worst locality and current CPU */ -+ best_locality = LOCALITY_DISTANT; -+ selected_cpu_idx = test_cpu_idx; -+ -+ /* try to find the best locality within group */ -+ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { -+ /* if CPU has not been used and locality is better */ -+ if (selected_cpus[cpu_idx] > -1) { -+ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); -+ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { -+ /* assign best locality and best CPU idx in array */ -+ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; -+ selected_cpu_idx = cpu_idx; -+ } -+ } -+ } -+ -+ /* add our next best CPU to ordered list */ -+ ordered_cpus_idx++; -+ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; -+ /* mark this CPU as used */ -+ selected_cpus[selected_cpu_idx] = -1; -+ } -+ -+ /* set up RQ and CPU orders */ -+ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { -+ other_rq = cpu_rq(ordered_cpus[test_cpu]); -+ /* set up cpu orders */ -+ rq->cpu_order[total_cpus++] = other_rq; -+ if (other_rq->is_leader) { -+ /* set up RQ orders */ -+ rq->rq_order[total_rqs++] = other_rq; -+ } -+ } -+ } -+ } -+ -+ kfree(selected_cpus); -+ kfree(ordered_cpus); -+ -+#ifdef CONFIG_X86 -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < total_runqueues; i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); -+ } -+ } -+ -+ for_each_online_cpu(cpu) { -+ rq = cpu_rq(cpu); -+ for (i = 0; i < num_online_cpus(); i++) { -+ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, -+ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); -+ } -+ } -+#endif -+} -+ -+void __init sched_init_smp(void) -+{ -+ sched_init_numa(); -+ -+ /* -+ * There's no userspace yet to cause hotplug operations; hence all the -+ * cpu masks are stable and all blatant races in the below code cannot -+ * happen. -+ */ -+ mutex_lock(&sched_domains_mutex); -+ sched_init_domains(cpu_active_mask); -+ mutex_unlock(&sched_domains_mutex); -+ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ local_irq_disable(); -+ mutex_lock(&sched_domains_mutex); -+ lock_all_rqs(); -+ -+ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", -+ num_possible_cpus(), num_present_cpus(), num_online_cpus()); -+ -+ select_leaders(); -+ -+ unlock_all_rqs(); -+ mutex_unlock(&sched_domains_mutex); -+ -+ share_rqs(); -+ -+ local_irq_enable(); -+ -+ setup_rq_orders(); -+ -+ switch (rqshare) { -+ case RQSHARE_ALL: -+ /* This should only ever read 1 */ -+ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMP: -+ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC: -+ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_MC_LLC: -+ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_SMT: -+ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", -+ total_runqueues); -+ break; -+ case RQSHARE_NONE: -+ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", -+ total_runqueues); -+ break; -+ } -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ sched_smp_initialized = true; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+#ifdef CONFIG_SMP -+ int cpu_ids; -+#endif -+ int i; -+ struct rq *rq; -+ -+ wait_bit_init(); -+ -+ prio_ratios[0] = 128; -+ for (i = 1 ; i < NICE_WIDTH ; i++) -+ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; -+ -+ skiplist_node_init(&init_task.node); -+ -+#ifdef CONFIG_SMP -+ init_defrootdomain(); -+ cpumask_clear(&cpu_idle_map); -+#else -+ uprq = &per_cpu(runqueues, 0); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); -+ skiplist_init(rq->node); -+ rq->sl = new_skiplist(rq->node); -+ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); -+ raw_spin_lock_init(rq->lock); -+ rq->nr_running = 0; -+ rq->nr_uninterruptible = 0; -+ rq->nr_switches = 0; -+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; -+ rq->last_jiffy = jiffies; -+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = -+ rq->iowait_ns = rq->idle_ns = 0; -+ rq->dither = 0; -+ set_rq_task(rq, &init_task); -+ rq->iso_ticks = 0; -+ rq->iso_refractory = false; -+#ifdef CONFIG_SMP -+ rq->is_leader = true; -+ rq->smp_leader = NULL; -+#ifdef CONFIG_SCHED_MC -+ rq->mc_leader = NULL; -+#endif -+#ifdef CONFIG_SCHED_SMT -+ rq->smt_leader = NULL; -+#endif -+ rq->sd = NULL; -+ rq->rd = NULL; -+ rq->online = false; -+ rq->cpu = i; -+ rq_attach_root(rq, &def_root_domain); -+#endif -+ init_rq_hrexpiry(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+ -+#ifdef CONFIG_SMP -+ cpu_ids = i; -+ /* -+ * Set the base locality for cpu cache distance calculation to -+ * "distant" (3). Make sure the distance from a CPU to itself is 0. -+ */ -+ for_each_possible_cpu(i) { -+ int j; -+ -+ rq = cpu_rq(i); -+#ifdef CONFIG_SCHED_SMT -+ rq->siblings_idle = sole_cpu_idle; -+#endif -+#ifdef CONFIG_SCHED_MC -+ rq->cache_idle = sole_cpu_idle; -+#endif -+ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); -+ for_each_possible_cpu(j) { -+ if (i == j) -+ rq->cpu_locality[j] = LOCALITY_SAME; -+ else -+ rq->cpu_locality[j] = LOCALITY_DISTANT; -+ } -+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); -+ rq->rq_order[0] = rq->cpu_order[0] = rq; -+ for (j = 1; j < cpu_ids; j++) -+ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); -+ } -+#endif -+ -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && !preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+static inline void normalise_rt_tasks(void) -+{ -+ struct sched_attr attr = {}; -+ struct task_struct *g, *p; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p) && !iso_task(p)) -+ continue; -+ -+ rq = task_rq_lock(p, &rf); -+ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); -+ task_rq_unlock(rq, p, &rf); -+ } -+ read_unlock(&tasklist_lock); -+} -+ -+void normalize_rt_tasks(void) -+{ -+ normalise_rt_tasks(); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+void init_idle_bootup_task(struct task_struct *idle) -+{} -+ -+#ifdef CONFIG_SCHED_DEBUG -+__read_mostly bool sched_debug_enabled; -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void call_trace_sched_update_nr_running(struct rq *rq, int count) -+{ -+ trace_sched_update_nr_running_tp(rq, count); -+} -+ -+/* CFS Compat */ -+#ifdef CONFIG_RCU_TORTURE_TEST -+int sysctl_sched_rt_runtime; -+#endif -diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h -new file mode 100644 -index 000000000000..09a1f2fe64ba ---- /dev/null -+++ b/kernel/sched/MuQSS.h -@@ -0,0 +1,1070 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef MUQSS_SCHED_H -+#define MUQSS_SCHED_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_PARAVIRT -+#include -+#endif -+ -+#include "cpupri.h" -+ -+#include -+ -+#ifdef CONFIG_SCHED_DEBUG -+# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) -+#else -+# define SCHED_WARN_ON(x) ((void)(x)) -+#endif -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -+ -+struct rq; -+ -+#ifdef CONFIG_SMP -+ -+static inline bool sched_asym_prefer(int a, int b) -+{ -+ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); -+} -+ -+struct perf_domain { -+ struct em_perf_domain *em_pd; -+ struct perf_domain *next; -+ struct rcu_head rcu; -+}; -+ -+/* Scheduling group status flags */ -+#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ -+#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ -+ -+/* -+ * We add the notion of a root-domain which will be used to define per-domain -+ * variables. Each exclusive cpuset essentially defines an island domain by -+ * fully partitioning the member cpus from any other cpuset. Whenever a new -+ * exclusive cpuset is created, we also create and attach a new root-domain -+ * object. -+ * -+ */ -+struct root_domain { -+ atomic_t refcount; -+ atomic_t rto_count; -+ struct rcu_head rcu; -+ cpumask_var_t span; -+ cpumask_var_t online; -+ -+ /* -+ * Indicate pullable load on at least one CPU, e.g: -+ * - More than one runnable task -+ * - Running task is misfit -+ */ -+ int overload; -+ -+ /* Indicate one or more cpus over-utilized (tipping point) */ -+ int overutilized; -+ -+ /* -+ * The bit corresponding to a CPU gets set here if such CPU has more -+ * than one runnable -deadline task (as it is below for RT tasks). -+ */ -+ cpumask_var_t dlo_mask; -+ atomic_t dlo_count; -+ /* Replace unused CFS structures with void */ -+ //struct dl_bw dl_bw; -+ //struct cpudl cpudl; -+ void *dl_bw; -+ void *cpudl; -+ -+ /* -+ * The "RT overload" flag: it gets set if a CPU has more than -+ * one runnable RT task. -+ */ -+ cpumask_var_t rto_mask; -+ //struct cpupri cpupri; -+ void *cpupri; -+ -+ unsigned long max_cpu_capacity; -+ -+ /* -+ * NULL-terminated list of performance domains intersecting with the -+ * CPUs of the rd. Protected by RCU. -+ */ -+ struct perf_domain *pd; -+}; -+ -+extern void init_defrootdomain(void); -+extern int sched_init_domains(const struct cpumask *cpu_map); -+extern void rq_attach_root(struct rq *rq, struct root_domain *rd); -+ -+static inline void cpupri_cleanup(void __maybe_unused *cpupri) -+{ -+} -+ -+static inline void cpudl_cleanup(void __maybe_unused *cpudl) -+{ -+} -+ -+static inline void init_dl_bw(void __maybe_unused *dl_bw) -+{ -+} -+ -+static inline int cpudl_init(void __maybe_unused *dl_bw) -+{ -+ return 0; -+} -+ -+static inline int cpupri_init(void __maybe_unused *cpupri) -+{ -+ return 0; -+} -+#endif /* CONFIG_SMP */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ raw_spinlock_t *lock; -+ raw_spinlock_t *orig_lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle; -+ struct task_struct *stop; -+ struct mm_struct *prev_mm; -+ -+ unsigned int nr_running; -+ /* -+ * This is part of a global counter where only the total sum -+ * over all CPUs matters. A task can increase this counter on -+ * one CPU and if it got migrated afterwards it may decrease -+ * it on another CPU. Always updated under the runqueue lock: -+ */ -+ unsigned long nr_uninterruptible; -+#ifdef CONFIG_SMP -+ unsigned int ttwu_pending; -+#endif -+ u64 nr_switches; -+ -+ /* Stored data about rq->curr to work outside rq lock */ -+ u64 rq_deadline; -+ int rq_prio; -+ -+ /* Best queued id for use outside lock */ -+ u64 best_key; -+ -+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ -+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ -+ u64 niffies; /* Last time this RQ updated rq clock */ -+ u64 last_niffy; /* Last niffies as updated by local clock */ -+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ -+ -+ u64 load_update; /* When we last updated load */ -+ unsigned long load_avg; /* Rolling load average */ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ u64 irq_load_update; /* When we last updated IRQ load */ -+ unsigned long irq_load_avg; /* Rolling IRQ load average */ -+#endif -+#ifdef CONFIG_SMT_NICE -+ struct mm_struct *rq_mm; -+ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ -+#endif -+ /* Accurate timekeeping data */ -+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, -+ iowait_ns, idle_ns; -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+ skiplist_node *node; -+ skiplist *sl; -+#ifdef CONFIG_SMP -+ struct task_struct *preempt; /* Preempt triggered on this task */ -+ struct task_struct *preempting; /* Hint only, what task is preempting */ -+ -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ struct root_domain *rd; -+ struct sched_domain *sd; -+ -+ unsigned long cpu_capacity_orig; -+ -+ int *cpu_locality; /* CPU relative cache distance */ -+ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ -+ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ -+ -+ bool is_leader; -+ struct rq *smp_leader; /* First physical CPU per node */ -+#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+ struct sched_avg avg_thermal; -+#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ -+#ifdef CONFIG_SCHED_SMT -+ struct rq *smt_leader; /* First logical CPU in SMT siblings */ -+ cpumask_t thread_mask; -+ bool (*siblings_idle)(struct rq *rq); -+ /* See if all smt siblings are idle */ -+#endif /* CONFIG_SCHED_SMT */ -+#ifdef CONFIG_SCHED_MC -+ struct rq *mc_leader; /* First logical CPU in MC siblings */ -+ cpumask_t core_mask; -+ bool (*cache_idle)(struct rq *rq); -+ /* See if all cache siblings are idle */ -+#endif /* CONFIG_SCHED_MC */ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ u64 clock, old_clock, last_tick; -+ /* Ensure that all clocks are in the same cache line */ -+ u64 clock_task ____cacheline_aligned; -+ int dither; -+ -+ int iso_ticks; -+ bool iso_refractory; -+ -+#ifdef CONFIG_HIGH_RES_TIMERS -+ struct hrtimer hrexpiry_timer; -+#endif -+ -+ int rt_nr_running; /* Number real time tasks running */ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+}; -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ lockdep_assert_held(rq->lock); -+ -+ return rq->clock_task; -+} -+ -+/** -+ * By default the decay is the default pelt decay period. -+ * The decay shift can change the decay period in -+ * multiples of 32. -+ * Decay shift Decay period(ms) -+ * 0 32 -+ * 1 64 -+ * 2 128 -+ * 3 256 -+ * 4 512 -+ */ -+extern int sched_thermal_decay_shift; -+ -+static inline u64 rq_clock_thermal(struct rq *rq) -+{ -+ return rq_clock_task(rq) >> sched_thermal_decay_shift; -+} -+ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+#ifdef CONFIG_SMP -+struct rq *cpu_rq(int cpu); -+#endif -+ -+#ifndef CONFIG_SMP -+extern struct rq *uprq; -+#define cpu_rq(cpu) (uprq) -+#define this_rq() (uprq) -+#define raw_rq() (uprq) -+#define task_rq(p) (uprq) -+#define cpu_curr(cpu) ((uprq)->curr) -+#else /* CONFIG_SMP */ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define this_rq() this_cpu_ptr(&runqueues) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#endif /* CONFIG_SMP */ -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline int task_running(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SMP -+ return p->on_cpu; -+#else -+ return task_current(rq, p); -+#endif -+} -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+static inline void rq_lock(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock(rq->lock); -+} -+ -+static inline void rq_unlock(struct rq *rq) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(rq->lock); -+} -+ -+static inline void rq_lock_irq(struct rq *rq) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irq(rq->lock); -+} -+ -+static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(rq->lock); -+} -+ -+static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(rq->lock, rf->flags); -+} -+ -+static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(rq->lock, rf->flags); -+} -+ -+static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ while (42) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ } -+ return rq; -+} -+ -+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ rq_unlock(rq); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ while (42) { -+ rq = task_rq(p); -+ raw_spin_lock(rq->lock); -+ if (likely(rq == task_rq(p))) -+ break; -+ raw_spin_unlock(rq->lock); -+ } -+ return rq; -+} -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) -+{ -+ rq_unlock(rq); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ rq_lock(rq); -+ return rq; -+} -+ -+/* -+ * {de,en}queue flags: Most not used on MuQSS. -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks -+ * are in a known state which allows modification. Such pairs -+ * should preserve as much state as possible. -+ * -+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location -+ * in the runqueue. -+ * -+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) -+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) -+ * ENQUEUE_MIGRATED - the task was migrated during wakeup -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ -+ -+#define ENQUEUE_WAKEUP 0x01 -+#define ENQUEUE_RESTORE 0x02 -+ -+#ifdef CONFIG_SMP -+#define ENQUEUE_MIGRATED 0x40 -+#else -+#define ENQUEUE_MIGRATED 0x00 -+#endif -+ -+#ifdef CONFIG_NUMA -+enum numa_topology_type { -+ NUMA_DIRECT, -+ NUMA_GLUELESS_MESH, -+ NUMA_BACKPLANE, -+}; -+extern enum numa_topology_type sched_numa_topology_type; -+extern int sched_max_numa_distance; -+extern bool find_numa_distance(int distance); -+extern void sched_init_numa(void); -+extern void sched_domains_numa_masks_set(unsigned int cpu); -+extern void sched_domains_numa_masks_clear(unsigned int cpu); -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline void sched_init_numa(void) { } -+static inline void sched_domains_numa_masks_set(unsigned int cpu) { } -+static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+extern struct mutex sched_domains_mutex; -+extern struct static_key_false sched_schedstats; -+ -+#define rcu_dereference_check_sched_domain(p) \ -+ rcu_dereference_check((p), \ -+ lockdep_is_held(&sched_domains_mutex)) -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. -+ * See destroy_sched_domains: call_rcu for details. -+ * -+ * The domain tree of any CPU may only be accessed from within -+ * preempt-disabled sections. -+ */ -+#define for_each_domain(cpu, __sd) \ -+ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ -+ __sd; __sd = __sd->parent) -+ -+/** -+ * highest_flag_domain - Return highest sched_domain containing flag. -+ * @cpu: The cpu whose highest level of sched domain is to -+ * be returned. -+ * @flag: The flag to check for the highest sched_domain -+ * for the given cpu. -+ * -+ * Returns the highest sched_domain of a cpu which contains the given flag. -+ */ -+static inline struct sched_domain *highest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd, *hsd = NULL; -+ -+ for_each_domain(cpu, sd) { -+ if (!(sd->flags & flag)) -+ break; -+ hsd = sd; -+ } -+ -+ return hsd; -+} -+ -+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) -+{ -+ struct sched_domain *sd; -+ -+ for_each_domain(cpu, sd) { -+ if (sd->flags & flag) -+ break; -+ } -+ -+ return sd; -+} -+ -+DECLARE_PER_CPU(struct sched_domain *, sd_llc); -+DECLARE_PER_CPU(int, sd_llc_size); -+DECLARE_PER_CPU(int, sd_llc_id); -+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); -+DECLARE_PER_CPU(struct sched_domain *, sd_numa); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); -+DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); -+ -+struct sched_group_capacity { -+ atomic_t ref; -+ /* -+ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity -+ * for a single CPU. -+ */ -+ unsigned long capacity; -+ unsigned long min_capacity; /* Min per-CPU capacity in group */ -+ unsigned long max_capacity; /* Max per-CPU capacity in group */ -+ unsigned long next_update; -+ int imbalance; /* XXX unrelated to capacity but shared group state */ -+ -+#ifdef CONFIG_SCHED_DEBUG -+ int id; -+#endif -+ -+ unsigned long cpumask[]; /* balance mask */ -+}; -+ -+struct sched_group { -+ struct sched_group *next; /* Must be a circular list */ -+ atomic_t ref; -+ -+ unsigned int group_weight; -+ struct sched_group_capacity *sgc; -+ int asym_prefer_cpu; /* cpu of highest priority in group */ -+ -+ /* -+ * The CPUs this group covers. -+ * -+ * NOTE: this field is variable length. (Allocated dynamically -+ * by attaching extra space to the end of the structure, -+ * depending on how many CPUs the kernel has booted up with) -+ */ -+ unsigned long cpumask[0]; -+}; -+ -+static inline struct cpumask *sched_group_span(struct sched_group *sg) -+{ -+ return to_cpumask(sg->cpumask); -+} -+ -+/* -+ * See build_balance_mask(). -+ */ -+static inline struct cpumask *group_balance_mask(struct sched_group *sg) -+{ -+ return to_cpumask(sg->sgc->cpumask); -+} -+ -+/** -+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. -+ * @group: The group whose first cpu is to be returned. -+ */ -+static inline unsigned int group_first_cpu(struct sched_group *group) -+{ -+ return cpumask_first(sched_group_span(group)); -+} -+ -+ -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void dirty_sched_domain_sysctl(int cpu); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void dirty_sched_domain_sysctl(int cpu) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -+extern void set_rq_online (struct rq *rq); -+extern void set_rq_offline(struct rq *rq); -+extern bool sched_smp_initialized; -+ -+static inline void update_group_capacity(struct sched_domain *sd, int cpu) -+{ -+} -+ -+static inline void trigger_load_balance(struct rq *rq) -+{ -+} -+ -+#define sched_feat(x) 0 -+ -+#else /* CONFIG_SMP */ -+ -+static inline void flush_smp_call_function_from_idle(void) { } -+ -+#endif /* CONFIG_SMP */ -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ SCHED_WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+#ifdef CONFIG_SCHED_DEBUG -+extern bool sched_debug_enabled; -+#endif -+ -+extern void schedule_idle(void); -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+static inline bool sched_stop_runnable(struct rq *rq) -+{ -+ return rq->stop && task_on_rq_queued(rq->stop); -+} -+ -+#ifdef CONFIG_SMP -+static inline int cpu_of(struct rq *rq) -+{ -+ return rq->cpu; -+} -+#else /* CONFIG_SMP */ -+static inline int cpu_of(struct rq *rq) -+{ -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); -+ -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, -+ cpu_of(rq))); -+ -+ if (data) -+ data->func(data, rq->niffies, flags); -+} -+#else -+static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) -+{ -+} -+#endif /* CONFIG_CPU_FREQ */ -+ -+static __always_inline -+unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, -+ struct task_struct __maybe_unused *p) -+{ -+ return util; -+} -+ -+static inline bool uclamp_is_used(void) -+{ -+ return false; -+} -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return tsk_seruntime(t); -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ struct rq_flags rf; -+ u64 ns; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = tsk_seruntime(t); -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+/** -+ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. -+ * @cpu: the CPU in question. -+ * -+ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. -+ * -+ * f_curr -+ * ------ * SCHED_CAPACITY_SCALE -+ * f_max -+ */ -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern bool sched_can_stop_tick(struct rq *rq); -+extern int __init sched_tick_offload_init(void); -+ -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out of -+ * nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (sched_can_stop_tick(rq)) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+static inline bool rt_rq_is_runnable(struct rq *rt_rq) -+{ -+ return rt_rq->rt_nr_running; -+} -+ -+/** -+ * enum schedutil_type - CPU utilization type -+ * @FREQUENCY_UTIL: Utilization used to select frequency -+ * @ENERGY_UTIL: Utilization used during energy calculation -+ * -+ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time -+ * need to be aggregated differently depending on the usage made of them. This -+ * enum is used within schedutil_freq_util() to differentiate the types of -+ * utilization expected by the callers, and adjust the aggregation accordingly. -+ */ -+enum schedutil_type { -+ FREQUENCY_UTIL, -+ ENERGY_UTIL, -+}; -+ -+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL -+ -+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, -+ unsigned long max, enum schedutil_type type, -+ struct task_struct *p); -+ -+static inline unsigned long cpu_bw_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_dl(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline unsigned long cpu_util_cfs(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline unsigned long cpu_util_rt(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->rt_nr_running); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ unsigned long ret = READ_ONCE(rq->irq_load_avg); -+ -+ if (ret > SCHED_CAPACITY_SCALE) -+ ret = SCHED_CAPACITY_SCALE; -+ return ret; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ util *= (max - irq); -+ util /= max; -+ -+ return util; -+ -+} -+#else -+static inline unsigned long cpu_util_irq(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline -+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) -+{ -+ return util; -+} -+#endif -+#endif -+ -+#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) -+#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) -+ -+DECLARE_STATIC_KEY_FALSE(sched_energy_present); -+ -+static inline bool sched_energy_enabled(void) -+{ -+ return static_branch_unlikely(&sched_energy_present); -+} -+ -+#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ -+ -+#define perf_domain_span(pd) NULL -+static inline bool sched_energy_enabled(void) { return false; } -+ -+#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ if (!(p->flags & PF_KTHREAD)) -+ return false; -+ -+ if (p->nr_cpus_allowed != 1) -+ return false; -+ -+ return true; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ -+static inline int -+update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -+{ -+ return 0; -+} -+ -+static inline u64 thermal_load_avg(struct rq *rq) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_RCU_TORTURE_TEST -+extern int sysctl_sched_rt_runtime; -+#endif -+ -+#endif /* MUQSS_SCHED_H */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index e39008242cf4..146a3dfe626f 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifdef CONFIG_SCHED_MUQSS -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) -+#else -+#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) -+#endif -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && -- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { -+ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { - return max; - } - -@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - struct task_struct *thread; - struct sched_attr attr = { - .size = sizeof(struct sched_attr), -+#ifdef CONFIG_SCHED_MUQSS -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, -+#endif - .sched_flags = SCHED_FLAG_SUGOV, - .sched_nice = 0, - .sched_priority = 0, -diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h -index efbb492bb94c..f0288c32ab17 100644 ---- a/kernel/sched/cpupri.h -+++ b/kernel/sched/cpupri.h -@@ -17,6 +17,7 @@ struct cpupri { - int *cpu_to_pri; - }; - -+#ifndef CONFIG_SCHED_MUQSS - #ifdef CONFIG_SMP - int cpupri_find(struct cpupri *cp, struct task_struct *p, - struct cpumask *lowest_mask); -@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); - int cpupri_init(struct cpupri *cp); - void cpupri_cleanup(struct cpupri *cp); - #endif -+#endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 5a55d2300452..283a580754a7 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) - return accounted; - } - --#ifdef CONFIG_64BIT --static inline u64 read_sum_exec_runtime(struct task_struct *t) --{ -- return t->se.sum_exec_runtime; --} --#else --static u64 read_sum_exec_runtime(struct task_struct *t) --{ -- u64 ns; -- struct rq_flags rf; -- struct rq *rq; -- -- rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -- task_rq_unlock(rq, t, &rf); -- -- return ns; --} --#endif -- - /* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. -@@ -614,7 +594,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f324dc36fc43..43ca13ed9ab0 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_MUQSS - /* - * idle-task scheduling class. - */ -@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..4478c11cb51a 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,19 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_MUQSS -+#include "MuQSS.h" -+ -+/* Begin compatibility wrappers for MuQSS/CFS differences */ -+#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->nr_running) -+ -+#else /* CONFIG_SCHED_MUQSS */ -+ -+#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) -+#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) -+ -+ - #include - - #include -@@ -2626,3 +2639,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+/* MuQSS compatibility functions */ -+#ifdef CONFIG_64BIT -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ return t->se.sum_exec_runtime; -+} -+#else -+static inline u64 read_sum_exec_runtime(struct task_struct *t) -+{ -+ u64 ns; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(t, &rf); -+ ns = t->se.sum_exec_runtime; -+ task_rq_unlock(rq, t, &rf); -+ -+ return ns; -+} -+#endif -+#endif /* CONFIG_SCHED_MUQSS */ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..a1dc490c15e4 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -440,7 +440,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - struct root_domain *old_rd = NULL; - unsigned long flags; - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_lock_irqsave(rq->lock, flags); -+#else - raw_spin_lock_irqsave(&rq->lock, flags); -+#endif - - if (rq->rd) { - old_rd = rq->rd; -@@ -466,7 +470,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) - set_rq_online(rq); - -+#ifdef CONFIG_SCHED_MUQSS -+ raw_spin_unlock_irqrestore(rq->lock, flags); -+#else - raw_spin_unlock_irqrestore(&rq->lock, flags); -+#endif - - if (old_rd) - call_rcu(&old_rd->rcu, free_rootdomain); -diff --git a/kernel/skip_list.c b/kernel/skip_list.c -new file mode 100644 -index 000000000000..bf5c6e97e139 ---- /dev/null -+++ b/kernel/skip_list.c -@@ -0,0 +1,148 @@ -+/* -+ Copyright (C) 2011,2016 Con Kolivas. -+ -+ Code based on example originally by William Pugh. -+ -+Skip Lists are a probabilistic alternative to balanced trees, as -+described in the June 1990 issue of CACM and were invented by -+William Pugh in 1987. -+ -+A couple of comments about this implementation: -+The routine randomLevel has been hard-coded to generate random -+levels using p=0.25. It can be easily changed. -+ -+The insertion routine has been implemented so as to use the -+dirty hack described in the CACM paper: if a random level is -+generated that is more than the current maximum level, the -+current maximum level plus one is used instead. -+ -+Levels start at zero and go up to MaxLevel (which is equal to -+MaxNumberOfLevels-1). -+ -+The routines defined in this file are: -+ -+init: defines slnode -+ -+new_skiplist: returns a new, empty list -+ -+randomLevel: Returns a random level based on a u64 random seed passed to it. -+In MuQSS, the "niffy" time is used for this purpose. -+ -+insert(l,key, value): inserts the binding (key, value) into l. This operation -+occurs in O(log n) time. -+ -+delnode(slnode, l, node): deletes any binding of key from the l based on the -+actual node value. This operation occurs in O(k) time where k is the -+number of levels of the node in question (max 8). The original delete -+function occurred in O(log n) time and involved a search. -+ -+MuQSS Notes: In this implementation of skiplists, there are bidirectional -+next/prev pointers and the insert function returns a pointer to the actual -+node the value is stored. The key here is chosen by the scheduler so as to -+sort tasks according to the priority list requirements and is no longer used -+by the scheduler after insertion. The scheduler lookup, however, occurs in -+O(1) time because it is always the first item in the level 0 linked list. -+Since the task struct stores a copy of the node pointer upon skiplist_insert, -+it can also remove it much faster than the original implementation with the -+aid of prev<->next pointer manipulation and no searching. -+ -+*/ -+ -+#include -+#include -+ -+#define MaxNumberOfLevels 8 -+#define MaxLevel (MaxNumberOfLevels - 1) -+ -+void skiplist_init(skiplist_node *slnode) -+{ -+ int i; -+ -+ slnode->key = 0xFFFFFFFFFFFFFFFF; -+ slnode->level = 0; -+ slnode->value = NULL; -+ for (i = 0; i < MaxNumberOfLevels; i++) -+ slnode->next[i] = slnode->prev[i] = slnode; -+} -+ -+skiplist *new_skiplist(skiplist_node *slnode) -+{ -+ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); -+ -+ BUG_ON(!l); -+ l->header = slnode; -+ return l; -+} -+ -+void free_skiplist(skiplist *l) -+{ -+ skiplist_node *p, *q; -+ -+ p = l->header; -+ do { -+ q = p->next[0]; -+ p->next[0]->prev[0] = q->prev[0]; -+ skiplist_node_init(p); -+ p = q; -+ } while (p != l->header); -+ kfree(l); -+} -+ -+void skiplist_node_init(skiplist_node *node) -+{ -+ memset(node, 0, sizeof(skiplist_node)); -+} -+ -+static inline unsigned int randomLevel(const long unsigned int randseed) -+{ -+ return find_first_bit(&randseed, MaxLevel) / 2; -+} -+ -+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) -+{ -+ skiplist_node *update[MaxNumberOfLevels]; -+ skiplist_node *p, *q; -+ int k = l->level; -+ -+ p = l->header; -+ do { -+ while (q = p->next[k], q->key <= key) -+ p = q; -+ update[k] = p; -+ } while (--k >= 0); -+ -+ ++l->entries; -+ k = randomLevel(randseed); -+ if (k > l->level) { -+ k = ++l->level; -+ update[k] = l->header; -+ } -+ -+ node->level = k; -+ node->key = key; -+ node->value = value; -+ do { -+ p = update[k]; -+ node->next[k] = p->next[k]; -+ p->next[k] = node; -+ node->prev[k] = p; -+ node->next[k]->prev[k] = node; -+ } while (--k >= 0); -+} -+ -+void skiplist_delete(skiplist *l, skiplist_node *node) -+{ -+ int k, m = node->level; -+ -+ for (k = 0; k <= m; k++) { -+ node->prev[k]->next[k] = node->next[k]; -+ node->next[k]->prev[k] = node->prev[k]; -+ } -+ skiplist_node_init(node); -+ if (m == l->level) { -+ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) -+ m--; -+ l->level = m; -+ } -+ l->entries--; -+} -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index afad085960b8..d2e35cd54f94 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -120,7 +120,17 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; --#ifdef CONFIG_PRINTK -+static int zero = 0; -+static int one = 1; -+#ifdef CONFIG_SCHED_MUQSS -+extern int rr_interval; -+extern int sched_interactive; -+extern int sched_iso_cpu; -+extern int sched_yield_type; -+#endif -+extern int hrtimer_granularity_us; -+extern int hrtimeout_min_us; -+#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) - static int ten_thousand = 10000; - #endif - #ifdef CONFIG_PERF_EVENTS -@@ -184,7 +194,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - int sysctl_legacy_va_layout; - #endif - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -193,7 +203,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; - static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; - #endif /* CONFIG_SMP */ --#endif /* CONFIG_SCHED_DEBUG */ -+#endif /* CONFIG_SCHED_DEBUG && !CONFIG_SCHED_MUQSS */ - - #ifdef CONFIG_COMPACTION - static int min_extfrag_threshold; -@@ -1652,6 +1662,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_MUQSS - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -1843,6 +1854,73 @@ static struct ctl_table kern_table[] = { - .extra1 = SYSCTL_ONE, - }, - #endif -+#elif defined(CONFIG_SCHED_MUQSS) -+ { -+ .procname = "rr_interval", -+ .data = &rr_interval, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &one_thousand, -+ }, -+ { -+ .procname = "interactive", -+ .data = &sched_interactive, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one, -+ }, -+ { -+ .procname = "iso_cpu", -+ .data = &sched_iso_cpu, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &one_hundred, -+ }, -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS) -+ { -+ .procname = "sched_schedstats", -+ .data = NULL, -+ .maxlen = sizeof(unsigned int), -+ .mode = 0644, -+ .proc_handler = sysctl_schedstats, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE, -+ }, -+#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */ -+#endif /* CONFIG_SCHED_MUQSS */ -+ { -+ .procname = "hrtimer_granularity_us", -+ .data = &hrtimer_granularity_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, -+ { -+ .procname = "hrtimeout_min_us", -+ .data = &hrtimeout_min_us, -+ .maxlen = sizeof(int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &one, -+ .extra2 = &ten_thousand, -+ }, - #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) - { - .procname = "sched_energy_aware", -diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig -index a09b1d61df6a..e7662101fcc3 100644 ---- a/kernel/time/Kconfig -+++ b/kernel/time/Kconfig -@@ -75,6 +75,9 @@ config NO_HZ_COMMON - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - select TICK_ONESHOT - -+config NO_HZ_FULL -+ bool -+ - choice - prompt "Timer tick handling" - default NO_HZ_IDLE if NO_HZ -@@ -96,8 +99,9 @@ config NO_HZ_IDLE - - Most of the time you want to say Y here. - --config NO_HZ_FULL -+config NO_HZ_FULL_NODEF - bool "Full dynticks system (tickless)" -+ select NO_HZ_FULL - # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS - # We need at least one periodic CPU for timekeeping -@@ -123,6 +127,8 @@ config NO_HZ_FULL - transitions: syscalls, exceptions and interrupts. Even when it's - dynamically off. - -+ Not recommended for desktops,laptops, or mobile devices. -+ - Say N. - - endchoice -@@ -132,7 +138,7 @@ config CONTEXT_TRACKING - - config CONTEXT_TRACKING_FORCE - bool "Force context tracking" -- depends on CONTEXT_TRACKING -+ depends on CONTEXT_TRACKING && !SCHED_MUQSS - default y if !NO_HZ_FULL - help - The major pre-requirement for full dynticks to work is to -diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c -index f5490222e134..544c58c29267 100644 ---- a/kernel/time/clockevents.c -+++ b/kernel/time/clockevents.c -@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) - - #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST - --/* Limit min_delta to a jiffie */ --#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) -+int __read_mostly hrtimer_granularity_us = 100; -+/* Limit min_delta to 100us */ -+#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) - - /** - * clockevents_increase_min_delta - raise minimum delta of a clock event device -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..19918cf649b0 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -2223,3 +2223,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, - return schedule_hrtimeout_range(expires, 0, mode); - } - EXPORT_SYMBOL_GPL(schedule_hrtimeout); -+ -+/* -+ * As per schedule_hrtimeout but taskes a millisecond value and returns how -+ * many milliseconds are left. -+ */ -+long __sched schedule_msec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ int delta, jiffs; -+ ktime_t expires; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ jiffs = msecs_to_jiffies(timeout); -+ /* -+ * If regular timer resolution is adequate or hrtimer resolution is not -+ * (yet) better than Hz, as would occur during startup, use regular -+ * timers. -+ */ -+ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) -+ return schedule_timeout(jiffs); -+ -+ delta = (timeout % 1000) * NSEC_PER_MSEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_ms(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+EXPORT_SYMBOL(schedule_msec_hrtimeout); -+ -+#define USECS_PER_SEC 1000000 -+extern int hrtimer_granularity_us; -+ -+static inline long schedule_usec_hrtimeout(long timeout) -+{ -+ struct hrtimer_sleeper t; -+ ktime_t expires; -+ int delta; -+ -+ if (!timeout) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) -+ return schedule_timeout(usecs_to_jiffies(timeout)); -+ -+ if (timeout < hrtimer_granularity_us) -+ timeout = hrtimer_granularity_us; -+ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; -+ expires = ktime_set(0, delta); -+ -+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); -+ hrtimer_set_expires_range_ns(&t.timer, expires, delta); -+ -+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); -+ -+ if (likely(t.task)) -+ schedule(); -+ -+ hrtimer_cancel(&t.timer); -+ destroy_hrtimer_on_stack(&t.timer); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ expires = hrtimer_expires_remaining(&t.timer); -+ timeout = ktime_to_us(expires); -+ return timeout < 0 ? 0 : timeout; -+} -+ -+int __read_mostly hrtimeout_min_us = 500; -+ -+long __sched schedule_min_hrtimeout(void) -+{ -+ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); -+} -+ -+EXPORT_SYMBOL(schedule_min_hrtimeout); -+ -+long __sched schedule_msec_hrtimeout_interruptible(long timeout) -+{ -+ __set_current_state(TASK_INTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); -+ -+long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) -+{ -+ __set_current_state(TASK_UNINTERRUPTIBLE); -+ return schedule_msec_hrtimeout(timeout); -+} -+EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index a71758e34e45..ebb84a65d928 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -diff --git a/kernel/time/timer.c b/kernel/time/timer.c -index a50364df1054..a86e4530e530 100644 ---- a/kernel/time/timer.c -+++ b/kernel/time/timer.c -@@ -44,6 +44,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -1587,7 +1588,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) - * Check, if the next hrtimer event is before the next timer wheel - * event: - */ --static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) -+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) - { - u64 nextevt = hrtimer_get_next_event(); - -@@ -1605,6 +1606,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) - if (nextevt <= basem) - return basem; - -+ if (nextevt < expires && nextevt - basem <= TICK_NSEC) -+ base->is_idle = false; -+ - /* - * Round up to the next jiffie. High resolution timers are - * off, so the hrtimers are expired in the tick and we need to -@@ -1674,7 +1678,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) - } - raw_spin_unlock(&base->lock); - -- return cmp_next_hrtimer_event(basem, expires); -+ return cmp_next_hrtimer_event(base, basem, expires); - } - - /** -@@ -1873,6 +1877,18 @@ signed long __sched schedule_timeout(signed long timeout) - - expire = timeout + jiffies; - -+#ifdef CONFIG_HIGH_RES_TIMERS -+ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { -+ /* -+ * Special case 1 as being a request for the minimum timeout -+ * and use highres timers to timeout after 1ms to workaround -+ * the granularity of low Hz tick timers. -+ */ -+ if (!schedule_min_hrtimeout()) -+ return 0; -+ goto out_timeout; -+ } -+#endif - timer.task = current; - timer_setup_on_stack(&timer.timer, process_timeout, 0); - __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); -@@ -1881,10 +1897,10 @@ signed long __sched schedule_timeout(signed long timeout) - - /* Remove the timer from the object tracker */ - destroy_timer_on_stack(&timer.timer); -- -+out_timeout: - timeout = expire - jiffies; - -- out: -+out: - return timeout < 0 ? 0 : timeout; - } - EXPORT_SYMBOL(schedule_timeout); -@@ -2027,7 +2043,19 @@ void __init init_timers(void) - */ - void msleep(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ /* -+ * Use high resolution timers where the resolution of tick based -+ * timers is inadequate. -+ */ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs) -+ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); -+ return; -+ } -+ timeout = jiffs + 1; - - while (timeout) - timeout = schedule_timeout_uninterruptible(timeout); -@@ -2041,7 +2069,15 @@ EXPORT_SYMBOL(msleep); - */ - unsigned long msleep_interruptible(unsigned int msecs) - { -- unsigned long timeout = msecs_to_jiffies(msecs) + 1; -+ int jiffs = msecs_to_jiffies(msecs); -+ unsigned long timeout; -+ -+ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { -+ while (msecs && !signal_pending(current)) -+ msecs = schedule_msec_hrtimeout_interruptible(msecs); -+ return msecs; -+ } -+ timeout = jiffs + 1; - - while (timeout && !signal_pending(current)) - timeout = schedule_timeout_interruptible(timeout); -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..68930e7f4d28 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_MUQSS -+ /* No deadline on MuQSS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 466fc3144fff..27224c2d7674 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ struct scan_control { - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 33; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) -diff --git a/net/core/pktgen.c b/net/core/pktgen.c -index 44fdbb9c6e53..ae0adfc677c2 100644 ---- a/net/core/pktgen.c -+++ b/net/core/pktgen.c -@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) - mutex_unlock(&pktgen_thread_lock); - pr_debug("%s: waiting for %s to disappear....\n", - __func__, ifname); -- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); -+ schedule_msec_hrtimeout_interruptible((msec_per_try)); - mutex_lock(&pktgen_thread_lock); - - if (++i >= max_tries) { -diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c -index 40232a278b1a..d87fae1113aa 100644 ---- a/sound/pci/maestro3.c -+++ b/sound/pci/maestro3.c -@@ -1995,7 +1995,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(0, io + GPIO_DATA); - outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); -+ schedule_msec_hrtimeout_uninterruptible((delay1)); - - outw(GPO_PRIMARY_AC97, io + GPIO_DATA); - udelay(5); -@@ -2003,7 +2003,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) - outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); - outw(~0, io + GPIO_MASK); - -- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); -+ schedule_msec_hrtimeout_uninterruptible((delay2)); - - if (! snd_m3_try_read_vendor(chip)) - break; -diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c -index 653da3eaf355..d77d12902594 100644 ---- a/sound/soc/codecs/rt5631.c -+++ b/sound/soc/codecs/rt5631.c -@@ -417,7 +417,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena - hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - /* config one-bit depop parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); - snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, -@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable - hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); - snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); - if (enable) { -- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_uninterruptible((10)); - - /* config depop sequence parameter */ - rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); -diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c -index a6aa212fa0c8..8bfa549b38db 100644 ---- a/sound/soc/codecs/wm8350.c -+++ b/sound/soc/codecs/wm8350.c -@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) - out2->ramp == WM8350_RAMP_UP) { - /* delay is longer over 0dB as increases are larger */ - if (i >= WM8350_OUTn_0dB) -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (2)); - else -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (1)); - } else - udelay(50); /* doesn't matter if we delay longer */ -@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - (platform->dis_out4 << 6)); - - /* wait for discharge */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - cap_discharge_msecs)); - -@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - WM8350_VBUFEN); - - /* wait for vmid */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_charge_msecs)); - -@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform-> - vmid_discharge_msecs)); - -@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, - pm1 | WM8350_OUTPUT_DRAIN_EN); - - /* wait */ -- schedule_timeout_interruptible(msecs_to_jiffies -+ schedule_msec_hrtimeout_interruptible( - (platform->drain_msecs)); - - pm1 &= ~WM8350_BIASEN; -diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c -index a9a6d766a176..45bf31de6282 100644 ---- a/sound/soc/codecs/wm8900.c -+++ b/sound/soc/codecs/wm8900.c -@@ -1104,7 +1104,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, - /* Need to let things settle before stopping the clock - * to ensure that restart works, see "Stopping the - * master clock" in the datasheet. */ -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_write(component, WM8900_REG_POWER2, - WM8900_REG_POWER2_SYSCLK_ENA); - break; -diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c -index 7072ffacbdfd..e8414ec4759c 100644 ---- a/sound/soc/codecs/wm9713.c -+++ b/sound/soc/codecs/wm9713.c -@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, - - /* Gracefully shut down the voice interface. */ - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); -- schedule_timeout_interruptible(msecs_to_jiffies(1)); -+ schedule_msec_hrtimeout_interruptible(1); - snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); - snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); - -@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, - wm9713->pll_in = freq_in; - - /* wait 10ms AC97 link frames for the link to stabilise */ -- schedule_timeout_interruptible(msecs_to_jiffies(10)); -+ schedule_msec_hrtimeout_interruptible((10)); - return 0; - } - -diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c -index 3273161e2787..7fb9b4c6dd7b 100644 ---- a/sound/soc/soc-dapm.c -+++ b/sound/soc/soc-dapm.c -@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) - static void pop_wait(u32 pop_time) - { - if (pop_time) -- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); -+ schedule_msec_hrtimeout_uninterruptible((pop_time)); - } - - __printf(3, 4) -diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c -index fdbdfb7bce92..fa8e8faf3eb3 100644 ---- a/sound/usb/line6/pcm.c -+++ b/sound/usb/line6/pcm.c -@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, - if (!alive) - break; - set_current_state(TASK_UNINTERRUPTIBLE); -- schedule_timeout(1); -+ schedule_min_hrtimeout(); - } while (--timeout > 0); - if (alive) - dev_err(line6pcm->line6->ifcdev, diff --git a/linux59-tkg/linux59-tkg-patches/0004-glitched-muqss.patch b/linux59-tkg/linux59-tkg-patches/0004-glitched-muqss.patch deleted file mode 100644 index 46b094f..0000000 --- a/linux59-tkg/linux59-tkg-patches/0004-glitched-muqss.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - MuQSS - -diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c -index 84a1d08d68551..57c3036a68952 100644 ---- a/kernel/sched/MuQSS.c -+++ b/kernel/sched/MuQSS.c -@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; - * are allowed to run five seconds as real time tasks. This is the total over - * all online cpus. - */ -+#ifdef CONFIG_ZENIFY -+int sched_iso_cpu __read_mostly = 25; -+#else - int sched_iso_cpu __read_mostly = 70; -+#endif - - /* - * sched_yield_type - Choose what sort of yield sched_yield will perform. - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -5,7 +5,7 @@ - choice - prompt "Timer frequency" - default HZ_100 if SCHED_MUQSS -- default HZ_250_NODEF if !SCHED_MUQSS -+ default HZ_500_NODEF if !SCHED_MUQSS - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -50,6 +50,20 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500_NODEF -+ bool "500 HZ" -+ help -+ 500 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ -+ config HZ_750_NODEF -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000_NODEF - bool "1000 HZ" - help -@@ -63,6 +70,8 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250_NODEF - default 300 if HZ_300_NODEF -+ default 500 if HZ_500_NODEF -+ default 750 if HZ_750_NODEF - default 1000 if HZ_1000_NODEF - - config SCHED_HRTICK - -diff --git a/Makefile b/Makefile -index d4d36c61940b..4a9dfe471f1f 100644 ---- a/Makefile -+++ b/Makefile -@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus - - CKVERSION = -ck1 - CKNAME = MuQSS Powered --EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) - - # We are using a recursive build, so we need to do a little thinking - # to get the ordering right. -diff --git a/scripts/headers_install.sh b/scripts/headers_install.sh -index dd554bd43..75030ad93 100755 ---- a/scripts/headers_install.sh -+++ b/scripts/headers_install.sh -@@ -89,6 +89,7 @@ include/uapi/linux/atmdev.h:CONFIG_COMPAT - include/uapi/linux/eventpoll.h:CONFIG_PM_SLEEP - include/uapi/linux/hw_breakpoint.h:CONFIG_HAVE_MIXED_BREAKPOINTS_REGS - include/uapi/linux/pktcdvd.h:CONFIG_CDROM_PKTCDVD_WCACHE -+include/uapi/linux/sched.h:CONFIG_SCHED_MUQSS - " - - for c in $configs diff --git a/linux59-tkg/linux59-tkg-patches/0004-glitched-ondemand-muqss.patch b/linux59-tkg/linux59-tkg-patches/0004-glitched-ondemand-muqss.patch deleted file mode 100644 index 02933e4..0000000 --- a/linux59-tkg/linux59-tkg-patches/0004-glitched-ondemand-muqss.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (45) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (45) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux59-tkg/linux59-tkg-patches/0005-glitched-pds.patch b/linux59-tkg/linux59-tkg-patches/0005-glitched-pds.patch deleted file mode 100644 index 08c9ef3..0000000 --- a/linux59-tkg/linux59-tkg-patches/0005-glitched-pds.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - PDS - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) diff --git a/linux59-tkg/linux59-tkg-patches/0006-add-acs-overrides_iommu.patch b/linux59-tkg/linux59-tkg-patches/0006-add-acs-overrides_iommu.patch deleted file mode 100644 index d1303a5..0000000 --- a/linux59-tkg/linux59-tkg-patches/0006-add-acs-overrides_iommu.patch +++ /dev/null @@ -1,193 +0,0 @@ -From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 -From: Mark Weiman -Date: Sun, 12 Aug 2018 11:36:21 -0400 -Subject: [PATCH] pci: Enable overrides for missing ACS capabilities - -This an updated version of Alex Williamson's patch from: -https://lkml.org/lkml/2013/5/30/513 - -Original commit message follows: - -PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that -allows us to control whether transactions are allowed to be redirected -in various subnodes of a PCIe topology. For instance, if two -endpoints are below a root port or downsteam switch port, the -downstream port may optionally redirect transactions between the -devices, bypassing upstream devices. The same can happen internally -on multifunction devices. The transaction may never be visible to the -upstream devices. - -One upstream device that we particularly care about is the IOMMU. If -a redirection occurs in the topology below the IOMMU, then the IOMMU -cannot provide isolation between devices. This is why the PCIe spec -encourages topologies to include ACS support. Without it, we have to -assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. - -Unfortunately, far too many topologies do not support ACS to make this -a steadfast requirement. Even the latest chipsets from Intel are only -sporadically supporting ACS. We have trouble getting interconnect -vendors to include the PCIe spec required PCIe capability, let alone -suggested features. - -Therefore, we need to add some flexibility. The pcie_acs_override= -boot option lets users opt-in specific devices or sets of devices to -assume ACS support. The "downstream" option assumes full ACS support -on root ports and downstream switch ports. The "multifunction" -option assumes the subset of ACS features available on multifunction -endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" -option enables ACS support on devices matching the provided vendor -and device IDs, allowing more strategic ACS overrides. These options -may be combined in any order. A maximum of 16 id specific overrides -are available. It's suggested to use the most limited set of options -necessary to avoid completely disabling ACS across the topology. -Note to hardware vendors, we have facilities to permanently quirk -specific devices which enforce isolation but not provide an ACS -capability. Please contact me to have your devices added and save -your customers the hassle of this boot option. - -Signed-off-by: Mark Weiman ---- - .../admin-guide/kernel-parameters.txt | 9 ++ - drivers/pci/quirks.c | 101 ++++++++++++++++++ - 2 files changed, 110 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index aefd358a5ca3..173b3596fd9e 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -3190,6 +3190,15 @@ - nomsi [MSI] If the PCI_MSI kernel config parameter is - enabled, this kernel boot option can be used to - disable the use of MSI interrupts system-wide. -+ pcie_acs_override = -+ [PCIE] Override missing PCIe ACS support for: -+ downstream -+ All downstream ports - full ACS capabilities -+ multifunction -+ All multifunction devices - multifunction ACS subset -+ id:nnnn:nnnn -+ Specific device - full ACS capabilities -+ Specified as vid:did (vendor/device ID) in hex - noioapicquirk [APIC] Disable all boot interrupt quirks. - Safety option to keep boot IRQs enabled. This - should never be necessary. -diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 4700d24e5d55..8f7a3d7fd9c1 100644 ---- a/drivers/pci/quirks.c -+++ b/drivers/pci/quirks.c -@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) - dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; - } - -+static bool acs_on_downstream; -+static bool acs_on_multifunction; -+ -+#define NUM_ACS_IDS 16 -+struct acs_on_id { -+ unsigned short vendor; -+ unsigned short device; -+}; -+static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; -+static u8 max_acs_id; -+ -+static __init int pcie_acs_override_setup(char *p) -+{ -+ if (!p) -+ return -EINVAL; -+ -+ while (*p) { -+ if (!strncmp(p, "downstream", 10)) -+ acs_on_downstream = true; -+ if (!strncmp(p, "multifunction", 13)) -+ acs_on_multifunction = true; -+ if (!strncmp(p, "id:", 3)) { -+ char opt[5]; -+ int ret; -+ long val; -+ -+ if (max_acs_id >= NUM_ACS_IDS - 1) { -+ pr_warn("Out of PCIe ACS override slots (%d)\n", -+ NUM_ACS_IDS); -+ goto next; -+ } -+ -+ p += 3; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].vendor = val; -+ -+ p += strcspn(p, ":"); -+ if (*p != ':') { -+ pr_warn("PCIe ACS invalid ID\n"); -+ goto next; -+ } -+ -+ p++; -+ snprintf(opt, 5, "%s", p); -+ ret = kstrtol(opt, 16, &val); -+ if (ret) { -+ pr_warn("PCIe ACS ID parse error %d\n", ret); -+ goto next; -+ } -+ acs_on_ids[max_acs_id].device = val; -+ max_acs_id++; -+ } -+next: -+ p += strcspn(p, ","); -+ if (*p == ',') -+ p++; -+ } -+ -+ if (acs_on_downstream || acs_on_multifunction || max_acs_id) -+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); -+ -+ return 0; -+} -+early_param("pcie_acs_override", pcie_acs_override_setup); -+ -+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) -+{ -+ int i; -+ -+ /* Never override ACS for legacy devices or devices with ACS caps */ -+ if (!pci_is_pcie(dev) || -+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) -+ return -ENOTTY; -+ -+ for (i = 0; i < max_acs_id; i++) -+ if (acs_on_ids[i].vendor == dev->vendor && -+ acs_on_ids[i].device == dev->device) -+ return 1; -+ -+ switch (pci_pcie_type(dev)) { -+ case PCI_EXP_TYPE_DOWNSTREAM: -+ case PCI_EXP_TYPE_ROOT_PORT: -+ if (acs_on_downstream) -+ return 1; -+ break; -+ case PCI_EXP_TYPE_ENDPOINT: -+ case PCI_EXP_TYPE_UPSTREAM: -+ case PCI_EXP_TYPE_LEG_END: -+ case PCI_EXP_TYPE_RC_END: -+ if (acs_on_multifunction && dev->multifunction) -+ return 1; -+ } -+ -+ return -ENOTTY; -+} - /* - * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. - * The device will throw a Link Down error on AER-capable systems and -@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { - { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, - /* Zhaoxin Root/Downstream Ports */ - { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, -+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, - { 0 } - }; - - diff --git a/linux59-tkg/linux59-tkg-patches/0007-v5.9-fsync.patch b/linux59-tkg/linux59-tkg-patches/0007-v5.9-fsync.patch deleted file mode 100644 index 47badbb..0000000 --- a/linux59-tkg/linux59-tkg-patches/0007-v5.9-fsync.patch +++ /dev/null @@ -1,597 +0,0 @@ -From 7b5df0248ce255ef5b7204d65a7b3783ebb76a3d Mon Sep 17 00:00:00 2001 -From: Gabriel Krisman Bertazi -Date: Fri, 13 Dec 2019 11:08:02 -0300 -Subject: [PATCH 1/2] futex: Implement mechanism to wait on any of several - futexes -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows -a thread to wait on several futexes at the same time, and be awoken by -any of them. In a sense, it implements one of the features that was -supported by pooling on the old FUTEX_FD interface. - -The use case lies in the Wine implementation of the Windows NT interface -WaitMultipleObjects. This Windows API function allows a thread to sleep -waiting on the first of a set of event sources (mutexes, timers, signal, -console input, etc) to signal. Considering this is a primitive -synchronization operation for Windows applications, being able to quickly -signal events on the producer side, and quickly go to sleep on the -consumer side is essential for good performance of those running over Wine. - -Wine developers have an implementation that uses eventfd, but it suffers -from FD exhaustion (there is applications that go to the order of -multi-milion FDs), and higher CPU utilization than this new operation. - -The futex list is passed as an array of `struct futex_wait_block` -(pointer, value, bitset) to the kernel, which will enqueue all of them -and sleep if none was already triggered. It returns a hint of which -futex caused the wake up event to userspace, but the hint doesn't -guarantee that is the only futex triggered. Before calling the syscall -again, userspace should traverse the list, trying to re-acquire any of -the other futexes, to prevent an immediate -EWOULDBLOCK return code from -the kernel. - -This was tested using three mechanisms: - -1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and -running the unmodified tools/testing/selftests/futex and a full linux -distro on top of this kernel. - -2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a -multi-threaded, event-handling setup. - -3) By running the Wine fsync implementation and executing multi-threaded -applications, in particular modern games, on top of this implementation. - -Changes were tested for the following ABIs: x86_64, i386 and x32. -Support for x32 applications is not implemented since it would -take a major rework adding a new entry point and splitting the current -futex 64 entry point in two and we can't change the current x32 syscall -number without breaking user space compatibility. - -CC: Steven Rostedt -Cc: Richard Yao -Cc: Thomas Gleixner -Cc: Peter Zijlstra -Co-developed-by: Zebediah Figura -Signed-off-by: Zebediah Figura -Co-developed-by: Steven Noonan -Signed-off-by: Steven Noonan -Co-developed-by: Pierre-Loup A. Griffais -Signed-off-by: Pierre-Loup A. Griffais -Signed-off-by: Gabriel Krisman Bertazi -[Added compatibility code] -Co-developed-by: André Almeida -Signed-off-by: André Almeida - -Adjusted for v5.9: Removed `put_futex_key` calls. ---- - include/uapi/linux/futex.h | 20 +++ - kernel/futex.c | 352 ++++++++++++++++++++++++++++++++++++- - 2 files changed, 370 insertions(+), 2 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index a89eb0accd5e2..580001e89c6ca 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,6 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 -+#define FUTEX_WAIT_MULTIPLE 13 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -@@ -40,6 +41,8 @@ - FUTEX_PRIVATE_FLAG) - #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ - FUTEX_PRIVATE_FLAG) -+#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ -+ FUTEX_PRIVATE_FLAG) - - /* - * Support for robust futexes: the kernel cleans up held futexes at -@@ -150,4 +153,21 @@ struct robust_list_head { - (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ - | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) - -+/* -+ * Maximum number of multiple futexes to wait for -+ */ -+#define FUTEX_MULTIPLE_MAX_COUNT 128 -+ -+/** -+ * struct futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct futex_wait_block { -+ __u32 __user *uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ - #endif /* _UAPI_LINUX_FUTEX_H */ -diff --git a/kernel/futex.c b/kernel/futex.c -index a5876694a60eb..6f4bea76df460 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -197,6 +197,8 @@ struct futex_pi_state { - * @rt_waiter: rt_waiter storage for use with requeue_pi - * @requeue_pi_key: the requeue_pi target futex key - * @bitset: bitset for the optional bitmasked wakeup -+ * @uaddr: userspace address of futex -+ * @uval: expected futex's value - * - * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so - * we can wake only the relevant ones (hashed queues may be shared). -@@ -219,6 +221,8 @@ struct futex_q { - struct rt_mutex_waiter *rt_waiter; - union futex_key *requeue_pi_key; - u32 bitset; -+ u32 __user *uaddr; -+ u32 uval; - } __randomize_layout; - - static const struct futex_q futex_q_init = { -@@ -2304,6 +2308,29 @@ static int unqueue_me(struct futex_q *q) - return ret; - } - -+/** -+ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket -+ * @q: The list of futexes to unqueue -+ * @count: Number of futexes in the list -+ * -+ * Helper to unqueue a list of futexes. This can't fail. -+ * -+ * Return: -+ * - >=0 - Index of the last futex that was awoken; -+ * - -1 - If no futex was awoken -+ */ -+static int unqueue_multiple(struct futex_q *q, int count) -+{ -+ int ret = -1; -+ int i; -+ -+ for (i = 0; i < count; i++) { -+ if (!unqueue_me(&q[i])) -+ ret = i; -+ } -+ return ret; -+} -+ - /* - * PI futexes can not be requeued and must remove themself from the - * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry -@@ -2662,6 +2689,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, - return ret; - } - -+/** -+ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes -+ * @qs: The corresponding futex list -+ * @count: The size of the lists -+ * @flags: Futex flags (FLAGS_SHARED, etc.) -+ * @awaken: Index of the last awoken futex -+ * -+ * Prepare multiple futexes in a single step and enqueue them. This may fail if -+ * the futex list is invalid or if any futex was already awoken. On success the -+ * task is ready to interruptible sleep. -+ * -+ * Return: -+ * - 1 - One of the futexes was awaken by another thread -+ * - 0 - Success -+ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL -+ */ -+static int futex_wait_multiple_setup(struct futex_q *qs, int count, -+ unsigned int flags, int *awaken) -+{ -+ struct futex_hash_bucket *hb; -+ int ret, i; -+ u32 uval; -+ -+ /* -+ * Enqueuing multiple futexes is tricky, because we need to -+ * enqueue each futex in the list before dealing with the next -+ * one to avoid deadlocking on the hash bucket. But, before -+ * enqueuing, we need to make sure that current->state is -+ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which -+ * cannot be done before the get_futex_key of the next key, -+ * because it calls get_user_pages, which can sleep. Thus, we -+ * fetch the list of futexes keys in two steps, by first pinning -+ * all the memory keys in the futex key, and only then we read -+ * each key and queue the corresponding futex. -+ */ -+retry: -+ for (i = 0; i < count; i++) { -+ qs[i].key = FUTEX_KEY_INIT; -+ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, -+ &qs[i].key, FUTEX_READ); -+ if (unlikely(ret)) { -+ return ret; -+ } -+ } -+ -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ for (i = 0; i < count; i++) { -+ struct futex_q *q = &qs[i]; -+ -+ hb = queue_lock(q); -+ -+ ret = get_futex_value_locked(&uval, q->uaddr); -+ if (ret) { -+ /* -+ * We need to try to handle the fault, which -+ * cannot be done without sleep, so we need to -+ * undo all the work already done, to make sure -+ * we don't miss any wake ups. Therefore, clean -+ * up, handle the fault and retry from the -+ * beginning. -+ */ -+ queue_unlock(hb); -+ -+ /* -+ * Keys 0..(i-1) are implicitly put -+ * on unqueue_multiple. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ /* -+ * On a real fault, prioritize the error even if -+ * some other futex was awoken. Userspace gave -+ * us a bad address, -EFAULT them. -+ */ -+ ret = get_user(uval, q->uaddr); -+ if (ret) -+ return ret; -+ -+ /* -+ * Even if the page fault was handled, If -+ * something was already awaken, we can safely -+ * give up and succeed to give a hint for userspace to -+ * acquire the right futex faster. -+ */ -+ if (*awaken >= 0) -+ return 1; -+ -+ goto retry; -+ } -+ -+ if (uval != q->uval) { -+ queue_unlock(hb); -+ -+ /* -+ * If something was already awaken, we can -+ * safely ignore the error and succeed. -+ */ -+ *awaken = unqueue_multiple(qs, i); -+ __set_current_state(TASK_RUNNING); -+ if (*awaken >= 0) -+ return 1; -+ -+ return -EWOULDBLOCK; -+ } -+ -+ /* -+ * The bucket lock can't be held while dealing with the -+ * next futex. Queue each futex at this moment so hb can -+ * be unlocked. -+ */ -+ queue_me(&qs[i], hb); -+ } -+ return 0; -+} -+ -+/** -+ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes -+ * @qs: The list of futexes to wait on -+ * @op: Operation code from futex's syscall -+ * @count: The number of objects -+ * @abs_time: Timeout before giving up and returning to userspace -+ * -+ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function -+ * sleeps on a group of futexes and returns on the first futex that -+ * triggered, or after the timeout has elapsed. -+ * -+ * Return: -+ * - >=0 - Hint to the futex that was awoken -+ * - <0 - On error -+ */ -+static int futex_wait_multiple(struct futex_q *qs, int op, -+ u32 count, ktime_t *abs_time) -+{ -+ struct hrtimer_sleeper timeout, *to; -+ int ret, flags = 0, hint = 0; -+ unsigned int i; -+ -+ if (!(op & FUTEX_PRIVATE_FLAG)) -+ flags |= FLAGS_SHARED; -+ -+ if (op & FUTEX_CLOCK_REALTIME) -+ flags |= FLAGS_CLOCKRT; -+ -+ to = futex_setup_timer(abs_time, &timeout, flags, 0); -+ while (1) { -+ ret = futex_wait_multiple_setup(qs, count, flags, &hint); -+ if (ret) { -+ if (ret > 0) { -+ /* A futex was awaken during setup */ -+ ret = hint; -+ } -+ break; -+ } -+ -+ if (to) -+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); -+ -+ /* -+ * Avoid sleeping if another thread already tried to -+ * wake us. -+ */ -+ for (i = 0; i < count; i++) { -+ if (plist_node_empty(&qs[i].list)) -+ break; -+ } -+ -+ if (i == count && (!to || to->task)) -+ freezable_schedule(); -+ -+ ret = unqueue_multiple(qs, count); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (ret >= 0) -+ break; -+ if (to && !to->task) { -+ ret = -ETIMEDOUT; -+ break; -+ } else if (signal_pending(current)) { -+ ret = -ERESTARTSYS; -+ break; -+ } -+ /* -+ * The final case is a spurious wakeup, for -+ * which just retry. -+ */ -+ } -+ -+ if (to) { -+ hrtimer_cancel(&to->timer); -+ destroy_hrtimer_on_stack(&to->timer); -+ } -+ -+ return ret; -+} -+ - static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, - ktime_t *abs_time, u32 bitset) - { -@@ -3774,6 +4000,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, - return -ENOSYS; - } - -+/** -+ * futex_read_wait_block - Read an array of futex_wait_block from userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function creates and allocate an array of futex_q (we zero it to -+ * initialize the fields) and then, for each futex_wait_block element from -+ * userspace, fill a futex_q element with proper values. -+ */ -+inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct futex_wait_block fwb; -+ struct futex_wait_block __user *entry = -+ (struct futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = fwb.uaddr; -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} - - SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, -@@ -3786,7 +4049,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) - return -EFAULT; - if (get_timespec64(&ts, utime)) -@@ -3807,6 +4071,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (u32) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs; -+ -+#ifdef CONFIG_X86_X32 -+ if (unlikely(in_x32_syscall())) -+ return -ENOSYS; -+#endif -+ qs = futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - -@@ -3969,6 +4252,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - #endif /* CONFIG_COMPAT */ - - #ifdef CONFIG_COMPAT_32BIT_TIME -+/** -+ * struct compat_futex_wait_block - Block of futexes to be waited for -+ * @uaddr: User address of the futex (compatible pointer) -+ * @val: Futex value expected by userspace -+ * @bitset: Bitset for the optional bitmasked wakeup -+ */ -+struct compat_futex_wait_block { -+ compat_uptr_t uaddr; -+ __u32 val; -+ __u32 bitset; -+}; -+ -+/** -+ * compat_futex_read_wait_block - Read an array of futex_wait_block from -+ * userspace -+ * @uaddr: Userspace address of the block -+ * @count: Number of blocks to be read -+ * -+ * This function does the same as futex_read_wait_block(), except that it -+ * converts the pointer to the futex from the compat version to the regular one. -+ */ -+inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, -+ u32 count) -+{ -+ unsigned int i; -+ struct futex_q *qs; -+ struct compat_futex_wait_block fwb; -+ struct compat_futex_wait_block __user *entry = -+ (struct compat_futex_wait_block __user *)uaddr; -+ -+ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) -+ return ERR_PTR(-EINVAL); -+ -+ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); -+ if (!qs) -+ return ERR_PTR(-ENOMEM); -+ -+ for (i = 0; i < count; i++) { -+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { -+ kfree(qs); -+ return ERR_PTR(-EFAULT); -+ } -+ -+ qs[i].uaddr = compat_ptr(fwb.uaddr); -+ qs[i].uval = fwb.val; -+ qs[i].bitset = fwb.bitset; -+ } -+ -+ return qs; -+} -+ - SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, - u32, val3) -@@ -3980,7 +4314,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET || -- cmd == FUTEX_WAIT_REQUEUE_PI)) { -+ cmd == FUTEX_WAIT_REQUEUE_PI || -+ cmd == FUTEX_WAIT_MULTIPLE)) { - if (get_old_timespec32(&ts, utime)) - return -EFAULT; - if (!timespec64_valid(&ts)) -@@ -3995,6 +4330,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) - val2 = (int) (unsigned long) utime; - -+ if (cmd == FUTEX_WAIT_MULTIPLE) { -+ int ret; -+ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); -+ -+ if (IS_ERR(qs)) -+ return PTR_ERR(qs); -+ -+ ret = futex_wait_multiple(qs, op, val, tp); -+ kfree(qs); -+ -+ return ret; -+ } -+ - return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); - } - #endif /* CONFIG_COMPAT_32BIT_TIME */ - -From ccdddb50d330d2ee1a4d2cbfdd27bdd7fb10eec3 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Andr=C3=A9=20Almeida?= -Date: Fri, 7 Feb 2020 23:28:02 -0300 -Subject: [PATCH 2/2] futex: Add Proton compatibility code - ---- - include/uapi/linux/futex.h | 2 +- - kernel/futex.c | 5 +++-- - 2 files changed, 4 insertions(+), 3 deletions(-) - -diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h -index 580001e89c6ca..a3e760886b8e7 100644 ---- a/include/uapi/linux/futex.h -+++ b/include/uapi/linux/futex.h -@@ -21,7 +21,7 @@ - #define FUTEX_WAKE_BITSET 10 - #define FUTEX_WAIT_REQUEUE_PI 11 - #define FUTEX_CMP_REQUEUE_PI 12 --#define FUTEX_WAIT_MULTIPLE 13 -+#define FUTEX_WAIT_MULTIPLE 31 - - #define FUTEX_PRIVATE_FLAG 128 - #define FUTEX_CLOCK_REALTIME 256 -diff --git a/kernel/futex.c b/kernel/futex.c -index 6f4bea76df460..03d89fe7b8392 100644 ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -4059,7 +4059,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } -@@ -4260,6 +4260,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, - */ - struct compat_futex_wait_block { - compat_uptr_t uaddr; -+ __u32 pad; - __u32 val; - __u32 bitset; - }; -@@ -4322,7 +4323,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - return -EINVAL; - - t = timespec64_to_ktime(ts); -- if (cmd == FUTEX_WAIT) -+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) - t = ktime_add_safe(ktime_get(), t); - tp = &t; - } diff --git a/linux59-tkg/linux59-tkg-patches/0008-5.9-bcachefs.patch b/linux59-tkg/linux59-tkg-patches/0008-5.9-bcachefs.patch deleted file mode 100644 index 5e81fb6..0000000 --- a/linux59-tkg/linux59-tkg-patches/0008-5.9-bcachefs.patch +++ /dev/null @@ -1,70821 +0,0 @@ -diff --git a/block/bio.c b/block/bio.c -index e865ea55b9f9..72a65c4113be 100644 ---- a/block/bio.c -+++ b/block/bio.c -@@ -1320,6 +1320,7 @@ void bio_set_pages_dirty(struct bio *bio) - set_page_dirty_lock(bvec->bv_page); - } - } -+EXPORT_SYMBOL_GPL(bio_set_pages_dirty); - - /* - * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. -@@ -1379,6 +1380,7 @@ void bio_check_pages_dirty(struct bio *bio) - spin_unlock_irqrestore(&bio_dirty_lock, flags); - schedule_work(&bio_dirty_work); - } -+EXPORT_SYMBOL_GPL(bio_check_pages_dirty); - - static inline bool bio_remaining_done(struct bio *bio) - { -diff --git a/block/blk-core.c b/block/blk-core.c -index 10c08ac50697..d68f24a7ee48 100644 ---- a/block/blk-core.c -+++ b/block/blk-core.c -@@ -213,18 +213,23 @@ int blk_status_to_errno(blk_status_t status) - } - EXPORT_SYMBOL_GPL(blk_status_to_errno); - --static void print_req_error(struct request *req, blk_status_t status, -- const char *caller) -+const char *blk_status_to_str(blk_status_t status) - { - int idx = (__force int)status; - - if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) -- return; -+ return "(invalid error)"; -+ return blk_errors[idx].name; -+} -+EXPORT_SYMBOL_GPL(blk_status_to_str); - -+static void print_req_error(struct request *req, blk_status_t status, -+ const char *caller) -+{ - printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", -- caller, blk_errors[idx].name, -+ caller, blk_status_to_str(status), - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, -diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig -index d1ca4d059c20..e63646b103c4 100644 ---- a/drivers/md/bcache/Kconfig -+++ b/drivers/md/bcache/Kconfig -@@ -3,6 +3,7 @@ - config BCACHE - tristate "Block device as cache" - select CRC64 -+ select CLOSURES - help - Allows a block device to be used as cache for other devices; uses - a btree for indexing and the layout is optimized for SSDs. -@@ -18,15 +19,6 @@ config BCACHE_DEBUG - Enables extra debugging tools, allows expensive runtime checks to be - turned on. - --config BCACHE_CLOSURES_DEBUG -- bool "Debug closures" -- depends on BCACHE -- select DEBUG_FS -- help -- Keeps all active closures in a linked list and provides a debugfs -- interface to list them, which makes it possible to see asynchronous -- operations that get stuck. -- - config BCACHE_ASYNC_REGISTRATION - bool "Asynchronous device registration (EXPERIMENTAL)" - depends on BCACHE -diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile -index 5b87e59676b8..054e8a33a7ab 100644 ---- a/drivers/md/bcache/Makefile -+++ b/drivers/md/bcache/Makefile -@@ -2,6 +2,6 @@ - - obj-$(CONFIG_BCACHE) += bcache.o - --bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ -- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ -+bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ -+ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ - util.o writeback.o features.o -diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h -index 4fd03d2496d8..498625095807 100644 ---- a/drivers/md/bcache/bcache.h -+++ b/drivers/md/bcache/bcache.h -@@ -180,6 +180,7 @@ - - #include - #include -+#include - #include - #include - #include -@@ -192,7 +193,6 @@ - - #include "bset.h" - #include "util.h" --#include "closure.h" - - struct bucket { - atomic_t pin; -diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c -deleted file mode 100644 -index 0164a1fe94a9..000000000000 ---- a/drivers/md/bcache/closure.c -+++ /dev/null -@@ -1,217 +0,0 @@ --// SPDX-License-Identifier: GPL-2.0 --/* -- * Asynchronous refcounty things -- * -- * Copyright 2010, 2011 Kent Overstreet -- * Copyright 2012 Google, Inc. -- */ -- --#include --#include --#include --#include -- --#include "closure.h" -- --static inline void closure_put_after_sub(struct closure *cl, int flags) --{ -- int r = flags & CLOSURE_REMAINING_MASK; -- -- BUG_ON(flags & CLOSURE_GUARD_MASK); -- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -- -- if (!r) { -- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -- atomic_set(&cl->remaining, -- CLOSURE_REMAINING_INITIALIZER); -- closure_queue(cl); -- } else { -- struct closure *parent = cl->parent; -- closure_fn *destructor = cl->fn; -- -- closure_debug_destroy(cl); -- -- if (destructor) -- destructor(cl); -- -- if (parent) -- closure_put(parent); -- } -- } --} -- --/* For clearing flags with the same atomic op as a put */ --void closure_sub(struct closure *cl, int v) --{ -- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); --} -- --/* -- * closure_put - decrement a closure's refcount -- */ --void closure_put(struct closure *cl) --{ -- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); --} -- --/* -- * closure_wake_up - wake up all closures on a wait list, without memory barrier -- */ --void __closure_wake_up(struct closure_waitlist *wait_list) --{ -- struct llist_node *list; -- struct closure *cl, *t; -- struct llist_node *reverse = NULL; -- -- list = llist_del_all(&wait_list->list); -- -- /* We first reverse the list to preserve FIFO ordering and fairness */ -- reverse = llist_reverse_order(list); -- -- /* Then do the wakeups */ -- llist_for_each_entry_safe(cl, t, reverse, list) { -- closure_set_waiting(cl, 0); -- closure_sub(cl, CLOSURE_WAITING + 1); -- } --} -- --/** -- * closure_wait - add a closure to a waitlist -- * @waitlist: will own a ref on @cl, which will be released when -- * closure_wake_up() is called on @waitlist. -- * @cl: closure pointer. -- * -- */ --bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) --{ -- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -- return false; -- -- closure_set_waiting(cl, _RET_IP_); -- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -- llist_add(&cl->list, &waitlist->list); -- -- return true; --} -- --struct closure_syncer { -- struct task_struct *task; -- int done; --}; -- --static void closure_sync_fn(struct closure *cl) --{ -- struct closure_syncer *s = cl->s; -- struct task_struct *p; -- -- rcu_read_lock(); -- p = READ_ONCE(s->task); -- s->done = 1; -- wake_up_process(p); -- rcu_read_unlock(); --} -- --void __sched __closure_sync(struct closure *cl) --{ -- struct closure_syncer s = { .task = current }; -- -- cl->s = &s; -- continue_at(cl, closure_sync_fn, NULL); -- -- while (1) { -- set_current_state(TASK_UNINTERRUPTIBLE); -- if (s.done) -- break; -- schedule(); -- } -- -- __set_current_state(TASK_RUNNING); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --static LIST_HEAD(closure_list); --static DEFINE_SPINLOCK(closure_list_lock); -- --void closure_debug_create(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_ALIVE; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_add(&cl->all, &closure_list); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --void closure_debug_destroy(struct closure *cl) --{ -- unsigned long flags; -- -- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -- cl->magic = CLOSURE_MAGIC_DEAD; -- -- spin_lock_irqsave(&closure_list_lock, flags); -- list_del(&cl->all); -- spin_unlock_irqrestore(&closure_list_lock, flags); --} -- --static struct dentry *closure_debug; -- --static int debug_seq_show(struct seq_file *f, void *data) --{ -- struct closure *cl; -- -- spin_lock_irq(&closure_list_lock); -- -- list_for_each_entry(cl, &closure_list, all) { -- int r = atomic_read(&cl->remaining); -- -- seq_printf(f, "%p: %pS -> %pS p %p r %i ", -- cl, (void *) cl->ip, cl->fn, cl->parent, -- r & CLOSURE_REMAINING_MASK); -- -- seq_printf(f, "%s%s\n", -- test_bit(WORK_STRUCT_PENDING_BIT, -- work_data_bits(&cl->work)) ? "Q" : "", -- r & CLOSURE_RUNNING ? "R" : ""); -- -- if (r & CLOSURE_WAITING) -- seq_printf(f, " W %pS\n", -- (void *) cl->waiting_on); -- -- seq_printf(f, "\n"); -- } -- -- spin_unlock_irq(&closure_list_lock); -- return 0; --} -- --static int debug_seq_open(struct inode *inode, struct file *file) --{ -- return single_open(file, debug_seq_show, NULL); --} -- --static const struct file_operations debug_ops = { -- .owner = THIS_MODULE, -- .open = debug_seq_open, -- .read = seq_read, -- .release = single_release --}; -- --void __init closure_debug_init(void) --{ -- if (!IS_ERR_OR_NULL(bcache_debug)) -- /* -- * it is unnecessary to check return value of -- * debugfs_create_file(), we should not care -- * about this. -- */ -- closure_debug = debugfs_create_file( -- "closures", 0400, bcache_debug, NULL, &debug_ops); --} --#endif -- --MODULE_AUTHOR("Kent Overstreet "); --MODULE_LICENSE("GPL"); -diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h -deleted file mode 100644 -index c88cdc4ae4ec..000000000000 ---- a/drivers/md/bcache/closure.h -+++ /dev/null -@@ -1,378 +0,0 @@ --/* SPDX-License-Identifier: GPL-2.0 */ --#ifndef _LINUX_CLOSURE_H --#define _LINUX_CLOSURE_H -- --#include --#include --#include --#include -- --/* -- * Closure is perhaps the most overused and abused term in computer science, but -- * since I've been unable to come up with anything better you're stuck with it -- * again. -- * -- * What are closures? -- * -- * They embed a refcount. The basic idea is they count "things that are in -- * progress" - in flight bios, some other thread that's doing something else - -- * anything you might want to wait on. -- * -- * The refcount may be manipulated with closure_get() and closure_put(). -- * closure_put() is where many of the interesting things happen, when it causes -- * the refcount to go to 0. -- * -- * Closures can be used to wait on things both synchronously and asynchronously, -- * and synchronous and asynchronous use can be mixed without restriction. To -- * wait synchronously, use closure_sync() - you will sleep until your closure's -- * refcount hits 1. -- * -- * To wait asynchronously, use -- * continue_at(cl, next_function, workqueue); -- * -- * passing it, as you might expect, the function to run when nothing is pending -- * and the workqueue to run that function out of. -- * -- * continue_at() also, critically, requires a 'return' immediately following the -- * location where this macro is referenced, to return to the calling function. -- * There's good reason for this. -- * -- * To use safely closures asynchronously, they must always have a refcount while -- * they are running owned by the thread that is running them. Otherwise, suppose -- * you submit some bios and wish to have a function run when they all complete: -- * -- * foo_endio(struct bio *bio) -- * { -- * closure_put(cl); -- * } -- * -- * closure_init(cl); -- * -- * do_stuff(); -- * closure_get(cl); -- * bio1->bi_endio = foo_endio; -- * bio_submit(bio1); -- * -- * do_more_stuff(); -- * closure_get(cl); -- * bio2->bi_endio = foo_endio; -- * bio_submit(bio2); -- * -- * continue_at(cl, complete_some_read, system_wq); -- * -- * If closure's refcount started at 0, complete_some_read() could run before the -- * second bio was submitted - which is almost always not what you want! More -- * importantly, it wouldn't be possible to say whether the original thread or -- * complete_some_read()'s thread owned the closure - and whatever state it was -- * associated with! -- * -- * So, closure_init() initializes a closure's refcount to 1 - and when a -- * closure_fn is run, the refcount will be reset to 1 first. -- * -- * Then, the rule is - if you got the refcount with closure_get(), release it -- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -- * on a closure because you called closure_init() or you were run out of a -- * closure - _always_ use continue_at(). Doing so consistently will help -- * eliminate an entire class of particularly pernicious races. -- * -- * Lastly, you might have a wait list dedicated to a specific event, and have no -- * need for specifying the condition - you just want to wait until someone runs -- * closure_wake_up() on the appropriate wait list. In that case, just use -- * closure_wait(). It will return either true or false, depending on whether the -- * closure was already on a wait list or not - a closure can only be on one wait -- * list at a time. -- * -- * Parents: -- * -- * closure_init() takes two arguments - it takes the closure to initialize, and -- * a (possibly null) parent. -- * -- * If parent is non null, the new closure will have a refcount for its lifetime; -- * a closure is considered to be "finished" when its refcount hits 0 and the -- * function to run is null. Hence -- * -- * continue_at(cl, NULL, NULL); -- * -- * returns up the (spaghetti) stack of closures, precisely like normal return -- * returns up the C stack. continue_at() with non null fn is better thought of -- * as doing a tail call. -- * -- * All this implies that a closure should typically be embedded in a particular -- * struct (which its refcount will normally control the lifetime of), and that -- * struct can very much be thought of as a stack frame. -- */ -- --struct closure; --struct closure_syncer; --typedef void (closure_fn) (struct closure *); --extern struct dentry *bcache_debug; -- --struct closure_waitlist { -- struct llist_head list; --}; -- --enum closure_state { -- /* -- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -- * the thread that owns the closure, and cleared by the thread that's -- * waking up the closure. -- * -- * The rest are for debugging and don't affect behaviour: -- * -- * CLOSURE_RUNNING: Set when a closure is running (i.e. by -- * closure_init() and when closure_put() runs then next function), and -- * must be cleared before remaining hits 0. Primarily to help guard -- * against incorrect usage and accidentally transferring references. -- * continue_at() and closure_return() clear it for you, if you're doing -- * something unusual you can use closure_set_dead() which also helps -- * annotate where references are being transferred. -- */ -- -- CLOSURE_BITS_START = (1U << 26), -- CLOSURE_DESTRUCTOR = (1U << 26), -- CLOSURE_WAITING = (1U << 28), -- CLOSURE_RUNNING = (1U << 30), --}; -- --#define CLOSURE_GUARD_MASK \ -- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -- --#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) --#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -- --struct closure { -- union { -- struct { -- struct workqueue_struct *wq; -- struct closure_syncer *s; -- struct llist_node list; -- closure_fn *fn; -- }; -- struct work_struct work; -- }; -- -- struct closure *parent; -- -- atomic_t remaining; -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG --#define CLOSURE_MAGIC_DEAD 0xc054dead --#define CLOSURE_MAGIC_ALIVE 0xc054a11e -- -- unsigned int magic; -- struct list_head all; -- unsigned long ip; -- unsigned long waiting_on; --#endif --}; -- --void closure_sub(struct closure *cl, int v); --void closure_put(struct closure *cl); --void __closure_wake_up(struct closure_waitlist *list); --bool closure_wait(struct closure_waitlist *list, struct closure *cl); --void __closure_sync(struct closure *cl); -- --/** -- * closure_sync - sleep until a closure a closure has nothing left to wait on -- * -- * Sleeps until the refcount hits 1 - the thread that's running the closure owns -- * the last refcount. -- */ --static inline void closure_sync(struct closure *cl) --{ -- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -- __closure_sync(cl); --} -- --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- --void closure_debug_init(void); --void closure_debug_create(struct closure *cl); --void closure_debug_destroy(struct closure *cl); -- --#else -- --static inline void closure_debug_init(void) {} --static inline void closure_debug_create(struct closure *cl) {} --static inline void closure_debug_destroy(struct closure *cl) {} -- --#endif -- --static inline void closure_set_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _THIS_IP_; --#endif --} -- --static inline void closure_set_ret_ip(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->ip = _RET_IP_; --#endif --} -- --static inline void closure_set_waiting(struct closure *cl, unsigned long f) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- cl->waiting_on = f; --#endif --} -- --static inline void closure_set_stopped(struct closure *cl) --{ -- atomic_sub(CLOSURE_RUNNING, &cl->remaining); --} -- --static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -- struct workqueue_struct *wq) --{ -- closure_set_ip(cl); -- cl->fn = fn; -- cl->wq = wq; -- /* between atomic_dec() in closure_put() */ -- smp_mb__before_atomic(); --} -- --static inline void closure_queue(struct closure *cl) --{ -- struct workqueue_struct *wq = cl->wq; -- /** -- * Changes made to closure, work_struct, or a couple of other structs -- * may cause work.func not pointing to the right location. -- */ -- BUILD_BUG_ON(offsetof(struct closure, fn) -- != offsetof(struct work_struct, func)); -- if (wq) { -- INIT_WORK(&cl->work, cl->work.func); -- BUG_ON(!queue_work(wq, &cl->work)); -- } else -- cl->fn(cl); --} -- --/** -- * closure_get - increment a closure's refcount -- */ --static inline void closure_get(struct closure *cl) --{ --#ifdef CONFIG_BCACHE_CLOSURES_DEBUG -- BUG_ON((atomic_inc_return(&cl->remaining) & -- CLOSURE_REMAINING_MASK) <= 1); --#else -- atomic_inc(&cl->remaining); --#endif --} -- --/** -- * closure_init - Initialize a closure, setting the refcount to 1 -- * @cl: closure to initialize -- * @parent: parent of the new closure. cl will take a refcount on it for its -- * lifetime; may be NULL. -- */ --static inline void closure_init(struct closure *cl, struct closure *parent) --{ -- memset(cl, 0, sizeof(struct closure)); -- cl->parent = parent; -- if (parent) -- closure_get(parent); -- -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -- -- closure_debug_create(cl); -- closure_set_ip(cl); --} -- --static inline void closure_init_stack(struct closure *cl) --{ -- memset(cl, 0, sizeof(struct closure)); -- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); --} -- --/** -- * closure_wake_up - wake up all closures on a wait list, -- * with memory barrier -- */ --static inline void closure_wake_up(struct closure_waitlist *list) --{ -- /* Memory barrier for the wait list */ -- smp_mb(); -- __closure_wake_up(list); --} -- --/** -- * continue_at - jump to another function with barrier -- * -- * After @cl is no longer waiting on anything (i.e. all outstanding refs have -- * been dropped with closure_put()), it will resume execution at @fn running out -- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -- * -- * This is because after calling continue_at() you no longer have a ref on @cl, -- * and whatever @cl owns may be freed out from under you - a running closure fn -- * has a ref on its own closure which continue_at() drops. -- * -- * Note you are expected to immediately return after using this macro. -- */ --#define continue_at(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_sub(_cl, CLOSURE_RUNNING + 1); \ --} while (0) -- --/** -- * closure_return - finish execution of a closure -- * -- * This is used to indicate that @cl is finished: when all outstanding refs on -- * @cl have been dropped @cl's ref on its parent closure (as passed to -- * closure_init()) will be dropped, if one was specified - thus this can be -- * thought of as returning to the parent closure. -- */ --#define closure_return(_cl) continue_at((_cl), NULL, NULL) -- --/** -- * continue_at_nobarrier - jump to another function without barrier -- * -- * Causes @fn to be executed out of @cl, in @wq context (or called directly if -- * @wq is NULL). -- * -- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -- * thus it's not safe to touch anything protected by @cl after a -- * continue_at_nobarrier(). -- */ --#define continue_at_nobarrier(_cl, _fn, _wq) \ --do { \ -- set_closure_fn(_cl, _fn, _wq); \ -- closure_queue(_cl); \ --} while (0) -- --/** -- * closure_return_with_destructor - finish execution of a closure, -- * with destructor -- * -- * Works like closure_return(), except @destructor will be called when all -- * outstanding refs on @cl have been dropped; @destructor may be used to safely -- * free the memory occupied by @cl, and it is called with the ref on the parent -- * closure still held - so @destructor could safely return an item to a -- * freelist protected by @cl's parent. -- */ --#define closure_return_with_destructor(_cl, _destructor) \ --do { \ -- set_closure_fn(_cl, _destructor, NULL); \ -- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ --} while (0) -- --/** -- * closure_call - execute @fn out of a new, uninitialized closure -- * -- * Typically used when running out of one closure, and we want to run @fn -- * asynchronously out of a new closure - @parent will then wait for @cl to -- * finish. -- */ --static inline void closure_call(struct closure *cl, closure_fn fn, -- struct workqueue_struct *wq, -- struct closure *parent) --{ -- closure_init(cl, parent); -- continue_at_nobarrier(cl, fn, wq); --} -- --#endif /* _LINUX_CLOSURE_H */ -diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c -index 1bbdc410ee3c..3b9e991ea475 100644 ---- a/drivers/md/bcache/super.c -+++ b/drivers/md/bcache/super.c -@@ -2925,7 +2925,6 @@ static int __init bcache_init(void) - goto err; - - bch_debug_init(); -- closure_debug_init(); - - bcache_is_reboot = false; - -diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h -index c029f7443190..59093f9f1793 100644 ---- a/drivers/md/bcache/util.h -+++ b/drivers/md/bcache/util.h -@@ -4,6 +4,7 @@ - #define _BCACHE_UTIL_H - - #include -+#include - #include - #include - #include -@@ -13,8 +14,6 @@ - #include - #include - --#include "closure.h" -- - #define PAGE_SECTORS (PAGE_SIZE / 512) - - struct closure; -diff --git a/fs/Kconfig b/fs/Kconfig -index aa4c12282301..88082e3663cb 100644 ---- a/fs/Kconfig -+++ b/fs/Kconfig -@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" - source "fs/btrfs/Kconfig" - source "fs/nilfs2/Kconfig" - source "fs/f2fs/Kconfig" -+source "fs/bcachefs/Kconfig" - source "fs/zonefs/Kconfig" - - config FS_DAX -diff --git a/fs/Makefile b/fs/Makefile -index 1c7b0e3f6daa..8afa8e3bc14f 100644 ---- a/fs/Makefile -+++ b/fs/Makefile -@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ - obj-$(CONFIG_BTRFS_FS) += btrfs/ - obj-$(CONFIG_GFS2_FS) += gfs2/ - obj-$(CONFIG_F2FS_FS) += f2fs/ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ - obj-$(CONFIG_CEPH_FS) += ceph/ - obj-$(CONFIG_PSTORE) += pstore/ - obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig -new file mode 100644 -index 000000000000..5594af719b2a ---- /dev/null -+++ b/fs/bcachefs/Kconfig -@@ -0,0 +1,50 @@ -+ -+config BCACHEFS_FS -+ tristate "bcachefs filesystem support" -+ depends on BLOCK -+ select EXPORTFS -+ select CLOSURES -+ select LIBCRC32C -+ select CRC64 -+ select FS_POSIX_ACL -+ select LZ4_COMPRESS -+ select LZ4_DECOMPRESS -+ select ZLIB_DEFLATE -+ select ZLIB_INFLATE -+ select ZSTD_COMPRESS -+ select ZSTD_DECOMPRESS -+ select CRYPTO_SHA256 -+ select CRYPTO_CHACHA20 -+ select CRYPTO_POLY1305 -+ select KEYS -+ select SIXLOCKS -+ select RAID6_PQ -+ select XOR_BLOCKS -+ help -+ The bcachefs filesystem - a modern, copy on write filesystem, with -+ support for multiple devices, compression, checksumming, etc. -+ -+config BCACHEFS_QUOTA -+ bool "bcachefs quota support" -+ depends on BCACHEFS_FS -+ select QUOTACTL -+ -+config BCACHEFS_POSIX_ACL -+ bool "bcachefs POSIX ACL support" -+ depends on BCACHEFS_FS -+ select FS_POSIX_ACL -+ -+config BCACHEFS_DEBUG -+ bool "bcachefs debugging" -+ depends on BCACHEFS_FS -+ help -+ Enables many extra debugging checks and assertions. -+ -+ The resulting code will be significantly slower than normal; you -+ probably shouldn't select this option unless you're a developer. -+ -+config BCACHEFS_TESTS -+ bool "bcachefs unit and performance tests" -+ depends on BCACHEFS_FS -+ help -+ Include some unit and performance tests for the core btree code -diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile -new file mode 100644 -index 000000000000..d85ced62c0dd ---- /dev/null -+++ b/fs/bcachefs/Makefile -@@ -0,0 +1,59 @@ -+ -+obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o -+ -+bcachefs-y := \ -+ acl.o \ -+ alloc_background.o \ -+ alloc_foreground.o \ -+ bkey.o \ -+ bkey_methods.o \ -+ bkey_sort.o \ -+ bset.o \ -+ btree_cache.o \ -+ btree_gc.o \ -+ btree_io.o \ -+ btree_iter.o \ -+ btree_key_cache.o \ -+ btree_update_interior.o \ -+ btree_update_leaf.o \ -+ buckets.o \ -+ chardev.o \ -+ checksum.o \ -+ clock.o \ -+ compress.o \ -+ debug.o \ -+ dirent.o \ -+ disk_groups.o \ -+ ec.o \ -+ error.o \ -+ extents.o \ -+ extent_update.o \ -+ fs.o \ -+ fs-common.o \ -+ fs-ioctl.o \ -+ fs-io.o \ -+ fsck.o \ -+ inode.o \ -+ io.o \ -+ journal.o \ -+ journal_io.o \ -+ journal_reclaim.o \ -+ journal_seq_blacklist.o \ -+ keylist.o \ -+ migrate.o \ -+ move.o \ -+ movinggc.o \ -+ opts.o \ -+ quota.o \ -+ rebalance.o \ -+ recovery.o \ -+ reflink.o \ -+ replicas.o \ -+ siphash.o \ -+ super.o \ -+ super-io.o \ -+ sysfs.o \ -+ tests.o \ -+ trace.o \ -+ util.o \ -+ xattr.o -diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c -new file mode 100644 -index 000000000000..76c98ddbf628 ---- /dev/null -+++ b/fs/bcachefs/acl.c -@@ -0,0 +1,388 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#include "bcachefs.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+#include "acl.h" -+#include "fs.h" -+#include "xattr.h" -+ -+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) -+{ -+ return sizeof(bch_acl_header) + -+ sizeof(bch_acl_entry_short) * nr_short + -+ sizeof(bch_acl_entry) * nr_long; -+} -+ -+static inline int acl_to_xattr_type(int type) -+{ -+ switch (type) { -+ case ACL_TYPE_ACCESS: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; -+ case ACL_TYPE_DEFAULT: -+ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Convert from filesystem to in-memory representation. -+ */ -+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) -+{ -+ const void *p, *end = value + size; -+ struct posix_acl *acl; -+ struct posix_acl_entry *out; -+ unsigned count = 0; -+ -+ if (!value) -+ return NULL; -+ if (size < sizeof(bch_acl_header)) -+ goto invalid; -+ if (((bch_acl_header *)value)->a_version != -+ cpu_to_le32(BCH_ACL_VERSION)) -+ goto invalid; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *entry = p; -+ -+ if (p + sizeof(bch_acl_entry_short) > end) -+ goto invalid; -+ -+ switch (le16_to_cpu(entry->e_tag)) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ case ACL_GROUP: -+ p += sizeof(bch_acl_entry); -+ break; -+ default: -+ goto invalid; -+ } -+ -+ count++; -+ } -+ -+ if (p > end) -+ goto invalid; -+ -+ if (!count) -+ return NULL; -+ -+ acl = posix_acl_alloc(count, GFP_KERNEL); -+ if (!acl) -+ return ERR_PTR(-ENOMEM); -+ -+ out = acl->a_entries; -+ -+ p = value + sizeof(bch_acl_header); -+ while (p < end) { -+ const bch_acl_entry *in = p; -+ -+ out->e_tag = le16_to_cpu(in->e_tag); -+ out->e_perm = le16_to_cpu(in->e_perm); -+ -+ switch (out->e_tag) { -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ p += sizeof(bch_acl_entry_short); -+ break; -+ case ACL_USER: -+ out->e_uid = make_kuid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ out->e_gid = make_kgid(&init_user_ns, -+ le32_to_cpu(in->e_id)); -+ p += sizeof(bch_acl_entry); -+ break; -+ } -+ -+ out++; -+ } -+ -+ BUG_ON(out != acl->a_entries + acl->a_count); -+ -+ return acl; -+invalid: -+ pr_err("invalid acl entry"); -+ return ERR_PTR(-EINVAL); -+} -+ -+#define acl_for_each_entry(acl, acl_e) \ -+ for (acl_e = acl->a_entries; \ -+ acl_e < acl->a_entries + acl->a_count; \ -+ acl_e++) -+ -+/* -+ * Convert from in-memory to filesystem representation. -+ */ -+static struct bkey_i_xattr * -+bch2_acl_to_xattr(struct btree_trans *trans, -+ const struct posix_acl *acl, -+ int type) -+{ -+ struct bkey_i_xattr *xattr; -+ bch_acl_header *acl_header; -+ const struct posix_acl_entry *acl_e; -+ void *outptr; -+ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; -+ -+ acl_for_each_entry(acl, acl_e) { -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ case ACL_GROUP: -+ nr_long++; -+ break; -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ nr_short++; -+ break; -+ default: -+ return ERR_PTR(-EINVAL); -+ } -+ } -+ -+ acl_len = bch2_acl_size(nr_short, nr_long); -+ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); -+ -+ if (u64s > U8_MAX) -+ return ERR_PTR(-E2BIG); -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return xattr; -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = acl_to_xattr_type(type); -+ xattr->v.x_name_len = 0, -+ xattr->v.x_val_len = cpu_to_le16(acl_len); -+ -+ acl_header = xattr_val(&xattr->v); -+ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); -+ -+ outptr = (void *) acl_header + sizeof(*acl_header); -+ -+ acl_for_each_entry(acl, acl_e) { -+ bch_acl_entry *entry = outptr; -+ -+ entry->e_tag = cpu_to_le16(acl_e->e_tag); -+ entry->e_perm = cpu_to_le16(acl_e->e_perm); -+ switch (acl_e->e_tag) { -+ case ACL_USER: -+ entry->e_id = cpu_to_le32( -+ from_kuid(&init_user_ns, acl_e->e_uid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ case ACL_GROUP: -+ entry->e_id = cpu_to_le32( -+ from_kgid(&init_user_ns, acl_e->e_gid)); -+ outptr += sizeof(bch_acl_entry); -+ break; -+ -+ case ACL_USER_OBJ: -+ case ACL_GROUP_OBJ: -+ case ACL_MASK: -+ case ACL_OTHER: -+ outptr += sizeof(bch_acl_entry_short); -+ break; -+ } -+ } -+ -+ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); -+ -+ return xattr; -+} -+ -+struct posix_acl *bch2_get_acl(struct inode *vinode, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct posix_acl *acl = NULL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(acl_to_xattr_type(type), "", 0), -+ 0); -+ if (IS_ERR(iter)) { -+ if (PTR_ERR(iter) == -EINTR) -+ goto retry; -+ -+ if (PTR_ERR(iter) != -ENOENT) -+ acl = ERR_CAST(iter); -+ goto out; -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ -+ if (!IS_ERR(acl)) -+ set_cached_acl(&inode->v, type, acl); -+out: -+ bch2_trans_exit(&trans); -+ return acl; -+} -+ -+int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ int ret; -+ -+ if (type == ACL_TYPE_DEFAULT && -+ !S_ISDIR(inode_u->bi_mode)) -+ return acl ? -EACCES : 0; -+ -+ if (acl) { -+ struct bkey_i_xattr *xattr = -+ bch2_acl_to_xattr(trans, acl, type); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &xattr->k_i, 0); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(acl_to_xattr_type(type), "", 0); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, -+ inode_u->bi_inum, &search); -+ } -+ -+ return ret == -ENOENT ? 0 : ret; -+} -+ -+int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl; -+ umode_t mode; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ acl = _acl; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ mode = inode_u.bi_mode; -+ -+ if (type == ACL_TYPE_ACCESS) { -+ ret = posix_acl_update_mode(&inode->v, &mode, &acl); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_set_acl_trans(&trans, &inode_u, -+ &inode->ei_str_hash, -+ acl, type); -+ if (ret) -+ goto btree_err; -+ -+ inode_u.bi_ctime = bch2_current_time(c); -+ inode_u.bi_mode = mode; -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_CTIME|ATTR_MODE); -+ -+ set_cached_acl(&inode->v, type, acl); -+err: -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ struct bkey_i_xattr *new; -+ struct posix_acl *acl; -+ int ret = 0; -+ -+ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ -+ acl = bch2_acl_from_disk(xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+ if (IS_ERR_OR_NULL(acl)) -+ return PTR_ERR(acl); -+ -+ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); -+ if (ret) -+ goto err; -+ -+ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); -+ if (IS_ERR(new)) { -+ ret = PTR_ERR(new); -+ goto err; -+ } -+ -+ new->k.p = iter->pos; -+ bch2_trans_update(trans, iter, &new->k_i, 0); -+ *new_acl = acl; -+ acl = NULL; -+err: -+ kfree(acl); -+ return ret; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h -new file mode 100644 -index 000000000000..cb62d502a7ff ---- /dev/null -+++ b/fs/bcachefs/acl.h -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ACL_H -+#define _BCACHEFS_ACL_H -+ -+struct bch_inode_unpacked; -+struct bch_hash_info; -+struct bch_inode_info; -+struct posix_acl; -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ -+#define BCH_ACL_VERSION 0x0001 -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+ __le32 e_id; -+} bch_acl_entry; -+ -+typedef struct { -+ __le16 e_tag; -+ __le16 e_perm; -+} bch_acl_entry_short; -+ -+typedef struct { -+ __le32 a_version; -+} bch_acl_header; -+ -+struct posix_acl *bch2_get_acl(struct inode *, int); -+ -+int bch2_set_acl_trans(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ const struct bch_hash_info *, -+ struct posix_acl *, int); -+int bch2_set_acl(struct inode *, struct posix_acl *, int); -+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, -+ umode_t, struct posix_acl **); -+ -+#else -+ -+static inline int bch2_set_acl_trans(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ const struct bch_hash_info *hash_info, -+ struct posix_acl *acl, int type) -+{ -+ return 0; -+} -+ -+static inline int bch2_acl_chmod(struct btree_trans *trans, -+ struct bch_inode_info *inode, -+ umode_t mode, -+ struct posix_acl **new_acl) -+{ -+ return 0; -+} -+ -+#endif /* CONFIG_BCACHEFS_POSIX_ACL */ -+ -+#endif /* _BCACHEFS_ACL_H */ -diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c -new file mode 100644 -index 000000000000..97508de9f721 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.c -@@ -0,0 +1,1477 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "recovery.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static const char * const bch2_alloc_field_names[] = { -+#define x(name, bytes) #name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ NULL -+}; -+ -+static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); -+ -+/* Ratelimiting/PD controllers */ -+ -+static void pd_controllers_update(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(to_delayed_work(work), -+ struct bch_fs, -+ pd_controllers_update); -+ struct bch_dev *ca; -+ s64 free = 0, fragmented = 0; -+ unsigned i; -+ -+ for_each_member_device(ca, c, i) { -+ struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ -+ free += bucket_to_sector(ca, -+ __dev_buckets_free(ca, stats)) << 9; -+ /* -+ * Bytes of internal fragmentation, which can be -+ * reclaimed by copy GC -+ */ -+ fragmented += max_t(s64, 0, (bucket_to_sector(ca, -+ stats.buckets[BCH_DATA_user] + -+ stats.buckets[BCH_DATA_cached]) - -+ (stats.sectors[BCH_DATA_user] + -+ stats.sectors[BCH_DATA_cached])) << 9); -+ } -+ -+ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); -+ schedule_delayed_work(&c->pd_controllers_update, -+ c->pd_controllers_update_seconds * HZ); -+} -+ -+/* Persistent alloc info: */ -+ -+static inline u64 get_alloc_field(const struct bch_alloc *a, -+ const void **p, unsigned field) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ u64 v; -+ -+ if (!(a->fields & (1 << field))) -+ return 0; -+ -+ switch (bytes) { -+ case 1: -+ v = *((const u8 *) *p); -+ break; -+ case 2: -+ v = le16_to_cpup(*p); -+ break; -+ case 4: -+ v = le32_to_cpup(*p); -+ break; -+ case 8: -+ v = le64_to_cpup(*p); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+ return v; -+} -+ -+static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, -+ unsigned field, u64 v) -+{ -+ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; -+ -+ if (!v) -+ return; -+ -+ a->v.fields |= 1 << field; -+ -+ switch (bytes) { -+ case 1: -+ *((u8 *) *p) = v; -+ break; -+ case 2: -+ *((__le16 *) *p) = cpu_to_le16(v); -+ break; -+ case 4: -+ *((__le32 *) *p) = cpu_to_le32(v); -+ break; -+ case 8: -+ *((__le64 *) *p) = cpu_to_le64(v); -+ break; -+ default: -+ BUG(); -+ } -+ -+ *p += bytes; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) -+{ -+ struct bkey_alloc_unpacked ret = { .gen = 0 }; -+ -+ if (k.k->type == KEY_TYPE_alloc) { -+ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; -+ const void *d = a->data; -+ unsigned idx = 0; -+ -+ ret.gen = a->gen; -+ -+#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); -+ BCH_ALLOC_FIELDS() -+#undef x -+ } -+ return ret; -+} -+ -+void bch2_alloc_pack(struct bkey_i_alloc *dst, -+ const struct bkey_alloc_unpacked src) -+{ -+ unsigned idx = 0; -+ void *d = dst->v.data; -+ unsigned bytes; -+ -+ dst->v.fields = 0; -+ dst->v.gen = src.gen; -+ -+#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); -+ BCH_ALLOC_FIELDS() -+#undef x -+ -+ bytes = (void *) d - (void *) &dst->v; -+ set_bkey_val_bytes(&dst->k, bytes); -+ memset_u64s_tail(&dst->v, 0, bytes); -+} -+ -+static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) -+{ -+ unsigned i, bytes = offsetof(struct bch_alloc, data); -+ -+ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) -+ if (a->fields & (1 << i)) -+ bytes += BCH_ALLOC_FIELD_BYTES[i]; -+ -+ return DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ -+ if (k.k->p.inode >= c->sb.nr_devices || -+ !c->devs[k.k->p.inode]) -+ return "invalid device"; -+ -+ /* allow for unknown fields */ -+ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); -+ const void *d = a.v->data; -+ unsigned i; -+ -+ pr_buf(out, "gen %u", a.v->gen); -+ -+ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) -+ if (a.v->fields & (1 << i)) -+ pr_buf(out, " %s %llu", -+ bch2_alloc_field_names[i], -+ get_alloc_field(a.v, &d, i)); -+} -+ -+static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bkey_alloc_unpacked u; -+ -+ if (level || k.k->type != KEY_TYPE_alloc) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, k.k->p.inode); -+ g = __bucket(ca, k.k->p.offset, 0); -+ u = bch2_alloc_unpack(k); -+ -+ g->_mark.gen = u.gen; -+ g->_mark.data_type = u.data_type; -+ g->_mark.dirty_sectors = u.dirty_sectors; -+ g->_mark.cached_sectors = u.cached_sectors; -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; -+ -+ return 0; -+} -+ -+int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ down_read(&c->gc_lock); -+ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, -+ NULL, bch2_alloc_read_fn); -+ up_read(&c->gc_lock); -+ -+ if (ret) { -+ bch_err(c, "error reading alloc info: %i", ret); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_dev_usage_from_buckets(c); -+ percpu_up_write(&c->mark_lock); -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, READ); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[READ].lock); -+ -+ mutex_lock(&c->bucket_clock[WRITE].lock); -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ bch2_recalc_oldest_io(c, ca, WRITE); -+ up_read(&ca->bucket_lock); -+ } -+ mutex_unlock(&c->bucket_clock[WRITE].lock); -+ -+ return 0; -+} -+ -+static int bch2_alloc_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ struct bch_dev *ca; -+ struct bucket_array *ba; -+ struct bucket *g; -+ struct bucket_mark m; -+ struct bkey_alloc_unpacked old_u, new_u; -+ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ -+ struct bkey_i_alloc *a; -+ int ret; -+retry: -+ bch2_trans_begin(trans); -+ -+ ret = bch2_btree_key_cache_flush(trans, -+ BTREE_ID_ALLOC, iter->pos); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ old_u = bch2_alloc_unpack(k); -+ -+ percpu_down_read(&c->mark_lock); -+ ca = bch_dev_bkey_exists(c, iter->pos.inode); -+ ba = bucket_array(ca); -+ -+ g = &ba->b[iter->pos.offset]; -+ m = READ_ONCE(g->mark); -+ new_u = alloc_mem_to_key(g, m); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) -+ return 0; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, new_u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_NORUN); -+ ret = bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ flags); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ return ret; -+} -+ -+int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 first_bucket, nbuckets; -+ int ret = 0; -+ -+ percpu_down_read(&c->mark_lock); -+ first_bucket = bucket_array(ca)->first_bucket; -+ nbuckets = bucket_array(ca)->nbuckets; -+ percpu_up_read(&c->mark_lock); -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, first_bucket), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (iter->pos.offset < nbuckets) { -+ bch2_trans_cond_resched(&trans); -+ -+ ret = bch2_alloc_write_key(&trans, iter, flags); -+ if (ret) -+ break; -+ bch2_btree_iter_next_slot(iter); -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+int bch2_alloc_write(struct bch_fs *c, unsigned flags) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret = 0; -+ -+ for_each_rw_member(ca, c, i) { -+ bch2_dev_alloc_write(c, ca, flags); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+/* Bucket IO clocks: */ -+ -+static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket *g; -+ u16 max_last_io = 0; -+ unsigned i; -+ -+ lockdep_assert_held(&c->bucket_clock[rw].lock); -+ -+ /* Recalculate max_last_io for this device: */ -+ for_each_bucket(g, buckets) -+ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); -+ -+ ca->max_last_bucket_io[rw] = max_last_io; -+ -+ /* Recalculate global max_last_io: */ -+ max_last_io = 0; -+ -+ for_each_member_device(ca, c, i) -+ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); -+ -+ clock->max_last_io = max_last_io; -+} -+ -+static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ struct bucket_array *buckets; -+ struct bch_dev *ca; -+ struct bucket *g; -+ unsigned i; -+ -+ trace_rescale_prios(c); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->io_time[rw] = clock->hand - -+ bucket_last_io(c, g, rw) / 2; -+ -+ bch2_recalc_oldest_io(c, ca, rw); -+ -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+static inline u64 bucket_clock_freq(u64 capacity) -+{ -+ return max(capacity >> 10, 2028ULL); -+} -+ -+static void bch2_inc_clock_hand(struct io_timer *timer) -+{ -+ struct bucket_clock *clock = container_of(timer, -+ struct bucket_clock, rescale); -+ struct bch_fs *c = container_of(clock, -+ struct bch_fs, bucket_clock[clock->rw]); -+ struct bch_dev *ca; -+ u64 capacity; -+ unsigned i; -+ -+ mutex_lock(&clock->lock); -+ -+ /* if clock cannot be advanced more, rescale prio */ -+ if (clock->max_last_io >= U16_MAX - 2) -+ bch2_rescale_bucket_io_times(c, clock->rw); -+ -+ BUG_ON(clock->max_last_io >= U16_MAX - 2); -+ -+ for_each_member_device(ca, c, i) -+ ca->max_last_bucket_io[clock->rw]++; -+ clock->max_last_io++; -+ clock->hand++; -+ -+ mutex_unlock(&clock->lock); -+ -+ capacity = READ_ONCE(c->capacity); -+ -+ if (!capacity) -+ return; -+ -+ /* -+ * we only increment when 0.1% of the filesystem capacity has been read -+ * or written too, this determines if it's time -+ * -+ * XXX: we shouldn't really be going off of the capacity of devices in -+ * RW mode (that will be 0 when we're RO, yet we can still service -+ * reads) -+ */ -+ timer->expire += bucket_clock_freq(capacity); -+ -+ bch2_io_timer_add(&c->io_clock[clock->rw], timer); -+} -+ -+static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -+{ -+ struct bucket_clock *clock = &c->bucket_clock[rw]; -+ -+ clock->hand = 1; -+ clock->rw = rw; -+ clock->rescale.fn = bch2_inc_clock_hand; -+ clock->rescale.expire = bucket_clock_freq(c->capacity); -+ mutex_init(&clock->lock); -+} -+ -+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, -+ size_t bucket_nr, int rw) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, dev); -+ struct btree_iter *iter; -+ struct bucket *g; -+ struct bkey_i_alloc *a; -+ struct bkey_alloc_unpacked u; -+ u16 *time; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, bucket_nr); -+ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ -+ bkey_alloc_init(&a->k_i); -+ a->k.p = iter->pos; -+ -+ time = rw == READ ? &u.read_time : &u.write_time; -+ if (*time == c->bucket_clock[rw].hand) -+ goto out; -+ -+ *time = c->bucket_clock[rw].hand; -+ -+ bch2_alloc_pack(a, u); -+ -+ ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: -+ bch2_trans_commit(trans, NULL, NULL, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+/* Background allocator thread: */ -+ -+/* -+ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens -+ * (marking them as invalidated on disk), then optionally issues discard -+ * commands to the newly free buckets, then puts them on the various freelists. -+ */ -+ -+/** -+ * wait_buckets_available - wait on reclaimable buckets -+ * -+ * If there aren't enough available buckets to fill up free_inc, wait until -+ * there are. -+ */ -+static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned long gc_count = c->gc_count; -+ u64 available; -+ int ret = 0; -+ -+ ca->allocator_state = ALLOCATOR_BLOCKED; -+ closure_wake_up(&c->freelist_wait); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ if (gc_count != c->gc_count) -+ ca->inc_gen_really_needs_gc = 0; -+ -+ available = max_t(s64, 0, dev_buckets_available(ca) - -+ ca->inc_gen_really_needs_gc); -+ -+ if (available > fifo_free(&ca->free_inc) || -+ (available && -+ (!fifo_full(&ca->free[RESERVE_BTREE]) || -+ !fifo_full(&ca->free[RESERVE_MOVINGGC])))) -+ break; -+ -+ up_read(&c->gc_lock); -+ schedule(); -+ try_to_freeze(); -+ down_read(&c->gc_lock); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ closure_wake_up(&c->freelist_wait); -+ -+ return ret; -+} -+ -+static bool bch2_can_invalidate_bucket(struct bch_dev *ca, -+ size_t bucket, -+ struct bucket_mark mark) -+{ -+ u8 gc_gen; -+ -+ if (!is_available_bucket(mark)) -+ return false; -+ -+ if (ca->buckets_nouse && -+ test_bit(bucket, ca->buckets_nouse)) -+ return false; -+ -+ gc_gen = bucket_gc_gen(ca, bucket); -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) -+ ca->inc_gen_needs_gc++; -+ -+ if (gc_gen >= BUCKET_GC_GEN_MAX) -+ ca->inc_gen_really_needs_gc++; -+ -+ return gc_gen < BUCKET_GC_GEN_MAX; -+} -+ -+/* -+ * Determines what order we're going to reuse buckets, smallest bucket_key() -+ * first. -+ * -+ * -+ * - We take into account the read prio of the bucket, which gives us an -+ * indication of how hot the data is -- we scale the prio so that the prio -+ * farthest from the clock is worth 1/8th of the closest. -+ * -+ * - The number of sectors of cached data in the bucket, which gives us an -+ * indication of the cost in cache misses this eviction will cause. -+ * -+ * - If hotness * sectors used compares equal, we pick the bucket with the -+ * smallest bucket_gc_gen() - since incrementing the same bucket's generation -+ * number repeatedly forces us to run mark and sweep gc to avoid generation -+ * number wraparound. -+ */ -+ -+static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark m) -+{ -+ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); -+ unsigned max_last_io = ca->max_last_bucket_io[READ]; -+ -+ /* -+ * Time since last read, scaled to [0, 8) where larger value indicates -+ * more recently read data: -+ */ -+ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; -+ -+ /* How much we want to keep the data in this bucket: */ -+ unsigned long data_wantness = -+ (hotness + 1) * bucket_sectors_used(m); -+ -+ unsigned long needs_journal_commit = -+ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); -+ -+ return (data_wantness << 9) | -+ (needs_journal_commit << 8) | -+ (bucket_gc_gen(ca, b) / 16); -+} -+ -+static inline int bucket_alloc_cmp(alloc_heap *h, -+ struct alloc_heap_entry l, -+ struct alloc_heap_entry r) -+{ -+ return cmp_int(l.key, r.key) ?: -+ cmp_int(r.nr, l.nr) ?: -+ cmp_int(l.bucket, r.bucket); -+} -+ -+static inline int bucket_idx_cmp(const void *_l, const void *_r) -+{ -+ const struct alloc_heap_entry *l = _l, *r = _r; -+ -+ return cmp_int(l->bucket, r->bucket); -+} -+ -+static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ struct alloc_heap_entry e = { 0 }; -+ size_t b, i, nr = 0; -+ -+ ca->alloc_heap.used = 0; -+ -+ mutex_lock(&c->bucket_clock[READ].lock); -+ down_read(&ca->bucket_lock); -+ -+ buckets = bucket_array(ca); -+ -+ bch2_recalc_oldest_io(c, ca, READ); -+ -+ /* -+ * Find buckets with lowest read priority, by building a maxheap sorted -+ * by read priority and repeatedly replacing the maximum element until -+ * all buckets have been visited. -+ */ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ unsigned long key = bucket_sort_key(c, ca, b, m); -+ -+ if (!bch2_can_invalidate_bucket(ca, b, m)) -+ continue; -+ -+ if (e.nr && e.bucket + e.nr == b && e.key == key) { -+ e.nr++; -+ } else { -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ e = (struct alloc_heap_entry) { -+ .bucket = b, -+ .nr = 1, -+ .key = key, -+ }; -+ } -+ -+ cond_resched(); -+ } -+ -+ if (e.nr) -+ heap_add_or_replace(&ca->alloc_heap, e, -+ -bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { -+ nr -= ca->alloc_heap.data[0].nr; -+ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); -+ } -+ -+ up_read(&ca->bucket_lock); -+ mutex_unlock(&c->bucket_clock[READ].lock); -+} -+ -+static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t b, start; -+ -+ if (ca->fifo_last_bucket < ca->mi.first_bucket || -+ ca->fifo_last_bucket >= ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ start = ca->fifo_last_bucket; -+ -+ do { -+ ca->fifo_last_bucket++; -+ if (ca->fifo_last_bucket == ca->mi.nbuckets) -+ ca->fifo_last_bucket = ca->mi.first_bucket; -+ -+ b = ca->fifo_last_bucket; -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } while (ca->fifo_last_bucket != start); -+} -+ -+static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bucket_array *buckets = bucket_array(ca); -+ struct bucket_mark m; -+ size_t checked, i; -+ -+ for (checked = 0; -+ checked < ca->mi.nbuckets / 2; -+ checked++) { -+ size_t b = bch2_rand_range(ca->mi.nbuckets - -+ ca->mi.first_bucket) + -+ ca->mi.first_bucket; -+ -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (bch2_can_invalidate_bucket(ca, b, m)) { -+ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; -+ -+ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ if (heap_full(&ca->alloc_heap)) -+ break; -+ } -+ -+ cond_resched(); -+ } -+ -+ sort(ca->alloc_heap.data, -+ ca->alloc_heap.used, -+ sizeof(ca->alloc_heap.data[0]), -+ bucket_idx_cmp, NULL); -+ -+ /* remove duplicates: */ -+ for (i = 0; i + 1 < ca->alloc_heap.used; i++) -+ if (ca->alloc_heap.data[i].bucket == -+ ca->alloc_heap.data[i + 1].bucket) -+ ca->alloc_heap.data[i].nr = 0; -+} -+ -+static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ size_t i, nr = 0; -+ -+ ca->inc_gen_needs_gc = 0; -+ -+ switch (ca->mi.replacement) { -+ case CACHE_REPLACEMENT_LRU: -+ find_reclaimable_buckets_lru(c, ca); -+ break; -+ case CACHE_REPLACEMENT_FIFO: -+ find_reclaimable_buckets_fifo(c, ca); -+ break; -+ case CACHE_REPLACEMENT_RANDOM: -+ find_reclaimable_buckets_random(c, ca); -+ break; -+ } -+ -+ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); -+ -+ for (i = 0; i < ca->alloc_heap.used; i++) -+ nr += ca->alloc_heap.data[i].nr; -+ -+ return nr; -+} -+ -+static inline long next_alloc_bucket(struct bch_dev *ca) -+{ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ while (ca->alloc_heap.used) { -+ if (top->nr) { -+ size_t b = top->bucket; -+ -+ top->bucket++; -+ top->nr--; -+ return b; -+ } -+ -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ } -+ -+ return -1; -+} -+ -+/* -+ * returns sequence number of most recent journal entry that updated this -+ * bucket: -+ */ -+static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) -+{ -+ if (m.journal_seq_valid) { -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u64 bucket_seq = journal_seq; -+ -+ bucket_seq &= ~((u64) U16_MAX); -+ bucket_seq |= m.journal_seq; -+ -+ if (bucket_seq > journal_seq) -+ bucket_seq -= 1 << 16; -+ -+ return bucket_seq; -+ } else { -+ return 0; -+ } -+} -+ -+static int bch2_invalidate_one_bucket2(struct btree_trans *trans, -+ struct bch_dev *ca, -+ struct btree_iter *iter, -+ u64 *journal_seq, unsigned flags) -+{ -+#if 0 -+ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; -+#else -+ /* hack: */ -+ __BKEY_PADDED(k, 8) alloc_key; -+#endif -+ struct bch_fs *c = trans->c; -+ struct bkey_i_alloc *a; -+ struct bkey_alloc_unpacked u; -+ struct bucket *g; -+ struct bucket_mark m; -+ bool invalidating_cached_data; -+ size_t b; -+ int ret = 0; -+ -+ BUG_ON(!ca->alloc_heap.used || -+ !ca->alloc_heap.data[0].nr); -+ b = ca->alloc_heap.data[0].bucket; -+ -+ /* first, put on free_inc and mark as owned by allocator: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ verify_not_on_freelist(c, ca, b); -+ -+ BUG_ON(!fifo_push(&ca->free_inc, b)); -+ -+ g = bucket(ca, b); -+ m = READ_ONCE(g->mark); -+ -+ invalidating_cached_data = m.cached_sectors != 0; -+ -+ /* -+ * If we're not invalidating cached data, we only increment the bucket -+ * gen in memory here, the incremented gen will be updated in the btree -+ * by bch2_trans_mark_pointer(): -+ */ -+ -+ if (!invalidating_cached_data) -+ bch2_invalidate_bucket(c, ca, b, &m); -+ else -+ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ -+ if (!invalidating_cached_data) -+ goto out; -+ -+ /* -+ * If the read-only path is trying to shut down, we can't be generating -+ * new btree updates: -+ */ -+ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { -+ ret = 1; -+ goto out; -+ } -+ -+ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); -+ -+ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, iter->pos.offset); -+ m = READ_ONCE(g->mark); -+ u = alloc_mem_to_key(g, m); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ invalidating_cached_data = u.cached_sectors != 0; -+ -+ u.gen++; -+ u.data_type = 0; -+ u.dirty_sectors = 0; -+ u.cached_sectors = 0; -+ u.read_time = c->bucket_clock[READ].hand; -+ u.write_time = c->bucket_clock[WRITE].hand; -+ -+ a = bkey_alloc_init(&alloc_key.k); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, u); -+ -+ bch2_trans_update(trans, iter, &a->k_i, -+ BTREE_TRIGGER_BUCKET_INVALIDATE); -+ -+ /* -+ * XXX: -+ * when using deferred btree updates, we have journal reclaim doing -+ * btree updates and thus requiring the allocator to make forward -+ * progress, and here the allocator is requiring space in the journal - -+ * so we need a journal pre-reservation: -+ */ -+ ret = bch2_trans_commit(trans, NULL, -+ invalidating_cached_data ? journal_seq : NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ flags); -+ if (ret == -EINTR) -+ goto retry; -+out: -+ if (!ret) { -+ /* remove from alloc_heap: */ -+ struct alloc_heap_entry e, *top = ca->alloc_heap.data; -+ -+ top->bucket++; -+ top->nr--; -+ -+ if (!top->nr) -+ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); -+ -+ /* -+ * Make sure we flush the last journal entry that updated this -+ * bucket (i.e. deleting the last reference) before writing to -+ * this bucket again: -+ */ -+ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); -+ } else { -+ size_t b2; -+ -+ /* remove from free_inc: */ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->freelist_lock); -+ -+ bch2_mark_alloc_bucket(c, ca, b, false, -+ gc_pos_alloc(c, NULL), 0); -+ -+ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); -+ BUG_ON(b != b2); -+ -+ spin_unlock(&c->freelist_lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+/* -+ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: -+ */ -+static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 journal_seq = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ -+ /* Only use nowait if we've already invalidated at least one bucket: */ -+ while (!ret && -+ !fifo_full(&ca->free_inc) && -+ ca->alloc_heap.used) -+ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, -+ BTREE_INSERT_GC_LOCK_HELD| -+ (!fifo_empty(&ca->free_inc) -+ ? BTREE_INSERT_NOWAIT : 0)); -+ -+ bch2_trans_exit(&trans); -+ -+ /* If we used NOWAIT, don't return the error: */ -+ if (!fifo_empty(&ca->free_inc)) -+ ret = 0; -+ if (ret) { -+ bch_err(ca, "error invalidating buckets: %i", ret); -+ return ret; -+ } -+ -+ if (journal_seq) -+ ret = bch2_journal_flush_seq(&c->journal, journal_seq); -+ if (ret) { -+ bch_err(ca, "journal error: %i", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ -+ /* -+ * Don't strand buckets on the copygc freelist until -+ * after recovery is finished: -+ */ -+ if (!test_bit(BCH_FS_STARTED, &c->flags) && -+ i == RESERVE_MOVINGGC) -+ continue; -+ -+ if (fifo_push(&ca->free[i], bucket)) { -+ fifo_pop(&ca->free_inc, bucket); -+ -+ closure_wake_up(&c->freelist_wait); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ spin_unlock(&c->freelist_lock); -+ goto out; -+ } -+ } -+ -+ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { -+ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; -+ closure_wake_up(&c->freelist_wait); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ if ((current->flags & PF_KTHREAD) && -+ kthread_should_stop()) { -+ ret = 1; -+ break; -+ } -+ -+ schedule(); -+ try_to_freeze(); -+ } -+out: -+ __set_current_state(TASK_RUNNING); -+ return ret; -+} -+ -+/* -+ * Pulls buckets off free_inc, discards them (if enabled), then adds them to -+ * freelists, waiting until there's room if necessary: -+ */ -+static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) -+{ -+ while (!fifo_empty(&ca->free_inc)) { -+ size_t bucket = fifo_peek(&ca->free_inc); -+ -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, bucket), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ if (push_invalidated_bucket(c, ca, bucket)) -+ return 1; -+ } -+ -+ return 0; -+} -+ -+/** -+ * bch_allocator_thread - move buckets from free_inc to reserves -+ * -+ * The free_inc FIFO is populated by find_reclaimable_buckets(), and -+ * the reserves are depleted by bucket allocation. When we run out -+ * of free_inc, try to invalidate some buckets and write out -+ * prios and gens. -+ */ -+static int bch2_allocator_thread(void *arg) -+{ -+ struct bch_dev *ca = arg; -+ struct bch_fs *c = ca->fs; -+ size_t nr; -+ int ret; -+ -+ set_freezable(); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ -+ while (1) { -+ cond_resched(); -+ if (kthread_should_stop()) -+ break; -+ -+ pr_debug("discarding %zu invalidated buckets", -+ fifo_used(&ca->free_inc)); -+ -+ ret = discard_invalidated_buckets(c, ca); -+ if (ret) -+ goto stop; -+ -+ down_read(&c->gc_lock); -+ -+ ret = bch2_invalidate_buckets(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ -+ if (!fifo_empty(&ca->free_inc)) { -+ up_read(&c->gc_lock); -+ continue; -+ } -+ -+ pr_debug("free_inc now empty"); -+ -+ do { -+ /* -+ * Find some buckets that we can invalidate, either -+ * they're completely unused, or only contain clean data -+ * that's been written back to the backing device or -+ * another cache tier -+ */ -+ -+ pr_debug("scanning for reclaimable buckets"); -+ -+ nr = find_reclaimable_buckets(c, ca); -+ -+ pr_debug("found %zu buckets", nr); -+ -+ trace_alloc_batch(ca, nr, ca->alloc_heap.size); -+ -+ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || -+ ca->inc_gen_really_needs_gc) && -+ c->gc_thread) { -+ atomic_inc(&c->kick_gc); -+ wake_up_process(c->gc_thread); -+ } -+ -+ /* -+ * If we found any buckets, we have to invalidate them -+ * before we scan for more - but if we didn't find very -+ * many we may want to wait on more buckets being -+ * available so we don't spin: -+ */ -+ if (!nr || -+ (nr < ALLOC_SCAN_BATCH(ca) && -+ !fifo_empty(&ca->free[RESERVE_NONE]))) { -+ ret = wait_buckets_available(c, ca); -+ if (ret) { -+ up_read(&c->gc_lock); -+ goto stop; -+ } -+ } -+ } while (!nr); -+ -+ up_read(&c->gc_lock); -+ -+ pr_debug("%zu buckets to invalidate", nr); -+ -+ /* -+ * alloc_heap is now full of newly-invalidated buckets: next, -+ * write out the new bucket gens: -+ */ -+ } -+ -+stop: -+ pr_debug("alloc thread stopping (ret %i)", ret); -+ ca->allocator_state = ALLOCATOR_STOPPED; -+ closure_wake_up(&c->freelist_wait); -+ return 0; -+} -+ -+/* Startup/shutdown (ro/rw): */ -+ -+void bch2_recalc_capacity(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; -+ unsigned bucket_size_max = 0; -+ unsigned long ra_pages = 0; -+ unsigned i, j; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ for_each_online_member(ca, c, i) { -+ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; -+ -+ ra_pages += bdi->ra_pages; -+ } -+ -+ bch2_set_ra_pages(c, ra_pages); -+ -+ for_each_rw_member(ca, c, i) { -+ u64 dev_reserve = 0; -+ -+ /* -+ * We need to reserve buckets (from the number -+ * of currently available buckets) against -+ * foreground writes so that mainly copygc can -+ * make forward progress. -+ * -+ * We need enough to refill the various reserves -+ * from scratch - copygc will use its entire -+ * reserve all at once, then run against when -+ * its reserve is refilled (from the formerly -+ * available buckets). -+ * -+ * This reserve is just used when considering if -+ * allocations for foreground writes must wait - -+ * not -ENOSPC calculations. -+ */ -+ for (j = 0; j < RESERVE_NONE; j++) -+ dev_reserve += ca->free[j].size; -+ -+ dev_reserve += 1; /* btree write point */ -+ dev_reserve += 1; /* copygc write point */ -+ dev_reserve += 1; /* rebalance write point */ -+ -+ dev_reserve *= ca->mi.bucket_size; -+ -+ copygc_threshold += dev_reserve; -+ -+ capacity += bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket); -+ -+ reserved_sectors += dev_reserve * 2; -+ -+ bucket_size_max = max_t(unsigned, bucket_size_max, -+ ca->mi.bucket_size); -+ } -+ -+ gc_reserve = c->opts.gc_reserve_bytes -+ ? c->opts.gc_reserve_bytes >> 9 -+ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); -+ -+ reserved_sectors = max(gc_reserve, reserved_sectors); -+ -+ reserved_sectors = min(reserved_sectors, capacity); -+ -+ c->copygc_threshold = copygc_threshold; -+ c->capacity = capacity - reserved_sectors; -+ -+ c->bucket_size_max = bucket_size_max; -+ -+ /* Wake up case someone was waiting for buckets */ -+ closure_wake_up(&c->freelist_wait); -+} -+ -+static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct open_bucket *ob; -+ bool ret = false; -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid && !ob->on_partial_list && -+ ob->ptr.dev == ca->dev_idx) -+ ret = true; -+ spin_unlock(&ob->lock); -+ } -+ -+ return ret; -+} -+ -+/* device goes ro: */ -+void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ BUG_ON(ca->alloc_thread); -+ -+ /* First, remove device from allocation groups: */ -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ clear_bit(ca->dev_idx, c->rw_devs[i].d); -+ -+ /* -+ * Capacity is calculated based off of devices in allocation groups: -+ */ -+ bch2_recalc_capacity(c); -+ -+ /* Next, close write points that point to this device... */ -+ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) -+ bch2_writepoint_stop(c, ca, &c->write_points[i]); -+ -+ bch2_writepoint_stop(c, ca, &c->copygc_write_point); -+ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); -+ bch2_writepoint_stop(c, ca, &c->btree_write_point); -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ while (c->btree_reserve_cache_nr) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ bch2_open_buckets_put(c, &a->ob); -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+ while (1) { -+ struct open_bucket *ob; -+ -+ spin_lock(&c->freelist_lock); -+ if (!ca->open_buckets_partial_nr) { -+ spin_unlock(&c->freelist_lock); -+ break; -+ } -+ ob = c->open_buckets + -+ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; -+ ob->on_partial_list = false; -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_ec_stop_dev(c, ca); -+ -+ /* -+ * Wake up threads that were blocked on allocation, so they can notice -+ * the device can no longer be removed and the capacity has changed: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ -+ /* -+ * journal_res_get() can block waiting for free space in the journal - -+ * it needs to notice there may not be devices to allocate from anymore: -+ */ -+ wake_up(&c->journal.wait); -+ -+ /* Now wait for any in flight writes: */ -+ -+ closure_wait_event(&c->open_buckets_wait, -+ !bch2_dev_has_open_write_point(c, ca)); -+} -+ -+/* device goes rw: */ -+void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) -+ if (ca->mi.data_allowed & (1 << i)) -+ set_bit(ca->dev_idx, c->rw_devs[i].d); -+} -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (ca->alloc_thread) -+ closure_wait_event(&c->freelist_wait, -+ ca->allocator_state != ALLOCATOR_RUNNING); -+} -+ -+/* stop allocator thread: */ -+void bch2_dev_allocator_stop(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ p = rcu_dereference_protected(ca->alloc_thread, 1); -+ ca->alloc_thread = NULL; -+ -+ /* -+ * We need an rcu barrier between setting ca->alloc_thread = NULL and -+ * the thread shutting down to avoid bch2_wake_allocator() racing: -+ * -+ * XXX: it would be better to have the rcu barrier be asynchronous -+ * instead of blocking us here -+ */ -+ synchronize_rcu(); -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+/* start allocator thread: */ -+int bch2_dev_allocator_start(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ /* -+ * allocator thread already started? -+ */ -+ if (ca->alloc_thread) -+ return 0; -+ -+ p = kthread_create(bch2_allocator_thread, ca, -+ "bch_alloc[%s]", ca->name); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(ca->alloc_thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_allocator_background_init(struct bch_fs *c) -+{ -+ spin_lock_init(&c->freelist_lock); -+ bch2_bucket_clock_init(c, READ); -+ bch2_bucket_clock_init(c, WRITE); -+ -+ c->pd_controllers_update_seconds = 5; -+ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); -+} -diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h -new file mode 100644 -index 000000000000..cbaff56f7473 ---- /dev/null -+++ b/fs/bcachefs/alloc_background.h -@@ -0,0 +1,105 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_BACKGROUND_H -+#define _BCACHEFS_ALLOC_BACKGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "debug.h" -+ -+struct bkey_alloc_unpacked { -+ u8 gen; -+#define x(_name, _bits) u##_bits _name; -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+/* How out of date a pointer gen is allowed to be: */ -+#define BUCKET_GC_GEN_MAX 96U -+ -+/* returns true if not equal */ -+static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, -+ struct bkey_alloc_unpacked r) -+{ -+ return l.gen != r.gen -+#define x(_name, _bits) || l._name != r._name -+ BCH_ALLOC_FIELDS() -+#undef x -+ ; -+} -+ -+struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); -+void bch2_alloc_pack(struct bkey_i_alloc *, -+ const struct bkey_alloc_unpacked); -+ -+int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); -+ -+static inline struct bkey_alloc_unpacked -+alloc_mem_to_key(struct bucket *g, struct bucket_mark m) -+{ -+ return (struct bkey_alloc_unpacked) { -+ .gen = m.gen, -+ .oldest_gen = g->oldest_gen, -+ .data_type = m.data_type, -+ .dirty_sectors = m.dirty_sectors, -+ .cached_sectors = m.cached_sectors, -+ .read_time = g->io_time[READ], -+ .write_time = g->io_time[WRITE], -+ }; -+} -+ -+#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) -+ -+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_alloc (struct bkey_ops) { \ -+ .key_invalid = bch2_alloc_invalid, \ -+ .val_to_text = bch2_alloc_to_text, \ -+} -+ -+struct journal_keys; -+int bch2_alloc_read(struct bch_fs *, struct journal_keys *); -+ -+static inline void bch2_wake_allocator(struct bch_dev *ca) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(ca->alloc_thread); -+ if (p) { -+ wake_up_process(p); -+ ca->allocator_state = ALLOCATOR_RUNNING; -+ } -+ rcu_read_unlock(); -+} -+ -+static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, -+ size_t bucket) -+{ -+ if (expensive_debug_checks(c)) { -+ size_t iter; -+ long i; -+ unsigned j; -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ BUG_ON(i == bucket); -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ BUG_ON(i == bucket); -+ } -+} -+ -+void bch2_recalc_capacity(struct bch_fs *); -+ -+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); -+ -+void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); -+void bch2_dev_allocator_stop(struct bch_dev *); -+int bch2_dev_allocator_start(struct bch_dev *); -+ -+int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); -+int bch2_alloc_write(struct bch_fs *, unsigned); -+void bch2_fs_allocator_background_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ -diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c -new file mode 100644 -index 000000000000..7a92e3d53254 ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.c -@@ -0,0 +1,990 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Primary bucket allocation code -+ * -+ * Copyright 2012 Google, Inc. -+ * -+ * Allocation in bcache is done in terms of buckets: -+ * -+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in -+ * btree pointers - they must match for the pointer to be considered valid. -+ * -+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a -+ * bucket simply by incrementing its gen. -+ * -+ * The gens (along with the priorities; it's really the gens are important but -+ * the code is named as if it's the priorities) are written in an arbitrary list -+ * of buckets on disk, with a pointer to them in the journal header. -+ * -+ * When we invalidate a bucket, we have to write its new gen to disk and wait -+ * for that write to complete before we use it - otherwise after a crash we -+ * could have pointers that appeared to be good but pointed to data that had -+ * been overwritten. -+ * -+ * Since the gens and priorities are all stored contiguously on disk, we can -+ * batch this up: We fill up the free_inc list with freshly invalidated buckets, -+ * call prio_write(), and when prio_write() finishes we pull buckets off the -+ * free_inc list and optionally discard them. -+ * -+ * free_inc isn't the only freelist - if it was, we'd often have to sleep while -+ * priorities and gens were being written before we could allocate. c->free is a -+ * smaller freelist, and buckets on that list are always ready to be used. -+ * -+ * If we've got discards enabled, that happens when a bucket moves from the -+ * free_inc list to the free list. -+ * -+ * It's important to ensure that gens don't wrap around - with respect to -+ * either the oldest gen in the btree or the gen on disk. This is quite -+ * difficult to do in practice, but we explicitly guard against it anyways - if -+ * a bucket is in danger of wrapping around we simply skip invalidating it that -+ * time around, and we garbage collect or rewrite the priorities sooner than we -+ * would have otherwise. -+ * -+ * bch2_bucket_alloc() allocates a single bucket from a specific device. -+ * -+ * bch2_bucket_alloc_set() allocates one or more buckets from different devices -+ * in a given filesystem. -+ * -+ * invalidate_buckets() drives all the processes described above. It's called -+ * from bch2_bucket_alloc() and a few other places that need to make sure free -+ * buckets are ready. -+ * -+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be -+ * invalidated, and then invalidate them and stick them on the free_inc list - -+ * in either lru or fifo order. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Open buckets represent a bucket that's currently being allocated from. They -+ * serve two purposes: -+ * -+ * - They track buckets that have been partially allocated, allowing for -+ * sub-bucket sized allocations - they're used by the sector allocator below -+ * -+ * - They provide a reference to the buckets they own that mark and sweep GC -+ * can find, until the new allocation has a pointer to it inserted into the -+ * btree -+ * -+ * When allocating some space with the sector allocator, the allocation comes -+ * with a reference to an open bucket - the caller is required to put that -+ * reference _after_ doing the index update that makes its allocation reachable. -+ */ -+ -+void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (ob->ec) { -+ bch2_ec_bucket_written(c, ob); -+ return; -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&ob->lock); -+ -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), -+ false, gc_pos_alloc(c, ob), 0); -+ ob->valid = false; -+ ob->type = 0; -+ -+ spin_unlock(&ob->lock); -+ percpu_up_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ c->open_buckets_nr_free++; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *c, -+ struct open_buckets *obs, -+ unsigned dev) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ptr.dev == dev && -+ ob->ec) -+ bch2_ec_bucket_cancel(c, ob); -+} -+ -+static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ -+ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); -+ -+ ob = c->open_buckets + c->open_buckets_freelist; -+ c->open_buckets_freelist = ob->freelist; -+ atomic_set(&ob->pin, 1); -+ ob->type = 0; -+ -+ c->open_buckets_nr_free--; -+ return ob; -+} -+ -+static void open_bucket_free_unused(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_bucket *ob) -+{ -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bool may_realloc = wp->type == BCH_DATA_user; -+ -+ BUG_ON(ca->open_buckets_partial_nr > -+ ARRAY_SIZE(ca->open_buckets_partial)); -+ -+ if (ca->open_buckets_partial_nr < -+ ARRAY_SIZE(ca->open_buckets_partial) && -+ may_realloc) { -+ spin_lock(&c->freelist_lock); -+ ob->on_partial_list = true; -+ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = -+ ob - c->open_buckets; -+ spin_unlock(&c->freelist_lock); -+ -+ closure_wake_up(&c->open_buckets_wait); -+ closure_wake_up(&c->freelist_wait); -+ } else { -+ bch2_open_bucket_put(c, ob); -+ } -+} -+ -+static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ BUG_ON(ptr_stale(ca, &ob->ptr)); -+ } -+#endif -+} -+ -+/* _only_ for allocating the journal on a new device: */ -+long bch2_bucket_alloc_new_fs(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ ssize_t b; -+ -+ rcu_read_lock(); -+ buckets = bucket_array(ca); -+ -+ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) -+ if (is_available_bucket(buckets->b[b].mark)) -+ goto success; -+ b = -1; -+success: -+ rcu_read_unlock(); -+ return b; -+} -+ -+static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) -+{ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ return 0; -+ case RESERVE_BTREE: -+ return OPEN_BUCKETS_COUNT / 4; -+ default: -+ return OPEN_BUCKETS_COUNT / 2; -+ } -+} -+ -+/** -+ * bch_bucket_alloc - allocate a single bucket from a specific device -+ * -+ * Returns index of bucket on success, 0 on failure -+ * */ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, -+ enum alloc_reserve reserve, -+ bool may_alloc_partial, -+ struct closure *cl) -+{ -+ struct bucket_array *buckets; -+ struct open_bucket *ob; -+ long bucket = 0; -+ -+ spin_lock(&c->freelist_lock); -+ -+ if (may_alloc_partial) { -+ int i; -+ -+ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { -+ ob = c->open_buckets + ca->open_buckets_partial[i]; -+ -+ if (reserve <= ob->alloc_reserve) { -+ array_remove_item(ca->open_buckets_partial, -+ ca->open_buckets_partial_nr, -+ i); -+ ob->on_partial_list = false; -+ ob->alloc_reserve = reserve; -+ spin_unlock(&c->freelist_lock); -+ return ob; -+ } -+ } -+ } -+ -+ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { -+ if (cl) -+ closure_wait(&c->open_buckets_wait, cl); -+ -+ if (!c->blocked_allocate_open_bucket) -+ c->blocked_allocate_open_bucket = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ trace_open_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-OPEN_BUCKETS_EMPTY); -+ } -+ -+ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) -+ goto out; -+ -+ switch (reserve) { -+ case RESERVE_ALLOC: -+ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_BTREE: -+ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= -+ ca->free[RESERVE_BTREE].size && -+ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) -+ goto out; -+ break; -+ case RESERVE_MOVINGGC: -+ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) -+ goto out; -+ break; -+ default: -+ break; -+ } -+ -+ if (cl) -+ closure_wait(&c->freelist_wait, cl); -+ -+ if (!c->blocked_allocate) -+ c->blocked_allocate = local_clock(); -+ -+ spin_unlock(&c->freelist_lock); -+ -+ trace_bucket_alloc_fail(ca, reserve); -+ return ERR_PTR(-FREELIST_EMPTY); -+out: -+ verify_not_on_freelist(c, ca, bucket); -+ -+ ob = bch2_open_bucket_alloc(c); -+ -+ spin_lock(&ob->lock); -+ buckets = bucket_array(ca); -+ -+ ob->valid = true; -+ ob->sectors_free = ca->mi.bucket_size; -+ ob->alloc_reserve = reserve; -+ ob->ptr = (struct bch_extent_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_ptr, -+ .gen = buckets->b[bucket].mark.gen, -+ .offset = bucket_to_sector(ca, bucket), -+ .dev = ca->dev_idx, -+ }; -+ -+ spin_unlock(&ob->lock); -+ -+ if (c->blocked_allocate_open_bucket) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate_open_bucket], -+ c->blocked_allocate_open_bucket); -+ c->blocked_allocate_open_bucket = 0; -+ } -+ -+ if (c->blocked_allocate) { -+ bch2_time_stats_update( -+ &c->times[BCH_TIME_blocked_allocate], -+ c->blocked_allocate); -+ c->blocked_allocate = 0; -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ bch2_wake_allocator(ca); -+ -+ trace_bucket_alloc(ca, reserve); -+ return ob; -+} -+ -+static int __dev_stripe_cmp(struct dev_stripe_state *stripe, -+ unsigned l, unsigned r) -+{ -+ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - -+ (stripe->next_alloc[l] < stripe->next_alloc[r])); -+} -+ -+#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs) -+{ -+ struct dev_alloc_list ret = { .nr = 0 }; -+ unsigned i; -+ -+ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) -+ ret.devs[ret.nr++] = i; -+ -+ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); -+ return ret; -+} -+ -+void bch2_dev_stripe_increment(struct bch_dev *ca, -+ struct dev_stripe_state *stripe) -+{ -+ u64 *v = stripe->next_alloc + ca->dev_idx; -+ u64 free_space = dev_buckets_free(ca); -+ u64 free_space_inv = free_space -+ ? div64_u64(1ULL << 48, free_space) -+ : 1ULL << 48; -+ u64 scale = *v / 4; -+ -+ if (*v + free_space_inv >= *v) -+ *v += free_space_inv; -+ else -+ *v = U64_MAX; -+ -+ for (v = stripe->next_alloc; -+ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) -+ *v = *v < scale ? 0 : *v - scale; -+} -+ -+#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) -+#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) -+ -+static void add_new_bucket(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ struct open_bucket *ob) -+{ -+ unsigned durability = -+ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; -+ -+ __clear_bit(ob->ptr.dev, devs_may_alloc->d); -+ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) -+ ? durability : 1; -+ *have_cache |= !durability; -+ -+ ob_push(c, ptrs, ob); -+} -+ -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct dev_stripe_state *stripe, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct dev_alloc_list devs_sorted = -+ bch2_dev_alloc_list(c, stripe, devs_may_alloc); -+ struct bch_dev *ca; -+ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; -+ unsigned i; -+ -+ BUG_ON(*nr_effective >= nr_replicas); -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ struct open_bucket *ob; -+ -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ if (!ca->mi.durability && *have_cache) -+ continue; -+ -+ ob = bch2_bucket_alloc(c, ca, reserve, -+ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); -+ if (IS_ERR(ob)) { -+ ret = -PTR_ERR(ob); -+ -+ if (cl) -+ return ret; -+ continue; -+ } -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ -+ bch2_dev_stripe_increment(ca, stripe); -+ -+ if (*nr_effective >= nr_replicas) -+ return ALLOC_SUCCESS; -+ } -+ -+ return ret; -+} -+ -+/* Allocate from stripes: */ -+ -+/* -+ * if we can't allocate a new stripe because there are already too many -+ * partially filled stripes, force allocating from an existing stripe even when -+ * it's to a device we don't want: -+ */ -+ -+static void bucket_alloc_from_stripe(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags) -+{ -+ struct dev_alloc_list devs_sorted; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ struct bch_dev *ca; -+ unsigned i, ec_idx; -+ -+ if (!erasure_code) -+ return; -+ -+ if (nr_replicas < 2) -+ return; -+ -+ if (ec_open_bucket(c, ptrs)) -+ return; -+ -+ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); -+ if (!h) -+ return; -+ -+ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); -+ -+ for (i = 0; i < devs_sorted.nr; i++) -+ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) -+ if (ob->ptr.dev == devs_sorted.devs[i] && -+ !test_and_set_bit(h->s->data_block_idx[ec_idx], -+ h->s->blocks_allocated)) -+ goto got_bucket; -+ goto out_put_head; -+got_bucket: -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ ob->ec_idx = h->s->data_block_idx[ec_idx]; -+ ob->ec = h->s; -+ -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, flags, ob); -+ atomic_inc(&h->s->pin); -+out_put_head: -+ bch2_ec_stripe_head_put(c, h); -+} -+ -+/* Sector allocator */ -+ -+static void get_buckets_from_writepoint(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_mask *devs_may_alloc, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ unsigned flags, -+ bool need_ec) -+{ -+ struct open_buckets ptrs_skip = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ -+ if (*nr_effective < nr_replicas && -+ test_bit(ob->ptr.dev, devs_may_alloc->d) && -+ (ca->mi.durability || -+ (wp->type == BCH_DATA_user && !*have_cache)) && -+ (ob->ec || !need_ec)) { -+ add_new_bucket(c, ptrs, devs_may_alloc, -+ nr_effective, have_cache, -+ flags, ob); -+ } else { -+ ob_push(c, &ptrs_skip, ob); -+ } -+ } -+ wp->ptrs = ptrs_skip; -+} -+ -+static enum bucket_alloc_ret -+open_bucket_add_buckets(struct bch_fs *c, -+ struct open_buckets *ptrs, -+ struct write_point *wp, -+ struct bch_devs_list *devs_have, -+ u16 target, -+ unsigned erasure_code, -+ unsigned nr_replicas, -+ unsigned *nr_effective, -+ bool *have_cache, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *_cl) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ struct closure *cl = NULL; -+ enum bucket_alloc_ret ret; -+ unsigned i; -+ -+ rcu_read_lock(); -+ devs = target_rw_devs(c, wp->type, target); -+ rcu_read_unlock(); -+ -+ /* Don't allocate from devices we already have pointers to: */ -+ for (i = 0; i < devs_have->nr; i++) -+ __clear_bit(devs_have->devs[i], devs.d); -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ if (erasure_code) { -+ if (!ec_open_bucket(c, ptrs)) { -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, true); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ -+ if (!ec_open_bucket(c, ptrs)) { -+ bucket_alloc_from_stripe(c, ptrs, wp, &devs, -+ target, erasure_code, -+ nr_replicas, nr_effective, -+ have_cache, flags); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ } -+ } -+ -+ get_buckets_from_writepoint(c, ptrs, wp, &devs, -+ nr_replicas, nr_effective, -+ have_cache, flags, false); -+ if (*nr_effective >= nr_replicas) -+ return 0; -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+retry_blocking: -+ /* -+ * Try nonblocking first, so that if one device is full we'll try from -+ * other devices: -+ */ -+ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, -+ nr_replicas, nr_effective, have_cache, -+ reserve, flags, cl); -+ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { -+ cl = _cl; -+ goto retry_blocking; -+ } -+ -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, -+ struct open_buckets *obs) -+{ -+ struct open_buckets ptrs = { .nr = 0 }; -+ struct open_bucket *ob, *ob2; -+ unsigned i, j; -+ -+ open_bucket_for_each(c, obs, ob, i) { -+ bool drop = !ca || ob->ptr.dev == ca->dev_idx; -+ -+ if (!drop && ob->ec) { -+ mutex_lock(&ob->ec->lock); -+ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ open_bucket_for_each(c, &ob->ec->parity, ob2, j) -+ drop |= ob2->ptr.dev == ca->dev_idx; -+ mutex_unlock(&ob->ec->lock); -+ } -+ -+ if (drop) -+ bch2_open_bucket_put(c, ob); -+ else -+ ob_push(c, &ptrs, ob); -+ } -+ -+ *obs = ptrs; -+} -+ -+void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, -+ struct write_point *wp) -+{ -+ mutex_lock(&wp->lock); -+ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); -+ mutex_unlock(&wp->lock); -+} -+ -+static inline struct hlist_head *writepoint_hash(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ unsigned hash = -+ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); -+ -+ return &c->write_points_hash[hash]; -+} -+ -+static struct write_point *__writepoint_find(struct hlist_head *head, -+ unsigned long write_point) -+{ -+ struct write_point *wp; -+ -+ hlist_for_each_entry_rcu(wp, head, node) -+ if (wp->write_point == write_point) -+ return wp; -+ -+ return NULL; -+} -+ -+static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) -+{ -+ u64 stranded = c->write_points_nr * c->bucket_size_max; -+ u64 free = bch2_fs_usage_read_short(c).free; -+ -+ return stranded * factor > free; -+} -+ -+static bool try_increase_writepoints(struct bch_fs *c) -+{ -+ struct write_point *wp; -+ -+ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || -+ too_many_writepoints(c, 32)) -+ return false; -+ -+ wp = c->write_points + c->write_points_nr++; -+ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); -+ return true; -+} -+ -+static bool try_decrease_writepoints(struct bch_fs *c, -+ unsigned old_nr) -+{ -+ struct write_point *wp; -+ -+ mutex_lock(&c->write_points_hash_lock); -+ if (c->write_points_nr < old_nr) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return true; -+ } -+ -+ if (c->write_points_nr == 1 || -+ !too_many_writepoints(c, 8)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ return false; -+ } -+ -+ wp = c->write_points + --c->write_points_nr; -+ -+ hlist_del_rcu(&wp->node); -+ mutex_unlock(&c->write_points_hash_lock); -+ -+ bch2_writepoint_stop(c, NULL, wp); -+ return true; -+} -+ -+static struct write_point *writepoint_find(struct bch_fs *c, -+ unsigned long write_point) -+{ -+ struct write_point *wp, *oldest; -+ struct hlist_head *head; -+ -+ if (!(write_point & 1UL)) { -+ wp = (struct write_point *) write_point; -+ mutex_lock(&wp->lock); -+ return wp; -+ } -+ -+ head = writepoint_hash(c, write_point); -+restart_find: -+ wp = __writepoint_find(head, write_point); -+ if (wp) { -+lock_wp: -+ mutex_lock(&wp->lock); -+ if (wp->write_point == write_point) -+ goto out; -+ mutex_unlock(&wp->lock); -+ goto restart_find; -+ } -+restart_find_oldest: -+ oldest = NULL; -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) -+ if (!oldest || time_before64(wp->last_used, oldest->last_used)) -+ oldest = wp; -+ -+ mutex_lock(&oldest->lock); -+ mutex_lock(&c->write_points_hash_lock); -+ if (oldest >= c->write_points + c->write_points_nr || -+ try_increase_writepoints(c)) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto restart_find_oldest; -+ } -+ -+ wp = __writepoint_find(head, write_point); -+ if (wp && wp != oldest) { -+ mutex_unlock(&c->write_points_hash_lock); -+ mutex_unlock(&oldest->lock); -+ goto lock_wp; -+ } -+ -+ wp = oldest; -+ hlist_del_rcu(&wp->node); -+ wp->write_point = write_point; -+ hlist_add_head_rcu(&wp->node, head); -+ mutex_unlock(&c->write_points_hash_lock); -+out: -+ wp->last_used = sched_clock(); -+ return wp; -+} -+ -+/* -+ * Get us an open_bucket we can allocate from, return with it locked: -+ */ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, -+ unsigned target, -+ unsigned erasure_code, -+ struct write_point_specifier write_point, -+ struct bch_devs_list *devs_have, -+ unsigned nr_replicas, -+ unsigned nr_replicas_required, -+ enum alloc_reserve reserve, -+ unsigned flags, -+ struct closure *cl) -+{ -+ struct write_point *wp; -+ struct open_bucket *ob; -+ struct open_buckets ptrs; -+ unsigned nr_effective, write_points_nr; -+ unsigned ob_flags = 0; -+ bool have_cache; -+ enum bucket_alloc_ret ret; -+ int i; -+ -+ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) -+ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; -+ -+ BUG_ON(!nr_replicas || !nr_replicas_required); -+retry: -+ ptrs.nr = 0; -+ nr_effective = 0; -+ write_points_nr = c->write_points_nr; -+ have_cache = false; -+ -+ wp = writepoint_find(c, write_point.v); -+ -+ if (wp->type == BCH_DATA_user) -+ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; -+ -+ /* metadata may not allocate on cache devices: */ -+ if (wp->type != BCH_DATA_user) -+ have_cache = true; -+ -+ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } else { -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ target, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, NULL); -+ if (!ret) -+ goto alloc_done; -+ -+ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, -+ 0, erasure_code, -+ nr_replicas, &nr_effective, -+ &have_cache, reserve, -+ ob_flags, cl); -+ } -+alloc_done: -+ BUG_ON(!ret && nr_effective < nr_replicas); -+ -+ if (erasure_code && !ec_open_bucket(c, &ptrs)) -+ pr_debug("failed to get ec bucket: ret %u", ret); -+ -+ if (ret == INSUFFICIENT_DEVICES && -+ nr_effective >= nr_replicas_required) -+ ret = 0; -+ -+ if (ret) -+ goto err; -+ -+ /* Free buckets we didn't use: */ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ open_bucket_free_unused(c, wp, ob); -+ -+ wp->ptrs = ptrs; -+ -+ wp->sectors_free = UINT_MAX; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); -+ -+ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); -+ -+ verify_not_stale(c, &wp->ptrs); -+ -+ return wp; -+err: -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) -+ ob_push(c, &ptrs, ob); -+ else -+ open_bucket_free_unused(c, wp, ob); -+ wp->ptrs = ptrs; -+ -+ mutex_unlock(&wp->lock); -+ -+ if (ret == FREELIST_EMPTY && -+ try_decrease_writepoints(c, write_points_nr)) -+ goto retry; -+ -+ switch (ret) { -+ case OPEN_BUCKETS_EMPTY: -+ case FREELIST_EMPTY: -+ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); -+ case INSUFFICIENT_DEVICES: -+ return ERR_PTR(-EROFS); -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, -+ struct bkey_i *k, unsigned sectors) -+ -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(sectors > wp->sectors_free); -+ wp->sectors_free -= sectors; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ struct bch_extent_ptr tmp = ob->ptr; -+ -+ tmp.cached = !ca->mi.durability && -+ wp->type == BCH_DATA_user; -+ -+ tmp.offset += ca->mi.bucket_size - ob->sectors_free; -+ bch2_bkey_append_ptr(k, tmp); -+ -+ BUG_ON(sectors > ob->sectors_free); -+ ob->sectors_free -= sectors; -+ } -+} -+ -+/* -+ * Append pointers to the space we just allocated to @k, and mark @sectors space -+ * as allocated out of @ob -+ */ -+void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); -+ wp->ptrs = keep; -+ -+ mutex_unlock(&wp->lock); -+ -+ bch2_open_buckets_put(c, &ptrs); -+} -+ -+static inline void writepoint_init(struct write_point *wp, -+ enum bch_data_type type) -+{ -+ mutex_init(&wp->lock); -+ wp->type = type; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *c) -+{ -+ struct open_bucket *ob; -+ struct write_point *wp; -+ -+ mutex_init(&c->write_points_hash_lock); -+ c->write_points_nr = ARRAY_SIZE(c->write_points); -+ -+ /* open bucket 0 is a sentinal NULL: */ -+ spin_lock_init(&c->open_buckets[0].lock); -+ -+ for (ob = c->open_buckets + 1; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { -+ spin_lock_init(&ob->lock); -+ c->open_buckets_nr_free++; -+ -+ ob->freelist = c->open_buckets_freelist; -+ c->open_buckets_freelist = ob - c->open_buckets; -+ } -+ -+ writepoint_init(&c->btree_write_point, BCH_DATA_btree); -+ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); -+ writepoint_init(&c->copygc_write_point, BCH_DATA_user); -+ -+ for (wp = c->write_points; -+ wp < c->write_points + c->write_points_nr; wp++) { -+ writepoint_init(wp, BCH_DATA_user); -+ -+ wp->last_used = sched_clock(); -+ wp->write_point = (unsigned long) wp; -+ hlist_add_head_rcu(&wp->node, -+ writepoint_hash(c, wp->write_point)); -+ } -+} -diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h -new file mode 100644 -index 000000000000..c658295cb8e0 ---- /dev/null -+++ b/fs/bcachefs/alloc_foreground.h -@@ -0,0 +1,138 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_FOREGROUND_H -+#define _BCACHEFS_ALLOC_FOREGROUND_H -+ -+#include "bcachefs.h" -+#include "alloc_types.h" -+ -+#include -+ -+struct bkey; -+struct bch_dev; -+struct bch_fs; -+struct bch_devs_List; -+ -+enum bucket_alloc_ret { -+ ALLOC_SUCCESS, -+ OPEN_BUCKETS_EMPTY, -+ FREELIST_EMPTY, /* Allocator thread not keeping up */ -+ INSUFFICIENT_DEVICES, -+}; -+ -+struct dev_alloc_list { -+ unsigned nr; -+ u8 devs[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, -+ struct dev_stripe_state *, -+ struct bch_devs_mask *); -+void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); -+ -+long bch2_bucket_alloc_new_fs(struct bch_dev *); -+ -+struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, -+ enum alloc_reserve, bool, -+ struct closure *); -+ -+static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, -+ struct open_bucket *ob) -+{ -+ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); -+ -+ obs->v[obs->nr++] = ob - c->open_buckets; -+} -+ -+#define open_bucket_for_each(_c, _obs, _ob, _i) \ -+ for ((_i) = 0; \ -+ (_i) < (_obs)->nr && \ -+ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ -+ (_i)++) -+ -+static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, -+ struct open_buckets *obs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, obs, ob, i) -+ if (ob->ec) -+ return ob; -+ -+ return NULL; -+} -+ -+void bch2_open_bucket_write_error(struct bch_fs *, -+ struct open_buckets *, unsigned); -+ -+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); -+ -+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) -+{ -+ if (atomic_dec_and_test(&ob->pin)) -+ __bch2_open_bucket_put(c, ob); -+} -+ -+static inline void bch2_open_buckets_put(struct bch_fs *c, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, ptrs, ob, i) -+ bch2_open_bucket_put(c, ob); -+ ptrs->nr = 0; -+} -+ -+static inline void bch2_open_bucket_get(struct bch_fs *c, -+ struct write_point *wp, -+ struct open_buckets *ptrs) -+{ -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ ob->type = wp->type; -+ atomic_inc(&ob->pin); -+ ob_push(c, ptrs, ob); -+ } -+} -+ -+enum bucket_alloc_ret -+bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, -+ struct dev_stripe_state *, struct bch_devs_mask *, -+ unsigned, unsigned *, bool *, enum alloc_reserve, -+ unsigned, struct closure *); -+ -+struct write_point *bch2_alloc_sectors_start(struct bch_fs *, -+ unsigned, unsigned, -+ struct write_point_specifier, -+ struct bch_devs_list *, -+ unsigned, unsigned, -+ enum alloc_reserve, -+ unsigned, -+ struct closure *); -+ -+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, -+ struct bkey_i *, unsigned); -+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -+ -+void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, -+ struct open_buckets *); -+ -+void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, -+ struct write_point *); -+ -+static inline struct write_point_specifier writepoint_hashed(unsigned long v) -+{ -+ return (struct write_point_specifier) { .v = v | 1 }; -+} -+ -+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) -+{ -+ return (struct write_point_specifier) { .v = (unsigned long) wp }; -+} -+ -+void bch2_fs_allocator_foreground_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ -diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h -new file mode 100644 -index 000000000000..20705460bb0a ---- /dev/null -+++ b/fs/bcachefs/alloc_types.h -@@ -0,0 +1,113 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ALLOC_TYPES_H -+#define _BCACHEFS_ALLOC_TYPES_H -+ -+#include -+#include -+ -+#include "clock_types.h" -+#include "fifo.h" -+ -+struct ec_bucket_buf; -+ -+/* There's two of these clocks, one for reads and one for writes: */ -+struct bucket_clock { -+ /* -+ * "now" in (read/write) IO time - incremented whenever we do X amount -+ * of reads or writes. -+ * -+ * Goes with the bucket read/write prios: when we read or write to a -+ * bucket we reset the bucket's prio to the current hand; thus hand - -+ * prio = time since bucket was last read/written. -+ * -+ * The units are some amount (bytes/sectors) of data read/written, and -+ * the units can change on the fly if we need to rescale to fit -+ * everything in a u16 - your only guarantee is that the units are -+ * consistent. -+ */ -+ u16 hand; -+ u16 max_last_io; -+ -+ int rw; -+ -+ struct io_timer rescale; -+ struct mutex lock; -+}; -+ -+/* There is one reserve for each type of btree, one for prios and gens -+ * and one for moving GC */ -+enum alloc_reserve { -+ RESERVE_ALLOC = -1, -+ RESERVE_BTREE = 0, -+ RESERVE_MOVINGGC = 1, -+ RESERVE_NONE = 2, -+ RESERVE_NR = 3, -+}; -+ -+typedef FIFO(long) alloc_fifo; -+ -+#define OPEN_BUCKETS_COUNT 1024 -+ -+#define WRITE_POINT_HASH_NR 32 -+#define WRITE_POINT_MAX 32 -+ -+typedef u16 open_bucket_idx_t; -+ -+struct open_bucket { -+ spinlock_t lock; -+ atomic_t pin; -+ open_bucket_idx_t freelist; -+ -+ /* -+ * When an open bucket has an ec_stripe attached, this is the index of -+ * the block in the stripe this open_bucket corresponds to: -+ */ -+ u8 ec_idx; -+ u8 type; -+ unsigned valid:1; -+ unsigned on_partial_list:1; -+ int alloc_reserve:3; -+ unsigned sectors_free; -+ struct bch_extent_ptr ptr; -+ struct ec_stripe_new *ec; -+}; -+ -+#define OPEN_BUCKET_LIST_MAX 15 -+ -+struct open_buckets { -+ open_bucket_idx_t nr; -+ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; -+}; -+ -+struct dev_stripe_state { -+ u64 next_alloc[BCH_SB_MEMBERS_MAX]; -+}; -+ -+struct write_point { -+ struct hlist_node node; -+ struct mutex lock; -+ u64 last_used; -+ unsigned long write_point; -+ enum bch_data_type type; -+ bool is_ec; -+ -+ /* calculated based on how many pointers we're actually going to use: */ -+ unsigned sectors_free; -+ -+ struct open_buckets ptrs; -+ struct dev_stripe_state stripe; -+}; -+ -+struct write_point_specifier { -+ unsigned long v; -+}; -+ -+struct alloc_heap_entry { -+ size_t bucket; -+ size_t nr; -+ unsigned long key; -+}; -+ -+typedef HEAP(struct alloc_heap_entry) alloc_heap; -+ -+#endif /* _BCACHEFS_ALLOC_TYPES_H */ -diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h -new file mode 100644 -index 000000000000..29f411635f29 ---- /dev/null -+++ b/fs/bcachefs/bcachefs.h -@@ -0,0 +1,882 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_H -+#define _BCACHEFS_H -+ -+/* -+ * SOME HIGH LEVEL CODE DOCUMENTATION: -+ * -+ * Bcache mostly works with cache sets, cache devices, and backing devices. -+ * -+ * Support for multiple cache devices hasn't quite been finished off yet, but -+ * it's about 95% plumbed through. A cache set and its cache devices is sort of -+ * like a md raid array and its component devices. Most of the code doesn't care -+ * about individual cache devices, the main abstraction is the cache set. -+ * -+ * Multiple cache devices is intended to give us the ability to mirror dirty -+ * cached data and metadata, without mirroring clean cached data. -+ * -+ * Backing devices are different, in that they have a lifetime independent of a -+ * cache set. When you register a newly formatted backing device it'll come up -+ * in passthrough mode, and then you can attach and detach a backing device from -+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly -+ * invalidates any cached data for that backing device. -+ * -+ * A cache set can have multiple (many) backing devices attached to it. -+ * -+ * There's also flash only volumes - this is the reason for the distinction -+ * between struct cached_dev and struct bcache_device. A flash only volume -+ * works much like a bcache device that has a backing device, except the -+ * "cached" data is always dirty. The end result is that we get thin -+ * provisioning with very little additional code. -+ * -+ * Flash only volumes work but they're not production ready because the moving -+ * garbage collector needs more work. More on that later. -+ * -+ * BUCKETS/ALLOCATION: -+ * -+ * Bcache is primarily designed for caching, which means that in normal -+ * operation all of our available space will be allocated. Thus, we need an -+ * efficient way of deleting things from the cache so we can write new things to -+ * it. -+ * -+ * To do this, we first divide the cache device up into buckets. A bucket is the -+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ -+ * works efficiently. -+ * -+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with -+ * it. The gens and priorities for all the buckets are stored contiguously and -+ * packed on disk (in a linked list of buckets - aside from the superblock, all -+ * of bcache's metadata is stored in buckets). -+ * -+ * The priority is used to implement an LRU. We reset a bucket's priority when -+ * we allocate it or on cache it, and every so often we decrement the priority -+ * of each bucket. It could be used to implement something more sophisticated, -+ * if anyone ever gets around to it. -+ * -+ * The generation is used for invalidating buckets. Each pointer also has an 8 -+ * bit generation embedded in it; for a pointer to be considered valid, its gen -+ * must match the gen of the bucket it points into. Thus, to reuse a bucket all -+ * we have to do is increment its gen (and write its new gen to disk; we batch -+ * this up). -+ * -+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that -+ * contain metadata (including btree nodes). -+ * -+ * THE BTREE: -+ * -+ * Bcache is in large part design around the btree. -+ * -+ * At a high level, the btree is just an index of key -> ptr tuples. -+ * -+ * Keys represent extents, and thus have a size field. Keys also have a variable -+ * number of pointers attached to them (potentially zero, which is handy for -+ * invalidating the cache). -+ * -+ * The key itself is an inode:offset pair. The inode number corresponds to a -+ * backing device or a flash only volume. The offset is the ending offset of the -+ * extent within the inode - not the starting offset; this makes lookups -+ * slightly more convenient. -+ * -+ * Pointers contain the cache device id, the offset on that device, and an 8 bit -+ * generation number. More on the gen later. -+ * -+ * Index lookups are not fully abstracted - cache lookups in particular are -+ * still somewhat mixed in with the btree code, but things are headed in that -+ * direction. -+ * -+ * Updates are fairly well abstracted, though. There are two different ways of -+ * updating the btree; insert and replace. -+ * -+ * BTREE_INSERT will just take a list of keys and insert them into the btree - -+ * overwriting (possibly only partially) any extents they overlap with. This is -+ * used to update the index after a write. -+ * -+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is -+ * overwriting a key that matches another given key. This is used for inserting -+ * data into the cache after a cache miss, and for background writeback, and for -+ * the moving garbage collector. -+ * -+ * There is no "delete" operation; deleting things from the index is -+ * accomplished by either by invalidating pointers (by incrementing a bucket's -+ * gen) or by inserting a key with 0 pointers - which will overwrite anything -+ * previously present at that location in the index. -+ * -+ * This means that there are always stale/invalid keys in the btree. They're -+ * filtered out by the code that iterates through a btree node, and removed when -+ * a btree node is rewritten. -+ * -+ * BTREE NODES: -+ * -+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and -+ * free smaller than a bucket - so, that's how big our btree nodes are. -+ * -+ * (If buckets are really big we'll only use part of the bucket for a btree node -+ * - no less than 1/4th - but a bucket still contains no more than a single -+ * btree node. I'd actually like to change this, but for now we rely on the -+ * bucket's gen for deleting btree nodes when we rewrite/split a node.) -+ * -+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook -+ * btree implementation. -+ * -+ * The way this is solved is that btree nodes are internally log structured; we -+ * can append new keys to an existing btree node without rewriting it. This -+ * means each set of keys we write is sorted, but the node is not. -+ * -+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would -+ * be expensive, and we have to distinguish between the keys we have written and -+ * the keys we haven't. So to do a lookup in a btree node, we have to search -+ * each sorted set. But we do merge written sets together lazily, so the cost of -+ * these extra searches is quite low (normally most of the keys in a btree node -+ * will be in one big set, and then there'll be one or two sets that are much -+ * smaller). -+ * -+ * This log structure makes bcache's btree more of a hybrid between a -+ * conventional btree and a compacting data structure, with some of the -+ * advantages of both. -+ * -+ * GARBAGE COLLECTION: -+ * -+ * We can't just invalidate any bucket - it might contain dirty data or -+ * metadata. If it once contained dirty data, other writes might overwrite it -+ * later, leaving no valid pointers into that bucket in the index. -+ * -+ * Thus, the primary purpose of garbage collection is to find buckets to reuse. -+ * It also counts how much valid data it each bucket currently contains, so that -+ * allocation can reuse buckets sooner when they've been mostly overwritten. -+ * -+ * It also does some things that are really internal to the btree -+ * implementation. If a btree node contains pointers that are stale by more than -+ * some threshold, it rewrites the btree node to avoid the bucket's generation -+ * wrapping around. It also merges adjacent btree nodes if they're empty enough. -+ * -+ * THE JOURNAL: -+ * -+ * Bcache's journal is not necessary for consistency; we always strictly -+ * order metadata writes so that the btree and everything else is consistent on -+ * disk in the event of an unclean shutdown, and in fact bcache had writeback -+ * caching (with recovery from unclean shutdown) before journalling was -+ * implemented. -+ * -+ * Rather, the journal is purely a performance optimization; we can't complete a -+ * write until we've updated the index on disk, otherwise the cache would be -+ * inconsistent in the event of an unclean shutdown. This means that without the -+ * journal, on random write workloads we constantly have to update all the leaf -+ * nodes in the btree, and those writes will be mostly empty (appending at most -+ * a few keys each) - highly inefficient in terms of amount of metadata writes, -+ * and it puts more strain on the various btree resorting/compacting code. -+ * -+ * The journal is just a log of keys we've inserted; on startup we just reinsert -+ * all the keys in the open journal entries. That means that when we're updating -+ * a node in the btree, we can wait until a 4k block of keys fills up before -+ * writing them out. -+ * -+ * For simplicity, we only journal updates to leaf nodes; updates to parent -+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth -+ * the complexity to deal with journalling them (in particular, journal replay) -+ * - updates to non leaf nodes just happen synchronously (see btree_split()). -+ */ -+ -+#undef pr_fmt -+#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "fifo.h" -+#include "opts.h" -+#include "util.h" -+ -+#define dynamic_fault(...) 0 -+#define race_fault(...) 0 -+ -+#define bch2_fs_init_fault(name) \ -+ dynamic_fault("bcachefs:bch_fs_init:" name) -+#define bch2_meta_read_fault(name) \ -+ dynamic_fault("bcachefs:meta:read:" name) -+#define bch2_meta_write_fault(name) \ -+ dynamic_fault("bcachefs:meta:write:" name) -+ -+#ifdef __KERNEL__ -+#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) -+#else -+#define bch2_fmt(_c, fmt) fmt "\n" -+#endif -+ -+#define bch_info(c, fmt, ...) \ -+ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_notice(c, fmt, ...) \ -+ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn(c, fmt, ...) \ -+ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_warn_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err(c, fmt, ...) \ -+ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+#define bch_err_ratelimited(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) -+ -+#define bch_verbose(c, fmt, ...) \ -+do { \ -+ if ((c)->opts.verbose) \ -+ bch_info(c, fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+#define pr_verbose_init(opts, fmt, ...) \ -+do { \ -+ if (opt_get(opts, verbose)) \ -+ pr_info(fmt, ##__VA_ARGS__); \ -+} while (0) -+ -+/* Parameters that are useful for debugging, but should always be compiled in: */ -+#define BCH_DEBUG_PARAMS_ALWAYS() \ -+ BCH_DEBUG_PARAM(key_merging_disabled, \ -+ "Disables merging of extents") \ -+ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ -+ "Causes mark and sweep to compact and rewrite every " \ -+ "btree node it traverses") \ -+ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ -+ "Disables rewriting of btree nodes during mark and sweep")\ -+ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ -+ "Disables the shrinker callback for the btree node cache") -+ -+/* Parameters that should only be compiled in in debug mode: */ -+#define BCH_DEBUG_PARAMS_DEBUG() \ -+ BCH_DEBUG_PARAM(expensive_debug_checks, \ -+ "Enables various runtime debugging checks that " \ -+ "significantly affect performance") \ -+ BCH_DEBUG_PARAM(debug_check_iterators, \ -+ "Enables extra verification for btree iterators") \ -+ BCH_DEBUG_PARAM(debug_check_bkeys, \ -+ "Run bkey_debugcheck (primarily checking GC/allocation "\ -+ "information) when iterating over keys") \ -+ BCH_DEBUG_PARAM(verify_btree_ondisk, \ -+ "Reread btree nodes at various points to verify the " \ -+ "mergesort in the read path against modifications " \ -+ "done in memory") \ -+ BCH_DEBUG_PARAM(journal_seq_verify, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(inject_invalid_keys, \ -+ "Store the journal sequence number in the version " \ -+ "number of every btree key, and verify that btree " \ -+ "update ordering is preserved during recovery") \ -+ BCH_DEBUG_PARAM(test_alloc_startup, \ -+ "Force allocator startup to use the slowpath where it" \ -+ "can't find enough free buckets without invalidating" \ -+ "cached data") \ -+ BCH_DEBUG_PARAM(force_reconstruct_read, \ -+ "Force reads to use the reconstruct path, when reading" \ -+ "from erasure coded extents") \ -+ BCH_DEBUG_PARAM(test_restart_gc, \ -+ "Test restarting mark and sweep gc when bucket gens change") -+ -+#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() -+#else -+#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() -+#endif -+ -+#define BCH_TIME_STATS() \ -+ x(btree_node_mem_alloc) \ -+ x(btree_node_split) \ -+ x(btree_node_sort) \ -+ x(btree_node_read) \ -+ x(btree_gc) \ -+ x(btree_lock_contended_read) \ -+ x(btree_lock_contended_intent) \ -+ x(btree_lock_contended_write) \ -+ x(data_write) \ -+ x(data_read) \ -+ x(data_promote) \ -+ x(journal_write) \ -+ x(journal_delay) \ -+ x(journal_flush_seq) \ -+ x(blocked_journal) \ -+ x(blocked_allocate) \ -+ x(blocked_allocate_open_bucket) -+ -+enum bch_time_stats { -+#define x(name) BCH_TIME_##name, -+ BCH_TIME_STATS() -+#undef x -+ BCH_TIME_STAT_NR -+}; -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "clock_types.h" -+#include "ec_types.h" -+#include "journal_types.h" -+#include "keylist_types.h" -+#include "quota_types.h" -+#include "rebalance_types.h" -+#include "replicas_types.h" -+#include "super_types.h" -+ -+/* Number of nodes btree coalesce will try to coalesce at once */ -+#define GC_MERGE_NODES 4U -+ -+/* Maximum number of nodes we might need to allocate atomically: */ -+#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) -+ -+/* Size of the freelist we allocate btree nodes from: */ -+#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) -+ -+#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) -+ -+struct btree; -+ -+enum gc_phase { -+ GC_PHASE_NOT_RUNNING, -+ GC_PHASE_START, -+ GC_PHASE_SB, -+ -+ GC_PHASE_BTREE_EC, -+ GC_PHASE_BTREE_EXTENTS, -+ GC_PHASE_BTREE_INODES, -+ GC_PHASE_BTREE_DIRENTS, -+ GC_PHASE_BTREE_XATTRS, -+ GC_PHASE_BTREE_ALLOC, -+ GC_PHASE_BTREE_QUOTAS, -+ GC_PHASE_BTREE_REFLINK, -+ -+ GC_PHASE_PENDING_DELETE, -+ GC_PHASE_ALLOC, -+}; -+ -+struct gc_pos { -+ enum gc_phase phase; -+ struct bpos pos; -+ unsigned level; -+}; -+ -+struct io_count { -+ u64 sectors[2][BCH_DATA_NR]; -+}; -+ -+struct bch_dev { -+ struct kobject kobj; -+ struct percpu_ref ref; -+ struct completion ref_completion; -+ struct percpu_ref io_ref; -+ struct completion io_ref_completion; -+ -+ struct bch_fs *fs; -+ -+ u8 dev_idx; -+ /* -+ * Cached version of this device's member info from superblock -+ * Committed by bch2_write_super() -> bch_fs_mi_update() -+ */ -+ struct bch_member_cpu mi; -+ uuid_le uuid; -+ char name[BDEVNAME_SIZE]; -+ -+ struct bch_sb_handle disk_sb; -+ struct bch_sb *sb_read_scratch; -+ int sb_write_error; -+ -+ struct bch_devs_mask self; -+ -+ /* biosets used in cloned bios for writing multiple replicas */ -+ struct bio_set replica_set; -+ -+ /* -+ * Buckets: -+ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and -+ * gc_lock, for device resize - holding any is sufficient for access: -+ * Or rcu_read_lock(), but only for ptr_stale(): -+ */ -+ struct bucket_array __rcu *buckets[2]; -+ unsigned long *buckets_nouse; -+ struct rw_semaphore bucket_lock; -+ -+ struct bch_dev_usage __percpu *usage[2]; -+ -+ /* Allocator: */ -+ struct task_struct __rcu *alloc_thread; -+ -+ /* -+ * free: Buckets that are ready to be used -+ * -+ * free_inc: Incoming buckets - these are buckets that currently have -+ * cached data in them, and we can't reuse them until after we write -+ * their new gen to disk. After prio_write() finishes writing the new -+ * gens/prios, they'll be moved to the free list (and possibly discarded -+ * in the process) -+ */ -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ -+ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; -+ open_bucket_idx_t open_buckets_partial_nr; -+ -+ size_t fifo_last_bucket; -+ -+ /* last calculated minimum prio */ -+ u16 max_last_bucket_io[2]; -+ -+ size_t inc_gen_needs_gc; -+ size_t inc_gen_really_needs_gc; -+ -+ /* -+ * XXX: this should be an enum for allocator state, so as to include -+ * error state -+ */ -+ enum { -+ ALLOCATOR_STOPPED, -+ ALLOCATOR_RUNNING, -+ ALLOCATOR_BLOCKED, -+ ALLOCATOR_BLOCKED_FULL, -+ } allocator_state; -+ -+ alloc_heap alloc_heap; -+ -+ atomic64_t rebalance_work; -+ -+ struct journal_device journal; -+ -+ struct work_struct io_error_work; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic64_t cur_latency[2]; -+ struct time_stats io_latency[2]; -+ -+#define CONGESTED_MAX 1024 -+ atomic_t congested; -+ u64 congested_last; -+ -+ struct io_count __percpu *io_done; -+}; -+ -+enum { -+ /* startup: */ -+ BCH_FS_ALLOC_READ_DONE, -+ BCH_FS_ALLOC_CLEAN, -+ BCH_FS_ALLOCATOR_RUNNING, -+ BCH_FS_ALLOCATOR_STOPPING, -+ BCH_FS_INITIAL_GC_DONE, -+ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, -+ BCH_FS_FSCK_DONE, -+ BCH_FS_STARTED, -+ BCH_FS_RW, -+ -+ /* shutdown: */ -+ BCH_FS_STOPPING, -+ BCH_FS_EMERGENCY_RO, -+ BCH_FS_WRITE_DISABLE_COMPLETE, -+ -+ /* errors: */ -+ BCH_FS_ERROR, -+ BCH_FS_ERRORS_FIXED, -+ -+ /* misc: */ -+ BCH_FS_FIXED_GENS, -+ BCH_FS_ALLOC_WRITTEN, -+ BCH_FS_REBUILD_REPLICAS, -+ BCH_FS_HOLD_BTREE_WRITES, -+}; -+ -+struct btree_debug { -+ unsigned id; -+ struct dentry *btree; -+ struct dentry *btree_format; -+ struct dentry *failed; -+}; -+ -+struct bch_fs_pcpu { -+ u64 sectors_available; -+}; -+ -+struct journal_seq_blacklist_table { -+ size_t nr; -+ struct journal_seq_blacklist_table_entry { -+ u64 start; -+ u64 end; -+ bool dirty; -+ } entries[0]; -+}; -+ -+struct journal_keys { -+ struct journal_key { -+ enum btree_id btree_id:8; -+ unsigned level:8; -+ struct bkey_i *k; -+ u32 journal_seq; -+ u32 journal_offset; -+ } *d; -+ size_t nr; -+ u64 journal_seq_base; -+}; -+ -+struct bch_fs { -+ struct closure cl; -+ -+ struct list_head list; -+ struct kobject kobj; -+ struct kobject internal; -+ struct kobject opts_dir; -+ struct kobject time_stats; -+ unsigned long flags; -+ -+ int minor; -+ struct device *chardev; -+ struct super_block *vfs_sb; -+ char name[40]; -+ -+ /* ro/rw, add/remove/resize devices: */ -+ struct rw_semaphore state_lock; -+ -+ /* Counts outstanding writes, for clean transition to read-only */ -+ struct percpu_ref writes; -+ struct work_struct read_only_work; -+ -+ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; -+ -+ struct bch_replicas_cpu replicas; -+ struct bch_replicas_cpu replicas_gc; -+ struct mutex replicas_gc_lock; -+ -+ struct journal_entry_res replicas_journal_res; -+ -+ struct bch_disk_groups_cpu __rcu *disk_groups; -+ -+ struct bch_opts opts; -+ -+ /* Updated by bch2_sb_update():*/ -+ struct { -+ uuid_le uuid; -+ uuid_le user_uuid; -+ -+ u16 version; -+ u16 encoded_extent_max; -+ -+ u8 nr_devices; -+ u8 clean; -+ -+ u8 encryption_type; -+ -+ u64 time_base_lo; -+ u32 time_base_hi; -+ u32 time_precision; -+ u64 features; -+ u64 compat; -+ } sb; -+ -+ struct bch_sb_handle disk_sb; -+ -+ unsigned short block_bits; /* ilog2(block_size) */ -+ -+ u16 btree_foreground_merge_threshold; -+ -+ struct closure sb_write; -+ struct mutex sb_lock; -+ -+ /* BTREE CACHE */ -+ struct bio_set btree_bio; -+ -+ struct btree_root btree_roots[BTREE_ID_NR]; -+ struct mutex btree_root_lock; -+ -+ struct btree_cache btree_cache; -+ -+ /* -+ * Cache of allocated btree nodes - if we allocate a btree node and -+ * don't use it, if we free it that space can't be reused until going -+ * _all_ the way through the allocator (which exposes us to a livelock -+ * when allocating btree reserves fail halfway through) - instead, we -+ * can stick them here: -+ */ -+ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; -+ unsigned btree_reserve_cache_nr; -+ struct mutex btree_reserve_cache_lock; -+ -+ mempool_t btree_interior_update_pool; -+ struct list_head btree_interior_update_list; -+ struct list_head btree_interior_updates_unwritten; -+ struct mutex btree_interior_update_lock; -+ struct closure_waitlist btree_interior_update_wait; -+ -+ struct workqueue_struct *btree_interior_update_worker; -+ struct work_struct btree_interior_update_work; -+ -+ /* btree_iter.c: */ -+ struct mutex btree_trans_lock; -+ struct list_head btree_trans_list; -+ mempool_t btree_iters_pool; -+ -+ struct btree_key_cache btree_key_cache; -+ -+ struct workqueue_struct *wq; -+ /* copygc needs its own workqueue for index updates.. */ -+ struct workqueue_struct *copygc_wq; -+ struct workqueue_struct *journal_reclaim_wq; -+ -+ /* ALLOCATION */ -+ struct delayed_work pd_controllers_update; -+ unsigned pd_controllers_update_seconds; -+ -+ struct bch_devs_mask rw_devs[BCH_DATA_NR]; -+ -+ u64 capacity; /* sectors */ -+ -+ /* -+ * When capacity _decreases_ (due to a disk being removed), we -+ * increment capacity_gen - this invalidates outstanding reservations -+ * and forces them to be revalidated -+ */ -+ u32 capacity_gen; -+ unsigned bucket_size_max; -+ -+ atomic64_t sectors_available; -+ -+ struct bch_fs_pcpu __percpu *pcpu; -+ -+ struct percpu_rw_semaphore mark_lock; -+ -+ seqcount_t usage_lock; -+ struct bch_fs_usage *usage_base; -+ struct bch_fs_usage __percpu *usage[2]; -+ struct bch_fs_usage __percpu *usage_gc; -+ -+ /* single element mempool: */ -+ struct mutex usage_scratch_lock; -+ struct bch_fs_usage *usage_scratch; -+ -+ /* -+ * When we invalidate buckets, we use both the priority and the amount -+ * of good data to determine which buckets to reuse first - to weight -+ * those together consistently we keep track of the smallest nonzero -+ * priority of any bucket. -+ */ -+ struct bucket_clock bucket_clock[2]; -+ -+ struct io_clock io_clock[2]; -+ -+ /* JOURNAL SEQ BLACKLIST */ -+ struct journal_seq_blacklist_table * -+ journal_seq_blacklist_table; -+ struct work_struct journal_seq_blacklist_gc_work; -+ -+ /* ALLOCATOR */ -+ spinlock_t freelist_lock; -+ struct closure_waitlist freelist_wait; -+ u64 blocked_allocate; -+ u64 blocked_allocate_open_bucket; -+ open_bucket_idx_t open_buckets_freelist; -+ open_bucket_idx_t open_buckets_nr_free; -+ struct closure_waitlist open_buckets_wait; -+ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; -+ -+ struct write_point btree_write_point; -+ struct write_point rebalance_write_point; -+ -+ struct write_point write_points[WRITE_POINT_MAX]; -+ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; -+ struct mutex write_points_hash_lock; -+ unsigned write_points_nr; -+ -+ /* GARBAGE COLLECTION */ -+ struct task_struct *gc_thread; -+ atomic_t kick_gc; -+ unsigned long gc_count; -+ -+ /* -+ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] -+ * has been marked by GC. -+ * -+ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) -+ * -+ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread -+ * can read without a lock. -+ */ -+ seqcount_t gc_pos_lock; -+ struct gc_pos gc_pos; -+ -+ /* -+ * The allocation code needs gc_mark in struct bucket to be correct, but -+ * it's not while a gc is in progress. -+ */ -+ struct rw_semaphore gc_lock; -+ -+ /* IO PATH */ -+ struct semaphore io_in_flight; -+ struct bio_set bio_read; -+ struct bio_set bio_read_split; -+ struct bio_set bio_write; -+ struct mutex bio_bounce_pages_lock; -+ mempool_t bio_bounce_pages; -+ struct rhashtable promote_table; -+ -+ mempool_t compression_bounce[2]; -+ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; -+ mempool_t decompress_workspace; -+ ZSTD_parameters zstd_params; -+ -+ struct crypto_shash *sha256; -+ struct crypto_sync_skcipher *chacha20; -+ struct crypto_shash *poly1305; -+ -+ atomic64_t key_version; -+ -+ mempool_t large_bkey_pool; -+ -+ /* REBALANCE */ -+ struct bch_fs_rebalance rebalance; -+ -+ /* COPYGC */ -+ struct task_struct *copygc_thread; -+ copygc_heap copygc_heap; -+ struct bch_pd_controller copygc_pd; -+ struct write_point copygc_write_point; -+ u64 copygc_threshold; -+ -+ /* STRIPES: */ -+ GENRADIX(struct stripe) stripes[2]; -+ -+ ec_stripes_heap ec_stripes_heap; -+ spinlock_t ec_stripes_heap_lock; -+ -+ /* ERASURE CODING */ -+ struct list_head ec_stripe_head_list; -+ struct mutex ec_stripe_head_lock; -+ -+ struct list_head ec_stripe_new_list; -+ struct mutex ec_stripe_new_lock; -+ -+ struct work_struct ec_stripe_create_work; -+ u64 ec_stripe_hint; -+ -+ struct bio_set ec_bioset; -+ -+ struct work_struct ec_stripe_delete_work; -+ struct llist_head ec_stripe_delete_list; -+ -+ /* REFLINK */ -+ u64 reflink_hint; -+ -+ /* VFS IO PATH - fs-io.c */ -+ struct bio_set writepage_bioset; -+ struct bio_set dio_write_bioset; -+ struct bio_set dio_read_bioset; -+ -+ struct bio_list btree_write_error_list; -+ struct work_struct btree_write_error_work; -+ spinlock_t btree_write_error_lock; -+ -+ /* ERRORS */ -+ struct list_head fsck_errors; -+ struct mutex fsck_error_lock; -+ bool fsck_alloc_err; -+ -+ /* QUOTAS */ -+ struct bch_memquota_type quotas[QTYP_NR]; -+ -+ /* DEBUG JUNK */ -+ struct dentry *debug; -+ struct btree_debug btree_debug[BTREE_ID_NR]; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree *verify_data; -+ struct btree_node *verify_ondisk; -+ struct mutex verify_lock; -+#endif -+ -+ u64 unused_inode_hint; -+ -+ /* -+ * A btree node on disk could have too many bsets for an iterator to fit -+ * on the stack - have to dynamically allocate them -+ */ -+ mempool_t fill_iter; -+ -+ mempool_t btree_bounce_pool; -+ -+ struct journal journal; -+ struct list_head journal_entries; -+ struct journal_keys journal_keys; -+ -+ u64 last_bucket_seq_cleanup; -+ -+ /* The rest of this all shows up in sysfs */ -+ atomic_long_t read_realloc_races; -+ atomic_long_t extent_migrate_done; -+ atomic_long_t extent_migrate_raced; -+ -+ unsigned btree_gc_periodic:1; -+ unsigned copy_gc_enabled:1; -+ bool promote_whole_extents; -+ -+#define BCH_DEBUG_PARAM(name, description) bool name; -+ BCH_DEBUG_PARAMS_ALL() -+#undef BCH_DEBUG_PARAM -+ -+ struct time_stats times[BCH_TIME_STAT_NR]; -+}; -+ -+static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) -+{ -+#ifndef NO_BCACHEFS_FS -+ if (c->vfs_sb) -+ c->vfs_sb->s_bdi->ra_pages = ra_pages; -+#endif -+} -+ -+static inline unsigned bucket_bytes(const struct bch_dev *ca) -+{ -+ return ca->mi.bucket_size << 9; -+} -+ -+static inline unsigned block_bytes(const struct bch_fs *c) -+{ -+ return c->opts.block_size << 9; -+} -+ -+static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) -+{ -+ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); -+} -+ -+static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) -+{ -+ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; -+ -+ if (c->sb.time_precision == 1) -+ return ns; -+ -+ return div_s64(ns, c->sb.time_precision); -+} -+ -+static inline s64 bch2_current_time(struct bch_fs *c) -+{ -+ struct timespec64 now; -+ -+ ktime_get_coarse_real_ts64(&now); -+ return timespec_to_bch2_time(c, now); -+} -+ -+static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) -+{ -+ return dev < c->sb.nr_devices && c->devs[dev]; -+} -+ -+#endif /* _BCACHEFS_H */ -diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h -new file mode 100644 -index 000000000000..d5a2230e403c ---- /dev/null -+++ b/fs/bcachefs/bcachefs_format.h -@@ -0,0 +1,1671 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FORMAT_H -+#define _BCACHEFS_FORMAT_H -+ -+/* -+ * bcachefs on disk data structures -+ * -+ * OVERVIEW: -+ * -+ * There are three main types of on disk data structures in bcachefs (this is -+ * reduced from 5 in bcache) -+ * -+ * - superblock -+ * - journal -+ * - btree -+ * -+ * The btree is the primary structure; most metadata exists as keys in the -+ * various btrees. There are only a small number of btrees, they're not -+ * sharded - we have one btree for extents, another for inodes, et cetera. -+ * -+ * SUPERBLOCK: -+ * -+ * The superblock contains the location of the journal, the list of devices in -+ * the filesystem, and in general any metadata we need in order to decide -+ * whether we can start a filesystem or prior to reading the journal/btree -+ * roots. -+ * -+ * The superblock is extensible, and most of the contents of the superblock are -+ * in variable length, type tagged fields; see struct bch_sb_field. -+ * -+ * Backup superblocks do not reside in a fixed location; also, superblocks do -+ * not have a fixed size. To locate backup superblocks we have struct -+ * bch_sb_layout; we store a copy of this inside every superblock, and also -+ * before the first superblock. -+ * -+ * JOURNAL: -+ * -+ * The journal primarily records btree updates in the order they occurred; -+ * journal replay consists of just iterating over all the keys in the open -+ * journal entries and re-inserting them into the btrees. -+ * -+ * The journal also contains entry types for the btree roots, and blacklisted -+ * journal sequence numbers (see journal_seq_blacklist.c). -+ * -+ * BTREE: -+ * -+ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically -+ * 128k-256k) and log structured. We use struct btree_node for writing the first -+ * entry in a given node (offset 0), and struct btree_node_entry for all -+ * subsequent writes. -+ * -+ * After the header, btree node entries contain a list of keys in sorted order. -+ * Values are stored inline with the keys; since values are variable length (and -+ * keys effectively are variable length too, due to packing) we can't do random -+ * access without building up additional in memory tables in the btree node read -+ * path. -+ * -+ * BTREE KEYS (struct bkey): -+ * -+ * The various btrees share a common format for the key - so as to avoid -+ * switching in fastpath lookup/comparison code - but define their own -+ * structures for the key values. -+ * -+ * The size of a key/value pair is stored as a u8 in units of u64s, so the max -+ * size is just under 2k. The common part also contains a type tag for the -+ * value, and a format field indicating whether the key is packed or not (and -+ * also meant to allow adding new key fields in the future, if desired). -+ * -+ * bkeys, when stored within a btree node, may also be packed. In that case, the -+ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can -+ * be generous with field sizes in the common part of the key format (64 bit -+ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define LE_BITMASK(_bits, name, type, field, offset, end) \ -+static const unsigned name##_OFFSET = offset; \ -+static const unsigned name##_BITS = (end - offset); \ -+static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ -+ \ -+static inline __u64 name(const type *k) \ -+{ \ -+ return (__le##_bits##_to_cpu(k->field) >> offset) & \ -+ ~(~0ULL << (end - offset)); \ -+} \ -+ \ -+static inline void SET_##name(type *k, __u64 v) \ -+{ \ -+ __u##_bits new = __le##_bits##_to_cpu(k->field); \ -+ \ -+ new &= ~(~(~0ULL << (end - offset)) << offset); \ -+ new |= (v & ~(~0ULL << (end - offset))) << offset; \ -+ k->field = __cpu_to_le##_bits(new); \ -+} -+ -+#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) -+#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) -+#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) -+ -+struct bkey_format { -+ __u8 key_u64s; -+ __u8 nr_fields; -+ /* One unused slot for now: */ -+ __u8 bits_per_field[6]; -+ __le64 field_offset[6]; -+}; -+ -+/* Btree keys - all units are in sectors */ -+ -+struct bpos { -+ /* -+ * Word order matches machine byte order - btree code treats a bpos as a -+ * single large integer, for search/comparison purposes -+ * -+ * Note that wherever a bpos is embedded in another on disk data -+ * structure, it has to be byte swabbed when reading in metadata that -+ * wasn't written in native endian order: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u32 snapshot; -+ __u64 offset; -+ __u64 inode; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u64 inode; -+ __u64 offset; /* Points to end of extent - sectors */ -+ __u32 snapshot; -+#else -+#error edit for your odd byteorder. -+#endif -+} __attribute__((packed, aligned(4))); -+ -+#define KEY_INODE_MAX ((__u64)~0ULL) -+#define KEY_OFFSET_MAX ((__u64)~0ULL) -+#define KEY_SNAPSHOT_MAX ((__u32)~0U) -+#define KEY_SIZE_MAX ((__u32)~0U) -+ -+static inline struct bpos POS(__u64 inode, __u64 offset) -+{ -+ struct bpos ret; -+ -+ ret.inode = inode; -+ ret.offset = offset; -+ ret.snapshot = 0; -+ -+ return ret; -+} -+ -+#define POS_MIN POS(0, 0) -+#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) -+ -+/* Empty placeholder struct, for container_of() */ -+struct bch_val { -+ __u64 __nothing[0]; -+}; -+ -+struct bversion { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u64 lo; -+ __u32 hi; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ __u32 hi; -+ __u64 lo; -+#endif -+} __attribute__((packed, aligned(4))); -+ -+struct bkey { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ __u8 pad[1]; -+ -+ struct bversion version; -+ __u32 size; /* extent size, in sectors */ -+ struct bpos p; -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ struct bpos p; -+ __u32 size; /* extent size, in sectors */ -+ struct bversion version; -+ -+ __u8 pad[1]; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bkey_packed { -+ __u64 _data[0]; -+ -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ -+ /* Format of key (0 for format local to btree node) */ -+ -+ /* -+ * XXX: next incompat on disk format change, switch format and -+ * needs_whiteout - bkey_packed() will be cheaper if format is the high -+ * bits of the bitfield -+ */ -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u8 format:7, -+ needs_whiteout:1; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u8 needs_whiteout:1, -+ format:7; -+#endif -+ -+ /* Type of the value */ -+ __u8 type; -+ __u8 key_start[0]; -+ -+ /* -+ * We copy bkeys with struct assignment in various places, and while -+ * that shouldn't be done with packed bkeys we can't disallow it in C, -+ * and it's legal to cast a bkey to a bkey_packed - so padding it out -+ * to the same size as struct bkey should hopefully be safest. -+ */ -+ __u8 pad[sizeof(struct bkey) - 3]; -+} __attribute__((packed, aligned(8))); -+ -+#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) -+#define BKEY_U64s_MAX U8_MAX -+#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) -+ -+#define KEY_PACKED_BITS_START 24 -+ -+#define KEY_FORMAT_LOCAL_BTREE 0 -+#define KEY_FORMAT_CURRENT 1 -+ -+enum bch_bkey_fields { -+ BKEY_FIELD_INODE, -+ BKEY_FIELD_OFFSET, -+ BKEY_FIELD_SNAPSHOT, -+ BKEY_FIELD_SIZE, -+ BKEY_FIELD_VERSION_HI, -+ BKEY_FIELD_VERSION_LO, -+ BKEY_NR_FIELDS, -+}; -+ -+#define bkey_format_field(name, field) \ -+ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) -+ -+#define BKEY_FORMAT_CURRENT \ -+((struct bkey_format) { \ -+ .key_u64s = BKEY_U64s, \ -+ .nr_fields = BKEY_NR_FIELDS, \ -+ .bits_per_field = { \ -+ bkey_format_field(INODE, p.inode), \ -+ bkey_format_field(OFFSET, p.offset), \ -+ bkey_format_field(SNAPSHOT, p.snapshot), \ -+ bkey_format_field(SIZE, size), \ -+ bkey_format_field(VERSION_HI, version.hi), \ -+ bkey_format_field(VERSION_LO, version.lo), \ -+ }, \ -+}) -+ -+/* bkey with inline value */ -+struct bkey_i { -+ __u64 _data[0]; -+ -+ union { -+ struct { -+ /* Size of combined key and value, in u64s */ -+ __u8 u64s; -+ }; -+ struct { -+ struct bkey k; -+ struct bch_val v; -+ }; -+ }; -+}; -+ -+#define KEY(_inode, _offset, _size) \ -+((struct bkey) { \ -+ .u64s = BKEY_U64s, \ -+ .format = KEY_FORMAT_CURRENT, \ -+ .p = POS(_inode, _offset), \ -+ .size = _size, \ -+}) -+ -+static inline void bkey_init(struct bkey *k) -+{ -+ *k = KEY(0, 0, 0); -+} -+ -+#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) -+ -+#define __BKEY_PADDED(key, pad) \ -+ struct { struct bkey_i key; __u64 key ## _pad[pad]; } -+ -+/* -+ * - DELETED keys are used internally to mark keys that should be ignored but -+ * override keys in composition order. Their version number is ignored. -+ * -+ * - DISCARDED keys indicate that the data is all 0s because it has been -+ * discarded. DISCARDs may have a version; if the version is nonzero the key -+ * will be persistent, otherwise the key will be dropped whenever the btree -+ * node is rewritten (like DELETED keys). -+ * -+ * - ERROR: any read of the data returns a read error, as the data was lost due -+ * to a failing device. Like DISCARDED keys, they can be removed (overridden) -+ * by new writes or cluster-wide GC. Node repair can also overwrite them with -+ * the same or a more recent version number, but not with an older version -+ * number. -+ * -+ * - WHITEOUT: for hash table btrees -+*/ -+#define BCH_BKEY_TYPES() \ -+ x(deleted, 0) \ -+ x(discard, 1) \ -+ x(error, 2) \ -+ x(cookie, 3) \ -+ x(whiteout, 4) \ -+ x(btree_ptr, 5) \ -+ x(extent, 6) \ -+ x(reservation, 7) \ -+ x(inode, 8) \ -+ x(inode_generation, 9) \ -+ x(dirent, 10) \ -+ x(xattr, 11) \ -+ x(alloc, 12) \ -+ x(quota, 13) \ -+ x(stripe, 14) \ -+ x(reflink_p, 15) \ -+ x(reflink_v, 16) \ -+ x(inline_data, 17) \ -+ x(btree_ptr_v2, 18) -+ -+enum bch_bkey_type { -+#define x(name, nr) KEY_TYPE_##name = nr, -+ BCH_BKEY_TYPES() -+#undef x -+ KEY_TYPE_MAX, -+}; -+ -+struct bch_cookie { -+ struct bch_val v; -+ __le64 cookie; -+}; -+ -+/* Extents */ -+ -+/* -+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally -+ * preceded by checksum/compression information (bch_extent_crc32 or -+ * bch_extent_crc64). -+ * -+ * One major determining factor in the format of extents is how we handle and -+ * represent extents that have been partially overwritten and thus trimmed: -+ * -+ * If an extent is not checksummed or compressed, when the extent is trimmed we -+ * don't have to remember the extent we originally allocated and wrote: we can -+ * merely adjust ptr->offset to point to the start of the data that is currently -+ * live. The size field in struct bkey records the current (live) size of the -+ * extent, and is also used to mean "size of region on disk that we point to" in -+ * this case. -+ * -+ * Thus an extent that is not checksummed or compressed will consist only of a -+ * list of bch_extent_ptrs, with none of the fields in -+ * bch_extent_crc32/bch_extent_crc64. -+ * -+ * When an extent is checksummed or compressed, it's not possible to read only -+ * the data that is currently live: we have to read the entire extent that was -+ * originally written, and then return only the part of the extent that is -+ * currently live. -+ * -+ * Thus, in addition to the current size of the extent in struct bkey, we need -+ * to store the size of the originally allocated space - this is the -+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, -+ * when the extent is trimmed, instead of modifying the offset field of the -+ * pointer, we keep a second smaller offset field - "offset into the original -+ * extent of the currently live region". -+ * -+ * The other major determining factor is replication and data migration: -+ * -+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated -+ * write, we will initially write all the replicas in the same format, with the -+ * same checksum type and compression format - however, when copygc runs later (or -+ * tiering/cache promotion, anything that moves data), it is not in general -+ * going to rewrite all the pointers at once - one of the replicas may be in a -+ * bucket on one device that has very little fragmentation while another lives -+ * in a bucket that has become heavily fragmented, and thus is being rewritten -+ * sooner than the rest. -+ * -+ * Thus it will only move a subset of the pointers (or in the case of -+ * tiering/cache promotion perhaps add a single pointer without dropping any -+ * current pointers), and if the extent has been partially overwritten it must -+ * write only the currently live portion (or copygc would not be able to reduce -+ * fragmentation!) - which necessitates a different bch_extent_crc format for -+ * the new pointer. -+ * -+ * But in the interests of space efficiency, we don't want to store one -+ * bch_extent_crc for each pointer if we don't have to. -+ * -+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and -+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the -+ * type of a given entry with a scheme similar to utf8 (except we're encoding a -+ * type, not a size), encoding the type in the position of the first set bit: -+ * -+ * bch_extent_crc32 - 0b1 -+ * bch_extent_ptr - 0b10 -+ * bch_extent_crc64 - 0b100 -+ * -+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and -+ * bch_extent_crc64 is the least constrained). -+ * -+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, -+ * until the next bch_extent_crc32/64. -+ * -+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer -+ * is neither checksummed nor compressed. -+ */ -+ -+/* 128 bits, sufficient for cryptographic MACs: */ -+struct bch_csum { -+ __le64 lo; -+ __le64 hi; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_EXTENT_ENTRY_TYPES() \ -+ x(ptr, 0) \ -+ x(crc32, 1) \ -+ x(crc64, 2) \ -+ x(crc128, 3) \ -+ x(stripe_ptr, 4) -+#define BCH_EXTENT_ENTRY_MAX 5 -+ -+enum bch_extent_entry_type { -+#define x(f, n) BCH_EXTENT_ENTRY_##f = n, -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+/* Compressed/uncompressed size are stored biased by 1: */ -+struct bch_extent_crc32 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u32 type:2, -+ _compressed_size:7, -+ _uncompressed_size:7, -+ offset:7, -+ _unused:1, -+ csum_type:4, -+ compression_type:4; -+ __u32 csum; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u32 csum; -+ __u32 compression_type:4, -+ csum_type:4, -+ _unused:1, -+ offset:7, -+ _uncompressed_size:7, -+ _compressed_size:7, -+ type:2; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+#define CRC32_SIZE_MAX (1U << 7) -+#define CRC32_NONCE_MAX 0 -+ -+struct bch_extent_crc64 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:3, -+ _compressed_size:9, -+ _uncompressed_size:9, -+ offset:9, -+ nonce:10, -+ csum_type:4, -+ compression_type:4, -+ csum_hi:16; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 csum_hi:16, -+ compression_type:4, -+ csum_type:4, -+ nonce:10, -+ offset:9, -+ _uncompressed_size:9, -+ _compressed_size:9, -+ type:3; -+#endif -+ __u64 csum_lo; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC64_SIZE_MAX (1U << 9) -+#define CRC64_NONCE_MAX ((1U << 10) - 1) -+ -+struct bch_extent_crc128 { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:4, -+ _compressed_size:13, -+ _uncompressed_size:13, -+ offset:13, -+ nonce:13, -+ csum_type:4, -+ compression_type:4; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 compression_type:4, -+ csum_type:4, -+ nonce:13, -+ offset:13, -+ _uncompressed_size:13, -+ _compressed_size:13, -+ type:4; -+#endif -+ struct bch_csum csum; -+} __attribute__((packed, aligned(8))); -+ -+#define CRC128_SIZE_MAX (1U << 13) -+#define CRC128_NONCE_MAX ((1U << 13) - 1) -+ -+/* -+ * @reservation - pointer hasn't been written to, just reserved -+ */ -+struct bch_extent_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:1, -+ cached:1, -+ unused:1, -+ reservation:1, -+ offset:44, /* 8 petabytes */ -+ dev:8, -+ gen:8; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 gen:8, -+ dev:8, -+ offset:44, -+ reservation:1, -+ unused:1, -+ cached:1, -+ type:1; -+#endif -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent_stripe_ptr { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:5, -+ block:8, -+ idx:51; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 idx:51, -+ block:8, -+ type:5; -+#endif -+}; -+ -+struct bch_extent_reservation { -+#if defined(__LITTLE_ENDIAN_BITFIELD) -+ __u64 type:6, -+ unused:22, -+ replicas:4, -+ generation:32; -+#elif defined (__BIG_ENDIAN_BITFIELD) -+ __u64 generation:32, -+ replicas:4, -+ unused:22, -+ type:6; -+#endif -+}; -+ -+union bch_extent_entry { -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 -+ unsigned long type; -+#elif __BITS_PER_LONG == 32 -+ struct { -+ unsigned long pad; -+ unsigned long type; -+ }; -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define x(f, n) struct bch_extent_##f f; -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+}; -+ -+struct bch_btree_ptr { -+ struct bch_val v; -+ -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_btree_ptr_v2 { -+ struct bch_val v; -+ -+ __u64 mem_ptr; -+ __le64 seq; -+ __le16 sectors_written; -+ /* In case we ever decide to do variable size btree nodes: */ -+ __le16 sectors; -+ struct bpos min_key; -+ struct bch_extent_ptr start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_extent { -+ struct bch_val v; -+ -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_reservation { -+ struct bch_val v; -+ -+ __le32 generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+} __attribute__((packed, aligned(8))); -+ -+/* Maximum size (in u64s) a single pointer could be: */ -+#define BKEY_EXTENT_PTR_U64s_MAX\ -+ ((sizeof(struct bch_extent_crc128) + \ -+ sizeof(struct bch_extent_ptr)) / sizeof(u64)) -+ -+/* Maximum possible size of an entire extent value: */ -+#define BKEY_EXTENT_VAL_U64s_MAX \ -+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) -+ -+#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* * Maximum possible size of an entire extent, key + value: */ -+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) -+ -+/* Btree pointers don't carry around checksums: */ -+#define BKEY_BTREE_PTR_VAL_U64s_MAX \ -+ ((sizeof(struct bch_btree_ptr_v2) + \ -+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) -+#define BKEY_BTREE_PTR_U64s_MAX \ -+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) -+ -+/* Inodes */ -+ -+#define BLOCKDEV_INODE_MAX 4096 -+ -+#define BCACHEFS_ROOT_INO 4096 -+ -+struct bch_inode { -+ struct bch_val v; -+ -+ __le64 bi_hash_seed; -+ __le32 bi_flags; -+ __le16 bi_mode; -+ __u8 fields[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_inode_generation { -+ struct bch_val v; -+ -+ __le32 bi_generation; -+ __le32 pad; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_INODE_FIELDS() \ -+ x(bi_atime, 64) \ -+ x(bi_ctime, 64) \ -+ x(bi_mtime, 64) \ -+ x(bi_otime, 64) \ -+ x(bi_size, 64) \ -+ x(bi_sectors, 64) \ -+ x(bi_uid, 32) \ -+ x(bi_gid, 32) \ -+ x(bi_nlink, 32) \ -+ x(bi_generation, 32) \ -+ x(bi_dev, 32) \ -+ x(bi_data_checksum, 8) \ -+ x(bi_compression, 8) \ -+ x(bi_project, 32) \ -+ x(bi_background_compression, 8) \ -+ x(bi_data_replicas, 8) \ -+ x(bi_promote_target, 16) \ -+ x(bi_foreground_target, 16) \ -+ x(bi_background_target, 16) \ -+ x(bi_erasure_code, 16) \ -+ x(bi_fields_set, 16) -+ -+/* subset of BCH_INODE_FIELDS */ -+#define BCH_INODE_OPTS() \ -+ x(data_checksum, 8) \ -+ x(compression, 8) \ -+ x(project, 32) \ -+ x(background_compression, 8) \ -+ x(data_replicas, 8) \ -+ x(promote_target, 16) \ -+ x(foreground_target, 16) \ -+ x(background_target, 16) \ -+ x(erasure_code, 16) -+ -+enum inode_opt_id { -+#define x(name, ...) \ -+ Inode_opt_##name, -+ BCH_INODE_OPTS() -+#undef x -+ Inode_opt_nr, -+}; -+ -+enum { -+ /* -+ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL -+ * flags) -+ */ -+ __BCH_INODE_SYNC = 0, -+ __BCH_INODE_IMMUTABLE = 1, -+ __BCH_INODE_APPEND = 2, -+ __BCH_INODE_NODUMP = 3, -+ __BCH_INODE_NOATIME = 4, -+ -+ __BCH_INODE_I_SIZE_DIRTY= 5, -+ __BCH_INODE_I_SECTORS_DIRTY= 6, -+ __BCH_INODE_UNLINKED = 7, -+ -+ /* bits 20+ reserved for packed fields below: */ -+}; -+ -+#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) -+#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) -+#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) -+#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) -+#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) -+#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) -+#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) -+#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) -+ -+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); -+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); -+ -+/* Dirents */ -+ -+/* -+ * Dirents (and xattrs) have to implement string lookups; since our b-tree -+ * doesn't support arbitrary length strings for the key, we instead index by a -+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset -+ * field of the key - using linear probing to resolve hash collisions. This also -+ * provides us with the readdir cookie posix requires. -+ * -+ * Linear probing requires us to use whiteouts for deletions, in the event of a -+ * collision: -+ */ -+ -+struct bch_dirent { -+ struct bch_val v; -+ -+ /* Target inode number: */ -+ __le64 d_inum; -+ -+ /* -+ * Copy of mode bits 12-15 from the target inode - so userspace can get -+ * the filetype without having to do a stat() -+ */ -+ __u8 d_type; -+ -+ __u8 d_name[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ -+ sizeof(struct bkey) - \ -+ offsetof(struct bch_dirent, d_name)) -+ -+ -+/* Xattrs */ -+ -+#define KEY_TYPE_XATTR_INDEX_USER 0 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 -+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 -+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 -+#define KEY_TYPE_XATTR_INDEX_SECURITY 4 -+ -+struct bch_xattr { -+ struct bch_val v; -+ __u8 x_type; -+ __u8 x_name_len; -+ __le16 x_val_len; -+ __u8 x_name[]; -+} __attribute__((packed, aligned(8))); -+ -+/* Bucket/allocation information: */ -+ -+struct bch_alloc { -+ struct bch_val v; -+ __u8 fields; -+ __u8 gen; -+ __u8 data[]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_ALLOC_FIELDS() \ -+ x(read_time, 16) \ -+ x(write_time, 16) \ -+ x(data_type, 8) \ -+ x(dirty_sectors, 16) \ -+ x(cached_sectors, 16) \ -+ x(oldest_gen, 8) -+ -+enum { -+#define x(name, bytes) BCH_ALLOC_FIELD_##name, -+ BCH_ALLOC_FIELDS() -+#undef x -+ BCH_ALLOC_FIELD_NR -+}; -+ -+static const unsigned BCH_ALLOC_FIELD_BYTES[] = { -+#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, -+ BCH_ALLOC_FIELDS() -+#undef x -+}; -+ -+#define x(name, bits) + (bits / 8) -+static const unsigned BKEY_ALLOC_VAL_U64s_MAX = -+ DIV_ROUND_UP(offsetof(struct bch_alloc, data) -+ BCH_ALLOC_FIELDS(), sizeof(u64)); -+#undef x -+ -+#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) -+ -+/* Quotas: */ -+ -+enum quota_types { -+ QTYP_USR = 0, -+ QTYP_GRP = 1, -+ QTYP_PRJ = 2, -+ QTYP_NR = 3, -+}; -+ -+enum quota_counters { -+ Q_SPC = 0, -+ Q_INO = 1, -+ Q_COUNTERS = 2, -+}; -+ -+struct bch_quota_counter { -+ __le64 hardlimit; -+ __le64 softlimit; -+}; -+ -+struct bch_quota { -+ struct bch_val v; -+ struct bch_quota_counter c[Q_COUNTERS]; -+} __attribute__((packed, aligned(8))); -+ -+/* Erasure coding */ -+ -+struct bch_stripe { -+ struct bch_val v; -+ __le16 sectors; -+ __u8 algorithm; -+ __u8 nr_blocks; -+ __u8 nr_redundant; -+ -+ __u8 csum_granularity_bits; -+ __u8 csum_type; -+ __u8 pad; -+ -+ struct bch_extent_ptr ptrs[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* Reflink: */ -+ -+struct bch_reflink_p { -+ struct bch_val v; -+ __le64 idx; -+ -+ __le32 reservation_generation; -+ __u8 nr_replicas; -+ __u8 pad[3]; -+}; -+ -+struct bch_reflink_v { -+ struct bch_val v; -+ __le64 refcount; -+ union bch_extent_entry start[0]; -+ __u64 _data[0]; -+}; -+ -+/* Inline data */ -+ -+struct bch_inline_data { -+ struct bch_val v; -+ u8 data[0]; -+}; -+ -+/* Optional/variable size superblock sections: */ -+ -+struct bch_sb_field { -+ __u64 _data[0]; -+ __le32 u64s; -+ __le32 type; -+}; -+ -+#define BCH_SB_FIELDS() \ -+ x(journal, 0) \ -+ x(members, 1) \ -+ x(crypt, 2) \ -+ x(replicas_v0, 3) \ -+ x(quota, 4) \ -+ x(disk_groups, 5) \ -+ x(clean, 6) \ -+ x(replicas, 7) \ -+ x(journal_seq_blacklist, 8) -+ -+enum bch_sb_field_type { -+#define x(f, nr) BCH_SB_FIELD_##f = nr, -+ BCH_SB_FIELDS() -+#undef x -+ BCH_SB_FIELD_NR -+}; -+ -+/* BCH_SB_FIELD_journal: */ -+ -+struct bch_sb_field_journal { -+ struct bch_sb_field field; -+ __le64 buckets[0]; -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+#define BCH_MIN_NR_NBUCKETS (1 << 6) -+ -+struct bch_member { -+ uuid_le uuid; -+ __le64 nbuckets; /* device size */ -+ __le16 first_bucket; /* index of first bucket used */ -+ __le16 bucket_size; /* sectors */ -+ __le32 pad; -+ __le64 last_mount; /* time_t */ -+ -+ __le64 flags[2]; -+}; -+ -+LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) -+/* 4-10 unused, was TIER, HAS_(META)DATA */ -+LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) -+LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) -+LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) -+LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) -+LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) -+ -+#define BCH_TIER_MAX 4U -+ -+#if 0 -+LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); -+LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); -+#endif -+ -+enum bch_member_state { -+ BCH_MEMBER_STATE_RW = 0, -+ BCH_MEMBER_STATE_RO = 1, -+ BCH_MEMBER_STATE_FAILED = 2, -+ BCH_MEMBER_STATE_SPARE = 3, -+ BCH_MEMBER_STATE_NR = 4, -+}; -+ -+enum cache_replacement { -+ CACHE_REPLACEMENT_LRU = 0, -+ CACHE_REPLACEMENT_FIFO = 1, -+ CACHE_REPLACEMENT_RANDOM = 2, -+ CACHE_REPLACEMENT_NR = 3, -+}; -+ -+struct bch_sb_field_members { -+ struct bch_sb_field field; -+ struct bch_member members[0]; -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+struct nonce { -+ __le32 d[4]; -+}; -+ -+struct bch_key { -+ __le64 key[4]; -+}; -+ -+#define BCH_KEY_MAGIC \ -+ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ -+ ((u64) 'h' << 16)|((u64) '*' << 24)| \ -+ ((u64) '*' << 32)|((u64) 'k' << 40)| \ -+ ((u64) 'e' << 48)|((u64) 'y' << 56)) -+ -+struct bch_encrypted_key { -+ __le64 magic; -+ struct bch_key key; -+}; -+ -+/* -+ * If this field is present in the superblock, it stores an encryption key which -+ * is used encrypt all other data/metadata. The key will normally be encrypted -+ * with the key userspace provides, but if encryption has been turned off we'll -+ * just store the master key unencrypted in the superblock so we can access the -+ * previously encrypted data. -+ */ -+struct bch_sb_field_crypt { -+ struct bch_sb_field field; -+ -+ __le64 flags; -+ __le64 kdf_flags; -+ struct bch_encrypted_key key; -+}; -+ -+LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); -+ -+enum bch_kdf_types { -+ BCH_KDF_SCRYPT = 0, -+ BCH_KDF_NR = 1, -+}; -+ -+/* stored as base 2 log of scrypt params: */ -+LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); -+LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); -+LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); -+ -+/* BCH_SB_FIELD_replicas: */ -+ -+#define BCH_DATA_TYPES() \ -+ x(none, 0) \ -+ x(sb, 1) \ -+ x(journal, 2) \ -+ x(btree, 3) \ -+ x(user, 4) \ -+ x(cached, 5) -+ -+enum bch_data_type { -+#define x(t, n) BCH_DATA_##t, -+ BCH_DATA_TYPES() -+#undef x -+ BCH_DATA_NR -+}; -+ -+struct bch_replicas_entry_v0 { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+struct bch_sb_field_replicas_v0 { -+ struct bch_sb_field field; -+ struct bch_replicas_entry_v0 entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_entry { -+ __u8 data_type; -+ __u8 nr_devs; -+ __u8 nr_required; -+ __u8 devs[0]; -+} __attribute__((packed)); -+ -+#define replicas_entry_bytes(_i) \ -+ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) -+ -+struct bch_sb_field_replicas { -+ struct bch_sb_field field; -+ struct bch_replicas_entry entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_quota: */ -+ -+struct bch_sb_quota_counter { -+ __le32 timelimit; -+ __le32 warnlimit; -+}; -+ -+struct bch_sb_quota_type { -+ __le64 flags; -+ struct bch_sb_quota_counter c[Q_COUNTERS]; -+}; -+ -+struct bch_sb_field_quota { -+ struct bch_sb_field field; -+ struct bch_sb_quota_type q[QTYP_NR]; -+} __attribute__((packed, aligned(8))); -+ -+/* BCH_SB_FIELD_disk_groups: */ -+ -+#define BCH_SB_LABEL_SIZE 32 -+ -+struct bch_disk_group { -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 flags[2]; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) -+LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) -+LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) -+ -+struct bch_sb_field_disk_groups { -+ struct bch_sb_field field; -+ struct bch_disk_group entries[0]; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * On clean shutdown, store btree roots and current journal sequence number in -+ * the superblock: -+ */ -+struct jset_entry { -+ __le16 u64s; -+ __u8 btree_id; -+ __u8 level; -+ __u8 type; /* designates what this jset holds */ -+ __u8 pad[3]; -+ -+ union { -+ struct bkey_i start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct bch_sb_field_clean { -+ struct bch_sb_field field; -+ -+ __le32 flags; -+ __le16 read_clock; -+ __le16 write_clock; -+ __le64 journal_seq; -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+struct journal_seq_blacklist_entry { -+ __le64 start; -+ __le64 end; -+}; -+ -+struct bch_sb_field_journal_seq_blacklist { -+ struct bch_sb_field field; -+ -+ union { -+ struct journal_seq_blacklist_entry start[0]; -+ __u64 _data[0]; -+ }; -+}; -+ -+/* Superblock: */ -+ -+/* -+ * New versioning scheme: -+ * One common version number for all on disk data structures - superblock, btree -+ * nodes, journal entries -+ */ -+#define BCH_JSET_VERSION_OLD 2 -+#define BCH_BSET_VERSION_OLD 3 -+ -+enum bcachefs_metadata_version { -+ bcachefs_metadata_version_min = 9, -+ bcachefs_metadata_version_new_versioning = 10, -+ bcachefs_metadata_version_bkey_renumber = 10, -+ bcachefs_metadata_version_inode_btree_change = 11, -+ bcachefs_metadata_version_max = 12, -+}; -+ -+#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) -+ -+#define BCH_SB_SECTOR 8 -+#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ -+ -+struct bch_sb_layout { -+ uuid_le magic; /* bcachefs superblock UUID */ -+ __u8 layout_type; -+ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ -+ __u8 nr_superblocks; -+ __u8 pad[5]; -+ __le64 sb_offset[61]; -+} __attribute__((packed, aligned(8))); -+ -+#define BCH_SB_LAYOUT_SECTOR 7 -+ -+/* -+ * @offset - sector where this sb was written -+ * @version - on disk format version -+ * @version_min - Oldest metadata version this filesystem contains; so we can -+ * safely drop compatibility code and refuse to mount filesystems -+ * we'd need it for -+ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) -+ * @seq - incremented each time superblock is written -+ * @uuid - used for generating various magic numbers and identifying -+ * member devices, never changes -+ * @user_uuid - user visible UUID, may be changed -+ * @label - filesystem label -+ * @seq - identifies most recent superblock, incremented each time -+ * superblock is written -+ * @features - enabled incompatible features -+ */ -+struct bch_sb { -+ struct bch_csum csum; -+ __le16 version; -+ __le16 version_min; -+ __le16 pad[2]; -+ uuid_le magic; -+ uuid_le uuid; -+ uuid_le user_uuid; -+ __u8 label[BCH_SB_LABEL_SIZE]; -+ __le64 offset; -+ __le64 seq; -+ -+ __le16 block_size; -+ __u8 dev_idx; -+ __u8 nr_devices; -+ __le32 u64s; -+ -+ __le64 time_base_lo; -+ __le32 time_base_hi; -+ __le32 time_precision; -+ -+ __le64 flags[8]; -+ __le64 features[2]; -+ __le64 compat[2]; -+ -+ struct bch_sb_layout layout; -+ -+ union { -+ struct bch_sb_field start[0]; -+ __le64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+/* -+ * Flags: -+ * BCH_SB_INITALIZED - set on first mount -+ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect -+ * behaviour of mount/recovery path: -+ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits -+ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 -+ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides -+ * DATA/META_CSUM_TYPE. Also indicates encryption -+ * algorithm in use, if/when we get more than one -+ */ -+ -+LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); -+ -+LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); -+LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); -+LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); -+LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); -+ -+LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); -+ -+LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); -+LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); -+ -+LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); -+LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); -+ -+LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); -+LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); -+LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); -+LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); -+ -+LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); -+ -+LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); -+ -+/* 61-64 unused */ -+ -+LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); -+LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); -+LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); -+ -+LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); -+LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); -+ -+/* -+ * Max size of an extent that may require bouncing to read or write -+ * (checksummed, compressed): 64k -+ */ -+LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, -+ struct bch_sb, flags[1], 14, 20); -+ -+LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); -+LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); -+ -+LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); -+LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); -+LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); -+ -+LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, -+ struct bch_sb, flags[2], 0, 4); -+LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); -+ -+LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); -+ -+/* -+ * Features: -+ * -+ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist -+ * reflink: gates KEY_TYPE_reflink -+ * inline_data: gates KEY_TYPE_inline_data -+ * new_siphash: gates BCH_STR_HASH_SIPHASH -+ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE -+ */ -+#define BCH_SB_FEATURES() \ -+ x(lz4, 0) \ -+ x(gzip, 1) \ -+ x(zstd, 2) \ -+ x(atomic_nlink, 3) \ -+ x(ec, 4) \ -+ x(journal_seq_blacklist_v3, 5) \ -+ x(reflink, 6) \ -+ x(new_siphash, 7) \ -+ x(inline_data, 8) \ -+ x(new_extent_overwrite, 9) \ -+ x(incompressible, 10) \ -+ x(btree_ptr_v2, 11) \ -+ x(extents_above_btree_updates, 12) \ -+ x(btree_updates_journalled, 13) -+ -+#define BCH_SB_FEATURES_ALL \ -+ ((1ULL << BCH_FEATURE_new_siphash)| \ -+ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ -+ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ -+ (1ULL << BCH_FEATURE_extents_above_btree_updates)) -+ -+enum bch_sb_feature { -+#define x(f, n) BCH_FEATURE_##f, -+ BCH_SB_FEATURES() -+#undef x -+ BCH_FEATURE_NR, -+}; -+ -+enum bch_sb_compat { -+ BCH_COMPAT_FEAT_ALLOC_INFO = 0, -+ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, -+}; -+ -+/* options: */ -+ -+#define BCH_REPLICAS_MAX 4U -+ -+enum bch_error_actions { -+ BCH_ON_ERROR_CONTINUE = 0, -+ BCH_ON_ERROR_RO = 1, -+ BCH_ON_ERROR_PANIC = 2, -+ BCH_NR_ERROR_ACTIONS = 3, -+}; -+ -+enum bch_str_hash_type { -+ BCH_STR_HASH_CRC32C = 0, -+ BCH_STR_HASH_CRC64 = 1, -+ BCH_STR_HASH_SIPHASH_OLD = 2, -+ BCH_STR_HASH_SIPHASH = 3, -+ BCH_STR_HASH_NR = 4, -+}; -+ -+enum bch_str_hash_opts { -+ BCH_STR_HASH_OPT_CRC32C = 0, -+ BCH_STR_HASH_OPT_CRC64 = 1, -+ BCH_STR_HASH_OPT_SIPHASH = 2, -+ BCH_STR_HASH_OPT_NR = 3, -+}; -+ -+enum bch_csum_type { -+ BCH_CSUM_NONE = 0, -+ BCH_CSUM_CRC32C_NONZERO = 1, -+ BCH_CSUM_CRC64_NONZERO = 2, -+ BCH_CSUM_CHACHA20_POLY1305_80 = 3, -+ BCH_CSUM_CHACHA20_POLY1305_128 = 4, -+ BCH_CSUM_CRC32C = 5, -+ BCH_CSUM_CRC64 = 6, -+ BCH_CSUM_NR = 7, -+}; -+ -+static const unsigned bch_crc_bytes[] = { -+ [BCH_CSUM_NONE] = 0, -+ [BCH_CSUM_CRC32C_NONZERO] = 4, -+ [BCH_CSUM_CRC32C] = 4, -+ [BCH_CSUM_CRC64_NONZERO] = 8, -+ [BCH_CSUM_CRC64] = 8, -+ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, -+ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, -+}; -+ -+static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) -+{ -+ switch (type) { -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+enum bch_csum_opts { -+ BCH_CSUM_OPT_NONE = 0, -+ BCH_CSUM_OPT_CRC32C = 1, -+ BCH_CSUM_OPT_CRC64 = 2, -+ BCH_CSUM_OPT_NR = 3, -+}; -+ -+#define BCH_COMPRESSION_TYPES() \ -+ x(none, 0) \ -+ x(lz4_old, 1) \ -+ x(gzip, 2) \ -+ x(lz4, 3) \ -+ x(zstd, 4) \ -+ x(incompressible, 5) -+ -+enum bch_compression_type { -+#define x(t, n) BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_TYPES() -+#undef x -+ BCH_COMPRESSION_TYPE_NR -+}; -+ -+#define BCH_COMPRESSION_OPTS() \ -+ x(none, 0) \ -+ x(lz4, 1) \ -+ x(gzip, 2) \ -+ x(zstd, 3) -+ -+enum bch_compression_opts { -+#define x(t, n) BCH_COMPRESSION_OPT_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ BCH_COMPRESSION_OPT_NR -+}; -+ -+/* -+ * Magic numbers -+ * -+ * The various other data structures have their own magic numbers, which are -+ * xored with the first part of the cache set's UUID -+ */ -+ -+#define BCACHE_MAGIC \ -+ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ -+ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) -+ -+#define BCACHEFS_STATFS_MAGIC 0xca451a4e -+ -+#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) -+#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) -+ -+static inline __le64 __bch2_sb_magic(struct bch_sb *sb) -+{ -+ __le64 ret; -+ memcpy(&ret, &sb->uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 __jset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); -+} -+ -+static inline __u64 __bset_magic(struct bch_sb *sb) -+{ -+ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); -+} -+ -+/* Journal */ -+ -+#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) -+ -+#define BCH_JSET_ENTRY_TYPES() \ -+ x(btree_keys, 0) \ -+ x(btree_root, 1) \ -+ x(prio_ptrs, 2) \ -+ x(blacklist, 3) \ -+ x(blacklist_v2, 4) \ -+ x(usage, 5) \ -+ x(data_usage, 6) -+ -+enum { -+#define x(f, nr) BCH_JSET_ENTRY_##f = nr, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+ BCH_JSET_ENTRY_NR -+}; -+ -+/* -+ * Journal sequence numbers can be blacklisted: bsets record the max sequence -+ * number of all the journal entries they contain updates for, so that on -+ * recovery we can ignore those bsets that contain index updates newer that what -+ * made it into the journal. -+ * -+ * This means that we can't reuse that journal_seq - we have to skip it, and -+ * then record that we skipped it so that the next time we crash and recover we -+ * don't think there was a missing journal entry. -+ */ -+struct jset_entry_blacklist { -+ struct jset_entry entry; -+ __le64 seq; -+}; -+ -+struct jset_entry_blacklist_v2 { -+ struct jset_entry entry; -+ __le64 start; -+ __le64 end; -+}; -+ -+enum { -+ FS_USAGE_RESERVED = 0, -+ FS_USAGE_INODES = 1, -+ FS_USAGE_KEY_VERSION = 2, -+ FS_USAGE_NR = 3 -+}; -+ -+struct jset_entry_usage { -+ struct jset_entry entry; -+ __le64 v; -+} __attribute__((packed)); -+ -+struct jset_entry_data_usage { -+ struct jset_entry entry; -+ __le64 v; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+/* -+ * On disk format for a journal entry: -+ * seq is monotonically increasing; every journal entry has its own unique -+ * sequence number. -+ * -+ * last_seq is the oldest journal entry that still has keys the btree hasn't -+ * flushed to disk yet. -+ * -+ * version is for on disk format changes. -+ */ -+struct jset { -+ struct bch_csum csum; -+ -+ __le64 magic; -+ __le64 seq; -+ __le32 version; -+ __le32 flags; -+ -+ __le32 u64s; /* size of d[] in u64s */ -+ -+ __u8 encrypted_start[0]; -+ -+ __le16 read_clock; -+ __le16 write_clock; -+ -+ /* Sequence number of oldest dirty journal entry */ -+ __le64 last_seq; -+ -+ -+ union { -+ struct jset_entry start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); -+LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); -+ -+#define BCH_JOURNAL_BUCKETS_MIN 8 -+ -+/* Btree: */ -+ -+#define BCH_BTREE_IDS() \ -+ x(EXTENTS, 0, "extents") \ -+ x(INODES, 1, "inodes") \ -+ x(DIRENTS, 2, "dirents") \ -+ x(XATTRS, 3, "xattrs") \ -+ x(ALLOC, 4, "alloc") \ -+ x(QUOTAS, 5, "quotas") \ -+ x(EC, 6, "stripes") \ -+ x(REFLINK, 7, "reflink") -+ -+enum btree_id { -+#define x(kwd, val, name) BTREE_ID_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BTREE_ID_NR -+}; -+ -+#define BTREE_MAX_DEPTH 4U -+ -+/* Btree nodes */ -+ -+/* -+ * Btree nodes -+ * -+ * On disk a btree node is a list/log of these; within each set the keys are -+ * sorted -+ */ -+struct bset { -+ __le64 seq; -+ -+ /* -+ * Highest journal entry this bset contains keys for. -+ * If on recovery we don't see that journal entry, this bset is ignored: -+ * this allows us to preserve the order of all index updates after a -+ * crash, since the journal records a total order of all index updates -+ * and anything that didn't make it to the journal doesn't get used. -+ */ -+ __le64 journal_seq; -+ -+ __le32 flags; -+ __le16 version; -+ __le16 u64s; /* count of d[] in u64s */ -+ -+ union { -+ struct bkey_packed start[0]; -+ __u64 _data[0]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); -+ -+LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); -+LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, -+ struct bset, flags, 5, 6); -+ -+struct btree_node { -+ struct bch_csum csum; -+ __le64 magic; -+ -+ /* this flags field is encrypted, unlike bset->flags: */ -+ __le64 flags; -+ -+ /* Closed interval: */ -+ struct bpos min_key; -+ struct bpos max_key; -+ struct bch_extent_ptr ptr; -+ struct bkey_format format; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); -+LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); -+LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, -+ struct btree_node, flags, 8, 9); -+/* 9-32 unused */ -+LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); -+ -+struct btree_node_entry { -+ struct bch_csum csum; -+ -+ union { -+ struct bset keys; -+ struct { -+ __u8 pad[22]; -+ __le16 u64s; -+ __u64 _data[0]; -+ -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+#endif /* _BCACHEFS_FORMAT_H */ -diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h -new file mode 100644 -index 000000000000..d71157a3e073 ---- /dev/null -+++ b/fs/bcachefs/bcachefs_ioctl.h -@@ -0,0 +1,332 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IOCTL_H -+#define _BCACHEFS_IOCTL_H -+ -+#include -+#include -+#include "bcachefs_format.h" -+ -+/* -+ * Flags common to multiple ioctls: -+ */ -+#define BCH_FORCE_IF_DATA_LOST (1 << 0) -+#define BCH_FORCE_IF_METADATA_LOST (1 << 1) -+#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) -+#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) -+ -+#define BCH_FORCE_IF_DEGRADED \ -+ (BCH_FORCE_IF_DATA_DEGRADED| \ -+ BCH_FORCE_IF_METADATA_DEGRADED) -+ -+/* -+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname -+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the -+ * filesystem: -+ */ -+#define BCH_BY_INDEX (1 << 4) -+ -+/* -+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem -+ * wide superblock: -+ */ -+#define BCH_READ_DEV (1 << 5) -+ -+/* global control dev: */ -+ -+/* These are currently broken, and probably unnecessary: */ -+#if 0 -+#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) -+#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) -+ -+struct bch_ioctl_assemble { -+ __u32 flags; -+ __u32 nr_devs; -+ __u64 pad; -+ __u64 devs[]; -+}; -+ -+struct bch_ioctl_incremental { -+ __u32 flags; -+ __u64 pad; -+ __u64 dev; -+}; -+#endif -+ -+/* filesystem ioctls: */ -+ -+#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) -+ -+/* These only make sense when we also have incremental assembly */ -+#if 0 -+#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) -+#define BCH_IOCTL_STOP _IO(0xbc, 3) -+#endif -+ -+#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) -+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) -+#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) -+#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) -+#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) -+#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) -+#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) -+#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) -+ -+/* ioctl below act on a particular file, not the filesystem as a whole: */ -+ -+#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) -+ -+/* -+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID -+ * -+ * Returns user visible UUID, not internal UUID (which may not ever be changed); -+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with -+ * this UUID. -+ */ -+struct bch_ioctl_query_uuid { -+ uuid_le uuid; -+}; -+ -+#if 0 -+struct bch_ioctl_start { -+ __u32 flags; -+ __u32 pad; -+}; -+#endif -+ -+/* -+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem -+ * -+ * The specified device must not be open or in use. On success, the new device -+ * will be an online member of the filesystem just like any other member. -+ * -+ * The device must first be prepared by userspace by formatting with a bcachefs -+ * superblock, which is only used for passing in superblock options/parameters -+ * for that device (in struct bch_member). The new device's superblock should -+ * not claim to be a member of any existing filesystem - UUIDs on it will be -+ * ignored. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem -+ * -+ * Any data present on @dev will be permanently deleted, and @dev will be -+ * removed from its slot in the filesystem's list of member devices. The device -+ * may be either offline or offline. -+ * -+ * Will fail removing @dev would leave us with insufficient read write devices -+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are -+ * set. -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem -+ * but is not open (e.g. because we started in degraded mode), bring it online -+ * -+ * all existing data on @dev will be available once the device is online, -+ * exactly as if @dev was present when the filesystem was first mounted -+ */ -+ -+/* -+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that -+ * block device, without removing it from the filesystem (so it can be brought -+ * back online later) -+ * -+ * Data present on @dev will be unavailable while @dev is offline (unless -+ * replicated), but will still be intact and untouched if @dev is brought back -+ * online -+ * -+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would -+ * leave us with insufficient read write devices or degraded/unavailable data, -+ * unless the approprate BCH_FORCE_IF_* flags are set. -+ */ -+ -+struct bch_ioctl_disk { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem -+ * -+ * @new_state - one of the bch_member_state states (rw, ro, failed, -+ * spare) -+ * -+ * Will refuse to change member state if we would then have insufficient devices -+ * to write to, or if it would result in degraded data (when @new_state is -+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. -+ */ -+struct bch_ioctl_disk_set_state { -+ __u32 flags; -+ __u8 new_state; -+ __u8 pad[3]; -+ __u64 dev; -+}; -+ -+enum bch_data_ops { -+ BCH_DATA_OP_SCRUB = 0, -+ BCH_DATA_OP_REREPLICATE = 1, -+ BCH_DATA_OP_MIGRATE = 2, -+ BCH_DATA_OP_NR = 3, -+}; -+ -+/* -+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. -+ * scrub, rereplicate, migrate). -+ * -+ * This ioctl kicks off a job in the background, and returns a file descriptor. -+ * Reading from the file descriptor returns a struct bch_ioctl_data_event, -+ * indicating current progress, and closing the file descriptor will stop the -+ * job. The file descriptor is O_CLOEXEC. -+ */ -+struct bch_ioctl_data { -+ __u32 op; -+ __u32 flags; -+ -+ struct bpos start; -+ struct bpos end; -+ -+ union { -+ struct { -+ __u32 dev; -+ __u32 pad; -+ } migrate; -+ struct { -+ __u64 pad[8]; -+ }; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+enum bch_data_event { -+ BCH_DATA_EVENT_PROGRESS = 0, -+ /* XXX: add an event for reporting errors */ -+ BCH_DATA_EVENT_NR = 1, -+}; -+ -+struct bch_ioctl_data_progress { -+ __u8 data_type; -+ __u8 btree_id; -+ __u8 pad[2]; -+ struct bpos pos; -+ -+ __u64 sectors_done; -+ __u64 sectors_total; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_ioctl_data_event { -+ __u8 type; -+ __u8 pad[7]; -+ union { -+ struct bch_ioctl_data_progress p; -+ __u64 pad2[15]; -+ }; -+} __attribute__((packed, aligned(8))); -+ -+struct bch_replicas_usage { -+ __u64 sectors; -+ struct bch_replicas_entry r; -+} __attribute__((packed)); -+ -+static inline struct bch_replicas_usage * -+replicas_usage_next(struct bch_replicas_usage *u) -+{ -+ return (void *) u + replicas_entry_bytes(&u->r) + 8; -+} -+ -+/* -+ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage -+ * -+ * Returns disk space usage broken out by data type, number of replicas, and -+ * by component device -+ * -+ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries -+ * -+ * On success, @replica_entries_bytes will be changed to indicate the number of -+ * bytes actually used. -+ * -+ * Returns -ERANGE if @replica_entries_bytes was too small -+ */ -+struct bch_ioctl_fs_usage { -+ __u64 capacity; -+ __u64 used; -+ __u64 online_reserved; -+ __u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ -+ __u32 replica_entries_bytes; -+ __u32 pad; -+ -+ struct bch_replicas_usage replicas[0]; -+}; -+ -+/* -+ * BCH_IOCTL_DEV_USAGE: query device disk space usage -+ * -+ * Returns disk space usage broken out by data type - both by buckets and -+ * sectors. -+ */ -+struct bch_ioctl_dev_usage { -+ __u64 dev; -+ __u32 flags; -+ __u8 state; -+ __u8 pad[7]; -+ -+ __u32 bucket_size; -+ __u64 nr_buckets; -+ __u64 available_buckets; -+ -+ __u64 buckets[BCH_DATA_NR]; -+ __u64 sectors[BCH_DATA_NR]; -+ -+ __u64 ec_buckets; -+ __u64 ec_sectors; -+}; -+ -+/* -+ * BCH_IOCTL_READ_SUPER: read filesystem superblock -+ * -+ * Equivalent to reading the superblock directly from the block device, except -+ * avoids racing with the kernel writing the superblock or having to figure out -+ * which block device to read -+ * -+ * @sb - buffer to read into -+ * @size - size of userspace allocated buffer -+ * @dev - device to read superblock for, if BCH_READ_DEV flag is -+ * specified -+ * -+ * Returns -ERANGE if buffer provided is too small -+ */ -+struct bch_ioctl_read_super { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 size; -+ __u64 sb; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to -+ * determine if disk is a (online) member - if so, returns device's index -+ * -+ * Returns -ENOENT if not found -+ */ -+struct bch_ioctl_disk_get_idx { -+ __u64 dev; -+}; -+ -+/* -+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device -+ * -+ * @dev - member to resize -+ * @nbuckets - new number of buckets -+ */ -+struct bch_ioctl_disk_resize { -+ __u32 flags; -+ __u32 pad; -+ __u64 dev; -+ __u64 nbuckets; -+}; -+ -+#endif /* _BCACHEFS_IOCTL_H */ -diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c -new file mode 100644 -index 000000000000..4d0c9129cd4a ---- /dev/null -+++ b/fs/bcachefs/bkey.c -@@ -0,0 +1,1154 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "bset.h" -+#include "util.h" -+ -+#undef EBUG_ON -+ -+#ifdef DEBUG_BKEYS -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) -+#endif -+ -+const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) -+{ -+ unsigned bit = high_bit_offset, done = 0; -+ -+ while (1) { -+ while (bit < 64) { -+ if (done && !(done % 8)) -+ *out++ = ' '; -+ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; -+ bit++; -+ done++; -+ if (done == nr_bits) { -+ *out++ = '\0'; -+ return; -+ } -+ } -+ -+ p = next_word(p); -+ bit = 0; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) -+{ -+ struct bkey tmp; -+ -+ BUG_ON(bkeyp_val_u64s(format, packed) != -+ bkey_val_u64s(unpacked)); -+ -+ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); -+ -+ tmp = __bch2_bkey_unpack_key(format, packed); -+ -+ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { -+ char buf1[160], buf2[160]; -+ char buf3[160], buf4[160]; -+ -+ bch2_bkey_to_text(&PBUF(buf1), unpacked); -+ bch2_bkey_to_text(&PBUF(buf2), &tmp); -+ bch2_to_binary(buf3, (void *) unpacked, 80); -+ bch2_to_binary(buf4, high_word(format, packed), 80); -+ -+ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", -+ format->key_u64s, -+ format->bits_per_field[0], -+ format->bits_per_field[1], -+ format->bits_per_field[2], -+ format->bits_per_field[3], -+ format->bits_per_field[4], -+ buf1, buf2, buf3, buf4); -+ } -+} -+ -+#else -+static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, -+ const struct bkey *unpacked, -+ const struct bkey_format *format) {} -+#endif -+ -+struct pack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct pack_state pack_state_init(const struct bkey_format *format, -+ struct bkey_packed *k) -+{ -+ u64 *p = high_word(format, k); -+ -+ return (struct pack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = 0, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static void pack_state_finish(struct pack_state *state, -+ struct bkey_packed *k) -+{ -+ EBUG_ON(state->p < k->_data); -+ EBUG_ON(state->p >= k->_data + state->format->key_u64s); -+ -+ *state->p = state->w; -+} -+ -+struct unpack_state { -+ const struct bkey_format *format; -+ unsigned bits; /* bits remaining in current word */ -+ u64 w; /* current word */ -+ const u64 *p; /* pointer to next word */ -+}; -+ -+__always_inline -+static struct unpack_state unpack_state_init(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(format, k); -+ -+ return (struct unpack_state) { -+ .format = format, -+ .bits = 64 - high_bit_offset, -+ .w = *p << high_bit_offset, -+ .p = p, -+ }; -+} -+ -+__always_inline -+static u64 get_inc_field(struct unpack_state *state, unsigned field) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (bits >= state->bits) { -+ v = state->w >> (64 - bits); -+ bits -= state->bits; -+ -+ state->p = next_word(state->p); -+ state->w = *state->p; -+ state->bits = 64; -+ } -+ -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ v |= (state->w >> 1) >> (63 - bits); -+ state->w <<= bits; -+ state->bits -= bits; -+ -+ return v + offset; -+} -+ -+__always_inline -+static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ -+ if (v < offset) -+ return false; -+ -+ v -= offset; -+ -+ if (fls64(v) > bits) -+ return false; -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return true; -+} -+ -+/* -+ * Note: does NOT set out->format (we don't know what it should be here!) -+ * -+ * Also: doesn't work on extents - it doesn't preserve the invariant that -+ * if k is packed bkey_start_pos(k) will successfully pack -+ */ -+static bool bch2_bkey_transform_key(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ struct pack_state out_s = pack_state_init(out_f, out); -+ struct unpack_state in_s = unpack_state_init(in_f, in); -+ unsigned i; -+ -+ out->_data[0] = 0; -+ -+ for (i = 0; i < BKEY_NR_FIELDS; i++) -+ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) -+ return false; -+ -+ /* Can't happen because the val would be too big to unpack: */ -+ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); -+ -+ pack_state_finish(&out_s, out); -+ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ return true; -+} -+ -+bool bch2_bkey_transform(const struct bkey_format *out_f, -+ struct bkey_packed *out, -+ const struct bkey_format *in_f, -+ const struct bkey_packed *in) -+{ -+ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) -+ return false; -+ -+ memcpy_u64s((u64 *) out + out_f->key_u64s, -+ (u64 *) in + in_f->key_u64s, -+ (in->u64s - in_f->key_u64s)); -+ return true; -+} -+ -+#define bkey_fields() \ -+ x(BKEY_FIELD_INODE, p.inode) \ -+ x(BKEY_FIELD_OFFSET, p.offset) \ -+ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ -+ x(BKEY_FIELD_SIZE, size) \ -+ x(BKEY_FIELD_VERSION_HI, version.hi) \ -+ x(BKEY_FIELD_VERSION_LO, version.lo) -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bkey out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); -+ -+ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; -+ out.format = KEY_FORMAT_CURRENT; -+ out.needs_whiteout = in->needs_whiteout; -+ out.type = in->type; -+ out.pad[0] = 0; -+ -+#define x(id, field) out.field = get_inc_field(&state, id); -+ bkey_fields() -+#undef x -+ -+ return out; -+} -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *format, -+ const struct bkey_packed *in) -+{ -+ struct unpack_state state = unpack_state_init(format, in); -+ struct bpos out; -+ -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->u64s < format->key_u64s); -+ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); -+ -+ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); -+ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); -+ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); -+ -+ return out; -+} -+#endif -+ -+/** -+ * bch2_bkey_pack_key -- pack just the key, not the value -+ */ -+bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, -+ const struct bkey_format *format) -+{ -+ struct pack_state state = pack_state_init(format, out); -+ -+ EBUG_ON((void *) in == (void *) out); -+ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); -+ EBUG_ON(in->format != KEY_FORMAT_CURRENT); -+ -+ out->_data[0] = 0; -+ -+#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; -+ bkey_fields() -+#undef x -+ -+ /* -+ * Extents - we have to guarantee that if an extent is packed, a trimmed -+ * version will also pack: -+ */ -+ if (bkey_start_offset(in) < -+ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) -+ return false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->needs_whiteout = in->needs_whiteout; -+ out->type = in->type; -+ -+ bch2_bkey_pack_verify(out, in, format); -+ return true; -+} -+ -+/** -+ * bch2_bkey_unpack -- unpack the key and the value -+ */ -+void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, -+ const struct bkey_packed *src) -+{ -+ __bkey_unpack_key(b, &dst->k, src); -+ -+ memcpy_u64s(&dst->v, -+ bkeyp_val(&b->format, src), -+ bkeyp_val_u64s(&b->format, src)); -+} -+ -+/** -+ * bch2_bkey_pack -- pack the key and the value -+ */ -+bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, -+ const struct bkey_format *format) -+{ -+ struct bkey_packed tmp; -+ -+ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) -+ return false; -+ -+ memmove_u64s((u64 *) out + format->key_u64s, -+ &in->v, -+ bkey_val_u64s(&in->k)); -+ memcpy_u64s(out, &tmp, format->key_u64s); -+ -+ return true; -+} -+ -+__always_inline -+static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) -+{ -+ unsigned bits = state->format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(state->format->field_offset[field]); -+ bool ret = true; -+ -+ EBUG_ON(v < offset); -+ v -= offset; -+ -+ if (fls64(v) > bits) { -+ v = ~(~0ULL << bits); -+ ret = false; -+ } -+ -+ if (bits > state->bits) { -+ bits -= state->bits; -+ state->w |= (v >> 1) >> (bits - 1); -+ -+ *state->p = state->w; -+ state->p = next_word(state->p); -+ state->w = 0; -+ state->bits = 64; -+ } -+ -+ state->bits -= bits; -+ state->w |= v << state->bits; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static bool bkey_packed_successor(struct bkey_packed *out, -+ const struct btree *b, -+ struct bkey_packed k) -+{ -+ const struct bkey_format *f = &b->format; -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned first_bit, offset; -+ u64 *p; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ if (!nr_key_bits) -+ return false; -+ -+ *out = k; -+ -+ first_bit = high_bit_offset + nr_key_bits - 1; -+ p = nth_word(high_word(f, out), first_bit >> 6); -+ offset = 63 - (first_bit & 63); -+ -+ while (nr_key_bits) { -+ unsigned bits = min(64 - offset, nr_key_bits); -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if ((*p & mask) != mask) { -+ *p += 1ULL << offset; -+ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); -+ return true; -+ } -+ -+ *p &= ~mask; -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ offset = 0; -+ } -+ -+ return false; -+} -+#endif -+ -+/* -+ * Returns a packed key that compares <= in -+ * -+ * This is used in bset_search_tree(), where we need a packed pos in order to be -+ * able to compare against the keys in the auxiliary search tree - and it's -+ * legal to use a packed pos that isn't equivalent to the original pos, -+ * _provided_ it compares <= to the original pos. -+ */ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, -+ struct bpos in, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct pack_state state = pack_state_init(f, out); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos orig = in; -+#endif -+ bool exact = true; -+ -+ out->_data[0] = 0; -+ -+ if (unlikely(in.snapshot < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { -+ if (!in.offset-- && -+ !in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.offset < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { -+ if (!in.inode--) -+ return BKEY_PACK_POS_FAIL; -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (unlikely(in.inode < -+ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) -+ return BKEY_PACK_POS_FAIL; -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { -+ in.offset = KEY_OFFSET_MAX; -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { -+ in.snapshot = KEY_SNAPSHOT_MAX; -+ exact = false; -+ } -+ -+ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) -+ exact = false; -+ -+ pack_state_finish(&state, out); -+ out->u64s = f->key_u64s; -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ out->type = KEY_TYPE_deleted; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (exact) { -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); -+ } else { -+ struct bkey_packed successor; -+ -+ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); -+ BUG_ON(bkey_packed_successor(&successor, b, *out) && -+ bkey_cmp_left_packed(b, &successor, &orig) < 0); -+ } -+#endif -+ -+ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; -+} -+ -+void bch2_bkey_format_init(struct bkey_format_state *s) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) -+ s->field_min[i] = U64_MAX; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) -+ s->field_max[i] = 0; -+ -+ /* Make sure we can store a size of 0: */ -+ s->field_min[BKEY_FIELD_SIZE] = 0; -+} -+ -+static void __bkey_format_add(struct bkey_format_state *s, -+ unsigned field, u64 v) -+{ -+ s->field_min[field] = min(s->field_min[field], v); -+ s->field_max[field] = max(s->field_max[field], v); -+} -+ -+/* -+ * Changes @format so that @k can be successfully packed with @format -+ */ -+void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) -+{ -+#define x(id, field) __bkey_format_add(s, id, k->field); -+ bkey_fields() -+#undef x -+ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); -+} -+ -+void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) -+{ -+ unsigned field = 0; -+ -+ __bkey_format_add(s, field++, p.inode); -+ __bkey_format_add(s, field++, p.offset); -+ __bkey_format_add(s, field++, p.snapshot); -+} -+ -+/* -+ * We don't want it to be possible for the packed format to represent fields -+ * bigger than a u64... that will cause confusion and issues (like with -+ * bkey_packed_successor()) -+ */ -+static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, -+ unsigned bits, u64 offset) -+{ -+ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); -+ -+ f->bits_per_field[i] = bits; -+ f->field_offset[i] = cpu_to_le64(offset); -+} -+ -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ struct bkey_format ret = { -+ .nr_fields = BKEY_NR_FIELDS, -+ }; -+ -+ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { -+ s->field_min[i] = min(s->field_min[i], s->field_max[i]); -+ -+ set_format_field(&ret, i, -+ fls64(s->field_max[i] - s->field_min[i]), -+ s->field_min[i]); -+ -+ bits += ret.bits_per_field[i]; -+ } -+ -+ /* allow for extent merging: */ -+ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { -+ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; -+ bits += 4; -+ } -+ -+ ret.key_u64s = DIV_ROUND_UP(bits, 64); -+ -+ /* if we have enough spare bits, round fields up to nearest byte */ -+ bits = ret.key_u64s * 64 - bits; -+ -+ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { -+ unsigned r = round_up(ret.bits_per_field[i], 8) - -+ ret.bits_per_field[i]; -+ -+ if (r <= bits) { -+ set_format_field(&ret, i, -+ ret.bits_per_field[i] + r, -+ le64_to_cpu(ret.field_offset[i])); -+ bits -= r; -+ } -+ } -+ -+ EBUG_ON(bch2_bkey_format_validate(&ret)); -+ return ret; -+} -+ -+const char *bch2_bkey_format_validate(struct bkey_format *f) -+{ -+ unsigned i, bits = KEY_PACKED_BITS_START; -+ -+ if (f->nr_fields != BKEY_NR_FIELDS) -+ return "incorrect number of fields"; -+ -+ for (i = 0; i < f->nr_fields; i++) { -+ u64 field_offset = le64_to_cpu(f->field_offset[i]); -+ -+ if (f->bits_per_field[i] > 64) -+ return "field too large"; -+ -+ if (field_offset && -+ (f->bits_per_field[i] == 64 || -+ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < -+ field_offset))) -+ return "offset + bits overflow"; -+ -+ bits += f->bits_per_field[i]; -+ } -+ -+ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) -+ return "incorrect key_u64s"; -+ -+ return NULL; -+} -+ -+/* -+ * Most significant differing bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, -+ const struct bkey_packed *l_k, -+ const struct bkey_packed *r_k) -+{ -+ const u64 *l = high_word(&b->format, l_k); -+ const u64 *r = high_word(&b->format, r_k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned word_bits = 64 - high_bit_offset; -+ u64 l_v, r_v; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ /* for big endian, skip past header */ -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (nr_key_bits) { -+ if (nr_key_bits < word_bits) { -+ l_v >>= word_bits - nr_key_bits; -+ r_v >>= word_bits - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= word_bits; -+ } -+ -+ if (l_v != r_v) -+ return fls64(l_v ^ r_v) - 1 + nr_key_bits; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ word_bits = 64; -+ } -+ -+ return 0; -+} -+ -+/* -+ * First set bit -+ * Bits are indexed from 0 - return is [0, nr_key_bits) -+ */ -+__pure -+unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) -+{ -+ const u64 *p = high_word(&b->format, k); -+ unsigned nr_key_bits = b->nr_key_bits; -+ unsigned ret = 0, offset; -+ -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); -+ -+ offset = nr_key_bits; -+ while (offset > 64) { -+ p = next_word(p); -+ offset -= 64; -+ } -+ -+ offset = 64 - offset; -+ -+ while (nr_key_bits) { -+ unsigned bits = nr_key_bits + offset < 64 -+ ? nr_key_bits -+ : 64 - offset; -+ -+ u64 mask = (~0ULL >> (64 - bits)) << offset; -+ -+ if (*p & mask) -+ return ret + __ffs64(*p & mask) - offset; -+ -+ p = prev_word(p); -+ nr_key_bits -= bits; -+ ret += bits; -+ offset = 0; -+ } -+ -+ return 0; -+} -+ -+#ifdef CONFIG_X86_64 -+ -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ long d0, d1, d2, d3; -+ int cmp; -+ -+ /* we shouldn't need asm for this, but gcc is being retarded: */ -+ -+ asm(".intel_syntax noprefix;" -+ "xor eax, eax;" -+ "xor edx, edx;" -+ "1:;" -+ "mov r8, [rdi];" -+ "mov r9, [rsi];" -+ "sub ecx, 64;" -+ "jl 2f;" -+ -+ "cmp r8, r9;" -+ "jnz 3f;" -+ -+ "lea rdi, [rdi - 8];" -+ "lea rsi, [rsi - 8];" -+ "jmp 1b;" -+ -+ "2:;" -+ "not ecx;" -+ "shr r8, 1;" -+ "shr r9, 1;" -+ "shr r8, cl;" -+ "shr r9, cl;" -+ "cmp r8, r9;" -+ -+ "3:\n" -+ "seta al;" -+ "setb dl;" -+ "sub eax, edx;" -+ ".att_syntax prefix;" -+ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) -+ : "0" (l), "1" (r), "3" (nr_key_bits) -+ : "r8", "r9", "cc", "memory"); -+ -+ return cmp; -+} -+ -+#define I(_x) (*(out)++ = (_x)) -+#define I1(i0) I(i0) -+#define I2(i0, i1) (I1(i0), I(i1)) -+#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) -+#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) -+#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) -+ -+static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, -+ enum bch_bkey_fields field, -+ unsigned dst_offset, unsigned dst_size, -+ bool *eax_zeroed) -+{ -+ unsigned bits = format->bits_per_field[field]; -+ u64 offset = le64_to_cpu(format->field_offset[field]); -+ unsigned i, byte, bit_offset, align, shl, shr; -+ -+ if (!bits && !offset) { -+ if (!*eax_zeroed) { -+ /* xor eax, eax */ -+ I2(0x31, 0xc0); -+ } -+ -+ *eax_zeroed = true; -+ goto set_field; -+ } -+ -+ if (!bits) { -+ /* just return offset: */ -+ -+ switch (dst_size) { -+ case 8: -+ if (offset > S32_MAX) { -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ -+ I3(0xc7, 0x47, dst_offset + 4); -+ memcpy(out, (void *) &offset + 4, 4); -+ out += 4; -+ } else { -+ /* mov [rdi + dst_offset], offset */ -+ /* sign extended */ -+ I4(0x48, 0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], offset */ -+ I3(0xc7, 0x47, dst_offset); -+ memcpy(out, &offset, 4); -+ out += 4; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+ } -+ -+ bit_offset = format->key_u64s * 64; -+ for (i = 0; i <= field; i++) -+ bit_offset -= format->bits_per_field[i]; -+ -+ byte = bit_offset / 8; -+ bit_offset -= byte * 8; -+ -+ *eax_zeroed = false; -+ -+ if (bit_offset == 0 && bits == 8) { -+ /* movzx eax, BYTE PTR [rsi + imm8] */ -+ I4(0x0f, 0xb6, 0x46, byte); -+ } else if (bit_offset == 0 && bits == 16) { -+ /* movzx eax, WORD PTR [rsi + imm8] */ -+ I4(0x0f, 0xb7, 0x46, byte); -+ } else if (bit_offset + bits <= 32) { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 32); -+ -+ /* mov eax, [rsi + imm8] */ -+ I3(0x8b, 0x46, byte); -+ -+ if (bit_offset) { -+ /* shr eax, imm8 */ -+ I3(0xc1, 0xe8, bit_offset); -+ } -+ -+ if (bit_offset + bits < 32) { -+ unsigned mask = ~0U >> (32 - bits); -+ -+ /* and eax, imm32 */ -+ I1(0x25); -+ memcpy(out, &mask, 4); -+ out += 4; -+ } -+ } else if (bit_offset + bits <= 64) { -+ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 64); -+ -+ /* mov rax, [rsi + imm8] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ shl = 64 - bit_offset - bits; -+ shr = bit_offset + shl; -+ -+ if (shl) { -+ /* shl rax, imm8 */ -+ I4(0x48, 0xc1, 0xe0, shl); -+ } -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } else { -+ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); -+ byte -= align; -+ bit_offset += align * 8; -+ -+ BUG_ON(bit_offset + bits > 96); -+ -+ /* mov rax, [rsi + byte] */ -+ I4(0x48, 0x8b, 0x46, byte); -+ -+ /* mov edx, [rsi + byte + 8] */ -+ I3(0x8b, 0x56, byte + 8); -+ -+ /* bits from next word: */ -+ shr = bit_offset + bits - 64; -+ BUG_ON(shr > bit_offset); -+ -+ /* shr rax, bit_offset */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ -+ /* shl rdx, imm8 */ -+ I4(0x48, 0xc1, 0xe2, 64 - shr); -+ -+ /* or rax, rdx */ -+ I3(0x48, 0x09, 0xd0); -+ -+ shr = bit_offset - shr; -+ -+ if (shr) { -+ /* shr rax, imm8 */ -+ I4(0x48, 0xc1, 0xe8, shr); -+ } -+ } -+ -+ /* rax += offset: */ -+ if (offset > S32_MAX) { -+ /* mov rdx, imm64 */ -+ I2(0x48, 0xba); -+ memcpy(out, &offset, 8); -+ out += 8; -+ /* add %rdx, %rax */ -+ I3(0x48, 0x01, 0xd0); -+ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { -+ /* add rax, imm32 */ -+ I2(0x48, 0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } else if (offset) { -+ /* add eax, imm32 */ -+ I1(0x05); -+ memcpy(out, &offset, 4); -+ out += 4; -+ } -+set_field: -+ switch (dst_size) { -+ case 8: -+ /* mov [rdi + dst_offset], rax */ -+ I4(0x48, 0x89, 0x47, dst_offset); -+ break; -+ case 4: -+ /* mov [rdi + dst_offset], eax */ -+ I3(0x89, 0x47, dst_offset); -+ break; -+ default: -+ BUG(); -+ } -+ -+ return out; -+} -+ -+int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) -+{ -+ bool eax_zeroed = false; -+ u8 *out = _out; -+ -+ /* -+ * rdi: dst - unpacked key -+ * rsi: src - packed key -+ */ -+ -+ /* k->u64s, k->format, k->type */ -+ -+ /* mov eax, [rsi] */ -+ I2(0x8b, 0x06); -+ -+ /* add eax, BKEY_U64s - format->key_u64s */ -+ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); -+ -+ /* and eax, imm32: mask out k->pad: */ -+ I5(0x25, 0xff, 0xff, 0xff, 0); -+ -+ /* mov [rdi], eax */ -+ I2(0x89, 0x07); -+ -+#define x(id, field) \ -+ out = compile_bkey_field(format, out, id, \ -+ offsetof(struct bkey, field), \ -+ sizeof(((struct bkey *) NULL)->field), \ -+ &eax_zeroed); -+ bkey_fields() -+#undef x -+ -+ /* retq */ -+ I1(0xc3); -+ -+ return (void *) out - _out; -+} -+ -+#else -+static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, -+ unsigned nr_key_bits) -+{ -+ u64 l_v, r_v; -+ -+ if (!nr_key_bits) -+ return 0; -+ -+ /* for big endian, skip past header */ -+ nr_key_bits += high_bit_offset; -+ l_v = *l & (~0ULL >> high_bit_offset); -+ r_v = *r & (~0ULL >> high_bit_offset); -+ -+ while (1) { -+ if (nr_key_bits < 64) { -+ l_v >>= 64 - nr_key_bits; -+ r_v >>= 64 - nr_key_bits; -+ nr_key_bits = 0; -+ } else { -+ nr_key_bits -= 64; -+ } -+ -+ if (!nr_key_bits || l_v != r_v) -+ break; -+ -+ l = next_word(l); -+ r = next_word(r); -+ -+ l_v = *l; -+ r_v = *r; -+ } -+ -+ return cmp_int(l_v, r_v); -+} -+#endif -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ int ret; -+ -+ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); -+ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); -+ -+ ret = __bkey_cmp_bits(high_word(f, l), -+ high_word(f, r), -+ b->nr_key_bits); -+ -+ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), -+ bkey_unpack_pos(b, r))); -+ return ret; -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_packed(const struct bkey_packed *l, -+ const struct bkey_packed *r, -+ const struct btree *b) -+{ -+ struct bkey unpacked; -+ -+ if (likely(bkey_packed(l) && bkey_packed(r))) -+ return __bch2_bkey_cmp_packed_format_checked(l, r, b); -+ -+ if (bkey_packed(l)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, l); -+ l = (void*) &unpacked; -+ } else if (bkey_packed(r)) { -+ __bkey_unpack_key_format_checked(b, &unpacked, r); -+ r = (void*) &unpacked; -+ } -+ -+ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); -+} -+ -+__pure __flatten -+int __bch2_bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ const struct bkey *l_unpacked; -+ -+ return unlikely(l_unpacked = packed_to_bkey_c(l)) -+ ? bkey_cmp(l_unpacked->p, *r) -+ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+void bch2_bpos_swab(struct bpos *p) -+{ -+ u8 *l = (u8 *) p; -+ u8 *h = ((u8 *) &p[1]) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) -+{ -+ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; -+ u8 *l = k->key_start; -+ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; -+ -+ while (l < h) { -+ swap(*l, *h); -+ l++; -+ --h; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void) -+{ -+ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); -+ struct bkey_packed p; -+ -+ struct bkey_format test_format = { -+ .key_u64s = 2, -+ .nr_fields = BKEY_NR_FIELDS, -+ .bits_per_field = { -+ 13, -+ 64, -+ }, -+ }; -+ -+ struct unpack_state in_s = -+ unpack_state_init(&bch2_bkey_format_current, (void *) &t); -+ struct pack_state out_s = pack_state_init(&test_format, &p); -+ unsigned i; -+ -+ for (i = 0; i < out_s.format->nr_fields; i++) { -+ u64 a, v = get_inc_field(&in_s, i); -+ -+ switch (i) { -+#define x(id, field) case id: a = t.field; break; -+ bkey_fields() -+#undef x -+ default: -+ BUG(); -+ } -+ -+ if (a != v) -+ panic("got %llu actual %llu i %u\n", v, a, i); -+ -+ if (!set_inc_field(&out_s, i, v)) -+ panic("failed at %u\n", i); -+ } -+ -+ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); -+} -+#endif -diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h -new file mode 100644 -index 000000000000..cbcfbd26bc58 ---- /dev/null -+++ b/fs/bcachefs/bkey.h -@@ -0,0 +1,605 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_H -+#define _BCACHEFS_BKEY_H -+ -+#include -+#include "bcachefs_format.h" -+ -+#include "util.h" -+#include "vstructs.h" -+ -+#ifdef CONFIG_X86_64 -+#define HAVE_BCACHEFS_COMPILED_UNPACK 1 -+#endif -+ -+void bch2_to_binary(char *, const u64 *, unsigned); -+ -+/* bkey with split value, const */ -+struct bkey_s_c { -+ const struct bkey *k; -+ const struct bch_val *v; -+}; -+ -+/* bkey with split value */ -+struct bkey_s { -+ union { -+ struct { -+ struct bkey *k; -+ struct bch_val *v; -+ }; -+ struct bkey_s_c s_c; -+ }; -+}; -+ -+#define bkey_next(_k) vstruct_next(_k) -+ -+static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ k = bkey_next(k); -+ -+ while (k != end && !k->u64s) -+ k = (void *) ((u64 *) k + 1); -+ return k; -+} -+ -+#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) -+ -+static inline size_t bkey_val_bytes(const struct bkey *k) -+{ -+ return bkey_val_u64s(k) * sizeof(u64); -+} -+ -+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) -+{ -+ k->u64s = BKEY_U64s + val_u64s; -+} -+ -+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) -+{ -+ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); -+} -+ -+#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) -+ -+#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) -+ -+#define bkey_whiteout(_k) \ -+ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) -+ -+#define bkey_packed_typecheck(_k) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ -+ !type_is(_k, struct bkey_packed *)); \ -+ type_is(_k, struct bkey_packed *); \ -+}) -+ -+enum bkey_lr_packed { -+ BKEY_PACKED_BOTH, -+ BKEY_PACKED_RIGHT, -+ BKEY_PACKED_LEFT, -+ BKEY_PACKED_NONE, -+}; -+ -+#define bkey_lr_packed_typecheck(_l, _r) \ -+ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) -+ -+#define bkey_lr_packed(_l, _r) \ -+ ((_l)->format + ((_r)->format << 1)) -+ -+#define bkey_copy(_dst, _src) \ -+do { \ -+ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ -+ !type_is(_dst, struct bkey_packed *)); \ -+ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ -+ !type_is(_src, struct bkey_packed *)); \ -+ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ -+ (u64 *) (_dst) < (u64 *) (_src) + \ -+ ((struct bkey *) (_src))->u64s); \ -+ \ -+ memcpy_u64s_small((_dst), (_src), \ -+ ((struct bkey *) (_src))->u64s); \ -+} while (0) -+ -+struct btree; -+ -+struct bkey_format_state { -+ u64 field_min[BKEY_NR_FIELDS]; -+ u64 field_max[BKEY_NR_FIELDS]; -+}; -+ -+void bch2_bkey_format_init(struct bkey_format_state *); -+void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); -+void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); -+struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); -+const char *bch2_bkey_format_validate(struct bkey_format *); -+ -+__pure -+unsigned bch2_bkey_greatest_differing_bit(const struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+__pure -+unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); -+ -+__pure -+int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+__pure -+int __bch2_bkey_cmp_packed(const struct bkey_packed *, -+ const struct bkey_packed *, -+ const struct btree *); -+ -+__pure -+int __bch2_bkey_cmp_left_packed(const struct btree *, -+ const struct bkey_packed *, -+ const struct bpos *); -+ -+static inline __pure -+int bkey_cmp_left_packed(const struct btree *b, -+ const struct bkey_packed *l, const struct bpos *r) -+{ -+ return __bch2_bkey_cmp_left_packed(b, l, r); -+} -+ -+/* -+ * we prefer to pass bpos by ref, but it's often enough terribly convenient to -+ * pass it by by val... as much as I hate c++, const ref would be nice here: -+ */ -+__pure __flatten -+static inline int bkey_cmp_left_packed_byval(const struct btree *b, -+ const struct bkey_packed *l, -+ struct bpos r) -+{ -+ return bkey_cmp_left_packed(b, l, &r); -+} -+ -+/* -+ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to -+ * skip dispatching on k->format: -+ */ -+#define bkey_cmp_packed(_b, _l, _r) \ -+({ \ -+ int _cmp; \ -+ \ -+ switch (bkey_lr_packed_typecheck(_l, _r)) { \ -+ case BKEY_PACKED_NONE: \ -+ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ -+ ((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_LEFT: \ -+ _cmp = bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_l), \ -+ &((struct bkey *) (_r))->p); \ -+ break; \ -+ case BKEY_PACKED_RIGHT: \ -+ _cmp = -bkey_cmp_left_packed((_b), \ -+ (struct bkey_packed *) (_r), \ -+ &((struct bkey *) (_l))->p); \ -+ break; \ -+ case BKEY_PACKED_BOTH: \ -+ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ -+ (void *) (_r), (_b)); \ -+ break; \ -+ } \ -+ _cmp; \ -+}) -+ -+#if 1 -+static __always_inline int bkey_cmp(struct bpos l, struct bpos r) -+{ -+ if (l.inode != r.inode) -+ return l.inode < r.inode ? -1 : 1; -+ if (l.offset != r.offset) -+ return l.offset < r.offset ? -1 : 1; -+ if (l.snapshot != r.snapshot) -+ return l.snapshot < r.snapshot ? -1 : 1; -+ return 0; -+} -+#else -+int bkey_cmp(struct bpos l, struct bpos r); -+#endif -+ -+static inline struct bpos bpos_min(struct bpos l, struct bpos r) -+{ -+ return bkey_cmp(l, r) < 0 ? l : r; -+} -+ -+void bch2_bpos_swab(struct bpos *); -+void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); -+ -+static __always_inline int bversion_cmp(struct bversion l, struct bversion r) -+{ -+ return cmp_int(l.hi, r.hi) ?: -+ cmp_int(l.lo, r.lo); -+} -+ -+#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) -+#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) -+ -+static __always_inline int bversion_zero(struct bversion v) -+{ -+ return !bversion_cmp(v, ZERO_VERSION); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+/* statement expressions confusing unlikely()? */ -+#define bkey_packed(_k) \ -+ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ -+ (_k)->format != KEY_FORMAT_CURRENT; }) -+#else -+#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) -+#endif -+ -+/* -+ * It's safe to treat an unpacked bkey as a packed one, but not the reverse -+ */ -+static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) -+{ -+ return (struct bkey_packed *) k; -+} -+ -+static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) -+{ -+ return (const struct bkey_packed *) k; -+} -+ -+static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (struct bkey_i *) k; -+} -+ -+static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) -+{ -+ return bkey_packed(k) ? NULL : (const struct bkey *) k; -+} -+ -+static inline unsigned bkey_format_key_bits(const struct bkey_format *format) -+{ -+ return format->bits_per_field[BKEY_FIELD_INODE] + -+ format->bits_per_field[BKEY_FIELD_OFFSET] + -+ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; -+} -+ -+static inline struct bpos bkey_successor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!++ret.offset) -+ BUG_ON(!++ret.inode); -+ -+ return ret; -+} -+ -+static inline struct bpos bkey_predecessor(struct bpos p) -+{ -+ struct bpos ret = p; -+ -+ if (!ret.offset--) -+ BUG_ON(!ret.inode--); -+ -+ return ret; -+} -+ -+static inline u64 bkey_start_offset(const struct bkey *k) -+{ -+ return k->p.offset - k->size; -+} -+ -+static inline struct bpos bkey_start_pos(const struct bkey *k) -+{ -+ return (struct bpos) { -+ .inode = k->p.inode, -+ .offset = bkey_start_offset(k), -+ .snapshot = k->p.snapshot, -+ }; -+} -+ -+/* Packed helpers */ -+ -+static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; -+ -+ EBUG_ON(k->u64s < ret); -+ return ret; -+} -+ -+static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_key_u64s(format, k) * sizeof(u64); -+} -+ -+static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return k->u64s - bkeyp_key_u64s(format, k); -+} -+ -+static inline size_t bkeyp_val_bytes(const struct bkey_format *format, -+ const struct bkey_packed *k) -+{ -+ return bkeyp_val_u64s(format, k) * sizeof(u64); -+} -+ -+static inline void set_bkeyp_val_u64s(const struct bkey_format *format, -+ struct bkey_packed *k, unsigned val_u64s) -+{ -+ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; -+} -+ -+#define bkeyp_val(_format, _k) \ -+ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) -+ -+extern const struct bkey_format bch2_bkey_format_current; -+ -+bool bch2_bkey_transform(const struct bkey_format *, -+ struct bkey_packed *, -+ const struct bkey_format *, -+ const struct bkey_packed *); -+ -+struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, -+ const struct bkey_packed *); -+ -+#ifndef HAVE_BCACHEFS_COMPILED_UNPACK -+struct bpos __bkey_unpack_pos(const struct bkey_format *, -+ const struct bkey_packed *); -+#endif -+ -+bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, -+ const struct bkey_format *); -+ -+enum bkey_pack_pos_ret { -+ BKEY_PACK_POS_EXACT, -+ BKEY_PACK_POS_SMALLER, -+ BKEY_PACK_POS_FAIL, -+}; -+ -+enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, -+ const struct btree *); -+ -+static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, -+ const struct btree *b) -+{ -+ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; -+} -+ -+void bch2_bkey_unpack(const struct btree *, struct bkey_i *, -+ const struct bkey_packed *); -+bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, -+ const struct bkey_format *); -+ -+static inline u64 bkey_field_max(const struct bkey_format *f, -+ enum bch_bkey_fields nr) -+{ -+ return f->bits_per_field[nr] < 64 -+ ? (le64_to_cpu(f->field_offset[nr]) + -+ ~(~0ULL << f->bits_per_field[nr])) -+ : U64_MAX; -+} -+ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ -+int bch2_compile_bkey_format(const struct bkey_format *, void *); -+ -+#else -+ -+static inline int bch2_compile_bkey_format(const struct bkey_format *format, -+ void *out) { return 0; } -+ -+#endif -+ -+static inline void bkey_reassemble(struct bkey_i *dst, -+ struct bkey_s_c src) -+{ -+ dst->k = *src.k; -+ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); -+} -+ -+#define bkey_s_null ((struct bkey_s) { .k = NULL }) -+#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) -+ -+#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) -+#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) -+ -+static inline struct bkey_s bkey_to_s(struct bkey *k) -+{ -+ return (struct bkey_s) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) -+{ -+ return (struct bkey_s_c) { .k = k, .v = NULL }; -+} -+ -+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) -+{ -+ return (struct bkey_s) { .k = &k->k, .v = &k->v }; -+} -+ -+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) -+{ -+ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; -+} -+ -+/* -+ * For a given type of value (e.g. struct bch_extent), generates the types for -+ * bkey + bch_extent - inline, split, split const - and also all the conversion -+ * functions, which also check that the value is of the correct type. -+ * -+ * We use anonymous unions for upcasting - e.g. converting from e.g. a -+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion -+ * functions. -+ */ -+#define BKEY_VAL_ACCESSORS(name) \ -+struct bkey_i_##name { \ -+ union { \ -+ struct bkey k; \ -+ struct bkey_i k_i; \ -+ }; \ -+ struct bch_##name v; \ -+}; \ -+ \ -+struct bkey_s_c_##name { \ -+ union { \ -+ struct { \ -+ const struct bkey *k; \ -+ const struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+struct bkey_s_##name { \ -+ union { \ -+ struct { \ -+ struct bkey *k; \ -+ struct bch_##name *v; \ -+ }; \ -+ struct bkey_s_c_##name c; \ -+ struct bkey_s s; \ -+ struct bkey_s_c s_c; \ -+ }; \ -+}; \ -+ \ -+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline const struct bkey_i_##name * \ -+bkey_i_to_##name##_c(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return container_of(&k->k, struct bkey_i_##name, k); \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ -+{ \ -+ EBUG_ON(k.k->type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = k.k, \ -+ .v = container_of(k.v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ -+{ \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+name##_i_to_s_c(const struct bkey_i_##name *k) \ -+{ \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = &k->v, \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_s_c_##name \ -+bkey_i_to_s_c_##name(const struct bkey_i *k) \ -+{ \ -+ EBUG_ON(k->k.type != KEY_TYPE_##name); \ -+ return (struct bkey_s_c_##name) { \ -+ .k = &k->k, \ -+ .v = container_of(&k->v, struct bch_##name, v), \ -+ }; \ -+} \ -+ \ -+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ -+{ \ -+ struct bkey_i_##name *k = \ -+ container_of(&_k->k, struct bkey_i_##name, k); \ -+ \ -+ bkey_init(&k->k); \ -+ memset(&k->v, 0, sizeof(k->v)); \ -+ k->k.type = KEY_TYPE_##name; \ -+ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ -+ \ -+ return k; \ -+} -+ -+BKEY_VAL_ACCESSORS(cookie); -+BKEY_VAL_ACCESSORS(btree_ptr); -+BKEY_VAL_ACCESSORS(extent); -+BKEY_VAL_ACCESSORS(reservation); -+BKEY_VAL_ACCESSORS(inode); -+BKEY_VAL_ACCESSORS(inode_generation); -+BKEY_VAL_ACCESSORS(dirent); -+BKEY_VAL_ACCESSORS(xattr); -+BKEY_VAL_ACCESSORS(alloc); -+BKEY_VAL_ACCESSORS(quota); -+BKEY_VAL_ACCESSORS(stripe); -+BKEY_VAL_ACCESSORS(reflink_p); -+BKEY_VAL_ACCESSORS(reflink_v); -+BKEY_VAL_ACCESSORS(inline_data); -+BKEY_VAL_ACCESSORS(btree_ptr_v2); -+ -+/* byte order helpers */ -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return f->key_u64s - 1; -+} -+ -+#define high_bit_offset 0 -+#define nth_word(p, n) ((p) - (n)) -+ -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+ -+static inline unsigned high_word_offset(const struct bkey_format *f) -+{ -+ return 0; -+} -+ -+#define high_bit_offset KEY_PACKED_BITS_START -+#define nth_word(p, n) ((p) + (n)) -+ -+#else -+#error edit for your odd byteorder. -+#endif -+ -+#define high_word(f, k) ((k)->_data + high_word_offset(f)) -+#define next_word(p) nth_word(p, 1) -+#define prev_word(p) nth_word(p, -1) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_bkey_pack_test(void); -+#else -+static inline void bch2_bkey_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_BKEY_H */ -diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c -new file mode 100644 -index 000000000000..36e0c5152b47 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.c -@@ -0,0 +1,353 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "alloc_background.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "quota.h" -+#include "reflink.h" -+#include "xattr.h" -+ -+const char * const bch2_bkey_types[] = { -+#define x(name, nr) #name, -+ BCH_BKEY_TYPES() -+#undef x -+ NULL -+}; -+ -+static const char *deleted_key_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_deleted (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+#define bch2_bkey_ops_discard (struct bkey_ops) { \ -+ .key_invalid = deleted_key_invalid, \ -+} -+ -+static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k)) -+ return "value size should be zero"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_error (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_cookie_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+#define bch2_bkey_ops_cookie (struct bkey_ops) { \ -+ .key_invalid = key_type_cookie_invalid, \ -+} -+ -+#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ -+ .key_invalid = empty_val_key_invalid, \ -+} -+ -+static const char *key_type_inline_data_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return NULL; -+} -+ -+static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); -+} -+ -+#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ -+ .key_invalid = key_type_inline_data_invalid, \ -+ .val_to_text = key_type_inline_data_to_text, \ -+} -+ -+static const struct bkey_ops bch2_bkey_ops[] = { -+#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, -+ BCH_BKEY_TYPES() -+#undef x -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->type >= KEY_TYPE_MAX) -+ return "invalid type"; -+ -+ return bch2_bkey_ops[k.k->type].key_invalid(c, k); -+} -+ -+const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ if (k.k->u64s < BKEY_U64s) -+ return "u64s too small"; -+ -+ if (type == BKEY_TYPE_BTREE && -+ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ if (btree_node_type_is_extents(type)) { -+ if ((k.k->size == 0) != bkey_deleted(k.k)) -+ return "bad size field"; -+ -+ if (k.k->size > k.k->p.offset) -+ return "size greater than offset"; -+ } else { -+ if (k.k->size) -+ return "nonzero size field"; -+ } -+ -+ if (k.k->p.snapshot) -+ return "nonzero snapshot"; -+ -+ if (type != BKEY_TYPE_BTREE && -+ !bkey_cmp(k.k->p, POS_MAX)) -+ return "POS_MAX key"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, -+ enum btree_node_type type) -+{ -+ return __bch2_bkey_invalid(c, k, type) ?: -+ bch2_bkey_val_invalid(c, k); -+} -+ -+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) -+{ -+ if (bkey_cmp(k.k->p, b->data->min_key) < 0) -+ return "key before start of btree node"; -+ -+ if (bkey_cmp(k.k->p, b->data->max_key) > 0) -+ return "key past end of btree node"; -+ -+ return NULL; -+} -+ -+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ const char *invalid; -+ -+ BUG_ON(!k.k->u64s); -+ -+ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, k); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); -+ return; -+ } -+ -+ if (ops->key_debugcheck) -+ ops->key_debugcheck(c, k); -+} -+ -+void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) -+{ -+ if (!bkey_cmp(pos, POS_MIN)) -+ pr_buf(out, "POS_MIN"); -+ else if (!bkey_cmp(pos, POS_MAX)) -+ pr_buf(out, "POS_MAX"); -+ else -+ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); -+} -+ -+void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) -+{ -+ if (k) { -+ pr_buf(out, "u64s %u type %s ", k->u64s, -+ bch2_bkey_types[k->type]); -+ -+ bch2_bpos_to_text(out, k->p); -+ -+ pr_buf(out, " snap %u len %u ver %llu", -+ k->p.snapshot, k->size, k->version.lo); -+ } else { -+ pr_buf(out, "(null)"); -+ } -+} -+ -+void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (likely(ops->val_to_text)) -+ ops->val_to_text(out, c, k); -+} -+ -+void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_to_text(out, k.k); -+ -+ if (k.k) { -+ pr_buf(out, ": "); -+ bch2_val_to_text(out, c, k); -+ } -+} -+ -+void bch2_bkey_swab_val(struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ if (ops->swab) -+ ops->swab(k); -+} -+ -+bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; -+ -+ return ops->key_normalize -+ ? ops->key_normalize(c, k) -+ : false; -+} -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *c, -+ struct bkey_s l, struct bkey_s r) -+{ -+ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; -+ enum merge_result ret; -+ -+ if (key_merging_disabled(c) || -+ !ops->key_merge || -+ l.k->type != r.k->type || -+ bversion_cmp(l.k->version, r.k->version) || -+ bkey_cmp(l.k->p, bkey_start_pos(r.k))) -+ return BCH_MERGE_NOMERGE; -+ -+ ret = ops->key_merge(c, l, r); -+ -+ if (ret != BCH_MERGE_NOMERGE) -+ l.k->needs_whiteout |= r.k->needs_whiteout; -+ return ret; -+} -+ -+static const struct old_bkey_type { -+ u8 btree_node_type; -+ u8 old; -+ u8 new; -+} bkey_renumber_table[] = { -+ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, -+ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, -+ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, -+ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, -+ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, -+ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, -+ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, -+ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, -+ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, -+ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, -+}; -+ -+void bch2_bkey_renumber(enum btree_node_type btree_node_type, -+ struct bkey_packed *k, -+ int write) -+{ -+ const struct old_bkey_type *i; -+ -+ for (i = bkey_renumber_table; -+ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); -+ i++) -+ if (btree_node_type == i->btree_node_type && -+ k->type == (write ? i->new : i->old)) { -+ k->type = write ? i->old : i->new; -+ break; -+ } -+} -+ -+void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ const struct bkey_ops *ops; -+ struct bkey uk; -+ struct bkey_s u; -+ int i; -+ -+ /* -+ * Do these operations in reverse order in the write path: -+ */ -+ -+ for (i = 0; i < 4; i++) -+ switch (!write ? i : 3 - i) { -+ case 0: -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_key(f, k); -+ break; -+ case 1: -+ if (version < bcachefs_metadata_version_bkey_renumber) -+ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); -+ break; -+ case 2: -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ if (!bkey_packed(k)) { -+ struct bkey_i *u = packed_to_bkey(k); -+ swap(u->k.p.inode, u->k.p.offset); -+ } else if (f->bits_per_field[BKEY_FIELD_INODE] && -+ f->bits_per_field[BKEY_FIELD_OFFSET]) { -+ struct bkey_format tmp = *f, *in = f, *out = &tmp; -+ -+ swap(tmp.bits_per_field[BKEY_FIELD_INODE], -+ tmp.bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(tmp.field_offset[BKEY_FIELD_INODE], -+ tmp.field_offset[BKEY_FIELD_OFFSET]); -+ -+ if (!write) -+ swap(in, out); -+ -+ uk = __bch2_bkey_unpack_key(in, k); -+ swap(uk.p.inode, uk.p.offset); -+ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); -+ } -+ } -+ break; -+ case 3: -+ if (!bkey_packed(k)) { -+ u = bkey_i_to_s(packed_to_bkey(k)); -+ } else { -+ uk = __bch2_bkey_unpack_key(f, k); -+ u.k = &uk; -+ u.v = bkeyp_val(f, k); -+ } -+ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bkey_swab_val(u); -+ -+ ops = &bch2_bkey_ops[k->type]; -+ -+ if (ops->compat) -+ ops->compat(btree_id, version, big_endian, write, u); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h -new file mode 100644 -index 000000000000..0bca725ae3b8 ---- /dev/null -+++ b/fs/bcachefs/bkey_methods.h -@@ -0,0 +1,82 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_METHODS_H -+#define _BCACHEFS_BKEY_METHODS_H -+ -+#include "bkey.h" -+ -+struct bch_fs; -+struct btree; -+struct bkey; -+enum btree_node_type; -+ -+extern const char * const bch2_bkey_types[]; -+ -+enum merge_result { -+ BCH_MERGE_NOMERGE, -+ -+ /* -+ * The keys were mergeable, but would have overflowed size - so instead -+ * l was changed to the maximum size, and both keys were modified: -+ */ -+ BCH_MERGE_PARTIAL, -+ BCH_MERGE_MERGE, -+}; -+ -+struct bkey_ops { -+ /* Returns reason for being invalid if invalid, else NULL: */ -+ const char * (*key_invalid)(const struct bch_fs *, -+ struct bkey_s_c); -+ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); -+ void (*val_to_text)(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ void (*swab)(struct bkey_s); -+ bool (*key_normalize)(struct bch_fs *, struct bkey_s); -+ enum merge_result (*key_merge)(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ void (*compat)(enum btree_id id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s); -+}; -+ -+const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); -+const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, -+ enum btree_node_type); -+const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); -+ -+void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); -+ -+void bch2_bpos_to_text(struct printbuf *, struct bpos); -+void bch2_bkey_to_text(struct printbuf *, const struct bkey *); -+void bch2_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_bkey_swab_val(struct bkey_s); -+ -+bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); -+ -+enum merge_result bch2_bkey_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); -+ -+void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, -+ int, struct bkey_format *, struct bkey_packed *); -+ -+static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct bkey_format *f, -+ struct bkey_packed *k) -+{ -+ if (version < bcachefs_metadata_version_current || -+ big_endian != CPU_BIG_ENDIAN) -+ __bch2_bkey_compat(level, btree_id, version, -+ big_endian, write, f, k); -+ -+} -+ -+#endif /* _BCACHEFS_BKEY_METHODS_H */ -diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h -new file mode 100644 -index 000000000000..f607a0cb37ed ---- /dev/null -+++ b/fs/bcachefs/bkey_on_stack.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_ON_STACK_H -+#define _BCACHEFS_BKEY_ON_STACK_H -+ -+#include "bcachefs.h" -+ -+struct bkey_on_stack { -+ struct bkey_i *k; -+ u64 onstack[12]; -+}; -+ -+static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, -+ struct bch_fs *c, unsigned u64s) -+{ -+ if (s->k == (void *) s->onstack && -+ u64s > ARRAY_SIZE(s->onstack)) { -+ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); -+ memcpy(s->k, s->onstack, sizeof(s->onstack)); -+ } -+} -+ -+static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, -+ struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bkey_on_stack_realloc(s, c, k.k->u64s); -+ bkey_reassemble(s->k, k); -+} -+ -+static inline void bkey_on_stack_init(struct bkey_on_stack *s) -+{ -+ s->k = (void *) s->onstack; -+} -+ -+static inline void bkey_on_stack_exit(struct bkey_on_stack *s, -+ struct bch_fs *c) -+{ -+ if (s->k != (void *) s->onstack) -+ mempool_free(s->k, &c->large_bkey_pool); -+ s->k = NULL; -+} -+ -+#endif /* _BCACHEFS_BKEY_ON_STACK_H */ -diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c -new file mode 100644 -index 000000000000..839e78d1dc35 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.c -@@ -0,0 +1,515 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "bkey_sort.h" -+#include "bset.h" -+#include "extents.h" -+ -+typedef int (*sort_cmp_fn)(struct btree *, -+ struct bkey_packed *, -+ struct bkey_packed *); -+ -+static inline bool sort_iter_end(struct sort_iter *iter) -+{ -+ return !iter->used; -+} -+ -+static inline void __sort_iter_sift(struct sort_iter *iter, -+ unsigned from, -+ sort_cmp_fn cmp) -+{ -+ unsigned i; -+ -+ for (i = from; -+ i + 1 < iter->used && -+ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; -+ i++) -+ swap(iter->data[i], iter->data[i + 1]); -+} -+ -+static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ -+ __sort_iter_sift(iter, 0, cmp); -+} -+ -+static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ unsigned i = iter->used; -+ -+ while (i--) -+ __sort_iter_sift(iter, i, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) -+{ -+ return !sort_iter_end(iter) ? iter->data->k : NULL; -+} -+ -+static inline void __sort_iter_advance(struct sort_iter *iter, -+ unsigned idx, sort_cmp_fn cmp) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ BUG_ON(idx >= iter->used); -+ -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ -+ BUG_ON(i->k > i->end); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, cmp); -+} -+ -+static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) -+{ -+ __sort_iter_advance(iter, 0, cmp); -+} -+ -+static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, -+ sort_cmp_fn cmp) -+{ -+ struct bkey_packed *ret = sort_iter_peek(iter); -+ -+ if (ret) -+ sort_iter_advance(iter, cmp); -+ -+ return ret; -+} -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ */ -+static inline int key_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ cmp_int((unsigned long) l, (unsigned long) r); -+} -+ -+static inline bool should_drop_next_key(struct sort_iter *iter) -+{ -+ /* -+ * key_sort_cmp() ensures that when keys compare equal the older key -+ * comes first; so if l->k compares equal to r->k then l->k is older -+ * and should be dropped. -+ */ -+ return iter->used >= 2 && -+ !bkey_cmp_packed(iter->b, -+ iter->data[0].k, -+ iter->data[1].k); -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct bkey_packed *out = dst->start; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); -+ -+ while ((k = sort_iter_peek(iter))) { -+ if (!bkey_whiteout(k) && -+ !should_drop_next_key(iter)) { -+ bkey_copy(out, k); -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+static void extent_sort_append(struct bch_fs *c, -+ struct bkey_format *f, -+ struct btree_nr_keys *nr, -+ struct bkey_packed **out, -+ struct bkey_s k) -+{ -+ if (!bkey_whiteout(k.k)) { -+ if (!bch2_bkey_pack_key(*out, k.k, f)) -+ memcpy_u64s_small(*out, k.k, BKEY_U64s); -+ -+ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); -+ -+ btree_keys_account_key_add(nr, 0, *out); -+ *out = bkey_next(*out); -+ } -+} -+ -+/* Sort + repack in a new format: */ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *dst, struct btree *src, -+ struct btree_node_iter *src_iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_format *in_f = &src->format; -+ struct bkey_packed *in, *out = vstruct_last(dst); -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ -+ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(in)) -+ continue; -+ -+ if (bch2_bkey_transform(out_f, out, bkey_packed(in) -+ ? in_f : &bch2_bkey_format_current, in)) -+ out->format = KEY_FORMAT_LOCAL_BTREE; -+ else -+ bch2_bkey_unpack(src, (void *) out, in); -+ -+ btree_keys_account_key_add(&nr, 0, out); -+ out = bkey_next(out); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ return nr; -+} -+ -+/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *c, -+ struct bset *dst, struct btree *src, -+ struct btree_node_iter *iter, -+ struct bkey_format *out_f, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *out = vstruct_last(dst), *k_packed; -+ struct bkey_on_stack k; -+ struct btree_nr_keys nr; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&k); -+ -+ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { -+ if (filter_whiteouts && bkey_whiteout(k_packed)) -+ continue; -+ -+ /* -+ * NOTE: -+ * bch2_bkey_normalize may modify the key we pass it (dropping -+ * stale pointers) and we don't have a write lock on the src -+ * node; we have to make a copy of the entire key before calling -+ * normalize -+ */ -+ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); -+ bch2_bkey_unpack(src, k.k, k_packed); -+ -+ if (filter_whiteouts && -+ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) -+ continue; -+ -+ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ bkey_on_stack_exit(&k, c); -+ return nr; -+} -+ -+static inline int sort_keys_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: -+ (int) l->needs_whiteout - (int) r->needs_whiteout; -+} -+ -+unsigned bch2_sort_keys(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *next, *out = dst; -+ -+ sort_iter_sort(iter, sort_keys_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_keys_cmp))) { -+ bool needs_whiteout = false; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ while ((next = sort_iter_peek(iter)) && -+ !bkey_cmp_packed(iter->b, in, next)) { -+ BUG_ON(in->needs_whiteout && -+ next->needs_whiteout); -+ needs_whiteout |= in->needs_whiteout; -+ in = sort_iter_next(iter, sort_keys_cmp); -+ } -+ -+ if (bkey_whiteout(in)) { -+ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); -+ set_bkeyp_val_u64s(f, out, 0); -+ } else { -+ bkey_copy(out, in); -+ } -+ out->needs_whiteout |= needs_whiteout; -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+/* Compat code for btree_node_old_extent_overwrite: */ -+ -+/* -+ * If keys compare equal, compare by pointer order: -+ * -+ * Necessary for sort_fix_overlapping() - if there are multiple keys that -+ * compare equal in different sets, we have to process them newest to oldest. -+ */ -+static inline int extent_sort_fix_overlapping_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), -+ bkey_start_pos(&ur)) ?: -+ cmp_int((unsigned long) r, (unsigned long) l); -+} -+ -+/* -+ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same -+ * bset being ordered by start offset - but 0 size whiteouts (which are always -+ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: -+ */ -+static void extent_iter_advance(struct sort_iter *iter, unsigned idx) -+{ -+ struct sort_iter_set *i = iter->data + idx; -+ -+ do { -+ i->k = bkey_next_skip_noops(i->k, i->end); -+ } while (i->k != i->end && bkey_deleted(i->k)); -+ -+ if (i->k == i->end) -+ array_remove_item(iter->data, iter->used, idx); -+ else -+ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); -+} -+ -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, -+ struct sort_iter *iter) -+{ -+ struct btree *b = iter->b; -+ struct bkey_format *f = &b->format; -+ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; -+ struct bkey_packed *out = dst->start; -+ struct bkey l_unpacked, r_unpacked; -+ struct bkey_s l, r; -+ struct btree_nr_keys nr; -+ struct bkey_on_stack split; -+ unsigned i; -+ -+ memset(&nr, 0, sizeof(nr)); -+ bkey_on_stack_init(&split); -+ -+ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); -+ for (i = 0; i < iter->used;) { -+ if (bkey_deleted(iter->data[i].k)) -+ __sort_iter_advance(iter, i, -+ extent_sort_fix_overlapping_cmp); -+ else -+ i++; -+ } -+ -+ while (!sort_iter_end(iter)) { -+ l = __bkey_disassemble(b, _l->k, &l_unpacked); -+ -+ if (iter->used == 1) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ r = __bkey_disassemble(b, _r->k, &r_unpacked); -+ -+ /* If current key and next key don't overlap, just append */ -+ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { -+ extent_sort_append(c, f, &nr, &out, l); -+ extent_iter_advance(iter, 0); -+ continue; -+ } -+ -+ /* Skip 0 size keys */ -+ if (!r.k->size) { -+ extent_iter_advance(iter, 1); -+ continue; -+ } -+ -+ /* -+ * overlap: keep the newer key and trim the older key so they -+ * don't overlap. comparing pointers tells us which one is -+ * newer, since the bsets are appended one after the other. -+ */ -+ -+ /* can't happen because of comparison func */ -+ BUG_ON(_l->k < _r->k && -+ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); -+ -+ if (_l->k > _r->k) { -+ /* l wins, trim r */ -+ if (bkey_cmp(l.k->p, r.k->p) >= 0) { -+ extent_iter_advance(iter, 1); -+ } else { -+ bch2_cut_front_s(l.k->p, r); -+ extent_save(b, _r->k, r.k); -+ __sort_iter_sift(iter, 1, -+ extent_sort_fix_overlapping_cmp); -+ } -+ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { -+ -+ /* -+ * r wins, but it overlaps in the middle of l - split l: -+ */ -+ bkey_on_stack_reassemble(&split, c, l.s_c); -+ bch2_cut_back(bkey_start_pos(r.k), split.k); -+ -+ bch2_cut_front_s(r.k->p, l); -+ extent_save(b, _l->k, l.k); -+ -+ __sort_iter_sift(iter, 0, -+ extent_sort_fix_overlapping_cmp); -+ -+ extent_sort_append(c, f, &nr, &out, -+ bkey_i_to_s(split.k)); -+ } else { -+ bch2_cut_back_s(bkey_start_pos(r.k), l); -+ extent_save(b, _l->k, l.k); -+ } -+ } -+ -+ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); -+ -+ bkey_on_stack_exit(&split, c); -+ return nr; -+} -+ -+static inline int sort_extents_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) ?: -+ (int) bkey_deleted(l) - (int) bkey_deleted(r); -+} -+ -+unsigned bch2_sort_extents(struct bkey_packed *dst, -+ struct sort_iter *iter, -+ bool filter_whiteouts) -+{ -+ struct bkey_packed *in, *out = dst; -+ -+ sort_iter_sort(iter, sort_extents_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extents_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ if (bkey_whiteout(in) && -+ (filter_whiteouts || !in->needs_whiteout)) -+ continue; -+ -+ bkey_copy(out, in); -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -+ -+static inline int sort_extent_whiteouts_cmp(struct btree *b, -+ struct bkey_packed *l, -+ struct bkey_packed *r) -+{ -+ struct bkey ul = bkey_unpack_key(b, l); -+ struct bkey ur = bkey_unpack_key(b, r); -+ -+ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); -+} -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, -+ struct sort_iter *iter) -+{ -+ const struct bkey_format *f = &iter->b->format; -+ struct bkey_packed *in, *out = dst; -+ struct bkey_i l, r; -+ bool prev = false, l_packed = false; -+ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); -+ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); -+ u64 new_size; -+ -+ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); -+ -+ sort_iter_sort(iter, sort_extent_whiteouts_cmp); -+ -+ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { -+ if (bkey_deleted(in)) -+ continue; -+ -+ EBUG_ON(bkeyp_val_u64s(f, in)); -+ EBUG_ON(in->type != KEY_TYPE_discard); -+ -+ r.k = bkey_unpack_key(iter->b, in); -+ -+ if (prev && -+ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ new_size = l_packed -+ ? min(max_packed_size, max_packed_offset - -+ bkey_start_offset(&l.k)) -+ : KEY_SIZE_MAX; -+ -+ new_size = min(new_size, r.k.p.offset - -+ bkey_start_offset(&l.k)); -+ -+ BUG_ON(new_size < l.k.size); -+ -+ bch2_key_resize(&l.k, new_size); -+ -+ if (bkey_cmp(l.k.p, r.k.p) >= 0) -+ continue; -+ -+ bch2_cut_front(l.k.p, &r); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ l = r; -+ prev = true; -+ l_packed = bkey_packed(in); -+ } -+ -+ if (prev) { -+ if (!bch2_bkey_pack(out, &l, f)) { -+ BUG_ON(l_packed); -+ bkey_copy(out, &l); -+ } -+ out = bkey_next(out); -+ } -+ -+ return (u64 *) out - (u64 *) dst; -+} -diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h -new file mode 100644 -index 000000000000..458a051fdac5 ---- /dev/null -+++ b/fs/bcachefs/bkey_sort.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BKEY_SORT_H -+#define _BCACHEFS_BKEY_SORT_H -+ -+struct sort_iter { -+ struct btree *b; -+ unsigned used; -+ unsigned size; -+ -+ struct sort_iter_set { -+ struct bkey_packed *k, *end; -+ } data[MAX_BSETS + 1]; -+}; -+ -+static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) -+{ -+ iter->b = b; -+ iter->used = 0; -+ iter->size = ARRAY_SIZE(iter->data); -+} -+ -+static inline void sort_iter_add(struct sort_iter *iter, -+ struct bkey_packed *k, -+ struct bkey_packed *end) -+{ -+ BUG_ON(iter->used >= iter->size); -+ -+ if (k != end) -+ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; -+} -+ -+struct btree_nr_keys -+bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+struct btree_nr_keys -+bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, -+ struct sort_iter *); -+ -+struct btree_nr_keys -+bch2_sort_repack(struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+struct btree_nr_keys -+bch2_sort_repack_merge(struct bch_fs *, -+ struct bset *, struct btree *, -+ struct btree_node_iter *, -+ struct bkey_format *, bool); -+ -+unsigned bch2_sort_keys(struct bkey_packed *, -+ struct sort_iter *, bool); -+unsigned bch2_sort_extents(struct bkey_packed *, -+ struct sort_iter *, bool); -+ -+unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, -+ struct sort_iter *); -+ -+#endif /* _BCACHEFS_BKEY_SORT_H */ -diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c -new file mode 100644 -index 000000000000..f7c2841ed8a7 ---- /dev/null -+++ b/fs/bcachefs/bset.c -@@ -0,0 +1,1742 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for working with individual keys, and sorted sets of keys with in a -+ * btree node -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "bset.h" -+#include "eytzinger.h" -+#include "util.h" -+ -+#include -+#include -+#include -+#include -+ -+/* hack.. */ -+#include "alloc_types.h" -+#include -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, -+ struct btree *); -+ -+static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) -+{ -+ unsigned n = ARRAY_SIZE(iter->data); -+ -+ while (n && __btree_node_iter_set_end(iter, n - 1)) -+ --n; -+ -+ return n; -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (offset <= t->end_offset) { -+ EBUG_ON(offset < btree_bkey_first_offset(t)); -+ return t; -+ } -+ -+ BUG(); -+} -+ -+/* -+ * There are never duplicate live keys in the btree - but including keys that -+ * have been flagged as deleted (and will be cleaned up later) we _will_ see -+ * duplicates. -+ * -+ * Thus the sort order is: usual key comparison first, but for keys that compare -+ * equal the deleted key(s) come first, and the (at most one) live version comes -+ * last. -+ * -+ * The main reason for this is insertion: to handle overwrites, we first iterate -+ * over keys that compare equal to our insert key, and then insert immediately -+ * prior to the first key greater than the key we're inserting - our insert -+ * position will be after all keys that compare equal to our insert key, which -+ * by the time we actually do the insert will all be deleted. -+ */ -+ -+void bch2_dump_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned set) -+{ -+ struct bkey_packed *_k, *_n; -+ struct bkey uk, n; -+ struct bkey_s_c k; -+ char buf[200]; -+ -+ if (!i->u64s) -+ return; -+ -+ for (_k = i->start; -+ _k < vstruct_last(i); -+ _k = _n) { -+ _n = bkey_next_skip_noops(_k, vstruct_last(i)); -+ -+ k = bkey_disassemble(b, _k, &uk); -+ if (c) -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ else -+ bch2_bkey_to_text(&PBUF(buf), k.k); -+ printk(KERN_ERR "block %u key %5zu: %s\n", set, -+ _k->_data - i->_data, buf); -+ -+ if (_n == vstruct_last(i)) -+ continue; -+ -+ n = bkey_unpack_key(b, _n); -+ -+ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { -+ printk(KERN_ERR "Key skipped backwards\n"); -+ continue; -+ } -+ -+ if (!bkey_deleted(k.k) && -+ !bkey_cmp(n.p, k.k->p)) -+ printk(KERN_ERR "Duplicate keys\n"); -+ } -+} -+ -+void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ console_lock(); -+ for_each_bset(b, t) -+ bch2_dump_bset(c, b, bset(b, t), t - b->set); -+ console_unlock(); -+} -+ -+void bch2_dump_btree_node_iter(struct btree *b, -+ struct btree_node_iter *iter) -+{ -+ struct btree_node_iter_set *set; -+ -+ printk(KERN_ERR "btree node iter with %u/%u sets:\n", -+ __btree_node_iter_used(iter), b->nsets); -+ -+ btree_node_iter_for_each(iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk = bkey_unpack_key(b, k); -+ char buf[100]; -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ printk(KERN_ERR "set %zu key %u: %s\n", -+ t - b->set, set->k, buf); -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ struct bset_tree *t; -+ struct bkey_packed *k; -+ struct btree_nr_keys nr = { 0 }; -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) -+ btree_keys_account_key_add(&nr, t - b->set, k); -+ -+ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); -+} -+ -+static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, -+ struct btree *b) -+{ -+ struct btree_node_iter iter = *_iter; -+ const struct bkey_packed *k, *n; -+ -+ k = bch2_btree_node_iter_peek_all(&iter, b); -+ __bch2_btree_node_iter_advance(&iter, b); -+ n = bch2_btree_node_iter_peek_all(&iter, b); -+ -+ bkey_unpack_key(b, k); -+ -+ if (n && -+ bkey_iter_cmp(b, k, n) > 0) { -+ struct btree_node_iter_set *set; -+ struct bkey ku = bkey_unpack_key(b, k); -+ struct bkey nu = bkey_unpack_key(b, n); -+ char buf1[80], buf2[80]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &ku); -+ bch2_bkey_to_text(&PBUF(buf2), &nu); -+ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", -+ buf1, buf2); -+ printk(KERN_ERR "iter was:"); -+ -+ btree_node_iter_for_each(_iter, set) { -+ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ printk(" [%zi %zi]", t - b->set, -+ k->_data - bset(b, t)->_data); -+ } -+ panic("\n"); -+ } -+} -+ -+void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct btree_node_iter_set *set, *s2; -+ struct bkey_packed *k, *p; -+ struct bset_tree *t; -+ -+ if (bch2_btree_node_iter_end(iter)) -+ return; -+ -+ /* Verify no duplicates: */ -+ btree_node_iter_for_each(iter, set) -+ btree_node_iter_for_each(iter, s2) -+ BUG_ON(set != s2 && set->end == s2->end); -+ -+ /* Verify that set->end is correct: */ -+ btree_node_iter_for_each(iter, set) { -+ for_each_bset(b, t) -+ if (set->end == t->end_offset) -+ goto found; -+ BUG(); -+found: -+ BUG_ON(set->k < btree_bkey_first_offset(t) || -+ set->k >= t->end_offset); -+ } -+ -+ /* Verify iterator is sorted: */ -+ btree_node_iter_for_each(iter, set) -+ BUG_ON(set != iter->data && -+ btree_node_iter_cmp(b, set[-1], set[0]) > 0); -+ -+ k = bch2_btree_node_iter_peek_all(iter, b); -+ -+ for_each_bset(b, t) { -+ if (iter->data[0].end == t->end_offset) -+ continue; -+ -+ p = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ -+ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); -+ } -+} -+ -+void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, -+ struct bkey_packed *insert, unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); -+ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); -+#if 0 -+ BUG_ON(prev && -+ bkey_iter_cmp(b, prev, insert) > 0); -+#else -+ if (prev && -+ bkey_iter_cmp(b, prev, insert) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, prev); -+ struct bkey k2 = bkey_unpack_key(b, insert); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("prev > insert:\n" -+ "prev key %s\n" -+ "insert key %s\n", -+ buf1, buf2); -+ } -+#endif -+#if 0 -+ BUG_ON(next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0); -+#else -+ if (next != btree_bkey_last(b, t) && -+ bkey_iter_cmp(b, insert, next) > 0) { -+ struct bkey k1 = bkey_unpack_key(b, insert); -+ struct bkey k2 = bkey_unpack_key(b, next); -+ char buf1[100]; -+ char buf2[100]; -+ -+ bch2_dump_btree_node(NULL, b); -+ bch2_bkey_to_text(&PBUF(buf1), &k1); -+ bch2_bkey_to_text(&PBUF(buf2), &k2); -+ -+ panic("insert > next:\n" -+ "insert key %s\n" -+ "next key %s\n", -+ buf1, buf2); -+ } -+#endif -+} -+ -+#else -+ -+static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, -+ struct btree *b) {} -+ -+#endif -+ -+/* Auxiliary search trees */ -+ -+#define BFLOAT_FAILED_UNPACKED U8_MAX -+#define BFLOAT_FAILED U8_MAX -+ -+struct bkey_float { -+ u8 exponent; -+ u8 key_offset; -+ u16 mantissa; -+}; -+#define BKEY_MANTISSA_BITS 16 -+ -+static unsigned bkey_float_byte_offset(unsigned idx) -+{ -+ return idx * sizeof(struct bkey_float); -+} -+ -+struct ro_aux_tree { -+ struct bkey_float f[0]; -+}; -+ -+struct rw_aux_tree { -+ u16 offset; -+ struct bpos k; -+}; -+ -+static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) -+{ -+ BUG_ON(t->aux_data_offset == U16_MAX); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return t->aux_data_offset; -+ case BSET_RO_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + -+ t->size * sizeof(u8), 8); -+ case BSET_RW_AUX_TREE: -+ return t->aux_data_offset + -+ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned bset_aux_tree_buf_start(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return t == b->set -+ ? DIV_ROUND_UP(b->unpack_fn_len, 8) -+ : bset_aux_tree_buf_end(t - 1); -+} -+ -+static void *__aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return b->aux_data + t->aux_data_offset * 8; -+} -+ -+static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+static u8 *ro_aux_tree_prev(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); -+} -+ -+static struct bkey_float *bkey_float(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned idx) -+{ -+ return ro_aux_tree_base(b, t)->f + idx; -+} -+ -+static void bset_aux_tree_verify(struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ if (t->aux_data_offset == U16_MAX) -+ continue; -+ -+ BUG_ON(t != b->set && -+ t[-1].aux_data_offset == U16_MAX); -+ -+ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); -+ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); -+ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); -+ } -+#endif -+} -+ -+void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) -+{ -+ unsigned i; -+ -+ b->nsets = 0; -+ memset(&b->nr, 0, sizeof(b->nr)); -+#ifdef CONFIG_BCACHEFS_DEBUG -+ b->expensive_debug_checks = expensive_debug_checks; -+#endif -+ for (i = 0; i < MAX_BSETS; i++) -+ b->set[i].data_offset = U16_MAX; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+/* Binary tree stuff for auxiliary search trees */ -+ -+/* -+ * Cacheline/offset <-> bkey pointer arithmetic: -+ * -+ * t->tree is a binary search tree in an array; each node corresponds to a key -+ * in one cacheline in t->set (BSET_CACHELINE bytes). -+ * -+ * This means we don't have to store the full index of the key that a node in -+ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and -+ * then bkey_float->m gives us the offset within that cacheline, in units of 8 -+ * bytes. -+ * -+ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to -+ * make this work. -+ * -+ * To construct the bfloat for an arbitrary key we need to know what the key -+ * immediately preceding it is: we have to check if the two keys differ in the -+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size -+ * of the previous key so we can walk backwards to it from t->tree[j]'s key. -+ */ -+ -+static inline void *bset_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline) -+{ -+ return (void *) round_down((unsigned long) btree_bkey_first(b, t), -+ L1_CACHE_BYTES) + -+ cacheline * BSET_CACHELINE; -+} -+ -+static struct bkey_packed *cacheline_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ unsigned offset) -+{ -+ return bset_cacheline(b, t, cacheline) + offset * 8; -+} -+ -+static unsigned bkey_to_cacheline(const struct btree *b, -+ const struct bset_tree *t, -+ const struct bkey_packed *k) -+{ -+ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; -+} -+ -+static ssize_t __bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); -+} -+ -+static unsigned bkey_to_cacheline_offset(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned cacheline, -+ const struct bkey_packed *k) -+{ -+ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); -+ -+ EBUG_ON(m > U8_MAX); -+ return m; -+} -+ -+static inline struct bkey_packed *tree_to_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ return cacheline_to_bkey(b, t, -+ __eytzinger1_to_inorder(j, t->size, t->extra), -+ bkey_float(b, t, j)->key_offset); -+} -+ -+static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, -+ const struct bset_tree *t, -+ unsigned j) -+{ -+ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; -+ -+ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); -+} -+ -+static struct rw_aux_tree *rw_aux_tree(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ -+ return __aux_tree_base(b, t); -+} -+ -+/* -+ * For the write set - the one we're currently inserting keys into - we don't -+ * maintain a full search tree, we just keep a simple lookup table in t->prev. -+ */ -+static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, -+ struct bset_tree *t, -+ unsigned j) -+{ -+ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); -+} -+ -+static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, -+ unsigned j, struct bkey_packed *k) -+{ -+ EBUG_ON(k >= btree_bkey_last(b, t)); -+ -+ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { -+ .offset = __btree_node_key_to_offset(b, k), -+ .k = bkey_unpack_pos(b, k), -+ }; -+} -+ -+static void bch2_bset_verify_rw_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ struct bkey_packed *k = btree_bkey_first(b, t); -+ unsigned j = 0; -+ -+ if (!btree_keys_expensive_checks(b)) -+ return; -+ -+ BUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ BUG_ON(t->size < 1); -+ BUG_ON(rw_aux_to_bkey(b, t, j) != k); -+ -+ goto start; -+ while (1) { -+ if (rw_aux_to_bkey(b, t, j) == k) { -+ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, -+ bkey_unpack_pos(b, k))); -+start: -+ if (++j == t->size) -+ break; -+ -+ BUG_ON(rw_aux_tree(b, t)[j].offset <= -+ rw_aux_tree(b, t)[j - 1].offset); -+ } -+ -+ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ BUG_ON(k >= btree_bkey_last(b, t)); -+ } -+} -+ -+/* returns idx of first entry >= offset: */ -+static unsigned rw_aux_tree_bsearch(struct btree *b, -+ struct bset_tree *t, -+ unsigned offset) -+{ -+ unsigned bset_offs = offset - btree_bkey_first_offset(t); -+ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); -+ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); -+ EBUG_ON(!t->size); -+ EBUG_ON(idx > t->size); -+ -+ while (idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset) -+ idx++; -+ -+ while (idx && -+ rw_aux_tree(b, t)[idx - 1].offset >= offset) -+ idx--; -+ -+ EBUG_ON(idx < t->size && -+ rw_aux_tree(b, t)[idx].offset < offset); -+ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); -+ EBUG_ON(idx + 1 < t->size && -+ rw_aux_tree(b, t)[idx].offset == -+ rw_aux_tree(b, t)[idx + 1].offset); -+ -+ return idx; -+} -+ -+static inline unsigned bkey_mantissa(const struct bkey_packed *k, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+ u64 v; -+ -+ EBUG_ON(!bkey_packed(k)); -+ -+ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); -+ -+ /* -+ * In little endian, we're shifting off low bits (and then the bits we -+ * want are at the low end), in big endian we're shifting off high bits -+ * (and then the bits we want are at the high end, so we shift them -+ * back down): -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ v >>= f->exponent & 7; -+#else -+ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; -+#endif -+ return (u16) v; -+} -+ -+static void make_bfloat(struct btree *b, struct bset_tree *t, -+ unsigned j, -+ struct bkey_packed *min_key, -+ struct bkey_packed *max_key) -+{ -+ struct bkey_float *f = bkey_float(b, t, j); -+ struct bkey_packed *m = tree_to_bkey(b, t, j); -+ struct bkey_packed *l, *r; -+ unsigned mantissa; -+ int shift, exponent, high_bit; -+ -+ if (is_power_of_2(j)) { -+ l = min_key; -+ -+ if (!l->u64s) { -+ if (!bkey_pack_pos(l, b->data->min_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = b->data->min_key; -+ bkey_copy(l, &tmp); -+ } -+ } -+ } else { -+ l = tree_to_prev_bkey(b, t, j >> ffs(j)); -+ -+ EBUG_ON(m < l); -+ } -+ -+ if (is_power_of_2(j + 1)) { -+ r = max_key; -+ -+ if (!r->u64s) { -+ if (!bkey_pack_pos(r, t->max_key, b)) { -+ struct bkey_i tmp; -+ -+ bkey_init(&tmp.k); -+ tmp.k.p = t->max_key; -+ bkey_copy(r, &tmp); -+ } -+ } -+ } else { -+ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); -+ -+ EBUG_ON(m > r); -+ } -+ -+ /* -+ * for failed bfloats, the lookup code falls back to comparing against -+ * the original key. -+ */ -+ -+ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || -+ !b->nr_key_bits) { -+ f->exponent = BFLOAT_FAILED_UNPACKED; -+ return; -+ } -+ -+ /* -+ * The greatest differing bit of l and r is the first bit we must -+ * include in the bfloat mantissa we're creating in order to do -+ * comparisons - that bit always becomes the high bit of -+ * bfloat->mantissa, and thus the exponent we're calculating here is -+ * the position of what will become the low bit in bfloat->mantissa: -+ * -+ * Note that this may be negative - we may be running off the low end -+ * of the key: we handle this later: -+ */ -+ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), -+ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); -+ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); -+ -+ /* -+ * Then we calculate the actual shift value, from the start of the key -+ * (k->_data), to get the key bits starting at exponent: -+ */ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; -+ -+ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); -+#else -+ shift = high_bit_offset + -+ b->nr_key_bits - -+ exponent - -+ BKEY_MANTISSA_BITS; -+ -+ EBUG_ON(shift < KEY_PACKED_BITS_START); -+#endif -+ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); -+ -+ f->exponent = shift; -+ mantissa = bkey_mantissa(m, f, j); -+ -+ /* -+ * If we've got garbage bits, set them to all 1s - it's legal for the -+ * bfloat to compare larger than the original key, but not smaller: -+ */ -+ if (exponent < 0) -+ mantissa |= ~(~0U << -exponent); -+ -+ f->mantissa = mantissa; -+} -+ -+/* bytes remaining - only valid for last bset: */ -+static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ bset_aux_tree_verify(b); -+ -+ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); -+} -+ -+static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / -+ (sizeof(struct bkey_float) + sizeof(u8)); -+} -+ -+static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) -+{ -+ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); -+} -+ -+static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *k; -+ -+ t->size = 1; -+ t->extra = BSET_RW_AUX_TREE_VAL; -+ rw_aux_tree(b, t)[0].offset = -+ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); -+ -+ bset_tree_for_each_key(b, t, k) { -+ if (t->size == bset_rw_tree_capacity(b, t)) -+ break; -+ -+ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > -+ L1_CACHE_BYTES) -+ rw_aux_tree_set(b, t, t->size++, k); -+ } -+} -+ -+static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); -+ struct bkey_packed min_key, max_key; -+ unsigned j, cacheline = 1; -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), -+ bset_ro_tree_capacity(b, t)); -+retry: -+ if (t->size < 2) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ return; -+ } -+ -+ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; -+ -+ /* First we figure out where the first key in each cacheline is */ -+ eytzinger1_for_each(j, t->size) { -+ while (bkey_to_cacheline(b, t, k) < cacheline) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ if (k >= btree_bkey_last(b, t)) { -+ /* XXX: this path sucks */ -+ t->size--; -+ goto retry; -+ } -+ -+ ro_aux_tree_prev(b, t)[j] = prev->u64s; -+ bkey_float(b, t, j)->key_offset = -+ bkey_to_cacheline_offset(b, t, cacheline++, k); -+ -+ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); -+ EBUG_ON(tree_to_bkey(b, t, j) != k); -+ } -+ -+ while (k != btree_bkey_last(b, t)) -+ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); -+ -+ t->max_key = bkey_unpack_pos(b, prev); -+ -+ /* Then we build the tree */ -+ eytzinger1_for_each(j, t->size) -+ make_bfloat(b, t, j, &min_key, &max_key); -+} -+ -+static void bset_alloc_tree(struct btree *b, struct bset_tree *t) -+{ -+ struct bset_tree *i; -+ -+ for (i = b->set; i != t; i++) -+ BUG_ON(bset_has_rw_aux_tree(i)); -+ -+ bch2_bset_set_no_aux_tree(b, t); -+ -+ /* round up to next cacheline: */ -+ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), -+ SMP_CACHE_BYTES / sizeof(u64)); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, -+ bool writeable) -+{ -+ if (writeable -+ ? bset_has_rw_aux_tree(t) -+ : bset_has_ro_aux_tree(t)) -+ return; -+ -+ bset_alloc_tree(b, t); -+ -+ if (!__bset_tree_capacity(b, t)) -+ return; -+ -+ if (writeable) -+ __build_rw_aux_tree(b, t); -+ else -+ __build_ro_aux_tree(b, t); -+ -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_init_first(struct btree *b, struct bset *i) -+{ -+ struct bset_tree *t; -+ -+ BUG_ON(b->nsets); -+ -+ memset(i, 0, sizeof(*i)); -+ get_random_bytes(&i->seq, sizeof(i->seq)); -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+void bch2_bset_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_node_entry *bne) -+{ -+ struct bset *i = &bne->keys; -+ struct bset_tree *t; -+ -+ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); -+ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); -+ BUG_ON(b->nsets >= MAX_BSETS); -+ -+ memset(i, 0, sizeof(*i)); -+ i->seq = btree_bset_first(b)->seq; -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ t = &b->set[b->nsets++]; -+ set_btree_bset(b, t, i); -+} -+ -+/* -+ * find _some_ key in the same bset as @k that precedes @k - not necessarily the -+ * immediate predecessor: -+ */ -+static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed *p; -+ unsigned offset; -+ int j; -+ -+ EBUG_ON(k < btree_bkey_first(b, t) || -+ k > btree_bkey_last(b, t)); -+ -+ if (k == btree_bkey_first(b, t)) -+ return NULL; -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ p = btree_bkey_first(b, t); -+ break; -+ case BSET_RO_AUX_TREE: -+ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); -+ -+ do { -+ p = j ? tree_to_bkey(b, t, -+ __inorder_to_eytzinger1(j--, -+ t->size, t->extra)) -+ : btree_bkey_first(b, t); -+ } while (p >= k); -+ break; -+ case BSET_RW_AUX_TREE: -+ offset = __btree_node_key_to_offset(b, k); -+ j = rw_aux_tree_bsearch(b, t, offset); -+ p = j ? rw_aux_to_bkey(b, t, j - 1) -+ : btree_bkey_first(b, t); -+ break; -+ } -+ -+ return p; -+} -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; -+ -+ while ((p = __bkey_prev(b, t, k)) && !ret) { -+ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) -+ if (i->type >= min_key_type) -+ ret = i; -+ -+ k = p; -+ } -+ -+ if (btree_keys_expensive_checks(b)) { -+ BUG_ON(ret >= orig_k); -+ -+ for (i = ret -+ ? bkey_next_skip_noops(ret, orig_k) -+ : btree_bkey_first(b, t); -+ i != orig_k; -+ i = bkey_next_skip_noops(i, orig_k)) -+ BUG_ON(i->type >= min_key_type); -+ } -+ -+ return ret; -+} -+ -+/* Insert */ -+ -+static void rw_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ unsigned offset = __btree_node_key_to_offset(b, k); -+ unsigned j = rw_aux_tree_bsearch(b, t, offset); -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset == offset) -+ rw_aux_tree_set(b, t, j, k); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+} -+ -+static void ro_aux_tree_fix_invalidated_key(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct bkey_packed min_key, max_key; -+ unsigned inorder, j; -+ -+ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); -+ -+ /* signal to make_bfloat() that they're uninitialized: */ -+ min_key.u64s = max_key.u64s = 0; -+ -+ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { -+ t->max_key = bkey_unpack_pos(b, k); -+ -+ for (j = 1; j < t->size; j = j * 2 + 1) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ -+ if (inorder && -+ inorder < t->size) { -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ -+ if (k == tree_to_bkey(b, t, j)) { -+ /* Fix the node this key corresponds to */ -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the right boundary */ -+ for (j = eytzinger1_left_child(j); -+ j < t->size; -+ j = eytzinger1_right_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+ -+ if (inorder + 1 < t->size) { -+ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); -+ -+ if (k == tree_to_prev_bkey(b, t, j)) { -+ make_bfloat(b, t, j, &min_key, &max_key); -+ -+ /* Children for which this key is the left boundary */ -+ for (j = eytzinger1_right_child(j); -+ j < t->size; -+ j = eytzinger1_left_child(j)) -+ make_bfloat(b, t, j, &min_key, &max_key); -+ } -+ } -+} -+ -+/** -+ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been -+ * modified, fix any auxiliary search tree by remaking all the nodes in the -+ * auxiliary search tree that @k corresponds to -+ */ -+void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ break; -+ case BSET_RO_AUX_TREE: -+ ro_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ case BSET_RW_AUX_TREE: -+ rw_aux_tree_fix_invalidated_key(b, t, k); -+ break; -+ } -+} -+ -+static void bch2_bset_fix_lookup_table(struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *_where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ int shift = new_u64s - clobber_u64s; -+ unsigned l, j, where = __btree_node_key_to_offset(b, _where); -+ -+ EBUG_ON(bset_has_ro_aux_tree(t)); -+ -+ if (!bset_has_rw_aux_tree(t)) -+ return; -+ -+ /* returns first entry >= where */ -+ l = rw_aux_tree_bsearch(b, t, where); -+ -+ if (!l) /* never delete first entry */ -+ l++; -+ else if (l < t->size && -+ where < t->end_offset && -+ rw_aux_tree(b, t)[l].offset == where) -+ rw_aux_tree_set(b, t, l++, _where); -+ -+ /* l now > where */ -+ -+ for (j = l; -+ j < t->size && -+ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; -+ j++) -+ ; -+ -+ if (j < t->size && -+ rw_aux_tree(b, t)[j].offset + shift == -+ rw_aux_tree(b, t)[l - 1].offset) -+ j++; -+ -+ memmove(&rw_aux_tree(b, t)[l], -+ &rw_aux_tree(b, t)[j], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[j]); -+ t->size -= j - l; -+ -+ for (j = l; j < t->size; j++) -+ rw_aux_tree(b, t)[j].offset += shift; -+ -+ EBUG_ON(l < t->size && -+ rw_aux_tree(b, t)[l].offset == -+ rw_aux_tree(b, t)[l - 1].offset); -+ -+ if (t->size < bset_rw_tree_capacity(b, t) && -+ (l < t->size -+ ? rw_aux_tree(b, t)[l].offset -+ : t->end_offset) - -+ rw_aux_tree(b, t)[l - 1].offset > -+ L1_CACHE_BYTES / sizeof(u64)) { -+ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); -+ struct bkey_packed *end = l < t->size -+ ? rw_aux_to_bkey(b, t, l) -+ : btree_bkey_last(b, t); -+ struct bkey_packed *k = start; -+ -+ while (1) { -+ k = bkey_next_skip_noops(k, end); -+ if (k == end) -+ break; -+ -+ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { -+ memmove(&rw_aux_tree(b, t)[l + 1], -+ &rw_aux_tree(b, t)[l], -+ (void *) &rw_aux_tree(b, t)[t->size] - -+ (void *) &rw_aux_tree(b, t)[l]); -+ t->size++; -+ rw_aux_tree_set(b, t, l, k); -+ break; -+ } -+ } -+ } -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bset_aux_tree_verify(b); -+} -+ -+void bch2_bset_insert(struct btree *b, -+ struct btree_node_iter *iter, -+ struct bkey_packed *where, -+ struct bkey_i *insert, -+ unsigned clobber_u64s) -+{ -+ struct bkey_format *f = &b->format; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bkey_packed packed, *src = bkey_to_packed(insert); -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); -+ -+ if (bch2_bkey_pack_key(&packed, &insert->k, f)) -+ src = &packed; -+ -+ if (!bkey_whiteout(&insert->k)) -+ btree_keys_account_key_add(&b->nr, t - b->set, src); -+ -+ if (src->u64s != clobber_u64s) { -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data + src->u64s; -+ -+ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < -+ (int) clobber_u64s - src->u64s); -+ -+ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); -+ set_btree_bset_end(b, t); -+ } -+ -+ memcpy_u64s(where, src, -+ bkeyp_key_u64s(f, src)); -+ memcpy_u64s(bkeyp_val(f, where), &insert->v, -+ bkeyp_val_u64s(f, src)); -+ -+ if (src->u64s != clobber_u64s) -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_bset_delete(struct btree *b, -+ struct bkey_packed *where, -+ unsigned clobber_u64s) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ u64 *src_p = where->_data + clobber_u64s; -+ u64 *dst_p = where->_data; -+ -+ bch2_bset_verify_rw_aux_tree(b, t); -+ -+ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); -+ -+ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); -+ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); -+ set_btree_bset_end(b, t); -+ -+ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); -+} -+ -+/* Lookup */ -+ -+__flatten -+static struct bkey_packed *bset_search_write_set(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ unsigned l = 0, r = t->size; -+ -+ while (l + 1 != r) { -+ unsigned m = (l + r) >> 1; -+ -+ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) -+ l = m; -+ else -+ r = m; -+ } -+ -+ return rw_aux_to_bkey(b, t, l); -+} -+ -+static inline void prefetch_four_cachelines(void *p) -+{ -+#ifdef CONFIG_X86_64 -+ asm(".intel_syntax noprefix;" -+ "prefetcht0 [%0 - 127 + 64 * 0];" -+ "prefetcht0 [%0 - 127 + 64 * 1];" -+ "prefetcht0 [%0 - 127 + 64 * 2];" -+ "prefetcht0 [%0 - 127 + 64 * 3];" -+ ".att_syntax prefix;" -+ : -+ : "r" (p + 127)); -+#else -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ prefetch(p + L1_CACHE_BYTES * 3); -+#endif -+} -+ -+static inline bool bkey_mantissa_bits_dropped(const struct btree *b, -+ const struct bkey_float *f, -+ unsigned idx) -+{ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; -+ -+ return f->exponent > key_bits_start; -+#else -+ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; -+ -+ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; -+#endif -+} -+ -+__flatten -+static struct bkey_packed *bset_search_tree(const struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *packed_search) -+{ -+ struct ro_aux_tree *base = ro_aux_tree_base(b, t); -+ struct bkey_float *f; -+ struct bkey_packed *k; -+ unsigned inorder, n = 1, l, r; -+ int cmp; -+ -+ do { -+ if (likely(n << 4 < t->size)) -+ prefetch(&base->f[n << 4]); -+ -+ f = &base->f[n]; -+ -+ if (!unlikely(packed_search)) -+ goto slowpath; -+ if (unlikely(f->exponent >= BFLOAT_FAILED)) -+ goto slowpath; -+ -+ l = f->mantissa; -+ r = bkey_mantissa(packed_search, f, n); -+ -+ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) -+ goto slowpath; -+ -+ n = n * 2 + (l < r); -+ continue; -+slowpath: -+ k = tree_to_bkey(b, t, n); -+ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); -+ if (!cmp) -+ return k; -+ -+ n = n * 2 + (cmp < 0); -+ } while (n < t->size); -+ -+ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); -+ -+ /* -+ * n would have been the node we recursed to - the low bit tells us if -+ * we recursed left or recursed right. -+ */ -+ if (likely(!(n & 1))) { -+ --inorder; -+ if (unlikely(!inorder)) -+ return btree_bkey_first(b, t); -+ -+ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; -+ } -+ -+ return cacheline_to_bkey(b, t, inorder, f->key_offset); -+} -+ -+static __always_inline __flatten -+struct bkey_packed *__bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ -+ /* -+ * First, we search for a cacheline, then lastly we do a linear search -+ * within that cacheline. -+ * -+ * To search for the cacheline, there's three different possibilities: -+ * * The set is too small to have a search tree, so we just do a linear -+ * search over the whole set. -+ * * The set is the one we're currently inserting into; keeping a full -+ * auxiliary search tree up to date would be too expensive, so we -+ * use a much simpler lookup table to do a binary search - -+ * bset_search_write_set(). -+ * * Or we use the auxiliary search tree we constructed earlier - -+ * bset_search_tree() -+ */ -+ -+ switch (bset_aux_tree_type(t)) { -+ case BSET_NO_AUX_TREE: -+ return btree_bkey_first(b, t); -+ case BSET_RW_AUX_TREE: -+ return bset_search_write_set(b, t, search, lossy_packed_search); -+ case BSET_RO_AUX_TREE: -+ /* -+ * Each node in the auxiliary search tree covers a certain range -+ * of bits, and keys above and below the set it covers might -+ * differ outside those bits - so we have to special case the -+ * start and end - handle that here: -+ */ -+ -+ if (bkey_cmp(*search, t->max_key) > 0) -+ return btree_bkey_last(b, t); -+ -+ return bset_search_tree(b, t, search, lossy_packed_search); -+ default: -+ unreachable(); -+ } -+} -+ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search_linear(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search, -+ struct bkey_packed *m) -+{ -+ if (lossy_packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_cmp_p_or_unp(b, m, -+ lossy_packed_search, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (!packed_search) -+ while (m != btree_bkey_last(b, t) && -+ bkey_iter_pos_cmp(b, m, search) < 0) -+ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); -+ -+ BUG_ON(prev && -+ bkey_iter_cmp_p_or_unp(b, prev, -+ packed_search, search) >= 0); -+ } -+ -+ return m; -+} -+ -+/* -+ * Returns the first key greater than or equal to @search -+ */ -+static __always_inline __flatten -+struct bkey_packed *bch2_bset_search(struct btree *b, -+ struct bset_tree *t, -+ struct bpos *search, -+ struct bkey_packed *packed_search, -+ const struct bkey_packed *lossy_packed_search) -+{ -+ struct bkey_packed *m = __bch2_bset_search(b, t, search, -+ lossy_packed_search); -+ -+ return bch2_bset_search_linear(b, t, search, -+ packed_search, lossy_packed_search, m); -+} -+ -+/* Btree node iterator */ -+ -+static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ if (k != end) { -+ struct btree_node_iter_set *pos; -+ -+ btree_node_iter_for_each(iter, pos) -+ ; -+ -+ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); -+ *pos = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+} -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *iter, -+ struct btree *b, -+ const struct bkey_packed *k, -+ const struct bkey_packed *end) -+{ -+ __bch2_btree_node_iter_push(iter, b, k, end); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+noinline __flatten __attribute__((cold)) -+static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bset_tree *t; -+ -+ trace_bkey_pack_pos_fail(search); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ bch2_bset_search(b, t, search, NULL, NULL), -+ btree_bkey_last(b, t)); -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+/** -+ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a -+ * given position -+ * -+ * Main entry point to the lookup code for individual btree nodes: -+ * -+ * NOTE: -+ * -+ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate -+ * keys. This doesn't matter for most code, but it does matter for lookups. -+ * -+ * Some adjacent keys with a string of equal keys: -+ * i j k k k k l m -+ * -+ * If you search for k, the lookup code isn't guaranteed to return you any -+ * specific k. The lookup code is conceptually doing a binary search and -+ * iterating backwards is very expensive so if the pivot happens to land at the -+ * last k that's what you'll get. -+ * -+ * This works out ok, but it's something to be aware of: -+ * -+ * - For non extents, we guarantee that the live key comes last - see -+ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't -+ * see will only be deleted keys you don't care about. -+ * -+ * - For extents, deleted keys sort last (see the comment at the top of this -+ * file). But when you're searching for extents, you actually want the first -+ * key strictly greater than your search key - an extent that compares equal -+ * to the search key is going to have 0 sectors after the search key. -+ * -+ * But this does mean that we can't just search for -+ * bkey_successor(start_of_range) to get the first extent that overlaps with -+ * the range we want - if we're unlucky and there's an extent that ends -+ * exactly where we searched, then there could be a deleted key at the same -+ * position and we'd get that when we search instead of the preceding extent -+ * we needed. -+ * -+ * So we've got to search for start_of_range, then after the lookup iterate -+ * past any extents that compare equal to the position we searched for. -+ */ -+__flatten -+void bch2_btree_node_iter_init(struct btree_node_iter *iter, -+ struct btree *b, struct bpos *search) -+{ -+ struct bkey_packed p, *packed_search = NULL; -+ struct btree_node_iter_set *pos = iter->data; -+ struct bkey_packed *k[MAX_BSETS]; -+ unsigned i; -+ -+ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); -+ bset_aux_tree_verify(b); -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { -+ case BKEY_PACK_POS_EXACT: -+ packed_search = &p; -+ break; -+ case BKEY_PACK_POS_SMALLER: -+ packed_search = NULL; -+ break; -+ case BKEY_PACK_POS_FAIL: -+ btree_node_iter_init_pack_failed(iter, b, search); -+ return; -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ k[i] = __bch2_bset_search(b, b->set + i, search, &p); -+ prefetch_four_cachelines(k[i]); -+ } -+ -+ for (i = 0; i < b->nsets; i++) { -+ struct bset_tree *t = b->set + i; -+ struct bkey_packed *end = btree_bkey_last(b, t); -+ -+ k[i] = bch2_bset_search_linear(b, t, search, -+ packed_search, &p, k[i]); -+ if (k[i] != end) -+ *pos++ = (struct btree_node_iter_set) { -+ __btree_node_key_to_offset(b, k[i]), -+ __btree_node_key_to_offset(b, end) -+ }; -+ } -+ -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ memset(iter, 0, sizeof(*iter)); -+ -+ for_each_bset(b, t) -+ __bch2_btree_node_iter_push(iter, b, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ bch2_btree_node_iter_sort(iter, b); -+} -+ -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) -+ return __btree_node_offset_to_key(b, set->k); -+ -+ return btree_bkey_last(b, t); -+} -+ -+static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned first) -+{ -+ bool ret; -+ -+ if ((ret = (btree_node_iter_cmp(b, -+ iter->data[first], -+ iter->data[first + 1]) > 0))) -+ swap(iter->data[first], iter->data[first + 1]); -+ return ret; -+} -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ /* unrolled bubble sort: */ -+ -+ if (!__btree_node_iter_set_end(iter, 2)) { -+ btree_node_iter_sort_two(iter, b, 0); -+ btree_node_iter_sort_two(iter, b, 1); -+ } -+ -+ if (!__btree_node_iter_set_end(iter, 1)) -+ btree_node_iter_sort_two(iter, b, 0); -+} -+ -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, -+ struct btree_node_iter_set *set) -+{ -+ struct btree_node_iter_set *last = -+ iter->data + ARRAY_SIZE(iter->data) - 1; -+ -+ memmove(&set[0], &set[1], (void *) last - (void *) set); -+ *last = (struct btree_node_iter_set) { 0, 0 }; -+} -+ -+static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; -+ -+ EBUG_ON(iter->data->k > iter->data->end); -+ -+ while (!__btree_node_iter_set_end(iter, 0) && -+ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) -+ iter->data->k++; -+ -+ if (unlikely(__btree_node_iter_set_end(iter, 0))) { -+ bch2_btree_node_iter_set_drop(iter, iter->data); -+ return; -+ } -+ -+ if (__btree_node_iter_set_end(iter, 1)) -+ return; -+ -+ if (!btree_node_iter_sort_two(iter, b, 0)) -+ return; -+ -+ if (__btree_node_iter_set_end(iter, 2)) -+ return; -+ -+ btree_node_iter_sort_two(iter, b, 1); -+} -+ -+void bch2_btree_node_iter_advance(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) { -+ bch2_btree_node_iter_verify(iter, b); -+ bch2_btree_node_iter_next_check(iter, b); -+ } -+ -+ __bch2_btree_node_iter_advance(iter, b); -+} -+ -+/* -+ * Expensive: -+ */ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ struct bkey_packed *k, *prev = NULL; -+ struct btree_node_iter_set *set; -+ struct bset_tree *t; -+ unsigned end = 0; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ -+ for_each_bset(b, t) { -+ k = bch2_bkey_prev_all(b, t, -+ bch2_btree_node_iter_bset_pos(iter, b, t)); -+ if (k && -+ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { -+ prev = k; -+ end = t->end_offset; -+ } -+ } -+ -+ if (!prev) -+ return NULL; -+ -+ /* -+ * We're manually memmoving instead of just calling sort() to ensure the -+ * prev we picked ends up in slot 0 - sort won't necessarily put it -+ * there because of duplicate deleted keys: -+ */ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == end) -+ goto found; -+ -+ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); -+found: -+ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); -+ -+ memmove(&iter->data[1], -+ &iter->data[0], -+ (void *) set - (void *) &iter->data[0]); -+ -+ iter->data[0].k = __btree_node_key_to_offset(b, prev); -+ iter->data[0].end = end; -+ -+ if (btree_keys_expensive_checks(b)) -+ bch2_btree_node_iter_verify(iter, b); -+ return prev; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ struct bkey_packed *prev; -+ -+ do { -+ prev = bch2_btree_node_iter_prev_all(iter, b); -+ } while (prev && prev->type < min_key_type); -+ -+ return prev; -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bkey *u) -+{ -+ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); -+ -+ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; -+} -+ -+/* Mergesort */ -+ -+void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) { -+ enum bset_aux_tree_type type = bset_aux_tree_type(t); -+ size_t j; -+ -+ stats->sets[type].nr++; -+ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * -+ sizeof(u64); -+ -+ if (bset_has_ro_aux_tree(t)) { -+ stats->floats += t->size - 1; -+ -+ for (j = 1; j < t->size; j++) -+ stats->failed += -+ bkey_float(b, t, j)->exponent == -+ BFLOAT_FAILED; -+ } -+ } -+} -+ -+void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, -+ struct bkey_packed *k) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ struct bkey uk; -+ unsigned j, inorder; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ if (!bset_has_ro_aux_tree(t)) -+ return; -+ -+ inorder = bkey_to_cacheline(b, t, k); -+ if (!inorder || inorder >= t->size) -+ return; -+ -+ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); -+ if (k != tree_to_bkey(b, t, j)) -+ return; -+ -+ switch (bkey_float(b, t, j)->exponent) { -+ case BFLOAT_FAILED: -+ uk = bkey_unpack_key(b, k); -+ pr_buf(out, -+ " failed unpacked at depth %u\n" -+ "\t%llu:%llu\n", -+ ilog2(j), -+ uk.p.inode, uk.p.offset); -+ break; -+ } -+} -diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h -new file mode 100644 -index 000000000000..5921cf689105 ---- /dev/null -+++ b/fs/bcachefs/bset.h -@@ -0,0 +1,661 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BSET_H -+#define _BCACHEFS_BSET_H -+ -+#include -+#include -+ -+#include "bcachefs_format.h" -+#include "bkey.h" -+#include "bkey_methods.h" -+#include "btree_types.h" -+#include "util.h" /* for time_stats */ -+#include "vstructs.h" -+ -+/* -+ * BKEYS: -+ * -+ * A bkey contains a key, a size field, a variable number of pointers, and some -+ * ancillary flag bits. -+ * -+ * We use two different functions for validating bkeys, bkey_invalid and -+ * bkey_deleted(). -+ * -+ * The one exception to the rule that ptr_invalid() filters out invalid keys is -+ * that it also filters out keys of size 0 - these are keys that have been -+ * completely overwritten. It'd be safe to delete these in memory while leaving -+ * them on disk, just unnecessary work - so we filter them out when resorting -+ * instead. -+ * -+ * We can't filter out stale keys when we're resorting, because garbage -+ * collection needs to find them to ensure bucket gens don't wrap around - -+ * unless we're rewriting the btree node those stale keys still exist on disk. -+ * -+ * We also implement functions here for removing some number of sectors from the -+ * front or the back of a bkey - this is mainly used for fixing overlapping -+ * extents, by removing the overlapping sectors from the older key. -+ * -+ * BSETS: -+ * -+ * A bset is an array of bkeys laid out contiguously in memory in sorted order, -+ * along with a header. A btree node is made up of a number of these, written at -+ * different times. -+ * -+ * There could be many of them on disk, but we never allow there to be more than -+ * 4 in memory - we lazily resort as needed. -+ * -+ * We implement code here for creating and maintaining auxiliary search trees -+ * (described below) for searching an individial bset, and on top of that we -+ * implement a btree iterator. -+ * -+ * BTREE ITERATOR: -+ * -+ * Most of the code in bcache doesn't care about an individual bset - it needs -+ * to search entire btree nodes and iterate over them in sorted order. -+ * -+ * The btree iterator code serves both functions; it iterates through the keys -+ * in a btree node in sorted order, starting from either keys after a specific -+ * point (if you pass it a search key) or the start of the btree node. -+ * -+ * AUXILIARY SEARCH TREES: -+ * -+ * Since keys are variable length, we can't use a binary search on a bset - we -+ * wouldn't be able to find the start of the next key. But binary searches are -+ * slow anyways, due to terrible cache behaviour; bcache originally used binary -+ * searches and that code topped out at under 50k lookups/second. -+ * -+ * So we need to construct some sort of lookup table. Since we only insert keys -+ * into the last (unwritten) set, most of the keys within a given btree node are -+ * usually in sets that are mostly constant. We use two different types of -+ * lookup tables to take advantage of this. -+ * -+ * Both lookup tables share in common that they don't index every key in the -+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search -+ * is used for the rest. -+ * -+ * For sets that have been written to disk and are no longer being inserted -+ * into, we construct a binary search tree in an array - traversing a binary -+ * search tree in an array gives excellent locality of reference and is very -+ * fast, since both children of any node are adjacent to each other in memory -+ * (and their grandchildren, and great grandchildren...) - this means -+ * prefetching can be used to great effect. -+ * -+ * It's quite useful performance wise to keep these nodes small - not just -+ * because they're more likely to be in L2, but also because we can prefetch -+ * more nodes on a single cacheline and thus prefetch more iterations in advance -+ * when traversing this tree. -+ * -+ * Nodes in the auxiliary search tree must contain both a key to compare against -+ * (we don't want to fetch the key from the set, that would defeat the purpose), -+ * and a pointer to the key. We use a few tricks to compress both of these. -+ * -+ * To compress the pointer, we take advantage of the fact that one node in the -+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have -+ * a function (to_inorder()) that takes the index of a node in a binary tree and -+ * returns what its index would be in an inorder traversal, so we only have to -+ * store the low bits of the offset. -+ * -+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To -+ * compress that, we take advantage of the fact that when we're traversing the -+ * search tree at every iteration we know that both our search key and the key -+ * we're looking for lie within some range - bounded by our previous -+ * comparisons. (We special case the start of a search so that this is true even -+ * at the root of the tree). -+ * -+ * So we know the key we're looking for is between a and b, and a and b don't -+ * differ higher than bit 50, we don't need to check anything higher than bit -+ * 50. -+ * -+ * We don't usually need the rest of the bits, either; we only need enough bits -+ * to partition the key range we're currently checking. Consider key n - the -+ * key our auxiliary search tree node corresponds to, and key p, the key -+ * immediately preceding n. The lowest bit we need to store in the auxiliary -+ * search tree is the highest bit that differs between n and p. -+ * -+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the -+ * comparison. But we'd really like our nodes in the auxiliary search tree to be -+ * of fixed size. -+ * -+ * The solution is to make them fixed size, and when we're constructing a node -+ * check if p and n differed in the bits we needed them to. If they don't we -+ * flag that node, and when doing lookups we fallback to comparing against the -+ * real key. As long as this doesn't happen to often (and it seems to reliably -+ * happen a bit less than 1% of the time), we win - even on failures, that key -+ * is then more likely to be in cache than if we were doing binary searches all -+ * the way, since we're touching so much less memory. -+ * -+ * The keys in the auxiliary search tree are stored in (software) floating -+ * point, with an exponent and a mantissa. The exponent needs to be big enough -+ * to address all the bits in the original key, but the number of bits in the -+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. -+ * -+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys -+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. -+ * We need one node per 128 bytes in the btree node, which means the auxiliary -+ * search trees take up 3% as much memory as the btree itself. -+ * -+ * Constructing these auxiliary search trees is moderately expensive, and we -+ * don't want to be constantly rebuilding the search tree for the last set -+ * whenever we insert another key into it. For the unwritten set, we use a much -+ * simpler lookup table - it's just a flat array, so index i in the lookup table -+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing -+ * within each byte range works the same as with the auxiliary search trees. -+ * -+ * These are much easier to keep up to date when we insert a key - we do it -+ * somewhat lazily; when we shift a key up we usually just increment the pointer -+ * to it, only when it would overflow do we go to the trouble of finding the -+ * first key in that range of bytes again. -+ */ -+ -+extern bool bch2_expensive_debug_checks; -+ -+static inline bool btree_keys_expensive_checks(const struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ return bch2_expensive_debug_checks || *b->expensive_debug_checks; -+#else -+ return false; -+#endif -+} -+ -+enum bset_aux_tree_type { -+ BSET_NO_AUX_TREE, -+ BSET_RO_AUX_TREE, -+ BSET_RW_AUX_TREE, -+}; -+ -+#define BSET_TREE_NR_TYPES 3 -+ -+#define BSET_NO_AUX_TREE_VAL (U16_MAX) -+#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) -+ -+static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) -+{ -+ switch (t->extra) { -+ case BSET_NO_AUX_TREE_VAL: -+ EBUG_ON(t->size); -+ return BSET_NO_AUX_TREE; -+ case BSET_RW_AUX_TREE_VAL: -+ EBUG_ON(!t->size); -+ return BSET_RW_AUX_TREE; -+ default: -+ EBUG_ON(!t->size); -+ return BSET_RO_AUX_TREE; -+ } -+} -+ -+/* -+ * BSET_CACHELINE was originally intended to match the hardware cacheline size - -+ * it used to be 64, but I realized the lookup code would touch slightly less -+ * memory if it was 128. -+ * -+ * It definites the number of bytes (in struct bset) per struct bkey_float in -+ * the auxiliar search tree - when we're done searching the bset_float tree we -+ * have this many bytes left that we do a linear search over. -+ * -+ * Since (after level 5) every level of the bset_tree is on a new cacheline, -+ * we're touching one fewer cacheline in the bset tree in exchange for one more -+ * cacheline in the linear search - but the linear search might stop before it -+ * gets to the second cacheline. -+ */ -+ -+#define BSET_CACHELINE 128 -+ -+static inline size_t btree_keys_cachelines(struct btree *b) -+{ -+ return (1U << b->byte_order) / BSET_CACHELINE; -+} -+ -+static inline size_t btree_aux_data_bytes(struct btree *b) -+{ -+ return btree_keys_cachelines(b) * 8; -+} -+ -+static inline size_t btree_aux_data_u64s(struct btree *b) -+{ -+ return btree_aux_data_bytes(b) / sizeof(u64); -+} -+ -+typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); -+ -+static inline void -+__bkey_unpack_key_format_checked(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ { -+ compiled_unpack_fn unpack_fn = b->aux_data; -+ unpack_fn(dst, src); -+ -+ if (btree_keys_expensive_checks(b)) { -+ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); -+ -+ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); -+ } -+ } -+#else -+ *dst = __bch2_bkey_unpack_key(&b->format, src); -+#endif -+} -+ -+static inline struct bkey -+bkey_unpack_key_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ struct bkey dst; -+ -+ __bkey_unpack_key_format_checked(b, &dst, src); -+ return dst; -+} -+ -+static inline void __bkey_unpack_key(const struct btree *b, -+ struct bkey *dst, -+ const struct bkey_packed *src) -+{ -+ if (likely(bkey_packed(src))) -+ __bkey_unpack_key_format_checked(b, dst, src); -+ else -+ *dst = *packed_to_bkey_c(src); -+} -+ -+/** -+ * bkey_unpack_key -- unpack just the key, not the value -+ */ -+static inline struct bkey bkey_unpack_key(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_key_format_checked(b, src) -+ : *packed_to_bkey_c(src); -+} -+ -+static inline struct bpos -+bkey_unpack_pos_format_checked(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+#ifdef HAVE_BCACHEFS_COMPILED_UNPACK -+ return bkey_unpack_key_format_checked(b, src).p; -+#else -+ return __bkey_unpack_pos(&b->format, src); -+#endif -+} -+ -+static inline struct bpos bkey_unpack_pos(const struct btree *b, -+ const struct bkey_packed *src) -+{ -+ return likely(bkey_packed(src)) -+ ? bkey_unpack_pos_format_checked(b, src) -+ : packed_to_bkey_c(src)->p; -+} -+ -+/* Disassembled bkeys */ -+ -+static inline struct bkey_s_c bkey_disassemble(struct btree *b, -+ const struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; -+} -+ -+/* non const version: */ -+static inline struct bkey_s __bkey_disassemble(struct btree *b, -+ struct bkey_packed *k, -+ struct bkey *u) -+{ -+ __bkey_unpack_key(b, u, k); -+ -+ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; -+} -+ -+#define for_each_bset(_b, _t) \ -+ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) -+ -+#define bset_tree_for_each_key(_b, _t, _k) \ -+ for (_k = btree_bkey_first(_b, _t); \ -+ _k != btree_bkey_last(_b, _t); \ -+ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) -+ -+static inline bool bset_has_ro_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; -+} -+ -+static inline bool bset_has_rw_aux_tree(struct bset_tree *t) -+{ -+ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; -+} -+ -+static inline void bch2_bset_set_no_aux_tree(struct btree *b, -+ struct bset_tree *t) -+{ -+ BUG_ON(t < b->set); -+ -+ for (; t < b->set + ARRAY_SIZE(b->set); t++) { -+ t->size = 0; -+ t->extra = BSET_NO_AUX_TREE_VAL; -+ t->aux_data_offset = U16_MAX; -+ } -+} -+ -+static inline void btree_node_set_format(struct btree *b, -+ struct bkey_format f) -+{ -+ int len; -+ -+ b->format = f; -+ b->nr_key_bits = bkey_format_key_bits(&f); -+ -+ len = bch2_compile_bkey_format(&b->format, b->aux_data); -+ BUG_ON(len < 0 || len > U8_MAX); -+ -+ b->unpack_fn_len = len; -+ -+ bch2_bset_set_no_aux_tree(b, b->set); -+} -+ -+static inline struct bset *bset_next_set(struct btree *b, -+ unsigned block_bytes) -+{ -+ struct bset *i = btree_bset_last(b); -+ -+ EBUG_ON(!is_power_of_2(block_bytes)); -+ -+ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); -+} -+ -+void bch2_btree_keys_init(struct btree *, bool *); -+ -+void bch2_bset_init_first(struct btree *, struct bset *); -+void bch2_bset_init_next(struct bch_fs *, struct btree *, -+ struct btree_node_entry *); -+void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); -+void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); -+ -+void bch2_bset_insert(struct btree *, struct btree_node_iter *, -+ struct bkey_packed *, struct bkey_i *, unsigned); -+void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); -+ -+/* Bkey utility code */ -+ -+/* packed or unpacked */ -+static inline int bkey_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ EBUG_ON(r_packed && !bkey_packed(r_packed)); -+ -+ if (unlikely(!bkey_packed(l))) -+ return bkey_cmp(packed_to_bkey_c(l)->p, *r); -+ -+ if (likely(r_packed)) -+ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); -+ -+ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); -+} -+ -+struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); -+ -+struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, -+ struct bkey_packed *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) -+{ -+ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); -+} -+ -+enum bch_extent_overlap { -+ BCH_EXTENT_OVERLAP_ALL = 0, -+ BCH_EXTENT_OVERLAP_BACK = 1, -+ BCH_EXTENT_OVERLAP_FRONT = 2, -+ BCH_EXTENT_OVERLAP_MIDDLE = 3, -+}; -+ -+/* Returns how k overlaps with m */ -+static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, -+ const struct bkey *m) -+{ -+ int cmp1 = bkey_cmp(k->p, m->p) < 0; -+ int cmp2 = bkey_cmp(bkey_start_pos(k), -+ bkey_start_pos(m)) > 0; -+ -+ return (cmp1 << 1) + cmp2; -+} -+ -+/* Btree key iteration */ -+ -+void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, -+ const struct bkey_packed *, -+ const struct bkey_packed *); -+void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, -+ struct bpos *); -+void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, -+ struct btree *, -+ struct bset_tree *); -+ -+void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); -+void bch2_btree_node_iter_set_drop(struct btree_node_iter *, -+ struct btree_node_iter_set *); -+void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); -+ -+#define btree_node_iter_for_each(_iter, _set) \ -+ for (_set = (_iter)->data; \ -+ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ -+ (_set)->k != (_set)->end; \ -+ _set++) -+ -+static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, -+ unsigned i) -+{ -+ return iter->data[i].k == iter->data[i].end; -+} -+ -+static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) -+{ -+ return __btree_node_iter_set_end(iter, 0); -+} -+ -+/* -+ * When keys compare equal, deleted keys compare first: -+ * -+ * XXX: only need to compare pointers for keys that are both within a -+ * btree_node_iterator - we need to break ties for prev() to work correctly -+ */ -+static inline int bkey_iter_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r) -+{ -+ return bkey_cmp_packed(b, l, r) -+ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) -+ ?: cmp_int(l, r); -+} -+ -+static inline int btree_node_iter_cmp(const struct btree *b, -+ struct btree_node_iter_set l, -+ struct btree_node_iter_set r) -+{ -+ return bkey_iter_cmp(b, -+ __btree_node_offset_to_key(b, l.k), -+ __btree_node_offset_to_key(b, r.k)); -+} -+ -+/* These assume r (the search key) is not a deleted key: */ -+static inline int bkey_iter_pos_cmp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bpos *r) -+{ -+ return bkey_cmp_left_packed(b, l, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, -+ const struct bkey_packed *l, -+ const struct bkey_packed *r_packed, -+ const struct bpos *r) -+{ -+ return bkey_cmp_p_or_unp(b, l, r_packed, r) -+ ?: -((int) bkey_deleted(l)); -+} -+ -+static inline struct bkey_packed * -+__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return __btree_node_offset_to_key(b, iter->data->k); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, -+ struct btree *b, -+ unsigned min_key_type) -+{ -+ while (!bch2_btree_node_iter_end(iter)) { -+ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (k->type >= min_key_type) -+ return k; -+ -+ bch2_btree_node_iter_advance(iter, b); -+ } -+ -+ return NULL; -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, -+ struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, 0); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) -+{ -+ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); -+ -+ if (ret) -+ bch2_btree_node_iter_advance(iter, b); -+ -+ return ret; -+} -+ -+struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, -+ struct btree *); -+struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, -+ struct btree *, unsigned); -+ -+static inline struct bkey_packed * -+bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) -+{ -+ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); -+} -+ -+struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, -+ struct btree *, -+ struct bkey *); -+ -+#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ -+ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ -+ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ -+ bch2_btree_node_iter_advance(iter, b)) -+ -+/* Accounting: */ -+ -+static inline void btree_keys_account_key(struct btree_nr_keys *n, -+ unsigned bset, -+ struct bkey_packed *k, -+ int sign) -+{ -+ n->live_u64s += k->u64s * sign; -+ n->bset_u64s[bset] += k->u64s * sign; -+ -+ if (bkey_packed(k)) -+ n->packed_keys += sign; -+ else -+ n->unpacked_keys += sign; -+} -+ -+static inline void btree_keys_account_val_delta(struct btree *b, -+ struct bkey_packed *k, -+ int delta) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, k); -+ -+ b->nr.live_u64s += delta; -+ b->nr.bset_u64s[t - b->set] += delta; -+} -+ -+#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, 1) -+#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ -+ btree_keys_account_key(_nr, _bset_idx, _k, -1) -+ -+#define btree_account_key_add(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) -+#define btree_account_key_drop(_b, _k) \ -+ btree_keys_account_key(&(_b)->nr, \ -+ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) -+ -+struct bset_stats { -+ struct { -+ size_t nr, bytes; -+ } sets[BSET_TREE_NR_TYPES]; -+ -+ size_t floats; -+ size_t failed; -+}; -+ -+void bch2_btree_keys_stats(struct btree *, struct bset_stats *); -+void bch2_bfloat_to_text(struct printbuf *, struct btree *, -+ struct bkey_packed *); -+ -+/* Debug stuff */ -+ -+void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); -+void bch2_dump_btree_node(struct bch_fs *, struct btree *); -+void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_verify_btree_nr_keys(struct btree *); -+void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -+void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, -+ struct bkey_packed *, unsigned); -+ -+#else -+ -+static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} -+static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, -+ struct btree *b) {} -+static inline void bch2_verify_insert_pos(struct btree *b, -+ struct bkey_packed *where, -+ struct bkey_packed *insert, -+ unsigned clobber_u64s) {} -+#endif -+ -+static inline void bch2_verify_btree_nr_keys(struct btree *b) -+{ -+ if (btree_keys_expensive_checks(b)) -+ __bch2_verify_btree_nr_keys(b); -+} -+ -+#endif /* _BCACHEFS_BSET_H */ -diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c -new file mode 100644 -index 000000000000..bb94fa2341ee ---- /dev/null -+++ b/fs/bcachefs/btree_cache.c -@@ -0,0 +1,1063 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "debug.h" -+ -+#include -+#include -+#include -+ -+const char * const bch2_btree_ids[] = { -+#define x(kwd, val, name) name, -+ BCH_BTREE_IDS() -+#undef x -+ NULL -+}; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *c) -+{ -+ unsigned i, reserve = 16; -+ -+ if (!c->btree_roots[0].b) -+ reserve += 8; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ reserve += min_t(unsigned, 1, -+ c->btree_roots[i].b->c.level) * 8; -+ -+ c->btree_cache.reserve = reserve; -+} -+ -+static inline unsigned btree_cache_can_free(struct btree_cache *bc) -+{ -+ return max_t(int, 0, bc->used - bc->reserve); -+} -+ -+static void __btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ EBUG_ON(btree_node_write_in_flight(b)); -+ -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ vfree(b->aux_data); -+ b->aux_data = NULL; -+} -+ -+static void btree_node_data_free(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ __btree_node_data_free(c, b); -+ bc->used--; -+ list_move(&b->list, &bc->freed); -+} -+ -+static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct btree *b = obj; -+ const u64 *v = arg->key; -+ -+ return b->hash_val == *v ? 0 : 1; -+} -+ -+static const struct rhashtable_params bch_btree_cache_params = { -+ .head_offset = offsetof(struct btree, hash), -+ .key_offset = offsetof(struct btree, hash_val), -+ .key_len = sizeof(u64), -+ .obj_cmpfn = bch2_btree_cache_cmp_fn, -+}; -+ -+static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) -+{ -+ BUG_ON(b->data || b->aux_data); -+ -+ b->data = kvpmalloc(btree_bytes(c), gfp); -+ if (!b->data) -+ return -ENOMEM; -+ -+ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); -+ if (!b->aux_data) { -+ kvpfree(b->data, btree_bytes(c)); -+ b->data = NULL; -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct btree *__btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); -+ if (!b) -+ return NULL; -+ -+ bkey_btree_ptr_init(&b->key); -+ six_lock_init(&b->c.lock); -+ INIT_LIST_HEAD(&b->list); -+ INIT_LIST_HEAD(&b->write_blocked); -+ b->byte_order = ilog2(btree_bytes(c)); -+ return b; -+} -+ -+static struct btree *btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b = __btree_node_mem_alloc(c); -+ if (!b) -+ return NULL; -+ -+ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { -+ kfree(b); -+ return NULL; -+ } -+ -+ bc->used++; -+ list_add(&b->list, &bc->freeable); -+ return b; -+} -+ -+/* Btree in memory cache - hash table */ -+ -+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) -+{ -+ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); -+ -+ /* Cause future lookups for this node to fail: */ -+ b->hash_val = 0; -+ -+ six_lock_wakeup_all(&b->c.lock); -+} -+ -+int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) -+{ -+ BUG_ON(b->hash_val); -+ b->hash_val = btree_ptr_hash_val(&b->key); -+ -+ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, -+ bch_btree_cache_params); -+} -+ -+int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, -+ unsigned level, enum btree_id id) -+{ -+ int ret; -+ -+ b->c.level = level; -+ b->c.btree_id = id; -+ -+ mutex_lock(&bc->lock); -+ ret = __bch2_btree_node_hash_insert(bc, b); -+ if (!ret) -+ list_add(&b->list, &bc->live); -+ mutex_unlock(&bc->lock); -+ -+ return ret; -+} -+ -+__flatten -+static inline struct btree *btree_cache_find(struct btree_cache *bc, -+ const struct bkey_i *k) -+{ -+ u64 v = btree_ptr_hash_val(k); -+ -+ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); -+} -+ -+/* -+ * this version is for btree nodes that have already been freed (we're not -+ * reaping a real btree node) -+ */ -+static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ int ret = 0; -+ -+ lockdep_assert_held(&bc->lock); -+ -+ if (!six_trylock_intent(&b->c.lock)) -+ return -ENOMEM; -+ -+ if (!six_trylock_write(&b->c.lock)) -+ goto out_unlock_intent; -+ -+ if (btree_node_noevict(b)) -+ goto out_unlock; -+ -+ if (!btree_node_may_write(b)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) && -+ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ goto out_unlock; -+ -+ if (btree_node_dirty(b) || -+ btree_node_write_in_flight(b) || -+ btree_node_read_in_flight(b)) { -+ if (!flush) -+ goto out_unlock; -+ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ /* -+ * Using the underscore version because we don't want to compact -+ * bsets after the write, since this node is about to be evicted -+ * - unless btree verify mode is enabled, since it runs out of -+ * the post write cleanup: -+ */ -+ if (verify_btree_ondisk(c)) -+ bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ else -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ -+ /* wait for any in flight btree write */ -+ btree_node_wait_on_io(b); -+ } -+out: -+ if (b->hash_val && !ret) -+ trace_btree_node_reap(c, b); -+ return ret; -+out_unlock: -+ six_unlock_write(&b->c.lock); -+out_unlock_intent: -+ six_unlock_intent(&b->c.lock); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int btree_node_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, false); -+} -+ -+static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) -+{ -+ return __btree_node_reclaim(c, b, true); -+} -+ -+static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b, *t; -+ unsigned long nr = sc->nr_to_scan; -+ unsigned long can_free; -+ unsigned long touched = 0; -+ unsigned long freed = 0; -+ unsigned i, flags; -+ -+ if (btree_shrinker_disabled(c)) -+ return SHRINK_STOP; -+ -+ /* Return -1 if we can't do anything right now */ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ return -1; -+ -+ flags = memalloc_nofs_save(); -+ -+ /* -+ * It's _really_ critical that we don't free too many btree nodes - we -+ * have to always leave ourselves a reserve. The reserve is how we -+ * guarantee that allocating memory for a new btree node can always -+ * succeed, so that inserting keys into the btree can always succeed and -+ * IO can always make forward progress: -+ */ -+ nr /= btree_pages(c); -+ can_free = btree_cache_can_free(bc); -+ nr = min_t(unsigned long, nr, can_free); -+ -+ i = 0; -+ list_for_each_entry_safe(b, t, &bc->freeable, list) { -+ touched++; -+ -+ if (freed >= nr) -+ break; -+ -+ if (++i > 3 && -+ !btree_node_reclaim(c, b)) { -+ btree_node_data_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ freed++; -+ } -+ } -+restart: -+ list_for_each_entry_safe(b, t, &bc->live, list) { -+ touched++; -+ -+ if (freed >= nr) { -+ /* Save position */ -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ break; -+ } -+ -+ if (!btree_node_accessed(b) && -+ !btree_node_reclaim(c, b)) { -+ /* can't call bch2_btree_node_hash_remove under lock */ -+ freed++; -+ if (&t->list != &bc->live) -+ list_move_tail(&bc->live, &t->list); -+ -+ btree_node_data_free(c, b); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ if (freed >= nr) -+ goto out; -+ -+ if (sc->gfp_mask & __GFP_FS) -+ mutex_lock(&bc->lock); -+ else if (!mutex_trylock(&bc->lock)) -+ goto out; -+ goto restart; -+ } else -+ clear_btree_node_accessed(b); -+ } -+ -+ memalloc_nofs_restore(flags); -+ mutex_unlock(&bc->lock); -+out: -+ return (unsigned long) freed * btree_pages(c); -+} -+ -+static unsigned long bch2_btree_cache_count(struct shrinker *shrink, -+ struct shrink_control *sc) -+{ -+ struct bch_fs *c = container_of(shrink, struct bch_fs, -+ btree_cache.shrink); -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (btree_shrinker_disabled(c)) -+ return 0; -+ -+ return btree_cache_can_free(bc) * btree_pages(c); -+} -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ unsigned i, flags; -+ -+ if (bc->shrink.list.next) -+ unregister_shrinker(&bc->shrink); -+ -+ /* vfree() can allocate memory: */ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ if (c->verify_data) -+ list_move(&c->verify_data->list, &bc->live); -+ -+ kvpfree(c->verify_ondisk, btree_bytes(c)); -+#endif -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].b) -+ list_add(&c->btree_roots[i].b->list, &bc->live); -+ -+ list_splice(&bc->freeable, &bc->live); -+ -+ while (!list_empty(&bc->live)) { -+ b = list_first_entry(&bc->live, struct btree, list); -+ -+ BUG_ON(btree_node_read_in_flight(b) || -+ btree_node_write_in_flight(b)); -+ -+ if (btree_node_dirty(b)) -+ bch2_btree_complete_write(c, b, btree_current_write(b)); -+ clear_btree_node_dirty(b); -+ -+ btree_node_data_free(c, b); -+ } -+ -+ while (!list_empty(&bc->freed)) { -+ b = list_first_entry(&bc->freed, struct btree, list); -+ list_del(&b->list); -+ kfree(b); -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ -+ if (bc->table_init_done) -+ rhashtable_destroy(&bc->table); -+} -+ -+int bch2_fs_btree_cache_init(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ unsigned i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); -+ if (ret) -+ goto out; -+ -+ bc->table_init_done = true; -+ -+ bch2_recalc_btree_reserve(c); -+ -+ for (i = 0; i < bc->reserve; i++) -+ if (!btree_node_mem_alloc(c)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_splice_init(&bc->live, &bc->freeable); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_init(&c->verify_lock); -+ -+ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); -+ if (!c->verify_ondisk) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ c->verify_data = btree_node_mem_alloc(c); -+ if (!c->verify_data) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_del_init(&c->verify_data->list); -+#endif -+ -+ bc->shrink.count_objects = bch2_btree_cache_count; -+ bc->shrink.scan_objects = bch2_btree_cache_scan; -+ bc->shrink.seeks = 4; -+ bc->shrink.batch = btree_pages(c) * 2; -+ register_shrinker(&bc->shrink); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+void bch2_fs_btree_cache_init_early(struct btree_cache *bc) -+{ -+ mutex_init(&bc->lock); -+ INIT_LIST_HEAD(&bc->live); -+ INIT_LIST_HEAD(&bc->freeable); -+ INIT_LIST_HEAD(&bc->freed); -+} -+ -+/* -+ * We can only have one thread cannibalizing other cached btree nodes at a time, -+ * or we'll deadlock. We use an open coded mutex to ensure that, which a -+ * cannibalize_bucket() will take. This means every time we unlock the root of -+ * the btree, we need to release this lock if we have it held. -+ */ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ -+ if (bc->alloc_lock == current) { -+ trace_btree_node_cannibalize_unlock(c); -+ bc->alloc_lock = NULL; -+ closure_wake_up(&bc->alloc_wait); -+ } -+} -+ -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct task_struct *old; -+ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) -+ goto success; -+ -+ if (!cl) { -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -ENOMEM; -+ } -+ -+ closure_wait(&bc->alloc_wait, cl); -+ -+ /* Try again, after adding ourselves to waitlist */ -+ old = cmpxchg(&bc->alloc_lock, NULL, current); -+ if (old == NULL || old == current) { -+ /* We raced */ -+ closure_wake_up(&bc->alloc_wait); -+ goto success; -+ } -+ -+ trace_btree_node_cannibalize_lock_fail(c); -+ return -EAGAIN; -+ -+success: -+ trace_btree_node_cannibalize_lock(c); -+ return 0; -+} -+ -+static struct btree *btree_node_cannibalize(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_reclaim(c, b)) -+ return b; -+ -+ while (1) { -+ list_for_each_entry_reverse(b, &bc->live, list) -+ if (!btree_node_write_and_reclaim(c, b)) -+ return b; -+ -+ /* -+ * Rare case: all nodes were intent-locked. -+ * Just busy-wait. -+ */ -+ WARN_ONCE(1, "btree cache cannibalize failed\n"); -+ cond_resched(); -+ } -+} -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ u64 start_time = local_clock(); -+ unsigned flags; -+ -+ flags = memalloc_nofs_save(); -+ mutex_lock(&bc->lock); -+ -+ /* -+ * btree_free() doesn't free memory; it sticks the node on the end of -+ * the list. Check if there's any freed nodes there: -+ */ -+ list_for_each_entry(b, &bc->freeable, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ /* -+ * We never free struct btree itself, just the memory that holds the on -+ * disk node. Check the freed list before allocating a new one: -+ */ -+ list_for_each_entry(b, &bc->freed, list) -+ if (!btree_node_reclaim(c, b)) -+ goto got_node; -+ -+ b = NULL; -+got_node: -+ if (b) -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ if (!b) { -+ b = __btree_node_mem_alloc(c); -+ if (!b) -+ goto err; -+ -+ BUG_ON(!six_trylock_intent(&b->c.lock)); -+ BUG_ON(!six_trylock_write(&b->c.lock)); -+ } -+ -+ if (!b->data) { -+ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) -+ goto err; -+ -+ mutex_lock(&bc->lock); -+ bc->used++; -+ mutex_unlock(&bc->lock); -+ } -+ -+ BUG_ON(btree_node_hashed(b)); -+ BUG_ON(btree_node_write_in_flight(b)); -+out: -+ b->flags = 0; -+ b->written = 0; -+ b->nsets = 0; -+ b->sib_u64s[0] = 0; -+ b->sib_u64s[1] = 0; -+ b->whiteout_u64s = 0; -+ bch2_btree_keys_init(b, &c->expensive_debug_checks); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], -+ start_time); -+ -+ memalloc_nofs_restore(flags); -+ return b; -+err: -+ mutex_lock(&bc->lock); -+ -+ if (b) { -+ list_add(&b->list, &bc->freed); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ /* Try to cannibalize another cached btree node: */ -+ if (bc->alloc_lock == current) { -+ b = btree_node_cannibalize(c); -+ list_del_init(&b->list); -+ mutex_unlock(&bc->lock); -+ -+ bch2_btree_node_hash_remove(bc, b); -+ -+ trace_btree_node_cannibalize(c); -+ goto out; -+ } -+ -+ mutex_unlock(&bc->lock); -+ memalloc_nofs_restore(flags); -+ return ERR_PTR(-ENOMEM); -+} -+ -+/* Slowpath, don't want it inlined into btree_iter_traverse() */ -+static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, -+ struct btree_iter *iter, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level, -+ enum six_lock_type lock_type, -+ bool sync) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); -+ /* -+ * Parent node must be locked, else we could read in a btree node that's -+ * been freed: -+ */ -+ if (iter && !bch2_btree_node_relock(iter, level + 1)) -+ return ERR_PTR(-EINTR); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ if (IS_ERR(b)) -+ return b; -+ -+ bkey_copy(&b->key, k); -+ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { -+ /* raced with another fill: */ -+ -+ /* mark as unhashed... */ -+ b->hash_val = 0; -+ -+ mutex_lock(&bc->lock); -+ list_add(&b->list, &bc->freeable); -+ mutex_unlock(&bc->lock); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ /* -+ * Unlock before doing IO: -+ * -+ * XXX: ideally should be dropping all btree node locks here -+ */ -+ if (iter && btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ bch2_btree_node_read(c, b, sync); -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (!sync) { -+ six_unlock_intent(&b->c.lock); -+ return NULL; -+ } -+ -+ if (lock_type == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ -+ return b; -+} -+ -+static int lock_node_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ const struct bkey_i *k = p; -+ -+ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; -+} -+ -+/** -+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it -+ * in from disk if necessary. -+ * -+ * If IO is necessary and running under generic_make_request, returns -EAGAIN. -+ * -+ * The btree node will have either a read or a write lock held, depending on -+ * the @write parameter. -+ */ -+struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level, -+ enum six_lock_type lock_type) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ /* -+ * We must have the parent locked to call bch2_btree_node_fill(), -+ * else we could read in a btree node from disk that's been -+ * freed: -+ */ -+ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, lock_type, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ /* -+ * There's a potential deadlock with splits and insertions into -+ * interior nodes we have to avoid: -+ * -+ * The other thread might be holding an intent lock on the node -+ * we want, and they want to update its parent node so they're -+ * going to upgrade their intent lock on the parent node to a -+ * write lock. -+ * -+ * But if we're holding a read lock on the parent, and we're -+ * trying to get the intent lock they're holding, we deadlock. -+ * -+ * So to avoid this we drop the read locks on parent nodes when -+ * we're starting to take intent locks - and handle the race. -+ * -+ * The race is that they might be about to free the node we -+ * want, and dropping our read lock on the parent node lets them -+ * update the parent marking the node we want as freed, and then -+ * free it: -+ * -+ * To guard against this, btree nodes are evicted from the cache -+ * when they're freed - and b->hash_val is zeroed out, which we -+ * check for after we lock the node. -+ * -+ * Then, bch2_btree_node_relock() on the parent will fail - because -+ * the parent was modified, when the pointer to the node we want -+ * was removed - and we'll bail out: -+ */ -+ if (btree_node_read_locked(iter, level + 1)) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, -+ lock_node_check_fn, (void *) k)) { -+ if (b->hash_val != btree_ptr_hash_val(k)) -+ goto retry; -+ return ERR_PTR(-EINTR); -+ } -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.level != level || -+ race_fault())) { -+ six_unlock_type(&b->c.lock, lock_type); -+ if (bch2_btree_node_relock(iter, level + 1)) -+ goto retry; -+ -+ trace_trans_restart_btree_node_reused(iter->trans->ip); -+ return ERR_PTR(-EINTR); -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_type(&b->c.lock, lock_type); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != iter->btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, -+ const struct bkey_i *k, -+ enum btree_id btree_id, -+ unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ struct bset_tree *t; -+ int ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_node_mem_ptr(k); -+ if (b) -+ goto lock_node; -+retry: -+ b = btree_cache_find(bc, k); -+ if (unlikely(!b)) { -+ b = bch2_btree_node_fill(c, NULL, k, btree_id, -+ level, SIX_LOCK_read, true); -+ -+ /* We raced and found the btree node in the cache */ -+ if (!b) -+ goto retry; -+ -+ if (IS_ERR(b)) -+ return b; -+ } else { -+lock_node: -+ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); -+ if (ret) -+ goto retry; -+ -+ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || -+ b->c.btree_id != btree_id || -+ b->c.level != level)) { -+ six_unlock_read(&b->c.lock); -+ goto retry; -+ } -+ } -+ -+ /* XXX: waiting on IO with btree locks held: */ -+ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, -+ TASK_UNINTERRUPTIBLE); -+ -+ prefetch(b->aux_data); -+ -+ for_each_bset(b, t) { -+ void *p = (u64 *) b->aux_data + t->aux_data_offset; -+ -+ prefetch(p + L1_CACHE_BYTES * 0); -+ prefetch(p + L1_CACHE_BYTES * 1); -+ prefetch(p + L1_CACHE_BYTES * 2); -+ } -+ -+ /* avoid atomic set bit if it's not needed: */ -+ if (!btree_node_accessed(b)) -+ set_btree_node_accessed(b); -+ -+ if (unlikely(btree_node_read_error(b))) { -+ six_unlock_read(&b->c.lock); -+ return ERR_PTR(-EIO); -+ } -+ -+ EBUG_ON(b->c.btree_id != btree_id || -+ BTREE_NODE_LEVEL(b->data) != level || -+ bkey_cmp(b->data->max_key, k->k.p)); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ struct btree *b, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *parent; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ struct btree *ret = NULL; -+ unsigned level = b->c.level; -+ -+ parent = btree_iter_node(iter, level + 1); -+ if (!parent) -+ return NULL; -+ -+ /* -+ * There's a corner case where a btree_iter might have a node locked -+ * that is just outside its current pos - when -+ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. -+ * -+ * But the lock ordering checks in __bch2_btree_node_lock() go off of -+ * iter->pos, not the node's key: so if the iterator is marked as -+ * needing to be traversed, we risk deadlock if we don't bail out here: -+ */ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return ERR_PTR(-EINTR); -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) { -+ ret = ERR_PTR(-EINTR); -+ goto out; -+ } -+ -+ node_iter = iter->l[parent->c.level].iter; -+ -+ k = bch2_btree_node_iter_peek_all(&node_iter, parent); -+ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); -+ -+ k = sib == btree_prev_sib -+ ? bch2_btree_node_iter_prev(&node_iter, parent) -+ : (bch2_btree_node_iter_advance(&node_iter, parent), -+ bch2_btree_node_iter_peek(&node_iter, parent)); -+ if (!k) -+ goto out; -+ -+ bch2_bkey_unpack(parent, &tmp.k, k); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { -+ struct btree_iter *linked; -+ -+ if (!bch2_btree_node_relock(iter, level + 1)) -+ goto out; -+ -+ /* -+ * We might have got -EINTR because trylock failed, and we're -+ * holding other locks that would cause us to deadlock: -+ */ -+ trans_for_each_iter(trans, linked) -+ if (btree_iter_cmp(iter, linked) < 0) -+ __bch2_btree_iter_unlock(linked); -+ -+ if (sib == btree_prev_sib) -+ btree_node_unlock(iter, level); -+ -+ ret = bch2_btree_node_get(c, iter, &tmp.k, level, -+ SIX_LOCK_intent); -+ -+ /* -+ * before btree_iter_relock() calls btree_iter_verify_locks(): -+ */ -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (!bch2_btree_node_relock(iter, level)) { -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ if (!IS_ERR(ret)) { -+ six_unlock_intent(&ret->c.lock); -+ ret = ERR_PTR(-EINTR); -+ } -+ } -+ -+ bch2_trans_relock(trans); -+ } -+out: -+ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, level + 1); -+ -+ if (PTR_ERR_OR_ZERO(ret) == -EINTR) -+ bch2_btree_iter_upgrade(iter, level + 2); -+ -+ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); -+ -+ if (!IS_ERR_OR_NULL(ret)) { -+ struct btree *n1 = ret, *n2 = b; -+ -+ if (sib != btree_prev_sib) -+ swap(n1, n2); -+ -+ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), -+ n2->data->min_key)); -+ } -+ -+ bch2_btree_trans_verify_locks(trans); -+ -+ return ret; -+} -+ -+void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct btree_cache *bc = &c->btree_cache; -+ struct btree *b; -+ -+ BUG_ON(!btree_node_locked(iter, level + 1)); -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ b = btree_cache_find(bc, k); -+ if (b) -+ return; -+ -+ bch2_btree_node_fill(c, iter, k, iter->btree_id, -+ level, SIX_LOCK_read, false); -+} -+ -+void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, -+ struct btree *b) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_stats stats; -+ -+ memset(&stats, 0, sizeof(stats)); -+ -+ bch2_btree_keys_stats(b, &stats); -+ -+ pr_buf(out, -+ "l %u %llu:%llu - %llu:%llu:\n" -+ " ptrs: ", -+ b->c.level, -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ b->data->max_key.inode, -+ b->data->max_key.offset); -+ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ pr_buf(out, "\n" -+ " format: u64s %u fields %u %u %u %u %u\n" -+ " unpack fn len: %u\n" -+ " bytes used %zu/%zu (%zu%% full)\n" -+ " sib u64s: %u, %u (merge threshold %zu)\n" -+ " nr packed keys %u\n" -+ " nr unpacked keys %u\n" -+ " floats %zu\n" -+ " failed unpacked %zu\n", -+ f->key_u64s, -+ f->bits_per_field[0], -+ f->bits_per_field[1], -+ f->bits_per_field[2], -+ f->bits_per_field[3], -+ f->bits_per_field[4], -+ b->unpack_fn_len, -+ b->nr.live_u64s * sizeof(u64), -+ btree_bytes(c) - sizeof(struct btree_node), -+ b->nr.live_u64s * 100 / btree_max_u64s(c), -+ b->sib_u64s[0], -+ b->sib_u64s[1], -+ BTREE_FOREGROUND_MERGE_THRESHOLD(c), -+ b->nr.packed_keys, -+ b->nr.unpacked_keys, -+ stats.floats, -+ stats.failed); -+} -diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h -new file mode 100644 -index 000000000000..d0d3a85bb8be ---- /dev/null -+++ b/fs/bcachefs/btree_cache.h -@@ -0,0 +1,104 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_CACHE_H -+#define _BCACHEFS_BTREE_CACHE_H -+ -+#include "bcachefs.h" -+#include "btree_types.h" -+ -+struct btree_iter; -+ -+extern const char * const bch2_btree_ids[]; -+ -+void bch2_recalc_btree_reserve(struct bch_fs *); -+ -+void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); -+int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); -+int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, -+ unsigned, enum btree_id); -+ -+void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); -+int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); -+ -+struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); -+ -+struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned, -+ enum six_lock_type); -+ -+struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, -+ enum btree_id, unsigned); -+ -+struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, -+ struct btree *, enum btree_node_sibling); -+ -+void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, -+ const struct bkey_i *, unsigned); -+ -+void bch2_fs_btree_cache_exit(struct bch_fs *); -+int bch2_fs_btree_cache_init(struct bch_fs *); -+void bch2_fs_btree_cache_init_early(struct btree_cache *); -+ -+static inline u64 btree_ptr_hash_val(const struct bkey_i *k) -+{ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); -+ case KEY_TYPE_btree_ptr_v2: -+ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; -+ default: -+ return 0; -+ } -+} -+ -+static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) -+{ -+ return k->k.type == KEY_TYPE_btree_ptr_v2 -+ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr -+ : NULL; -+} -+ -+/* is btree node in hash table? */ -+static inline bool btree_node_hashed(struct btree *b) -+{ -+ return b->hash_val != 0; -+} -+ -+#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ -+ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ -+ &(_c)->btree_cache.table), \ -+ _iter = 0; _iter < (_tbl)->size; _iter++) \ -+ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) -+ -+static inline size_t btree_bytes(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size << 9; -+} -+ -+static inline size_t btree_max_u64s(struct bch_fs *c) -+{ -+ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); -+} -+ -+static inline size_t btree_pages(struct bch_fs *c) -+{ -+ return btree_bytes(c) / PAGE_SIZE; -+} -+ -+static inline unsigned btree_blocks(struct bch_fs *c) -+{ -+ return c->opts.btree_node_size >> c->block_bits; -+} -+ -+#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) -+ -+#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) -+#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ -+ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) -+ -+#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) -+ -+void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, -+ struct btree *); -+ -+#endif /* _BCACHEFS_BTREE_CACHE_H */ -diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c -new file mode 100644 -index 000000000000..e8c1e752a25d ---- /dev/null -+++ b/fs/bcachefs/btree_gc.c -@@ -0,0 +1,1438 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * Copyright (C) 2014 Datera Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "bkey_on_stack.h" -+#include "btree_locking.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "debug.h" -+#include "ec.h" -+#include "error.h" -+#include "extents.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ preempt_disable(); -+ write_seqcount_begin(&c->gc_pos_lock); -+ c->gc_pos = new_pos; -+ write_seqcount_end(&c->gc_pos_lock); -+ preempt_enable(); -+} -+ -+static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) -+{ -+ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); -+ __gc_pos_set(c, new_pos); -+} -+ -+static int bch2_gc_check_topology(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bpos *expected_start, -+ struct bpos expected_end, -+ bool is_last) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, -+ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", -+ bp.v->min_key.inode, -+ bp.v->min_key.offset, -+ expected_start->inode, -+ expected_start->offset)) { -+ BUG(); -+ } -+ } -+ -+ *expected_start = bkey_cmp(k.k->p, POS_MAX) -+ ? bkey_successor(k.k->p) -+ : k.k->p; -+ -+ if (fsck_err_on(is_last && -+ bkey_cmp(k.k->p, expected_end), c, -+ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", -+ k.k->p.inode, -+ k.k->p.offset, -+ expected_end.inode, -+ expected_end.offset)) { -+ BUG(); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* marking of btree keys/nodes: */ -+ -+static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, -+ u8 *max_stale, bool initial) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ unsigned flags = -+ BTREE_TRIGGER_GC| -+ (initial ? BTREE_TRIGGER_NOATOMIC : 0); -+ int ret = 0; -+ -+ if (initial) { -+ BUG_ON(journal_seq_verify(c) && -+ k.k->version.lo > journal_cur_seq(&c->journal)); -+ -+ /* XXX change to fsck check */ -+ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, -+ "key version number higher than recorded: %llu > %llu", -+ k.k->version.lo, -+ atomic64_read(&c->key_version))) -+ atomic64_set(&c->key_version, k.k->version.lo); -+ -+ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, -+ "superblock not marked as containing replicas (type %u)", -+ k.k->type)) { -+ ret = bch2_mark_bkey_replicas(c, k); -+ if (ret) -+ return ret; -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); -+ -+ if (mustfix_fsck_err_on(!g->gen_valid, c, -+ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ } -+ -+ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, -+ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), -+ bch2_data_types[ptr_data_type(k.k, ptr)], -+ ptr->gen, g->mark.gen)) { -+ g2->_mark.gen = g->_mark.gen = ptr->gen; -+ g2->gen_valid = g->gen_valid = true; -+ g2->_mark.data_type = 0; -+ g2->_mark.dirty_sectors = 0; -+ g2->_mark.cached_sectors = 0; -+ set_bit(BCH_FS_FIXED_GENS, &c->flags); -+ } -+ } -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, true); -+ -+ if (gen_after(g->oldest_gen, ptr->gen)) -+ g->oldest_gen = ptr->gen; -+ -+ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); -+ } -+ -+ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); -+fsck_err: -+ return ret; -+} -+ -+static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, -+ bool initial) -+{ -+ struct bpos next_node_start = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ *max_stale = 0; -+ -+ if (!btree_node_type_needs_gc(btree_node_type(b))) -+ return 0; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ ret = bch2_gc_mark_key(c, k, max_stale, initial); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (b->c.level) { -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ bch2_btree_node_iter_end(&iter)); -+ if (ret) -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, -+ bool initial, bool metadata_only) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ 0, depth, BTREE_ITER_PREFETCH, b) { -+ bch2_verify_btree_nr_keys(b); -+ -+ gc_pos_set(c, gc_pos_btree_node(b)); -+ -+ ret = btree_gc_mark_node(c, b, &max_stale, initial); -+ if (ret) -+ break; -+ -+ if (!initial) { -+ if (max_stale > 64) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ else if (!btree_gc_rewrite_disabled(c) && -+ (btree_gc_always_rewrite(c) || max_stale > 16)) -+ bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, -+ BTREE_INSERT_NOWAIT| -+ BTREE_INSERT_GC_LOCK_HELD); -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->btree_root_lock); -+ b = c->btree_roots[btree_id].b; -+ if (!btree_node_fake(b)) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, initial); -+ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); -+ mutex_unlock(&c->btree_root_lock); -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ unsigned target_depth) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ struct bpos next_node_start = b->data->min_key; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ bch2_bkey_debugcheck(c, b, k); -+ -+ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); -+ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); -+ -+ ret = bch2_gc_mark_key(c, k, &max_stale, true); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ ret = bch2_gc_check_topology(c, k, -+ &next_node_start, -+ b->data->max_key, -+ !bch2_btree_and_journal_iter_peek(&iter).k); -+ if (ret) -+ break; -+ -+ if (b->c.level > target_depth) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_btree_init_recurse(c, child, -+ journal_keys, target_depth); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+static int bch2_gc_btree_init(struct bch_fs *c, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ bool metadata_only) -+{ -+ struct btree *b; -+ unsigned target_depth = metadata_only ? 1 -+ : expensive_debug_checks(c) ? 0 -+ : !btree_node_type_needs_gc(btree_id) ? 1 -+ : 0; -+ u8 max_stale = 0; -+ int ret = 0; -+ -+ b = c->btree_roots[btree_id].b; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset)) { -+ BUG(); -+ } -+ -+ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, -+ "btree root with incorrect min_key: %llu:%llu", -+ b->data->max_key.inode, -+ b->data->max_key.offset)) { -+ BUG(); -+ } -+ -+ if (b->c.level >= target_depth) -+ ret = bch2_gc_btree_init_recurse(c, b, -+ journal_keys, target_depth); -+ -+ if (!ret) -+ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), -+ &max_stale, true); -+fsck_err: -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) -+{ -+ return (int) btree_id_to_gc_phase(l) - -+ (int) btree_id_to_gc_phase(r); -+} -+ -+static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ enum btree_id ids[BTREE_ID_NR]; -+ unsigned i; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ ids[i] = i; -+ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ enum btree_id id = ids[i]; -+ int ret = initial -+ ? bch2_gc_btree_init(c, journal_keys, -+ id, metadata_only) -+ : bch2_gc_btree(c, id, initial, metadata_only); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, -+ u64 start, u64 end, -+ enum bch_data_type type, -+ unsigned flags) -+{ -+ u64 b = sector_to_bucket(ca, start); -+ -+ do { -+ unsigned sectors = -+ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; -+ -+ bch2_mark_metadata_bucket(c, ca, b, type, sectors, -+ gc_phase(GC_PHASE_SB), flags); -+ b++; -+ start += sectors; -+ } while (start < end); -+} -+ -+void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, -+ unsigned flags) -+{ -+ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; -+ unsigned i; -+ u64 b; -+ -+ /* -+ * This conditional is kind of gross, but we may be called from the -+ * device add path, before the new device has actually been added to the -+ * running filesystem: -+ */ -+ if (c) { -+ lockdep_assert_held(&c->sb_lock); -+ percpu_down_read(&c->mark_lock); -+ } -+ -+ for (i = 0; i < layout->nr_superblocks; i++) { -+ u64 offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset == BCH_SB_SECTOR) -+ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, -+ BCH_DATA_sb, flags); -+ -+ mark_metadata_sectors(c, ca, offset, -+ offset + (1 << layout->sb_max_size_bits), -+ BCH_DATA_sb, flags); -+ } -+ -+ for (i = 0; i < ca->journal.nr; i++) { -+ b = ca->journal.buckets[i]; -+ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), flags); -+ } -+ -+ if (c) -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_mark_superblocks(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&c->sb_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_SB)); -+ -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); -+ mutex_unlock(&c->sb_lock); -+} -+ -+#if 0 -+/* Also see bch2_pending_btree_node_free_insert_done() */ -+static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) -+{ -+ struct btree_update *as; -+ struct pending_btree_node_free *d; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); -+ -+ for_each_pending_btree_node_free(c, as, d) -+ if (d->index_update_done) -+ bch2_mark_key(c, bkey_i_to_s_c(&d->key), -+ 0, 0, NULL, 0, -+ BTREE_TRIGGER_GC); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+#endif -+ -+static void bch2_mark_allocator_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct open_bucket *ob; -+ size_t i, j, iter; -+ unsigned ci; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ spin_lock(&c->freelist_lock); -+ gc_pos_set(c, gc_pos_alloc(c, NULL)); -+ -+ for_each_member_device(ca, c, ci) { -+ fifo_for_each_entry(i, &ca->free_inc, iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ -+ -+ -+ for (j = 0; j < RESERVE_NR; j++) -+ fifo_for_each_entry(i, &ca->free[j], iter) -+ bch2_mark_alloc_bucket(c, ca, i, true, -+ gc_pos_alloc(c, NULL), -+ BTREE_TRIGGER_GC); -+ } -+ -+ spin_unlock(&c->freelist_lock); -+ -+ for (ob = c->open_buckets; -+ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); -+ ob++) { -+ spin_lock(&ob->lock); -+ if (ob->valid) { -+ gc_pos_set(c, gc_pos_alloc(c, ob)); -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, -+ gc_pos_alloc(c, ob), -+ BTREE_TRIGGER_GC); -+ } -+ spin_unlock(&ob->lock); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+static void bch2_gc_free(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ genradix_free(&c->stripes[1]); -+ -+ for_each_member_device(ca, c, i) { -+ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ ca->buckets[1] = NULL; -+ -+ free_percpu(ca->usage[1]); -+ ca->usage[1] = NULL; -+ } -+ -+ free_percpu(c->usage_gc); -+ c->usage_gc = NULL; -+} -+ -+static int bch2_gc_done(struct bch_fs *c, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ bool verify = !metadata_only && -+ (!initial || -+ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); -+ unsigned i; -+ int ret = 0; -+ -+#define copy_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, _msg ": got %llu, should be %llu" \ -+ , ##__VA_ARGS__, dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ ret = 1; \ -+ } -+#define copy_stripe_field(_f, _msg, ...) \ -+ if (dst->_f != src->_f) { \ -+ if (verify) \ -+ fsck_err(c, "stripe %zu has wrong "_msg \ -+ ": got %u, should be %u", \ -+ dst_iter.pos, ##__VA_ARGS__, \ -+ dst->_f, src->_f); \ -+ dst->_f = src->_f; \ -+ dst->dirty = true; \ -+ ret = 1; \ -+ } -+#define copy_bucket_field(_f) \ -+ if (dst->b[b].mark._f != src->b[b].mark._f) { \ -+ if (verify) \ -+ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ -+ ": got %u, should be %u", i, b, \ -+ dst->b[b].mark.gen, \ -+ bch2_data_types[dst->b[b].mark.data_type],\ -+ dst->b[b].mark._f, src->b[b].mark._f); \ -+ dst->b[b]._mark._f = src->b[b].mark._f; \ -+ ret = 1; \ -+ } -+#define copy_dev_field(_f, _msg, ...) \ -+ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) -+#define copy_fs_field(_f, _msg, ...) \ -+ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) -+ -+ if (!metadata_only) { -+ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); -+ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); -+ struct stripe *dst, *src; -+ unsigned i; -+ -+ c->ec_stripes_heap.used = 0; -+ -+ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && -+ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { -+ BUG_ON(src_iter.pos != dst_iter.pos); -+ -+ copy_stripe_field(alive, "alive"); -+ copy_stripe_field(sectors, "sectors"); -+ copy_stripe_field(algorithm, "algorithm"); -+ copy_stripe_field(nr_blocks, "nr_blocks"); -+ copy_stripe_field(nr_redundant, "nr_redundant"); -+ copy_stripe_field(blocks_nonempty, -+ "blocks_nonempty"); -+ -+ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) -+ copy_stripe_field(block_sectors[i], -+ "block_sectors[%u]", i); -+ -+ if (dst->alive) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_insert(c, dst, dst_iter.pos); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ genradix_iter_advance(&dst_iter, &c->stripes[0]); -+ genradix_iter_advance(&src_iter, &c->stripes[1]); -+ } -+ } -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 0); -+ struct bucket_array *src = __bucket_array(ca, 1); -+ size_t b; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ copy_bucket_field(gen); -+ copy_bucket_field(data_type); -+ copy_bucket_field(owned_by_allocator); -+ copy_bucket_field(stripe); -+ copy_bucket_field(dirty_sectors); -+ copy_bucket_field(cached_sectors); -+ -+ dst->b[b].oldest_gen = src->b[b].oldest_gen; -+ } -+ }; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ bch2_dev_usage_from_buckets(c); -+ -+ { -+ unsigned nr = fs_usage_u64s(c); -+ struct bch_fs_usage *dst = c->usage_base; -+ struct bch_fs_usage *src = (void *) -+ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); -+ -+ copy_fs_field(hidden, "hidden"); -+ copy_fs_field(btree, "btree"); -+ -+ if (!metadata_only) { -+ copy_fs_field(data, "data"); -+ copy_fs_field(cached, "cached"); -+ copy_fs_field(reserved, "reserved"); -+ copy_fs_field(nr_inodes,"nr_inodes"); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ copy_fs_field(persistent_reserved[i], -+ "persistent_reserved[%i]", i); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ char buf[80]; -+ -+ if (metadata_only && -+ (e->data_type == BCH_DATA_user || -+ e->data_type == BCH_DATA_cached)) -+ continue; -+ -+ bch2_replicas_entry_to_text(&PBUF(buf), e); -+ -+ copy_fs_field(replicas[i], "%s", buf); -+ } -+ } -+ -+#undef copy_fs_field -+#undef copy_dev_field -+#undef copy_bucket_field -+#undef copy_stripe_field -+#undef copy_field -+fsck_err: -+ return ret; -+} -+ -+static int bch2_gc_start(struct bch_fs *c, -+ bool metadata_only) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(c->usage_gc); -+ -+ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), -+ sizeof(u64), GFP_KERNEL); -+ if (!c->usage_gc) { -+ bch_err(c, "error allocating c->usage_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_member_device(ca, c, i) { -+ BUG_ON(ca->buckets[1]); -+ BUG_ON(ca->usage[1]); -+ -+ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO); -+ if (!ca->buckets[1]) { -+ percpu_ref_put(&ca->ref); -+ bch_err(c, "error allocating ca->buckets[gc]"); -+ return -ENOMEM; -+ } -+ -+ ca->usage[1] = alloc_percpu(struct bch_dev_usage); -+ if (!ca->usage[1]) { -+ bch_err(c, "error allocating ca->usage[gc]"); -+ percpu_ref_put(&ca->ref); -+ return -ENOMEM; -+ } -+ } -+ -+ ret = bch2_ec_mem_alloc(c, true); -+ if (ret) { -+ bch_err(c, "error allocating ec gc mem"); -+ return ret; -+ } -+ -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * indicate to stripe code that we need to allocate for the gc stripes -+ * radix tree, too -+ */ -+ gc_pos_set(c, gc_phase(GC_PHASE_START)); -+ -+ for_each_member_device(ca, c, i) { -+ struct bucket_array *dst = __bucket_array(ca, 1); -+ struct bucket_array *src = __bucket_array(ca, 0); -+ size_t b; -+ -+ dst->first_bucket = src->first_bucket; -+ dst->nbuckets = src->nbuckets; -+ -+ for (b = 0; b < src->nbuckets; b++) { -+ struct bucket *d = &dst->b[b]; -+ struct bucket *s = &src->b[b]; -+ -+ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; -+ d->gen_valid = s->gen_valid; -+ -+ if (metadata_only && -+ (s->mark.data_type == BCH_DATA_user || -+ s->mark.data_type == BCH_DATA_cached)) { -+ d->_mark = s->mark; -+ d->_mark.owned_by_allocator = 0; -+ } -+ } -+ }; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return 0; -+} -+ -+/** -+ * bch2_gc - walk _all_ references to buckets, and recompute them: -+ * -+ * Order matters here: -+ * - Concurrent GC relies on the fact that we have a total ordering for -+ * everything that GC walks - see gc_will_visit_node(), -+ * gc_will_visit_root() -+ * -+ * - also, references move around in the course of index updates and -+ * various other crap: everything needs to agree on the ordering -+ * references are allowed to move around in - e.g., we're allowed to -+ * start with a reference owned by an open_bucket (the allocator) and -+ * move it to the btree, but not the reverse. -+ * -+ * This is necessary to ensure that gc doesn't miss references that -+ * move around - if references move backwards in the ordering GC -+ * uses, GC could skip past them -+ */ -+int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, -+ bool initial, bool metadata_only) -+{ -+ struct bch_dev *ca; -+ u64 start_time = local_clock(); -+ unsigned i, iter = 0; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ trace_gc_start(c); -+ -+ down_write(&c->gc_lock); -+ -+ /* flush interior btree updates: */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+again: -+ ret = bch2_gc_start(c, metadata_only); -+ if (ret) -+ goto out; -+ -+ bch2_mark_superblocks(c); -+ -+ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); -+ if (ret) -+ goto out; -+ -+#if 0 -+ bch2_mark_pending_btree_node_frees(c); -+#endif -+ bch2_mark_allocator_buckets(c); -+ -+ c->gc_count++; -+out: -+ if (!ret && -+ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || -+ (!iter && test_restart_gc(c)))) { -+ /* -+ * XXX: make sure gens we fixed got saved -+ */ -+ if (iter++ <= 2) { -+ bch_info(c, "Fixed gens, restarting mark and sweep:"); -+ clear_bit(BCH_FS_FIXED_GENS, &c->flags); -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ percpu_down_write(&c->mark_lock); -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ /* flush fsck errors, reset counters */ -+ bch2_flush_fsck_errs(c); -+ -+ goto again; -+ } -+ -+ bch_info(c, "Unable to fix bucket gens, looping"); -+ ret = -EINVAL; -+ } -+ -+ if (!ret) { -+ bch2_journal_block(&c->journal); -+ -+ percpu_down_write(&c->mark_lock); -+ ret = bch2_gc_done(c, initial, metadata_only); -+ -+ bch2_journal_unblock(&c->journal); -+ } else { -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ /* Indicates that gc is no longer in progress: */ -+ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); -+ -+ bch2_gc_free(c); -+ percpu_up_write(&c->mark_lock); -+ -+ up_write(&c->gc_lock); -+ -+ trace_gc_end(c); -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); -+ -+ /* -+ * Wake up allocator in case it was waiting for buckets -+ * because of not being able to inc gens -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_wake_allocator(ca); -+ -+ /* -+ * At startup, allocations can happen directly instead of via the -+ * allocator thread - issue wakeup in case they blocked on gc_lock: -+ */ -+ closure_wake_up(&c->freelist_wait); -+ return ret; -+} -+ -+static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ percpu_down_read(&c->mark_lock); -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); -+ -+ if (gen_after(g->mark.gen, ptr->gen) > 16) { -+ percpu_up_read(&c->mark_lock); -+ return true; -+ } -+ } -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, false); -+ -+ if (gen_after(g->gc_gen, ptr->gen)) -+ g->gc_gen = ptr->gen; -+ } -+ percpu_up_read(&c->mark_lock); -+ -+ return false; -+} -+ -+/* -+ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree -+ * node pointers currently never have cached pointers that can become stale: -+ */ -+static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack sk; -+ int ret = 0; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k))) { -+ if (gc_btree_gens_key(c, k)) { -+ bkey_on_stack_reassemble(&sk, c, k); -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ if (ret == -EINTR) -+ continue; -+ if (ret) { -+ break; -+ } -+ } -+ -+ bch2_btree_iter_next(iter); -+ } -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+int bch2_gc_gens(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int ret; -+ -+ /* -+ * Ideally we would be using state_lock and not gc_lock here, but that -+ * introduces a deadlock in the RO path - we currently take the state -+ * lock at the start of going RO, thus the gc thread may get stuck: -+ */ -+ down_read(&c->gc_lock); -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->gc_gen = g->mark.gen; -+ up_read(&ca->bucket_lock); -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (btree_node_type_needs_gc(i)) { -+ ret = bch2_gc_btree_gens(c, i); -+ if (ret) { -+ bch_err(c, "error recalculating oldest_gen: %i", ret); -+ goto err; -+ } -+ } -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ g->oldest_gen = g->gc_gen; -+ up_read(&ca->bucket_lock); -+ } -+ -+ c->gc_count++; -+err: -+ up_read(&c->gc_lock); -+ return ret; -+} -+ -+/* Btree coalescing */ -+ -+static void recalc_packed_keys(struct btree *b) -+{ -+ struct bset *i = btree_bset_first(b); -+ struct bkey_packed *k; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ -+ BUG_ON(b->nsets != 1); -+ -+ vstruct_for_each(i, k) -+ btree_keys_account_key_add(&b->nr, 0, k); -+} -+ -+static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *old_nodes[GC_MERGE_NODES]) -+{ -+ struct btree *parent = btree_node_parent(iter, old_nodes[0]); -+ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; -+ unsigned blocks = btree_blocks(c) * 2 / 3; -+ struct btree *new_nodes[GC_MERGE_NODES]; -+ struct btree_update *as; -+ struct keylist keylist; -+ struct bkey_format_state format_state; -+ struct bkey_format new_format; -+ -+ memset(new_nodes, 0, sizeof(new_nodes)); -+ bch2_keylist_init(&keylist, NULL); -+ -+ /* Count keys that are not deleted */ -+ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) -+ u64s += old_nodes[i]->nr.live_u64s; -+ -+ nr_old_nodes = nr_new_nodes = i; -+ -+ /* Check if all keys in @old_nodes could fit in one fewer node */ -+ if (nr_old_nodes <= 1 || -+ __vstruct_blocks(struct btree_node, c->block_bits, -+ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) -+ return; -+ -+ /* Find a format that all keys in @old_nodes can pack into */ -+ bch2_bkey_format_init(&format_state); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ __bch2_btree_calc_format(&format_state, old_nodes[i]); -+ -+ new_format = bch2_bkey_format_done(&format_state); -+ -+ /* Check if repacking would make any nodes too big to fit */ -+ for (i = 0; i < nr_old_nodes; i++) -+ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); -+ return; -+ } -+ -+ if (bch2_keylist_realloc(&keylist, NULL, 0, -+ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); -+ return; -+ } -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + nr_old_nodes, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ NULL); -+ if (IS_ERR(as)) { -+ trace_btree_gc_coalesce_fail(c, -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET); -+ bch2_keylist_free(&keylist, NULL); -+ return; -+ } -+ -+ trace_btree_gc_coalesce(c, old_nodes[0]); -+ -+ for (i = 0; i < nr_old_nodes; i++) -+ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); -+ -+ /* Repack everything with @new_format and sort down to one bset */ -+ for (i = 0; i < nr_old_nodes; i++) -+ new_nodes[i] = -+ __bch2_btree_node_alloc_replacement(as, old_nodes[i], -+ new_format); -+ -+ /* -+ * Conceptually we concatenate the nodes together and slice them -+ * up at different boundaries. -+ */ -+ for (i = nr_new_nodes - 1; i > 0; --i) { -+ struct btree *n1 = new_nodes[i]; -+ struct btree *n2 = new_nodes[i - 1]; -+ -+ struct bset *s1 = btree_bset_first(n1); -+ struct bset *s2 = btree_bset_first(n2); -+ struct bkey_packed *k, *last = NULL; -+ -+ /* Calculate how many keys from @n2 we could fit inside @n1 */ -+ u64s = 0; -+ -+ for (k = s2->start; -+ k < vstruct_last(s2) && -+ vstruct_blocks_plus(n1->data, c->block_bits, -+ u64s + k->u64s) <= blocks; -+ k = bkey_next_skip_noops(k, vstruct_last(s2))) { -+ last = k; -+ u64s += k->u64s; -+ } -+ -+ if (u64s == le16_to_cpu(s2->u64s)) { -+ /* n2 fits entirely in n1 */ -+ n1->key.k.p = n1->data->max_key = n2->data->max_key; -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, -+ le16_to_cpu(s2->u64s)); -+ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ -+ six_unlock_write(&n2->c.lock); -+ bch2_btree_node_free_never_inserted(c, n2); -+ six_unlock_intent(&n2->c.lock); -+ -+ memmove(new_nodes + i - 1, -+ new_nodes + i, -+ sizeof(new_nodes[0]) * (nr_new_nodes - i)); -+ new_nodes[--nr_new_nodes] = NULL; -+ } else if (u64s) { -+ /* move part of n2 into n1 */ -+ n1->key.k.p = n1->data->max_key = -+ bkey_unpack_pos(n1, last); -+ -+ n2->data->min_key = bkey_successor(n1->data->max_key); -+ -+ memcpy_u64s(vstruct_last(s1), -+ s2->start, u64s); -+ le16_add_cpu(&s1->u64s, u64s); -+ -+ memmove(s2->start, -+ vstruct_idx(s2, u64s), -+ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); -+ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) { -+ struct btree *n = new_nodes[i]; -+ -+ recalc_packed_keys(n); -+ btree_node_reset_sib_u64s(n); -+ -+ bch2_btree_build_aux_trees(n); -+ -+ bch2_btree_update_add_new_node(as, n); -+ six_unlock_write(&n->c.lock); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ } -+ -+ /* -+ * The keys for the old nodes get deleted. We don't want to insert keys -+ * that compare equal to the keys for the new nodes we'll also be -+ * inserting - we can't because keys on a keylist must be strictly -+ * greater than the previous keys, and we also don't need to since the -+ * key for the new node will serve the same purpose (overwriting the key -+ * for the old node). -+ */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ struct bkey_i delete; -+ unsigned j; -+ -+ for (j = 0; j < nr_new_nodes; j++) -+ if (!bkey_cmp(old_nodes[i]->key.k.p, -+ new_nodes[j]->key.k.p)) -+ goto next; -+ -+ bkey_init(&delete.k); -+ delete.k.p = old_nodes[i]->key.k.p; -+ bch2_keylist_add_in_order(&keylist, &delete); -+next: -+ i = i; -+ } -+ -+ /* -+ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only -+ * does the lookup once and thus expects the keys to be in sorted order -+ * so we have to make sure the new keys are correctly ordered with -+ * respect to the deleted keys added in the previous loop -+ */ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); -+ -+ /* Insert the newly coalesced nodes */ -+ bch2_btree_insert_node(as, parent, iter, &keylist, 0); -+ -+ BUG_ON(!bch2_keylist_empty(&keylist)); -+ -+ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); -+ -+ bch2_btree_iter_node_replace(iter, new_nodes[0]); -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ bch2_btree_update_get_open_buckets(as, new_nodes[i]); -+ -+ /* Free the old nodes and update our sliding window */ -+ for (i = 0; i < nr_old_nodes; i++) { -+ bch2_btree_node_free_inmem(c, old_nodes[i], iter); -+ -+ /* -+ * the index update might have triggered a split, in which case -+ * the nodes we coalesced - the new nodes we just created - -+ * might not be sibling nodes anymore - don't add them to the -+ * sliding window (except the first): -+ */ -+ if (!i) { -+ old_nodes[i] = new_nodes[i]; -+ } else { -+ old_nodes[i] = NULL; -+ } -+ } -+ -+ for (i = 0; i < nr_new_nodes; i++) -+ six_unlock_intent(&new_nodes[i]->c.lock); -+ -+ bch2_btree_update_done(as); -+ bch2_keylist_free(&keylist, NULL); -+} -+ -+static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ unsigned i; -+ -+ /* Sliding window of adjacent btree nodes */ -+ struct btree *merge[GC_MERGE_NODES]; -+ u32 lock_seq[GC_MERGE_NODES]; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ /* -+ * XXX: We don't have a good way of positively matching on sibling nodes -+ * that have the same parent - this code works by handling the cases -+ * where they might not have the same parent, and is thus fragile. Ugh. -+ * -+ * Perhaps redo this to use multiple linked iterators? -+ */ -+ memset(merge, 0, sizeof(merge)); -+ -+ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, -+ BTREE_MAX_DEPTH, 0, -+ BTREE_ITER_PREFETCH, b) { -+ memmove(merge + 1, merge, -+ sizeof(merge) - sizeof(merge[0])); -+ memmove(lock_seq + 1, lock_seq, -+ sizeof(lock_seq) - sizeof(lock_seq[0])); -+ -+ merge[0] = b; -+ -+ for (i = 1; i < GC_MERGE_NODES; i++) { -+ if (!merge[i] || -+ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) -+ break; -+ -+ if (merge[i]->c.level != merge[0]->c.level) { -+ six_unlock_intent(&merge[i]->c.lock); -+ break; -+ } -+ } -+ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); -+ -+ bch2_coalesce_nodes(c, iter, merge); -+ -+ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { -+ lock_seq[i] = merge[i]->c.lock.state.seq; -+ six_unlock_intent(&merge[i]->c.lock); -+ } -+ -+ lock_seq[0] = merge[0]->c.lock.state.seq; -+ -+ if (kthread && kthread_should_stop()) { -+ bch2_trans_exit(&trans); -+ return -ESHUTDOWN; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ -+ /* -+ * If the parent node wasn't relocked, it might have been split -+ * and the nodes in our sliding window might not have the same -+ * parent anymore - blow away the sliding window: -+ */ -+ if (btree_iter_node(iter, iter->level + 1) && -+ !btree_node_intent_locked(iter, iter->level + 1)) -+ memset(merge + 1, 0, -+ (GC_MERGE_NODES - 1) * sizeof(merge[0])); -+ } -+ return bch2_trans_exit(&trans); -+} -+ -+/** -+ * bch_coalesce - coalesce adjacent nodes with low occupancy -+ */ -+void bch2_coalesce(struct bch_fs *c) -+{ -+ enum btree_id id; -+ -+ down_read(&c->gc_lock); -+ trace_gc_coalesce_start(c); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ int ret = c->btree_roots[id].b -+ ? bch2_coalesce_btree(c, id) -+ : 0; -+ -+ if (ret) { -+ if (ret != -ESHUTDOWN) -+ bch_err(c, "btree coalescing failed: %d", ret); -+ return; -+ } -+ } -+ -+ trace_gc_coalesce_end(c); -+ up_read(&c->gc_lock); -+} -+ -+static int bch2_gc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last = atomic_long_read(&clock->now); -+ unsigned last_kick = atomic_read(&c->kick_gc); -+ int ret; -+ -+ set_freezable(); -+ -+ while (1) { -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ -+ if (kthread_should_stop()) { -+ __set_current_state(TASK_RUNNING); -+ return 0; -+ } -+ -+ if (atomic_read(&c->kick_gc) != last_kick) -+ break; -+ -+ if (c->btree_gc_periodic) { -+ unsigned long next = last + c->capacity / 16; -+ -+ if (atomic_long_read(&clock->now) >= next) -+ break; -+ -+ bch2_io_clock_schedule_timeout(clock, next); -+ } else { -+ schedule(); -+ } -+ -+ try_to_freeze(); -+ } -+ __set_current_state(TASK_RUNNING); -+ -+ last = atomic_long_read(&clock->now); -+ last_kick = atomic_read(&c->kick_gc); -+ -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ ret = bch2_gc(c, NULL, false, false); -+#else -+ ret = bch2_gc_gens(c); -+#endif -+ if (ret < 0) -+ bch_err(c, "btree gc failed: %i", ret); -+ -+ debug_check_no_locks_held(); -+ } -+ -+ return 0; -+} -+ -+void bch2_gc_thread_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ p = c->gc_thread; -+ c->gc_thread = NULL; -+ -+ if (p) { -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_gc_thread_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ BUG_ON(c->gc_thread); -+ -+ p = kthread_create(bch2_gc_thread, c, "bch_gc"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ c->gc_thread = p; -+ wake_up_process(p); -+ return 0; -+} -diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h -new file mode 100644 -index 000000000000..3694a3df62a8 ---- /dev/null -+++ b/fs/bcachefs/btree_gc.h -@@ -0,0 +1,121 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_GC_H -+#define _BCACHEFS_BTREE_GC_H -+ -+#include "btree_types.h" -+ -+void bch2_coalesce(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); -+int bch2_gc_gens(struct bch_fs *); -+void bch2_gc_thread_stop(struct bch_fs *); -+int bch2_gc_thread_start(struct bch_fs *); -+void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); -+ -+/* -+ * For concurrent mark and sweep (with other index updates), we define a total -+ * ordering of _all_ references GC walks: -+ * -+ * Note that some references will have the same GC position as others - e.g. -+ * everything within the same btree node; in those cases we're relying on -+ * whatever locking exists for where those references live, i.e. the write lock -+ * on a btree node. -+ * -+ * That locking is also required to ensure GC doesn't pass the updater in -+ * between the updater adding/removing the reference and updating the GC marks; -+ * without that, we would at best double count sometimes. -+ * -+ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ -+ * be held that prevents GC from passing the position the updater is at. -+ * -+ * (What about the start of gc, when we're clearing all the marks? GC clears the -+ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc -+ * position inside its cmpxchg loop, so crap magically works). -+ */ -+ -+/* Position of (the start of) a gc phase: */ -+static inline struct gc_pos gc_phase(enum gc_phase phase) -+{ -+ return (struct gc_pos) { -+ .phase = phase, -+ .pos = POS_MIN, -+ .level = 0, -+ }; -+} -+ -+static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) -+{ -+ if (l.phase != r.phase) -+ return l.phase < r.phase ? -1 : 1; -+ if (bkey_cmp(l.pos, r.pos)) -+ return bkey_cmp(l.pos, r.pos); -+ if (l.level != r.level) -+ return l.level < r.level ? -1 : 1; -+ return 0; -+} -+ -+static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) -+{ -+ switch (id) { -+#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; -+ BCH_BTREE_IDS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct gc_pos gc_pos_btree(enum btree_id id, -+ struct bpos pos, unsigned level) -+{ -+ return (struct gc_pos) { -+ .phase = btree_id_to_gc_phase(id), -+ .pos = pos, -+ .level = level, -+ }; -+} -+ -+/* -+ * GC position of the pointers within a btree node: note, _not_ for &b->key -+ * itself, that lives in the parent node: -+ */ -+static inline struct gc_pos gc_pos_btree_node(struct btree *b) -+{ -+ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); -+} -+ -+/* -+ * GC position of the pointer to a btree root: we don't use -+ * gc_pos_pointer_to_btree_node() here to avoid a potential race with -+ * btree_split() increasing the tree depth - the new root will have level > the -+ * old root and thus have a greater gc position than the old root, but that -+ * would be incorrect since once gc has marked the root it's not coming back. -+ */ -+static inline struct gc_pos gc_pos_btree_root(enum btree_id id) -+{ -+ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); -+} -+ -+static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) -+{ -+ return (struct gc_pos) { -+ .phase = GC_PHASE_ALLOC, -+ .pos = POS(ob ? ob - c->open_buckets : 0, 0), -+ }; -+} -+ -+static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) -+{ -+ unsigned seq; -+ bool ret; -+ -+ do { -+ seq = read_seqcount_begin(&c->gc_pos_lock); -+ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; -+ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); -+ -+ return ret; -+} -+ -+#endif /* _BCACHEFS_BTREE_GC_H */ -diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c -new file mode 100644 -index 000000000000..2f5097218f9c ---- /dev/null -+++ b/fs/bcachefs/btree_io.c -@@ -0,0 +1,1834 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static void verify_no_dups(struct btree *b, -+ struct bkey_packed *start, -+ struct bkey_packed *end, -+ bool extents) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bkey_packed *k, *p; -+ -+ if (start == end) -+ return; -+ -+ for (p = start, k = bkey_next_skip_noops(start, end); -+ k != end; -+ p = k, k = bkey_next_skip_noops(k, end)) { -+ struct bkey l = bkey_unpack_key(b, p); -+ struct bkey r = bkey_unpack_key(b, k); -+ -+ BUG_ON(extents -+ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 -+ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); -+ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); -+ } -+#endif -+} -+ -+static void set_needs_whiteout(struct bset *i, int v) -+{ -+ struct bkey_packed *k; -+ -+ for (k = i->start; -+ k != vstruct_last(i); -+ k = bkey_next_skip_noops(k, vstruct_last(i))) -+ k->needs_whiteout = v; -+} -+ -+static void btree_bounce_free(struct bch_fs *c, size_t size, -+ bool used_mempool, void *p) -+{ -+ if (used_mempool) -+ mempool_free(p, &c->btree_bounce_pool); -+ else -+ vpfree(p, size); -+} -+ -+static void *btree_bounce_alloc(struct bch_fs *c, size_t size, -+ bool *used_mempool) -+{ -+ unsigned flags = memalloc_nofs_save(); -+ void *p; -+ -+ BUG_ON(size > btree_bytes(c)); -+ -+ *used_mempool = false; -+ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); -+ if (!p) { -+ *used_mempool = true; -+ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); -+ } -+ memalloc_nofs_restore(flags); -+ return p; -+} -+ -+static void sort_bkey_ptrs(const struct btree *bt, -+ struct bkey_packed **ptrs, unsigned nr) -+{ -+ unsigned n = nr, a = nr / 2, b, c, d; -+ -+ if (!a) -+ return; -+ -+ /* Heap sort: see lib/sort.c: */ -+ while (1) { -+ if (a) -+ a--; -+ else if (--n) -+ swap(ptrs[0], ptrs[n]); -+ else -+ break; -+ -+ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) -+ b = bkey_cmp_packed(bt, -+ ptrs[c], -+ ptrs[d]) >= 0 ? c : d; -+ if (d == n) -+ b = c; -+ -+ while (b != a && -+ bkey_cmp_packed(bt, -+ ptrs[a], -+ ptrs[b]) >= 0) -+ b = (b - 1) / 2; -+ c = b; -+ while (b != a) { -+ b = (b - 1) / 2; -+ swap(ptrs[b], ptrs[c]); -+ } -+ } -+} -+ -+static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; -+ bool used_mempool = false; -+ size_t bytes = b->whiteout_u64s * sizeof(u64); -+ -+ if (!b->whiteout_u64s) -+ return; -+ -+ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); -+ -+ for (k = unwritten_whiteouts_start(c, b); -+ k != unwritten_whiteouts_end(c, b); -+ k = bkey_next(k)) -+ *--ptrs = k; -+ -+ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); -+ -+ k = new_whiteouts; -+ -+ while (ptrs != ptrs_end) { -+ bkey_copy(k, *ptrs); -+ k = bkey_next(k); -+ ptrs++; -+ } -+ -+ verify_no_dups(b, new_whiteouts, -+ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), -+ btree_node_old_extent_overwrite(b)); -+ -+ memcpy_u64s(unwritten_whiteouts_start(c, b), -+ new_whiteouts, b->whiteout_u64s); -+ -+ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); -+} -+ -+static bool should_compact_bset(struct btree *b, struct bset_tree *t, -+ bool compacting, enum compact_mode mode) -+{ -+ if (!bset_dead_u64s(b, t)) -+ return false; -+ -+ switch (mode) { -+ case COMPACT_LAZY: -+ return should_compact_bset_lazy(b, t) || -+ (compacting && !bset_written(b, bset(b, t))); -+ case COMPACT_ALL: -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_compact_extent_whiteouts(struct bch_fs *c, -+ struct btree *b, -+ enum compact_mode mode) -+{ -+ const struct bkey_format *f = &b->format; -+ struct bset_tree *t; -+ struct bkey_packed *whiteouts = NULL; -+ struct bkey_packed *u_start, *u_pos; -+ struct sort_iter sort_iter; -+ unsigned bytes, whiteout_u64s = 0, u64s; -+ bool used_mempool, compacting = false; -+ -+ BUG_ON(!btree_node_is_extents(b)); -+ -+ for_each_bset(b, t) -+ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) -+ whiteout_u64s += bset_dead_u64s(b, t); -+ -+ if (!whiteout_u64s) -+ return false; -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ whiteout_u64s += b->whiteout_u64s; -+ bytes = whiteout_u64s * sizeof(u64); -+ -+ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); -+ u_start = u_pos = whiteouts; -+ -+ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ compacting = true; -+ -+ if (!should_compact_bset(b, t, compacting, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ compacting = true; -+ u_start = u_pos; -+ start = i->start; -+ end = vstruct_last(i); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (bkey_deleted(k)) -+ continue; -+ -+ BUG_ON(bkey_whiteout(k) && -+ k->needs_whiteout && -+ bkey_written(b, k)); -+ -+ if (bkey_whiteout(k) && !k->needs_whiteout) -+ continue; -+ -+ if (bkey_whiteout(k)) { -+ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); -+ set_bkeyp_val_u64s(f, u_pos, 0); -+ u_pos = bkey_next(u_pos); -+ } else { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } -+ } -+ -+ sort_iter_add(&sort_iter, u_start, u_pos); -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ } -+ -+ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; -+ -+ BUG_ON((void *) unwritten_whiteouts_start(c, b) < -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ -+ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), -+ &sort_iter); -+ -+ BUG_ON(u64s > b->whiteout_u64s); -+ BUG_ON(u_pos != whiteouts && !u64s); -+ -+ if (u64s != b->whiteout_u64s) { -+ void *src = unwritten_whiteouts_start(c, b); -+ -+ b->whiteout_u64s = u64s; -+ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); -+ } -+ -+ verify_no_dups(b, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b), -+ true); -+ -+ btree_bounce_free(c, bytes, used_mempool, whiteouts); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ bch_btree_keys_u64s_remaining(c, b); -+ bch2_verify_btree_nr_keys(b); -+ -+ return true; -+} -+ -+static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) -+{ -+ struct bset_tree *t; -+ bool ret = false; -+ -+ for_each_bset(b, t) { -+ struct bset *i = bset(b, t); -+ struct bkey_packed *k, *n, *out, *start, *end; -+ struct btree_node_entry *src = NULL, *dst = NULL; -+ -+ if (t != b->set && !bset_written(b, i)) { -+ src = container_of(i, struct btree_node_entry, keys); -+ dst = max(write_block(b), -+ (void *) btree_bkey_last(b, t - 1)); -+ } -+ -+ if (src != dst) -+ ret = true; -+ -+ if (!should_compact_bset(b, t, ret, mode)) { -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src) + -+ le16_to_cpu(src->keys.u64s) * -+ sizeof(u64)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ continue; -+ } -+ -+ start = btree_bkey_first(b, t); -+ end = btree_bkey_last(b, t); -+ -+ if (src != dst) { -+ memmove(dst, src, sizeof(*src)); -+ i = &dst->keys; -+ set_btree_bset(b, t, i); -+ } -+ -+ out = i->start; -+ -+ for (k = start; k != end; k = n) { -+ n = bkey_next_skip_noops(k, end); -+ -+ if (!bkey_whiteout(k)) { -+ bkey_copy(out, k); -+ out = bkey_next(out); -+ } else { -+ BUG_ON(k->needs_whiteout); -+ } -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) out - i->_data); -+ set_btree_bset_end(b, t); -+ bch2_bset_set_no_aux_tree(b, t); -+ ret = true; -+ } -+ -+ bch2_verify_btree_nr_keys(b); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return ret; -+} -+ -+bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, -+ enum compact_mode mode) -+{ -+ return !btree_node_old_extent_overwrite(b) -+ ? bch2_drop_whiteouts(b, mode) -+ : bch2_compact_extent_whiteouts(c, b, mode); -+} -+ -+static void btree_node_sort(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter, -+ unsigned start_idx, -+ unsigned end_idx, -+ bool filter_whiteouts) -+{ -+ struct btree_node *out; -+ struct sort_iter sort_iter; -+ struct bset_tree *t; -+ struct bset *start_bset = bset(b, &b->set[start_idx]); -+ bool used_mempool = false; -+ u64 start_time, seq = 0; -+ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; -+ bool sorting_entire_node = start_idx == 0 && -+ end_idx == b->nsets; -+ -+ sort_iter_init(&sort_iter, b); -+ -+ for (t = b->set + start_idx; -+ t < b->set + end_idx; -+ t++) { -+ u64s += le16_to_cpu(bset(b, t)->u64s); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ } -+ -+ bytes = sorting_entire_node -+ ? btree_bytes(c) -+ : __vstruct_bytes(struct btree_node, u64s); -+ -+ out = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ start_time = local_clock(); -+ -+ if (btree_node_old_extent_overwrite(b)) -+ filter_whiteouts = bset_written(b, start_bset); -+ -+ u64s = (btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents -+ : bch2_sort_keys)(out->keys.start, -+ &sort_iter, -+ filter_whiteouts); -+ -+ out->keys.u64s = cpu_to_le16(u64s); -+ -+ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); -+ -+ if (sorting_entire_node) -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ /* Make sure we preserve bset journal_seq: */ -+ for (t = b->set + start_idx; t < b->set + end_idx; t++) -+ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); -+ start_bset->journal_seq = cpu_to_le64(seq); -+ -+ if (sorting_entire_node) { -+ unsigned u64s = le16_to_cpu(out->keys.u64s); -+ -+ BUG_ON(bytes != btree_bytes(c)); -+ -+ /* -+ * Our temporary buffer is the same size as the btree node's -+ * buffer, we can just swap buffers instead of doing a big -+ * memcpy() -+ */ -+ *out = *b->data; -+ out->keys.u64s = cpu_to_le16(u64s); -+ swap(out, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ } else { -+ start_bset->u64s = out->keys.u64s; -+ memcpy_u64s(start_bset->start, -+ out->keys.start, -+ le16_to_cpu(out->keys.u64s)); -+ } -+ -+ for (i = start_idx + 1; i < end_idx; i++) -+ b->nr.bset_u64s[start_idx] += -+ b->nr.bset_u64s[i]; -+ -+ b->nsets -= shift; -+ -+ for (i = start_idx + 1; i < b->nsets; i++) { -+ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; -+ b->set[i] = b->set[i + shift]; -+ } -+ -+ for (i = b->nsets; i < MAX_BSETS; i++) -+ b->nr.bset_u64s[i] = 0; -+ -+ set_btree_bset_end(b, &b->set[start_idx]); -+ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); -+ -+ btree_bounce_free(c, bytes, used_mempool, out); -+ -+ bch2_verify_btree_nr_keys(b); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *c, -+ struct btree *dst, -+ struct btree *src) -+{ -+ struct btree_nr_keys nr; -+ struct btree_node_iter src_iter; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(dst->nsets != 1); -+ -+ bch2_bset_set_no_aux_tree(dst, dst->set); -+ -+ bch2_btree_node_iter_init_from_start(&src_iter, src); -+ -+ if (btree_node_is_extents(src)) -+ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ else -+ nr = bch2_sort_repack(btree_bset_first(dst), -+ src, &src_iter, -+ &dst->format, -+ true); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], -+ start_time); -+ -+ set_btree_bset_end(dst, dst->set); -+ -+ dst->nr.live_u64s += nr.live_u64s; -+ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; -+ dst->nr.packed_keys += nr.packed_keys; -+ dst->nr.unpacked_keys += nr.unpacked_keys; -+ -+ bch2_verify_btree_nr_keys(dst); -+} -+ -+#define SORT_CRIT (4096 / sizeof(u64)) -+ -+/* -+ * We're about to add another bset to the btree node, so if there's currently -+ * too many bsets - sort some of them together: -+ */ -+static bool btree_node_compact(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ unsigned unwritten_idx; -+ bool ret = false; -+ -+ for (unwritten_idx = 0; -+ unwritten_idx < b->nsets; -+ unwritten_idx++) -+ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) -+ break; -+ -+ if (b->nsets - unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, unwritten_idx, -+ b->nsets, false); -+ ret = true; -+ } -+ -+ if (unwritten_idx > 1) { -+ btree_node_sort(c, b, iter, 0, unwritten_idx, false); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+void bch2_btree_build_aux_trees(struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ bch2_bset_build_aux_tree(b, t, -+ !bset_written(b, bset(b, t)) && -+ t == bset_tree_last(b)); -+} -+ -+/* -+ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be -+ * inserted into -+ * -+ * Safe to call if there already is an unwritten bset - will only add a new bset -+ * if @b doesn't already have one. -+ * -+ * Returns true if we sorted (i.e. invalidated iterators -+ */ -+void bch2_btree_init_next(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_node_entry *bne; -+ bool did_sort; -+ -+ EBUG_ON(!(b->c.lock.state.seq & 1)); -+ EBUG_ON(iter && iter->l[b->c.level].b != b); -+ -+ did_sort = btree_node_compact(c, b, iter); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ if (iter && did_sort) -+ bch2_btree_iter_reinit_node(iter, b); -+} -+ -+static void btree_err_msg(struct printbuf *out, struct bch_fs *c, -+ struct btree *b, struct bset *i, -+ unsigned offset, int write) -+{ -+ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" -+ "pos ", -+ write ? "before write " : "", -+ b->c.btree_id, b->c.level, -+ c->btree_roots[b->c.btree_id].level); -+ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); -+ -+ pr_buf(out, " node offset %u", b->written); -+ if (i) -+ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); -+} -+ -+enum btree_err_type { -+ BTREE_ERR_FIXABLE, -+ BTREE_ERR_WANT_RETRY, -+ BTREE_ERR_MUST_RETRY, -+ BTREE_ERR_FATAL, -+}; -+ -+enum btree_validate_ret { -+ BTREE_RETRY_READ = 64, -+}; -+ -+#define btree_err(type, c, b, i, msg, ...) \ -+({ \ -+ __label__ out; \ -+ char _buf[300]; \ -+ struct printbuf out = PBUF(_buf); \ -+ \ -+ btree_err_msg(&out, c, b, i, b->written, write); \ -+ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ -+ \ -+ if (type == BTREE_ERR_FIXABLE && \ -+ write == READ && \ -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ -+ mustfix_fsck_err(c, "%s", _buf); \ -+ goto out; \ -+ } \ -+ \ -+ switch (write) { \ -+ case READ: \ -+ bch_err(c, "%s", _buf); \ -+ \ -+ switch (type) { \ -+ case BTREE_ERR_FIXABLE: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ case BTREE_ERR_WANT_RETRY: \ -+ if (have_retry) { \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case BTREE_ERR_MUST_RETRY: \ -+ ret = BTREE_RETRY_READ; \ -+ goto fsck_err; \ -+ case BTREE_ERR_FATAL: \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write: %s", _buf); \ -+ \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+out: \ -+ true; \ -+}) -+ -+#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) -+ -+static int validate_bset(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ const char *err; -+ int ret = 0; -+ -+ btree_err_on((version != BCH_BSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max, -+ BTREE_ERR_FATAL, c, b, i, -+ "unsupported bset version"); -+ -+ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "bset past end of btree node")) { -+ i->u64s = 0; -+ return 0; -+ } -+ -+ btree_err_on(b->written && !i->u64s, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "empty bset"); -+ -+ if (!b->written) { -+ struct btree_node *bn = -+ container_of(i, struct btree_node, keys); -+ /* These indicate that we read the wrong btree node: */ -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ /* XXX endianness */ -+ btree_err_on(bp->seq != bn->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect sequence number (wrong btree node)"); -+ } -+ -+ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect btree id"); -+ -+ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect level"); -+ -+ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { -+ u64 *p = (u64 *) &bn->ptr; -+ -+ *p = swab64(*p); -+ } -+ -+ if (!write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "incorrect min_key: got %llu:%llu should be %llu:%llu", -+ b->data->min_key.inode, -+ b->data->min_key.offset, -+ bp->min_key.inode, -+ bp->min_key.offset); -+ } -+ -+ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), -+ BTREE_ERR_MUST_RETRY, c, b, i, -+ "incorrect max key"); -+ -+ if (write) -+ compat_btree_node(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, bn); -+ -+ /* XXX: ideally we would be validating min_key too */ -+#if 0 -+ /* -+ * not correct anymore, due to btree node write error -+ * handling -+ * -+ * need to add bn->seq to btree keys and verify -+ * against that -+ */ -+ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), -+ bn->ptr), -+ BTREE_ERR_FATAL, c, b, i, -+ "incorrect backpointer"); -+#endif -+ err = bch2_bkey_format_validate(&bn->format); -+ btree_err_on(err, -+ BTREE_ERR_FATAL, c, b, i, -+ "invalid bkey format: %s", err); -+ -+ compat_bformat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &bn->format); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int validate_bset_keys(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned *whiteout_u64s, -+ int write, bool have_retry) -+{ -+ unsigned version = le16_to_cpu(i->version); -+ struct bkey_packed *k, *prev = NULL; -+ bool seen_non_whiteout = false; -+ int ret = 0; -+ -+ if (!BSET_SEPARATE_WHITEOUTS(i)) { -+ seen_non_whiteout = true; -+ *whiteout_u64s = 0; -+ } -+ -+ for (k = i->start; -+ k != vstruct_last(i);) { -+ struct bkey_s u; -+ struct bkey tmp; -+ const char *invalid; -+ -+ if (btree_err_on(bkey_next(k) > vstruct_last(i), -+ BTREE_ERR_FIXABLE, c, b, i, -+ "key extends past end of bset")) { -+ i->u64s = cpu_to_le16((u64 *) k - i->_data); -+ break; -+ } -+ -+ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey format %u", k->format)) { -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ /* XXX: validate k->u64s */ -+ if (!write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ u = __bkey_disassemble(b, k, &tmp); -+ -+ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: -+ bch2_bkey_in_btree_node(b, u.s_c) ?: -+ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey:\n%s\n%s", invalid, buf); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ continue; -+ } -+ -+ if (write) -+ bch2_bkey_compat(b->c.level, b->c.btree_id, version, -+ BSET_BIG_ENDIAN(i), write, -+ &b->format, k); -+ -+ /* -+ * with the separate whiteouts thing (used for extents), the -+ * second set of keys actually can have whiteouts too, so we -+ * can't solely go off bkey_whiteout()... -+ */ -+ -+ if (!seen_non_whiteout && -+ (!bkey_whiteout(k) || -+ (prev && bkey_iter_cmp(b, prev, k) > 0))) { -+ *whiteout_u64s = k->_data - i->_data; -+ seen_non_whiteout = true; -+ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { -+ char buf1[80]; -+ char buf2[80]; -+ struct bkey up = bkey_unpack_key(b, prev); -+ -+ bch2_bkey_to_text(&PBUF(buf1), &up); -+ bch2_bkey_to_text(&PBUF(buf2), u.k); -+ -+ bch2_dump_bset(c, b, i, 0); -+ btree_err(BTREE_ERR_FATAL, c, b, i, -+ "keys out of order: %s > %s", -+ buf1, buf2); -+ /* XXX: repair this */ -+ } -+ -+ prev = k; -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+fsck_err: -+ return ret; -+} -+ -+int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) -+{ -+ struct btree_node_entry *bne; -+ struct sort_iter *iter; -+ struct btree_node *sorted; -+ struct bkey_packed *k; -+ struct bch_extent_ptr *ptr; -+ struct bset *i; -+ bool used_mempool, blacklisted; -+ unsigned u64s; -+ int ret, retry_read = 0, write = READ; -+ -+ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); -+ sort_iter_init(iter, b); -+ iter->size = (btree_blocks(c) + 1) * 2; -+ -+ if (bch2_meta_read_fault("btree")) -+ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "dynamic fault"); -+ -+ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad magic"); -+ -+ btree_err_on(!b->data->keys.seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "bad btree header"); -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bch_btree_ptr_v2 *bp = -+ &bkey_i_to_btree_ptr_v2(&b->key)->v; -+ -+ btree_err_on(b->data->keys.seq != bp->seq, -+ BTREE_ERR_MUST_RETRY, c, b, NULL, -+ "got wrong btree node (seq %llx want %llx)", -+ b->data->keys.seq, bp->seq); -+ } -+ -+ while (b->written < c->opts.btree_node_size) { -+ unsigned sectors, whiteout_u64s = 0; -+ struct nonce nonce; -+ struct bch_csum csum; -+ bool first = !b->written; -+ -+ if (!b->written) { -+ i = &b->data->keys; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); -+ -+ btree_err_on(bch2_crc_cmp(csum, b->data->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ sectors = vstruct_sectors(b->data, c->block_bits); -+ } else { -+ bne = write_block(b); -+ i = &bne->keys; -+ -+ if (i->seq != b->data->keys.seq) -+ break; -+ -+ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "unknown checksum type"); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ btree_err_on(bch2_crc_cmp(csum, bne->csum), -+ BTREE_ERR_WANT_RETRY, c, b, i, -+ "invalid checksum"); -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ sectors = vstruct_sectors(bne, c->block_bits); -+ } -+ -+ ret = validate_bset(c, b, i, sectors, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ if (!b->written) -+ btree_node_set_format(b, b->data->format); -+ -+ ret = validate_bset_keys(c, b, i, &whiteout_u64s, -+ READ, have_retry); -+ if (ret) -+ goto fsck_err; -+ -+ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); -+ -+ b->written += sectors; -+ -+ blacklisted = bch2_journal_seq_is_blacklisted(c, -+ le64_to_cpu(i->journal_seq), -+ true); -+ -+ btree_err_on(blacklisted && first, -+ BTREE_ERR_FIXABLE, c, b, i, -+ "first btree node bset has blacklisted journal seq"); -+ if (blacklisted && !first) -+ continue; -+ -+ sort_iter_add(iter, i->start, -+ vstruct_idx(i, whiteout_u64s)); -+ -+ sort_iter_add(iter, -+ vstruct_idx(i, whiteout_u64s), -+ vstruct_last(i)); -+ } -+ -+ for (bne = write_block(b); -+ bset_byte_offset(b, bne) < btree_bytes(c); -+ bne = (void *) bne + block_bytes(c)) -+ btree_err_on(bne->keys.seq == b->data->keys.seq, -+ BTREE_ERR_WANT_RETRY, c, b, NULL, -+ "found bset signature after last bset"); -+ -+ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); -+ sorted->keys.u64s = 0; -+ -+ set_btree_bset(b, b->set, &b->data->keys); -+ -+ b->nr = (btree_node_old_extent_overwrite(b) -+ ? bch2_extent_sort_fix_overlapping -+ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); -+ -+ u64s = le16_to_cpu(sorted->keys.u64s); -+ *sorted = *b->data; -+ sorted->keys.u64s = cpu_to_le16(u64s); -+ swap(sorted, b->data); -+ set_btree_bset(b, b->set, &b->data->keys); -+ b->nsets = 1; -+ -+ BUG_ON(b->nr.live_u64s != u64s); -+ -+ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); -+ -+ i = &b->data->keys; -+ for (k = i->start; k != vstruct_last(i);) { -+ struct bkey tmp; -+ struct bkey_s u = __bkey_disassemble(b, k, &tmp); -+ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); -+ -+ if (invalid || -+ (inject_invalid_keys(c) && -+ !bversion_cmp(u.k->version, MAX_VERSION))) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); -+ btree_err(BTREE_ERR_FIXABLE, c, b, i, -+ "invalid bkey %s: %s", buf, invalid); -+ -+ btree_keys_account_key_drop(&b->nr, 0, k); -+ -+ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); -+ memmove_u64s_down(k, bkey_next(k), -+ (u64 *) vstruct_end(i) - (u64 *) k); -+ set_btree_bset_end(b, b->set); -+ continue; -+ } -+ -+ if (u.k->type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); -+ -+ bp.v->mem_ptr = 0; -+ } -+ -+ k = bkey_next_skip_noops(k, vstruct_last(i)); -+ } -+ -+ bch2_bset_build_aux_tree(b, b->set, false); -+ -+ set_needs_whiteout(btree_bset_first(b), true); -+ -+ btree_node_reset_sib_u64s(b); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ set_btree_node_need_rewrite(b); -+ } -+out: -+ mempool_free(iter, &c->fill_iter); -+ return retry_read; -+fsck_err: -+ if (ret == BTREE_RETRY_READ) { -+ retry_read = 1; -+ } else { -+ bch2_inconsistent_error(c); -+ set_btree_node_read_error(b); -+ } -+ goto out; -+} -+ -+static void btree_node_read_work(struct work_struct *work) -+{ -+ struct btree_read_bio *rb = -+ container_of(work, struct btree_read_bio, work); -+ struct bch_fs *c = rb->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ struct btree *b = rb->bio.bi_private; -+ struct bio *bio = &rb->bio; -+ struct bch_io_failures failed = { .nr = 0 }; -+ bool can_retry; -+ -+ goto start; -+ while (1) { -+ bch_info(c, "retrying read"); -+ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ bio_reset(bio); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = rb->pick.ptr.offset; -+ bio->bi_iter.bi_size = btree_bytes(c); -+ -+ if (rb->have_ioref) { -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ submit_bio_wait(bio); -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ } -+start: -+ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", -+ bch2_blk_status_to_str(bio->bi_status)); -+ if (rb->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ rb->have_ioref = false; -+ -+ bch2_mark_io_failure(&failed, &rb->pick); -+ -+ can_retry = bch2_bkey_pick_read_device(c, -+ bkey_i_to_s_c(&b->key), -+ &failed, &rb->pick) > 0; -+ -+ if (!bio->bi_status && -+ !bch2_btree_node_read_done(c, b, can_retry)) -+ break; -+ -+ if (!can_retry) { -+ set_btree_node_read_error(b); -+ break; -+ } -+ } -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], -+ rb->start_time); -+ bio_put(&rb->bio); -+ clear_btree_node_read_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); -+} -+ -+static void btree_node_read_endio(struct bio *bio) -+{ -+ struct btree_read_bio *rb = -+ container_of(bio, struct btree_read_bio, bio); -+ struct bch_fs *c = rb->c; -+ -+ if (rb->have_ioref) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); -+ bch2_latency_acct(ca, rb->start_time, READ); -+ } -+ -+ queue_work(system_unbound_wq, &rb->work); -+} -+ -+void bch2_btree_node_read(struct bch_fs *c, struct btree *b, -+ bool sync) -+{ -+ struct extent_ptr_decoded pick; -+ struct btree_read_bio *rb; -+ struct bch_dev *ca; -+ struct bio *bio; -+ int ret; -+ -+ trace_btree_read(c, b); -+ -+ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick); -+ if (bch2_fs_fatal_err_on(ret <= 0, c, -+ "btree node read error: no device to read from")) { -+ set_btree_node_read_error(b); -+ return; -+ } -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, -+ btree_bytes(c)), -+ &c->btree_bio); -+ rb = container_of(bio, struct btree_read_bio, bio); -+ rb->c = c; -+ rb->start_time = local_clock(); -+ rb->have_ioref = bch2_dev_get_ioref(ca, READ); -+ rb->pick = pick; -+ INIT_WORK(&rb->work, btree_node_read_work); -+ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bio->bi_end_io = btree_node_read_endio; -+ bio->bi_private = b; -+ bch2_bio_map(bio, b->data, btree_bytes(c)); -+ -+ set_btree_node_read_in_flight(b); -+ -+ if (rb->have_ioref) { -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], -+ bio_sectors(bio)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ -+ if (sync) { -+ submit_bio_wait(bio); -+ -+ bio->bi_private = b; -+ btree_node_read_work(&rb->work); -+ } else { -+ submit_bio(bio); -+ } -+ } else { -+ bio->bi_status = BLK_STS_REMOVED; -+ -+ if (sync) -+ btree_node_read_work(&rb->work); -+ else -+ queue_work(system_unbound_wq, &rb->work); -+ -+ } -+} -+ -+int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, -+ const struct bkey_i *k, unsigned level) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ BUG_ON(IS_ERR(b)); -+ -+ bkey_copy(&b->key, k); -+ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); -+ -+ bch2_btree_node_read(c, b, true); -+ -+ if (btree_node_read_error(b)) { -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ ret = -EIO; -+ goto err; -+ } -+ -+ bch2_btree_set_root_for_read(c, b); -+err: -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+ -+ return ret; -+} -+ -+void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, -+ struct btree_write *w) -+{ -+ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); -+ -+ do { -+ old = new = v; -+ if (!(old & 1)) -+ break; -+ -+ new &= ~1UL; -+ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); -+ -+ if (old & 1) -+ closure_put(&((struct btree_update *) new)->cl); -+ -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+} -+ -+static void btree_node_write_done(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_write *w = btree_prev_write(b); -+ -+ bch2_btree_complete_write(c, b, w); -+ btree_node_io_unlock(b); -+} -+ -+static void bch2_btree_node_write_error(struct bch_fs *c, -+ struct btree_write_bio *wbio) -+{ -+ struct btree *b = wbio->wbio.bio.bi_private; -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+ struct bch_extent_ptr *ptr; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, -+ BTREE_MAX_DEPTH, b->c.level, 0); -+retry: -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ /* has node been freed? */ -+ if (iter->l[b->c.level].b != b) { -+ /* node has been freed: */ -+ BUG_ON(!btree_node_dying(b)); -+ goto out; -+ } -+ -+ BUG_ON(!btree_node_hashed(b)); -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, -+ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) -+ goto err; -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_trans_exit(&trans); -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+ return; -+err: -+ set_btree_node_noevict(b); -+ bch2_fs_fatal_error(c, "fatal error writing btree node"); -+ goto out; -+} -+ -+void bch2_btree_write_error_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ btree_write_error_work); -+ struct bio *bio; -+ -+ while (1) { -+ spin_lock_irq(&c->btree_write_error_lock); -+ bio = bio_list_pop(&c->btree_write_error_list); -+ spin_unlock_irq(&c->btree_write_error_lock); -+ -+ if (!bio) -+ break; -+ -+ bch2_btree_node_write_error(c, -+ container_of(bio, struct btree_write_bio, wbio.bio)); -+ } -+} -+ -+static void btree_node_write_work(struct work_struct *work) -+{ -+ struct btree_write_bio *wbio = -+ container_of(work, struct btree_write_bio, work); -+ struct bch_fs *c = wbio->wbio.c; -+ struct btree *b = wbio->wbio.bio.bi_private; -+ -+ btree_bounce_free(c, -+ wbio->bytes, -+ wbio->wbio.used_mempool, -+ wbio->data); -+ -+ if (wbio->wbio.failed.nr) { -+ unsigned long flags; -+ -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ -+ queue_work(c->wq, &c->btree_write_error_work); -+ return; -+ } -+ -+ bio_put(&wbio->wbio.bio); -+ btree_node_write_done(c, b); -+} -+ -+static void btree_node_write_endio(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_write_bio *orig = parent ?: wbio; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ unsigned long flags; -+ -+ if (wbio->have_ioref) -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("btree")) { -+ spin_lock_irqsave(&c->btree_write_error_lock, flags); -+ bch2_dev_list_add_dev(&orig->failed, wbio->dev); -+ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); -+ } -+ -+ if (wbio->have_ioref) -+ percpu_ref_put(&ca->io_ref); -+ -+ if (parent) { -+ bio_put(bio); -+ bio_endio(&parent->bio); -+ } else { -+ struct btree_write_bio *wb = -+ container_of(orig, struct btree_write_bio, wbio); -+ -+ INIT_WORK(&wb->work, btree_node_write_work); -+ queue_work(system_unbound_wq, &wb->work); -+ } -+} -+ -+static int validate_bset_for_write(struct bch_fs *c, struct btree *b, -+ struct bset *i, unsigned sectors) -+{ -+ unsigned whiteout_u64s = 0; -+ int ret; -+ -+ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) -+ return -1; -+ -+ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: -+ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); -+ if (ret) -+ bch2_inconsistent_error(c); -+ -+ return ret; -+} -+ -+void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ struct btree_write_bio *wbio; -+ struct bset_tree *t; -+ struct bset *i; -+ struct btree_node *bn = NULL; -+ struct btree_node_entry *bne = NULL; -+ BKEY_PADDED(key) k; -+ struct bch_extent_ptr *ptr; -+ struct sort_iter sort_iter; -+ struct nonce nonce; -+ unsigned bytes_to_write, sectors_to_write, bytes, u64s; -+ u64 seq = 0; -+ bool used_mempool; -+ unsigned long old, new; -+ bool validate_before_checksum = false; -+ void *data; -+ -+ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) -+ return; -+ -+ /* -+ * We may only have a read lock on the btree node - the dirty bit is our -+ * "lock" against racing with other threads that may be trying to start -+ * a write, we do a write iff we clear the dirty bit. Since setting the -+ * dirty bit requires a write lock, we can't race with other threads -+ * redirtying it: -+ */ -+ do { -+ old = new = READ_ONCE(b->flags); -+ -+ if (!(old & (1 << BTREE_NODE_dirty))) -+ return; -+ -+ if (!btree_node_may_write(b)) -+ return; -+ -+ if (old & (1 << BTREE_NODE_write_in_flight)) { -+ btree_node_wait_on_io(b); -+ continue; -+ } -+ -+ new &= ~(1 << BTREE_NODE_dirty); -+ new &= ~(1 << BTREE_NODE_need_write); -+ new |= (1 << BTREE_NODE_write_in_flight); -+ new |= (1 << BTREE_NODE_just_written); -+ new ^= (1 << BTREE_NODE_write_idx); -+ } while (cmpxchg_acquire(&b->flags, old, new) != old); -+ -+ BUG_ON(btree_node_fake(b)); -+ BUG_ON((b->will_make_reachable != 0) != !b->written); -+ -+ BUG_ON(b->written >= c->opts.btree_node_size); -+ BUG_ON(b->written & (c->opts.block_size - 1)); -+ BUG_ON(bset_written(b, btree_bset_last(b))); -+ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); -+ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); -+ -+ bch2_sort_whiteouts(c, b); -+ -+ sort_iter_init(&sort_iter, b); -+ -+ bytes = !b->written -+ ? sizeof(struct btree_node) -+ : sizeof(struct btree_node_entry); -+ -+ bytes += b->whiteout_u64s * sizeof(u64); -+ -+ for_each_bset(b, t) { -+ i = bset(b, t); -+ -+ if (bset_written(b, i)) -+ continue; -+ -+ bytes += le16_to_cpu(i->u64s) * sizeof(u64); -+ sort_iter_add(&sort_iter, -+ btree_bkey_first(b, t), -+ btree_bkey_last(b, t)); -+ seq = max(seq, le64_to_cpu(i->journal_seq)); -+ } -+ -+ data = btree_bounce_alloc(c, bytes, &used_mempool); -+ -+ if (!b->written) { -+ bn = data; -+ *bn = *b->data; -+ i = &bn->keys; -+ } else { -+ bne = data; -+ bne->keys = b->data->keys; -+ i = &bne->keys; -+ } -+ -+ i->journal_seq = cpu_to_le64(seq); -+ i->u64s = 0; -+ -+ if (!btree_node_old_extent_overwrite(b)) { -+ sort_iter_add(&sort_iter, -+ unwritten_whiteouts_start(c, b), -+ unwritten_whiteouts_end(c, b)); -+ SET_BSET_SEPARATE_WHITEOUTS(i, false); -+ } else { -+ memcpy_u64s(i->start, -+ unwritten_whiteouts_start(c, b), -+ b->whiteout_u64s); -+ i->u64s = cpu_to_le16(b->whiteout_u64s); -+ SET_BSET_SEPARATE_WHITEOUTS(i, true); -+ } -+ -+ b->whiteout_u64s = 0; -+ -+ u64s = btree_node_old_extent_overwrite(b) -+ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) -+ : bch2_sort_keys(i->start, &sort_iter, false); -+ le16_add_cpu(&i->u64s, u64s); -+ -+ set_needs_whiteout(i, false); -+ -+ /* do we have data to write? */ -+ if (b->written && !i->u64s) -+ goto nowrite; -+ -+ bytes_to_write = vstruct_end(i) - data; -+ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; -+ -+ memset(data + bytes_to_write, 0, -+ (sectors_to_write << 9) - bytes_to_write); -+ -+ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); -+ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); -+ BUG_ON(i->seq != b->data->keys.seq); -+ -+ i->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le16(BCH_BSET_VERSION_OLD) -+ : cpu_to_le16(c->sb.version); -+ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) -+ validate_before_checksum = true; -+ -+ /* validate_bset will be modifying: */ -+ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ /* if we're going to be encrypting, check metadata validity first: */ -+ if (validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ bset_encrypt(c, i, b->written << 9); -+ -+ nonce = btree_nonce(i, b->written << 9); -+ -+ if (bn) -+ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); -+ else -+ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); -+ -+ /* if we're not encrypting, check metadata after checksumming: */ -+ if (!validate_before_checksum && -+ validate_bset_for_write(c, b, i, sectors_to_write)) -+ goto err; -+ -+ /* -+ * We handle btree write errors by immediately halting the journal - -+ * after we've done that, we can't issue any subsequent btree writes -+ * because they might have pointers to new nodes that failed to write. -+ * -+ * Furthermore, there's no point in doing any more btree writes because -+ * with the journal stopped, we're never going to update the journal to -+ * reflect that those writes were done and the data flushed from the -+ * journal: -+ * -+ * Also on journal error, the pending write may have updates that were -+ * never journalled (interior nodes, see btree_update_nodes_written()) - -+ * it's critical that we don't do the write in that case otherwise we -+ * will have updates visible that weren't in the journal: -+ * -+ * Make sure to update b->written so bch2_btree_init_next() doesn't -+ * break: -+ */ -+ if (bch2_journal_error(&c->journal) || -+ c->opts.nochanges) -+ goto err; -+ -+ trace_btree_write(b, bytes_to_write, sectors_to_write); -+ -+ wbio = container_of(bio_alloc_bioset(GFP_NOIO, -+ buf_pages(data, sectors_to_write << 9), -+ &c->btree_bio), -+ struct btree_write_bio, wbio.bio); -+ wbio_init(&wbio->wbio.bio); -+ wbio->data = data; -+ wbio->bytes = bytes; -+ wbio->wbio.used_mempool = used_mempool; -+ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; -+ wbio->wbio.bio.bi_end_io = btree_node_write_endio; -+ wbio->wbio.bio.bi_private = b; -+ -+ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); -+ -+ /* -+ * If we're appending to a leaf node, we don't technically need FUA - -+ * this write just needs to be persisted before the next journal write, -+ * which will be marked FLUSH|FUA. -+ * -+ * Similarly if we're writing a new btree root - the pointer is going to -+ * be in the next journal entry. -+ * -+ * But if we're writing a new btree node (that isn't a root) or -+ * appending to a non leaf btree node, we need either FUA or a flush -+ * when we write the parent with the new pointer. FUA is cheaper than a -+ * flush, and writes appending to leaf nodes aren't blocking anything so -+ * just make all btree node writes FUA to keep things sane. -+ */ -+ -+ bkey_copy(&k.key, &b->key); -+ -+ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) -+ ptr->offset += b->written; -+ -+ b->written += sectors_to_write; -+ -+ /* XXX: submitting IO with btree locks held: */ -+ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); -+ return; -+err: -+ set_btree_node_noevict(b); -+ b->written += sectors_to_write; -+nowrite: -+ btree_bounce_free(c, bytes, used_mempool, data); -+ btree_node_write_done(c, b); -+} -+ -+/* -+ * Work that must be done with write lock held: -+ */ -+bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) -+{ -+ bool invalidated_iter = false; -+ struct btree_node_entry *bne; -+ struct bset_tree *t; -+ -+ if (!btree_node_just_written(b)) -+ return false; -+ -+ BUG_ON(b->whiteout_u64s); -+ -+ clear_btree_node_just_written(b); -+ -+ /* -+ * Note: immediately after write, bset_written() doesn't work - the -+ * amount of data we had to write after compaction might have been -+ * smaller than the offset of the last bset. -+ * -+ * However, we know that all bsets have been written here, as long as -+ * we're still holding the write lock: -+ */ -+ -+ /* -+ * XXX: decide if we really want to unconditionally sort down to a -+ * single bset: -+ */ -+ if (b->nsets > 1) { -+ btree_node_sort(c, b, NULL, 0, b->nsets, true); -+ invalidated_iter = true; -+ } else { -+ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); -+ } -+ -+ for_each_bset(b, t) -+ set_needs_whiteout(bset(b, t), true); -+ -+ bch2_btree_verify(c, b); -+ -+ /* -+ * If later we don't unconditionally sort down to a single bset, we have -+ * to ensure this is still true: -+ */ -+ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); -+ -+ bne = want_new_bset(c, b); -+ if (bne) -+ bch2_bset_init_next(c, b, bne); -+ -+ bch2_btree_build_aux_trees(b); -+ -+ return invalidated_iter; -+} -+ -+/* -+ * Use this one if the node is intent locked: -+ */ -+void bch2_btree_node_write(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_type_held) -+{ -+ BUG_ON(lock_type_held == SIX_LOCK_write); -+ -+ if (lock_type_held == SIX_LOCK_intent || -+ six_lock_tryupgrade(&b->c.lock)) { -+ __bch2_btree_node_write(c, b, SIX_LOCK_intent); -+ -+ /* don't cycle lock unnecessarily: */ -+ if (btree_node_just_written(b) && -+ six_trylock_write(&b->c.lock)) { -+ bch2_btree_post_write_cleanup(c, b); -+ six_unlock_write(&b->c.lock); -+ } -+ -+ if (lock_type_held == SIX_LOCK_read) -+ six_lock_downgrade(&b->c.lock); -+ } else { -+ __bch2_btree_node_write(c, b, SIX_LOCK_read); -+ } -+} -+ -+static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+restart: -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) -+ if (test_bit(flag, &b->flags)) { -+ rcu_read_unlock(); -+ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); -+ goto restart; -+ -+ } -+ rcu_read_unlock(); -+} -+ -+void bch2_btree_flush_all_reads(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); -+} -+ -+void bch2_btree_flush_all_writes(struct bch_fs *c) -+{ -+ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); -+} -+ -+void bch2_btree_verify_flushed(struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || -+ (flags & (1 << BTREE_NODE_write_in_flight))); -+ } -+ rcu_read_unlock(); -+} -+ -+void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bucket_table *tbl; -+ struct rhash_head *pos; -+ struct btree *b; -+ unsigned i; -+ -+ rcu_read_lock(); -+ for_each_cached_btree(b, c, tbl, i, pos) { -+ unsigned long flags = READ_ONCE(b->flags); -+ -+ if (!(flags & (1 << BTREE_NODE_dirty))) -+ continue; -+ -+ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", -+ b, -+ (flags & (1 << BTREE_NODE_dirty)) != 0, -+ (flags & (1 << BTREE_NODE_need_write)) != 0, -+ b->c.level, -+ b->written, -+ !list_empty_careful(&b->write_blocked), -+ b->will_make_reachable != 0, -+ b->will_make_reachable & 1); -+ } -+ rcu_read_unlock(); -+} -diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h -new file mode 100644 -index 000000000000..626d0f071b70 ---- /dev/null -+++ b/fs/bcachefs/btree_io.h -@@ -0,0 +1,220 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_IO_H -+#define _BCACHEFS_BTREE_IO_H -+ -+#include "bkey_methods.h" -+#include "bset.h" -+#include "btree_locking.h" -+#include "checksum.h" -+#include "extents.h" -+#include "io_types.h" -+ -+struct bch_fs; -+struct btree_write; -+struct btree; -+struct btree_iter; -+ -+struct btree_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ unsigned have_ioref:1; -+ struct extent_ptr_decoded pick; -+ struct work_struct work; -+ struct bio bio; -+}; -+ -+struct btree_write_bio { -+ struct work_struct work; -+ void *data; -+ unsigned bytes; -+ struct bch_write_bio wbio; -+}; -+ -+static inline void btree_node_io_unlock(struct btree *b) -+{ -+ EBUG_ON(!btree_node_write_in_flight(b)); -+ clear_btree_node_write_in_flight(b); -+ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); -+} -+ -+static inline void btree_node_io_lock(struct btree *b) -+{ -+ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline void btree_node_wait_on_io(struct btree *b) -+{ -+ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, -+ TASK_UNINTERRUPTIBLE); -+} -+ -+static inline bool btree_node_may_write(struct btree *b) -+{ -+ return list_empty_careful(&b->write_blocked) && -+ (!b->written || !b->will_make_reachable); -+} -+ -+enum compact_mode { -+ COMPACT_LAZY, -+ COMPACT_ALL, -+}; -+ -+bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, -+ enum compact_mode); -+ -+static inline bool should_compact_bset_lazy(struct btree *b, -+ struct bset_tree *t) -+{ -+ unsigned total_u64s = bset_u64s(t); -+ unsigned dead_u64s = bset_dead_u64s(b, t); -+ -+ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; -+} -+ -+static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) -+{ -+ struct bset_tree *t; -+ -+ for_each_bset(b, t) -+ if (should_compact_bset_lazy(b, t)) -+ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); -+ -+ return false; -+} -+ -+static inline struct nonce btree_nonce(struct bset *i, unsigned offset) -+{ -+ return (struct nonce) {{ -+ [0] = cpu_to_le32(offset), -+ [1] = ((__le32 *) &i->seq)[0], -+ [2] = ((__le32 *) &i->seq)[1], -+ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, -+ }}; -+} -+ -+static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) -+{ -+ struct nonce nonce = btree_nonce(i, offset); -+ -+ if (!offset) { -+ struct btree_node *bn = container_of(i, struct btree_node, keys); -+ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, -+ bytes); -+ -+ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); -+ } -+ -+ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, -+ vstruct_end(i) - (void *) i->_data); -+} -+ -+void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); -+ -+void bch2_btree_build_aux_trees(struct btree *); -+void bch2_btree_init_next(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+ -+int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); -+void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); -+int bch2_btree_root_read(struct bch_fs *, enum btree_id, -+ const struct bkey_i *, unsigned); -+ -+void bch2_btree_complete_write(struct bch_fs *, struct btree *, -+ struct btree_write *); -+void bch2_btree_write_error_work(struct work_struct *); -+ -+void __bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); -+ -+void bch2_btree_node_write(struct bch_fs *, struct btree *, -+ enum six_lock_type); -+ -+static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, -+ enum six_lock_type lock_held) -+{ -+ while (b->written && -+ btree_node_need_write(b) && -+ btree_node_may_write(b)) { -+ if (!btree_node_write_in_flight(b)) { -+ bch2_btree_node_write(c, b, lock_held); -+ break; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_held); -+ btree_node_wait_on_io(b); -+ btree_node_lock_type(c, b, lock_held); -+ } -+} -+ -+#define bch2_btree_node_write_cond(_c, _b, cond) \ -+do { \ -+ unsigned long old, new, v = READ_ONCE((_b)->flags); \ -+ \ -+ do { \ -+ old = new = v; \ -+ \ -+ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ -+ break; \ -+ \ -+ new |= (1 << BTREE_NODE_need_write); \ -+ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ -+ \ -+ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ -+} while (0) -+ -+void bch2_btree_flush_all_reads(struct bch_fs *); -+void bch2_btree_flush_all_writes(struct bch_fs *); -+void bch2_btree_verify_flushed(struct bch_fs *); -+void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); -+ -+static inline void compat_bformat(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bkey_format *f) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) { -+ swap(f->bits_per_field[BKEY_FIELD_INODE], -+ f->bits_per_field[BKEY_FIELD_OFFSET]); -+ swap(f->field_offset[BKEY_FIELD_INODE], -+ f->field_offset[BKEY_FIELD_OFFSET]); -+ } -+} -+ -+static inline void compat_bpos(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, struct bpos *p) -+{ -+ if (big_endian != CPU_BIG_ENDIAN) -+ bch2_bpos_swab(p); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_id == BTREE_ID_INODES) -+ swap(p->inode, p->offset); -+} -+ -+static inline void compat_btree_node(unsigned level, enum btree_id btree_id, -+ unsigned version, unsigned big_endian, -+ int write, -+ struct btree_node *bn) -+{ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ write) -+ bn->min_key = bkey_predecessor(bn->min_key); -+ -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); -+ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bn->min_key, POS_MIN) && -+ !write) -+ bn->min_key = bkey_successor(bn->min_key); -+} -+ -+#endif /* _BCACHEFS_BTREE_IO_H */ -diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c -new file mode 100644 -index 000000000000..6fab76c3220c ---- /dev/null -+++ b/fs/bcachefs/btree_iter.c -@@ -0,0 +1,2445 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "debug.h" -+#include "extents.h" -+#include "journal.h" -+ -+#include -+#include -+ -+static inline bool is_btree_node(struct btree_iter *iter, unsigned l) -+{ -+ return l < BTREE_MAX_DEPTH && -+ (unsigned long) iter->l[l].b >= 128; -+} -+ -+static inline struct bpos btree_iter_search_key(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ -+ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && -+ bkey_cmp(pos, POS_MAX)) -+ pos = bkey_successor(pos); -+ return pos; -+} -+ -+static inline bool btree_iter_pos_before_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; -+} -+ -+static inline bool btree_iter_pos_after_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; -+} -+ -+static inline bool btree_iter_pos_in_node(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return iter->btree_id == b->c.btree_id && -+ !btree_iter_pos_before_node(iter, b) && -+ !btree_iter_pos_after_node(iter, b); -+} -+ -+/* Btree node locking: */ -+ -+void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) -+{ -+ bch2_btree_node_unlock_write_inlined(b, iter); -+} -+ -+void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ unsigned readers = 0; -+ -+ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[b->c.level].b == b && -+ btree_node_read_locked(linked, b->c.level)) -+ readers++; -+ -+ /* -+ * Must drop our read locks before calling six_lock_write() - -+ * six_unlock() won't do wakeups until the reader count -+ * goes to 0, and it's safe because we have the node intent -+ * locked: -+ */ -+ atomic64_sub(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); -+ atomic64_add(__SIX_VAL(read_lock, readers), -+ &b->c.lock.state.counter); -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = btree_iter_node(iter, level); -+ int want = __btree_lock_want(iter, level); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (race_fault()) -+ return false; -+ -+ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || -+ (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, want))) { -+ mark_btree_node_locked(iter, level, want); -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) -+{ -+ struct btree *b = iter->l[level].b; -+ -+ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); -+ -+ if (!is_btree_node(iter, level)) -+ return false; -+ -+ if (btree_node_intent_locked(iter, level)) -+ return true; -+ -+ if (race_fault()) -+ return false; -+ -+ if (btree_node_locked(iter, level) -+ ? six_lock_tryupgrade(&b->c.lock) -+ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) -+ goto success; -+ -+ if (btree_node_lock_seq_matches(iter, b, level) && -+ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { -+ btree_node_unlock(iter, level); -+ goto success; -+ } -+ -+ return false; -+success: -+ mark_btree_node_intent_locked(iter, level); -+ return true; -+} -+ -+static inline bool btree_iter_get_locks(struct btree_iter *iter, -+ bool upgrade, bool trace) -+{ -+ unsigned l = iter->level; -+ int fail_idx = -1; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!(upgrade -+ ? bch2_btree_node_upgrade(iter, l) -+ : bch2_btree_node_relock(iter, l))) { -+ if (trace) -+ (upgrade -+ ? trace_node_upgrade_fail -+ : trace_node_relock_fail)(l, iter->l[l].lock_seq, -+ is_btree_node(iter, l) -+ ? 0 -+ : (unsigned long) iter->l[l].b, -+ is_btree_node(iter, l) -+ ? iter->l[l].b->c.lock.state.seq -+ : 0); -+ -+ fail_idx = l; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ /* -+ * When we fail to get a lock, we have to ensure that any child nodes -+ * can't be relocked so bch2_btree_iter_traverse has to walk back up to -+ * the node that we failed to relock: -+ */ -+ while (fail_idx >= 0) { -+ btree_node_unlock(iter, fail_idx); -+ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; -+ --fail_idx; -+ } -+ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ return iter->uptodate < BTREE_ITER_NEED_RELOCK; -+} -+ -+static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ return type != BTREE_ITER_CACHED -+ ? container_of(_b, struct btree, c)->key.k.p -+ : container_of(_b, struct bkey_cached, c)->key.pos; -+} -+ -+/* Slowpath: */ -+bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, -+ unsigned level, struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, -+ void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_iter *linked; -+ u64 start_time = local_clock(); -+ bool ret = true; -+ -+ /* Check if it's safe to block: */ -+ trans_for_each_iter(trans, linked) { -+ if (!linked->nodes_locked) -+ continue; -+ -+ /* -+ * Can't block taking an intent lock if we have _any_ nodes read -+ * locked: -+ * -+ * - Our read lock blocks another thread with an intent lock on -+ * the same node from getting a write lock, and thus from -+ * dropping its intent lock -+ * -+ * - And the other thread may have multiple nodes intent locked: -+ * both the node we want to intent lock, and the node we -+ * already have read locked - deadlock: -+ */ -+ if (type == SIX_LOCK_intent && -+ linked->nodes_locked != linked->nodes_intent_locked) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = max_t(unsigned, -+ linked->locks_want, -+ __fls(linked->nodes_locked) + 1); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* -+ * Interior nodes must be locked before their descendants: if -+ * another iterator has possible descendants locked of the node -+ * we're about to lock, it must have the ancestors locked too: -+ */ -+ if (linked->btree_id == iter->btree_id && -+ level > __fls(linked->nodes_locked)) { -+ if (!(trans->nounlock)) { -+ linked->locks_want = -+ max(level + 1, max_t(unsigned, -+ linked->locks_want, -+ iter->locks_want)); -+ if (!btree_iter_get_locks(linked, true, false)) -+ ret = false; -+ } else { -+ ret = false; -+ } -+ } -+ -+ /* Must lock btree nodes in key order: */ -+ if ((cmp_int(iter->btree_id, linked->btree_id) ?: -+ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) -+ ret = false; -+ -+ if (iter->btree_id == linked->btree_id && -+ btree_node_locked(linked, level) && -+ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, -+ btree_iter_type(linked))) <= 0) -+ ret = false; -+ -+ /* -+ * Recheck if this is a node we already have locked - since one -+ * of the get_locks() calls might've successfully -+ * upgraded/relocked it: -+ */ -+ if (linked->l[level].b == b && -+ btree_node_locked_type(linked, level) >= type) { -+ six_lock_increment(&b->c.lock, type); -+ return true; -+ } -+ } -+ -+ if (unlikely(!ret)) { -+ trace_trans_restart_would_deadlock(iter->trans->ip); -+ return false; -+ } -+ -+ if (six_trylock_type(&b->c.lock, type)) -+ return true; -+ -+ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) -+ return false; -+ -+ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], -+ start_time); -+ return true; -+} -+ -+/* Btree iterator locking: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+static void bch2_btree_iter_verify_locks(struct btree_iter *iter) -+{ -+ unsigned l; -+ -+ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { -+ BUG_ON(iter->nodes_locked); -+ return; -+ } -+ -+ for (l = 0; is_btree_node(iter, l); l++) { -+ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && -+ !btree_node_locked(iter, l)) -+ continue; -+ -+ BUG_ON(btree_lock_want(iter, l) != -+ btree_node_locked_type(iter, l)); -+ } -+} -+ -+void bch2_btree_trans_verify_locks(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter_all(trans, iter) -+ bch2_btree_iter_verify_locks(iter); -+} -+#else -+static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} -+#endif -+ -+__flatten -+bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) -+{ -+ return btree_iter_get_locks(iter, false, trace); -+} -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ if (btree_iter_get_locks(iter, true, true)) -+ return true; -+ -+ /* -+ * Ancestor nodes must be locked before child nodes, so set locks_want -+ * on iterators that might lock ancestors before us to avoid getting -+ * -EINTR later: -+ */ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked != iter && -+ linked->btree_id == iter->btree_id && -+ linked->locks_want < new_locks_want) { -+ linked->locks_want = new_locks_want; -+ btree_iter_get_locks(linked, true, false); -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ unsigned l = iter->level; -+ -+ EBUG_ON(iter->locks_want >= new_locks_want); -+ -+ iter->locks_want = new_locks_want; -+ -+ do { -+ if (!btree_iter_node(iter, l)) -+ break; -+ -+ if (!bch2_btree_node_upgrade(iter, l)) { -+ iter->locks_want = l; -+ return false; -+ } -+ -+ l++; -+ } while (l < iter->locks_want); -+ -+ return true; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *iter, -+ unsigned downgrade_to) -+{ -+ unsigned l, new_locks_want = downgrade_to ?: -+ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); -+ -+ if (iter->locks_want < downgrade_to) { -+ iter->locks_want = new_locks_want; -+ -+ while (iter->nodes_locked && -+ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { -+ if (l > iter->level) { -+ btree_node_unlock(iter, l); -+ } else { -+ if (btree_node_intent_locked(iter, l)) { -+ six_lock_downgrade(&iter->l[l].b->c.lock); -+ iter->nodes_intent_locked ^= 1 << l; -+ } -+ break; -+ } -+ } -+ } -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ bch2_btree_iter_downgrade(iter); -+} -+ -+/* Btree transaction locking: */ -+ -+bool bch2_trans_relock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ bool ret = true; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) -+ ret &= bch2_btree_iter_relock(iter, true); -+ -+ return ret; -+} -+ -+void bch2_trans_unlock(struct btree_trans *trans) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ __bch2_btree_iter_unlock(iter); -+} -+ -+/* Btree iterator: */ -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+static void bch2_btree_iter_verify_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ bool locked = btree_node_locked(iter, 0); -+ -+ if (!bch2_btree_node_relock(iter, 0)) -+ return; -+ -+ ck = (void *) iter->l[0].b; -+ BUG_ON(ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)); -+ -+ if (!locked) -+ btree_node_unlock(iter, 0); -+} -+ -+static void bch2_btree_iter_verify_level(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ struct btree_node_iter tmp = l->iter; -+ bool locked = btree_node_locked(iter, level); -+ struct bkey_packed *p, *k; -+ char buf1[100], buf2[100]; -+ const char *msg; -+ -+ if (!debug_check_iterators(iter->trans->c)) -+ return; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ if (!level) -+ bch2_btree_iter_verify_cached(iter); -+ return; -+ } -+ -+ BUG_ON(iter->level < iter->min_depth); -+ -+ if (!btree_iter_node(iter, level)) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ /* -+ * Ideally this invariant would always be true, and hopefully in the -+ * future it will be, but for now set_pos_same_leaf() breaks it: -+ */ -+ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && -+ !btree_iter_pos_in_node(iter, l->b)); -+ -+ /* -+ * node iterators don't use leaf node iterator: -+ */ -+ if (btree_iter_type(iter) == BTREE_ITER_NODES && -+ level <= iter->min_depth) -+ goto unlock; -+ -+ bch2_btree_node_iter_verify(&l->iter, l->b); -+ -+ /* -+ * For interior nodes, the iterator will have skipped past -+ * deleted keys: -+ * -+ * For extents, the iterator may have skipped past deleted keys (but not -+ * whiteouts) -+ */ -+ p = level || btree_node_type_is_extents(iter->btree_id) -+ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) -+ : bch2_btree_node_iter_prev_all(&tmp, l->b); -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { -+ msg = "before"; -+ goto err; -+ } -+ -+ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ msg = "after"; -+ goto err; -+ } -+unlock: -+ if (!locked) -+ btree_node_unlock(iter, level); -+ return; -+err: -+ strcpy(buf1, "(none)"); -+ strcpy(buf2, "(none)"); -+ -+ if (p) { -+ struct bkey uk = bkey_unpack_key(l->b, p); -+ bch2_bkey_to_text(&PBUF(buf1), &uk); -+ } -+ -+ if (k) { -+ struct bkey uk = bkey_unpack_key(l->b, k); -+ bch2_bkey_to_text(&PBUF(buf2), &uk); -+ } -+ -+ panic("iterator should be %s key at level %u:\n" -+ "iter pos %s %llu:%llu\n" -+ "prev key %s\n" -+ "cur key %s\n", -+ msg, level, -+ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", -+ iter->pos.inode, iter->pos.offset, -+ buf1, buf2); -+} -+ -+static void bch2_btree_iter_verify(struct btree_iter *iter) -+{ -+ unsigned i; -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ bch2_btree_iter_verify_level(iter, i); -+} -+ -+void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) -+{ -+ struct btree_iter *iter; -+ -+ if (!debug_check_iterators(trans->c)) -+ return; -+ -+ trans_for_each_iter_with_node(trans, b, iter) -+ bch2_btree_iter_verify_level(iter, b->c.level); -+} -+ -+#else -+ -+static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} -+static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -+ -+#endif -+ -+static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, -+ struct btree *b, -+ struct bset_tree *t, -+ struct bkey_packed *k) -+{ -+ struct btree_node_iter_set *set; -+ -+ btree_node_iter_for_each(iter, set) -+ if (set->end == t->end_offset) { -+ set->k = __btree_node_key_to_offset(b, k); -+ bch2_btree_node_iter_sort(iter, b); -+ return; -+ } -+ -+ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); -+} -+ -+static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter_level *l = &iter->l[b->c.level]; -+ struct bpos pos = btree_iter_search_key(iter); -+ -+ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) -+ return; -+ -+ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_packed *where) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_iter_fix_key_modified(linked, b, where); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static void __bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bset_tree *t, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ const struct bkey_packed *end = btree_bkey_last(b, t); -+ struct btree_node_iter_set *set; -+ unsigned offset = __btree_node_key_to_offset(b, where); -+ int shift = new_u64s - clobber_u64s; -+ unsigned old_end = t->end_offset - shift; -+ unsigned orig_iter_pos = node_iter->data[0].k; -+ bool iter_current_key_modified = -+ orig_iter_pos >= offset && -+ orig_iter_pos <= offset + clobber_u64s; -+ struct bpos iter_pos = btree_iter_search_key(iter); -+ -+ btree_node_iter_for_each(node_iter, set) -+ if (set->end == old_end) -+ goto found; -+ -+ /* didn't find the bset in the iterator - might have to readd it: */ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ bch2_btree_node_iter_push(node_iter, b, where, end); -+ goto fixup_done; -+ } else { -+ /* Iterator is after key that changed */ -+ return; -+ } -+found: -+ set->end = t->end_offset; -+ -+ /* Iterator hasn't gotten to the key that changed yet: */ -+ if (set->k < offset) -+ return; -+ -+ if (new_u64s && -+ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { -+ set->k = offset; -+ } else if (set->k < offset + clobber_u64s) { -+ set->k = offset + new_u64s; -+ if (set->k == set->end) -+ bch2_btree_node_iter_set_drop(node_iter, set); -+ } else { -+ /* Iterator is after key that changed */ -+ set->k = (int) set->k + shift; -+ return; -+ } -+ -+ bch2_btree_node_iter_sort(node_iter, b); -+fixup_done: -+ if (node_iter->data[0].k != orig_iter_pos) -+ iter_current_key_modified = true; -+ -+ /* -+ * When a new key is added, and the node iterator now points to that -+ * key, the iterator might have skipped past deleted keys that should -+ * come after the key the iterator now points to. We have to rewind to -+ * before those deleted keys - otherwise -+ * bch2_btree_node_iter_prev_all() breaks: -+ */ -+ if (!bch2_btree_node_iter_end(node_iter) && -+ iter_current_key_modified && -+ (b->c.level || -+ btree_node_type_is_extents(iter->btree_id))) { -+ struct bset_tree *t; -+ struct bkey_packed *k, *k2, *p; -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ -+ for_each_bset(b, t) { -+ bool set_pos = false; -+ -+ if (node_iter->data[0].end == t->end_offset) -+ continue; -+ -+ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); -+ -+ while ((p = bch2_bkey_prev_all(b, t, k2)) && -+ bkey_iter_cmp(b, k, p) < 0) { -+ k2 = p; -+ set_pos = true; -+ } -+ -+ if (set_pos) -+ btree_node_iter_set_set_pos(node_iter, -+ b, t, k2); -+ } -+ } -+ -+ if (!b->c.level && -+ node_iter == &iter->l[0].iter && -+ iter_current_key_modified) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void bch2_btree_node_iter_fix(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_packed *where, -+ unsigned clobber_u64s, -+ unsigned new_u64s) -+{ -+ struct bset_tree *t = bch2_bkey_to_bset(b, where); -+ struct btree_iter *linked; -+ -+ if (node_iter != &iter->l[b->c.level].iter) { -+ __bch2_btree_node_iter_fix(iter, b, node_iter, t, -+ where, clobber_u64s, new_u64s); -+ -+ if (debug_check_iterators(iter->trans->c)) -+ bch2_btree_node_iter_verify(node_iter, b); -+ } -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) { -+ __bch2_btree_node_iter_fix(linked, b, -+ &linked->l[b->c.level].iter, t, -+ where, clobber_u64s, new_u64s); -+ bch2_btree_iter_verify_level(linked, b->c.level); -+ } -+} -+ -+static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u, -+ struct bkey_packed *k) -+{ -+ struct bkey_s_c ret; -+ -+ if (unlikely(!k)) { -+ /* -+ * signal to bch2_btree_iter_peek_slot() that we're currently at -+ * a hole -+ */ -+ u->type = KEY_TYPE_deleted; -+ return bkey_s_c_null; -+ } -+ -+ ret = bkey_disassemble(l->b, k, u); -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ -+ return ret; -+} -+ -+/* peek_all() doesn't skip deleted keys */ -+static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ struct bkey *u) -+{ -+ return __btree_iter_unpack(iter, l, u, -+ bch2_btree_node_iter_peek_all(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+} -+ -+static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, -+ struct btree_iter_level *l) -+{ -+ return __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_prev(&l->iter, l->b)); -+} -+ -+static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, -+ struct btree_iter_level *l, -+ int max_advance) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct bkey_packed *k; -+ int nr_advanced = 0; -+ -+ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && -+ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { -+ if (max_advance > 0 && nr_advanced >= max_advance) -+ return false; -+ -+ bch2_btree_node_iter_advance(&l->iter, l->b); -+ nr_advanced++; -+ } -+ -+ return true; -+} -+ -+/* -+ * Verify that iterator for parent node points to child node: -+ */ -+static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter_level *l; -+ unsigned plevel; -+ bool parent_locked; -+ struct bkey_packed *k; -+ -+ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ return; -+ -+ plevel = b->c.level + 1; -+ if (!btree_iter_node(iter, plevel)) -+ return; -+ -+ parent_locked = btree_node_locked(iter, plevel); -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ l = &iter->l[plevel]; -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ if (!k || -+ bkey_deleted(k) || -+ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { -+ char buf[100]; -+ struct bkey uk = bkey_unpack_key(b, k); -+ -+ bch2_bkey_to_text(&PBUF(buf), &uk); -+ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", -+ buf, b->key.k.p.inode, b->key.k.p.offset); -+ } -+ -+ if (!parent_locked) -+ btree_node_unlock(iter, b->c.level + 1); -+} -+ -+static inline void __btree_iter_init(struct btree_iter *iter, -+ unsigned level) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_iter_level *l = &iter->l[level]; -+ -+ bch2_btree_node_iter_init(&l->iter, l->b, &pos); -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+static inline void btree_iter_node_set(struct btree_iter *iter, -+ struct btree *b) -+{ -+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); -+ -+ btree_iter_verify_new_node(iter, b); -+ -+ EBUG_ON(!btree_iter_pos_in_node(iter, b)); -+ EBUG_ON(b->c.lock.state.seq & 1); -+ -+ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; -+ iter->l[b->c.level].b = b; -+ __btree_iter_init(iter, b->c.level); -+} -+ -+/* -+ * A btree node is being replaced - update the iterator to point to the new -+ * node: -+ */ -+void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) -+{ -+ enum btree_node_locked_type t; -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (btree_iter_type(linked) != BTREE_ITER_CACHED && -+ btree_iter_pos_in_node(linked, b)) { -+ /* -+ * bch2_btree_iter_node_drop() has already been called - -+ * the old node we're replacing has already been -+ * unlocked and the pointer invalidated -+ */ -+ BUG_ON(btree_node_locked(linked, b->c.level)); -+ -+ t = btree_lock_want(linked, b->c.level); -+ if (t != BTREE_NODE_UNLOCKED) { -+ six_lock_increment(&b->c.lock, t); -+ mark_btree_node_locked(linked, b->c.level, t); -+ } -+ -+ btree_iter_node_set(linked, b); -+ } -+} -+ -+void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ unsigned level = b->c.level; -+ -+ trans_for_each_iter(iter->trans, linked) -+ if (linked->l[level].b == b) { -+ __btree_node_unlock(linked, level); -+ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; -+ } -+} -+ -+/* -+ * A btree node has been modified in such a way as to invalidate iterators - fix -+ * them: -+ */ -+void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ __btree_iter_init(linked, b->c.level); -+} -+ -+static int lock_root_check_fn(struct six_lock *lock, void *p) -+{ -+ struct btree *b = container_of(lock, struct btree, c.lock); -+ struct btree **rootp = p; -+ -+ return b == *rootp ? 0 : -1; -+} -+ -+static inline int btree_iter_lock_root(struct btree_iter *iter, -+ unsigned depth_want) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; -+ enum six_lock_type lock_type; -+ unsigned i; -+ -+ EBUG_ON(iter->nodes_locked); -+ -+ while (1) { -+ b = READ_ONCE(*rootp); -+ iter->level = READ_ONCE(b->c.level); -+ -+ if (unlikely(iter->level < depth_want)) { -+ /* -+ * the root is at a lower depth than the depth we want: -+ * got to the end of the btree, or we're walking nodes -+ * greater than some depth and there are no nodes >= -+ * that depth -+ */ -+ iter->level = depth_want; -+ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ return 1; -+ } -+ -+ lock_type = __btree_lock_want(iter, iter->level); -+ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, -+ iter, lock_type, -+ lock_root_check_fn, rootp))) -+ return -EINTR; -+ -+ if (likely(b == READ_ONCE(*rootp) && -+ b->c.level == iter->level && -+ !race_fault())) { -+ for (i = 0; i < iter->level; i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; -+ iter->l[iter->level].b = b; -+ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) -+ iter->l[i].b = NULL; -+ -+ mark_btree_node_locked(iter, iter->level, lock_type); -+ btree_iter_node_set(iter, b); -+ return 0; -+ } -+ -+ six_unlock_type(&b->c.lock, lock_type); -+ } -+} -+ -+noinline -+static void btree_iter_prefetch(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *k; -+ BKEY_PADDED(k) tmp; -+ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) -+ ? (iter->level > 1 ? 0 : 2) -+ : (iter->level > 1 ? 1 : 16); -+ bool was_locked = btree_node_locked(iter, iter->level); -+ -+ while (nr) { -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ return; -+ -+ bch2_btree_node_iter_advance(&node_iter, l->b); -+ k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!k) -+ break; -+ -+ bch2_bkey_unpack(l->b, &tmp.k, k); -+ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); -+ } -+ -+ if (!was_locked) -+ btree_node_unlock(iter, iter->level); -+} -+ -+static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, -+ unsigned plevel, struct btree *b) -+{ -+ struct btree_iter_level *l = &iter->l[plevel]; -+ bool locked = btree_node_locked(iter, plevel); -+ struct bkey_packed *k; -+ struct bch_btree_ptr_v2 *bp; -+ -+ if (!bch2_btree_node_relock(iter, plevel)) -+ return; -+ -+ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); -+ -+ bp = (void *) bkeyp_val(&l->b->format, k); -+ bp->mem_ptr = (unsigned long)b; -+ -+ if (!locked) -+ btree_node_unlock(iter, plevel); -+} -+ -+static __always_inline int btree_iter_down(struct btree_iter *iter) -+{ -+ struct bch_fs *c = iter->trans->c; -+ struct btree_iter_level *l = &iter->l[iter->level]; -+ struct btree *b; -+ unsigned level = iter->level - 1; -+ enum six_lock_type lock_type = __btree_lock_want(iter, level); -+ BKEY_PADDED(k) tmp; -+ -+ EBUG_ON(!btree_node_locked(iter, iter->level)); -+ -+ bch2_bkey_unpack(l->b, &tmp.k, -+ bch2_btree_node_iter_peek(&l->iter, l->b)); -+ -+ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); -+ if (unlikely(IS_ERR(b))) -+ return PTR_ERR(b); -+ -+ mark_btree_node_locked(iter, level, lock_type); -+ btree_iter_node_set(iter, b); -+ -+ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && -+ unlikely(b != btree_node_mem_ptr(&tmp.k))) -+ btree_node_mem_ptr_set(iter, level + 1, b); -+ -+ if (iter->flags & BTREE_ITER_PREFETCH) -+ btree_iter_prefetch(iter); -+ -+ iter->level = level; -+ -+ return 0; -+} -+ -+static void btree_iter_up(struct btree_iter *iter) -+{ -+ btree_node_unlock(iter, iter->level++); -+} -+ -+static int btree_iter_traverse_one(struct btree_iter *); -+ -+static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ u8 sorted[BTREE_ITER_MAX]; -+ unsigned i, nr_sorted = 0; -+ -+ if (trans->in_traverse_all) -+ return -EINTR; -+ -+ trans->in_traverse_all = true; -+retry_all: -+ nr_sorted = 0; -+ -+ trans_for_each_iter(trans, iter) -+ sorted[nr_sorted++] = iter->idx; -+ -+#define btree_iter_cmp_by_idx(_l, _r) \ -+ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) -+ -+ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); -+#undef btree_iter_cmp_by_idx -+ bch2_trans_unlock(trans); -+ -+ if (unlikely(ret == -ENOMEM)) { -+ struct closure cl; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ } -+ -+ if (unlikely(ret == -EIO)) { -+ trans->error = true; -+ goto out; -+ } -+ -+ BUG_ON(ret && ret != -EINTR); -+ -+ /* Now, redo traversals in correct order: */ -+ for (i = 0; i < nr_sorted; i++) { -+ unsigned idx = sorted[i]; -+ -+ /* -+ * sucessfully traversing one iterator can cause another to be -+ * unlinked, in btree_key_cache_fill() -+ */ -+ if (!(trans->iters_linked & (1ULL << idx))) -+ continue; -+ -+ ret = btree_iter_traverse_one(&trans->iters[idx]); -+ if (ret) -+ goto retry_all; -+ } -+ -+ if (hweight64(trans->iters_live) > 1) -+ ret = -EINTR; -+ else -+ trans_for_each_iter(trans, iter) -+ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { -+ ret = -EINTR; -+ break; -+ } -+out: -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ trans->in_traverse_all = false; -+ return ret; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *trans) -+{ -+ return __btree_iter_traverse_all(trans, 0); -+} -+ -+static inline bool btree_iter_good_node(struct btree_iter *iter, -+ unsigned l, int check_pos) -+{ -+ if (!is_btree_node(iter, l) || -+ !bch2_btree_node_relock(iter, l)) -+ return false; -+ -+ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) -+ return false; -+ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) -+ return false; -+ return true; -+} -+ -+static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, -+ int check_pos) -+{ -+ unsigned l = iter->level; -+ -+ while (btree_iter_node(iter, l) && -+ !btree_iter_good_node(iter, l, check_pos)) { -+ btree_node_unlock(iter, l); -+ iter->l[l].b = BTREE_ITER_NO_NODE_UP; -+ l++; -+ } -+ -+ return l; -+} -+ -+/* -+ * This is the main state machine for walking down the btree - walks down to a -+ * specified depth -+ * -+ * Returns 0 on success, -EIO on error (error reading in a btree node). -+ * -+ * On error, caller (peek_node()/peek_key()) must return NULL; the error is -+ * stashed in the iterator and returned from bch2_trans_exit(). -+ */ -+static int btree_iter_traverse_one(struct btree_iter *iter) -+{ -+ unsigned depth_want = iter->level; -+ -+ /* -+ * if we need interior nodes locked, call btree_iter_relock() to make -+ * sure we walk back up enough that we lock them: -+ */ -+ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || -+ iter->locks_want > 1) -+ bch2_btree_iter_relock(iter, false); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_traverse_cached(iter); -+ -+ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) -+ return 0; -+ -+ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) -+ return 0; -+ -+ /* -+ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos -+ * here unnecessary -+ */ -+ iter->level = btree_iter_up_until_good_node(iter, 0); -+ -+ /* -+ * If we've got a btree node locked (i.e. we aren't about to relock the -+ * root) - advance its node iterator if necessary: -+ * -+ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary -+ */ -+ if (is_btree_node(iter, iter->level)) { -+ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); -+ -+ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); -+ } -+ -+ /* -+ * Note: iter->nodes[iter->level] may be temporarily NULL here - that -+ * would indicate to other code that we got to the end of the btree, -+ * here it indicates that relocking the root failed - it's critical that -+ * btree_iter_lock_root() comes next and that it can't fail -+ */ -+ while (iter->level > depth_want) { -+ int ret = btree_iter_node(iter, iter->level) -+ ? btree_iter_down(iter) -+ : btree_iter_lock_root(iter, depth_want); -+ if (unlikely(ret)) { -+ if (ret == 1) -+ return 0; -+ -+ iter->level = depth_want; -+ -+ if (ret == -EIO) { -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_ERROR; -+ } else { -+ iter->l[iter->level].b = -+ BTREE_ITER_NO_NODE_DOWN; -+ } -+ return ret; -+ } -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ -+ bch2_btree_iter_verify(iter); -+ return 0; -+} -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ int ret; -+ -+ ret = bch2_trans_cond_resched(trans) ?: -+ btree_iter_traverse_one(iter); -+ if (unlikely(ret)) -+ ret = __btree_iter_traverse_all(trans, ret); -+ -+ return ret; -+} -+ -+static inline void bch2_btree_iter_checks(struct btree_iter *iter) -+{ -+ enum btree_iter_type type = btree_iter_type(iter); -+ -+ EBUG_ON(iter->btree_id >= BTREE_ID_NR); -+ -+ BUG_ON((type == BTREE_ITER_KEYS || -+ type == BTREE_ITER_CACHED) && -+ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || -+ bkey_cmp(iter->pos, iter->k.p) > 0)); -+ -+ bch2_btree_iter_verify_locks(iter); -+ bch2_btree_iter_verify_level(iter, iter->level); -+} -+ -+/* Iterate across nodes (leaf and interior nodes) */ -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return iter->l[iter->level].b; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) -+{ -+ struct btree *b; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); -+ bch2_btree_iter_checks(iter); -+ -+ /* already got to end? */ -+ if (!btree_iter_node(iter, iter->level)) -+ return NULL; -+ -+ bch2_trans_cond_resched(iter->trans); -+ -+ btree_iter_up(iter); -+ -+ if (!bch2_btree_node_relock(iter, iter->level)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ /* got to end? */ -+ b = btree_iter_node(iter, iter->level); -+ if (!b) -+ return NULL; -+ -+ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { -+ /* -+ * Haven't gotten to the end of the parent node: go back down to -+ * the next child node -+ */ -+ -+ /* -+ * We don't really want to be unlocking here except we can't -+ * directly tell btree_iter_traverse() "traverse to this level" -+ * except by setting iter->level, so we have to unlock so we -+ * don't screw up our lock invariants: -+ */ -+ if (btree_node_read_locked(iter, iter->level)) -+ btree_node_unlock(iter, iter->level); -+ -+ iter->pos = bkey_successor(iter->pos); -+ iter->level = iter->min_depth; -+ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return NULL; -+ -+ b = iter->l[iter->level].b; -+ } -+ -+ iter->pos = b->key.k.p; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify(iter); -+ -+ return b; -+} -+ -+/* Iterate across keys (in leaf nodes only) */ -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ -+ EBUG_ON(iter->level != 0); -+ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); -+ EBUG_ON(!btree_node_locked(iter, 0)); -+ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+ -+ btree_iter_advance_to_pos(iter, l, -1); -+ -+ /* -+ * XXX: -+ * keeping a node locked that's outside (even just outside) iter->pos -+ * breaks __bch2_btree_node_lock(). This seems to only affect -+ * bch2_btree_node_get_sibling so for now it's fixed there, but we -+ * should try to get rid of this corner case. -+ * -+ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) -+ */ -+ -+ if (bch2_btree_node_iter_end(&l->iter) && -+ btree_iter_pos_after_node(iter, l->b)) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+} -+ -+static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) -+{ -+ unsigned l = iter->level; -+ -+ if (!cmp) -+ goto out; -+ -+ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { -+ btree_node_unlock(iter, 0); -+ iter->l[0].b = BTREE_ITER_NO_NODE_UP; -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ return; -+ } -+ -+ l = btree_iter_up_until_good_node(iter, cmp); -+ -+ if (btree_iter_node(iter, l)) { -+ /* -+ * We might have to skip over many keys, or just a few: try -+ * advancing the node iterator, and if we have to skip over too -+ * many keys just reinit it (or if we're rewinding, since that -+ * is expensive). -+ */ -+ if (cmp < 0 || -+ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) -+ __btree_iter_init(iter, l); -+ -+ /* Don't leave it locked if we're not supposed to: */ -+ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) -+ btree_node_unlock(iter, l); -+ } -+out: -+ if (l != iter->level) -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); -+ else -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); -+} -+ -+void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, -+ bool strictly_greater) -+{ -+ struct bpos old = btree_iter_search_key(iter); -+ int cmp; -+ -+ iter->flags &= ~BTREE_ITER_IS_EXTENTS; -+ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ cmp = bkey_cmp(btree_iter_search_key(iter), old); -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) -+{ -+ int cmp = bkey_cmp(new_pos, iter->pos); -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = new_pos; -+ -+ btree_iter_pos_changed(iter, cmp); -+} -+ -+static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->key.k.p; -+ -+ ret = bkey_cmp(iter->pos, POS_MAX) != 0; -+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) -+ iter->k.p = iter->pos = bkey_successor(iter->pos); -+ -+ btree_iter_pos_changed(iter, 1); -+ return ret; -+} -+ -+static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ bool ret; -+ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos = l->b->data->min_key; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ -+ ret = bkey_cmp(iter->pos, POS_MIN) != 0; -+ if (ret) { -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ iter->k.p = iter->pos = bkey_predecessor(iter->pos); -+ } -+ -+ btree_iter_pos_changed(iter, -1); -+ return ret; -+} -+ -+/** -+ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key -+ * it currently points to -+ */ -+static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c ret = { .k = &iter->k }; -+ -+ if (!bkey_deleted(&iter->k)) { -+ struct bkey_packed *_k = -+ __bch2_btree_node_iter_peek_all(&l->iter, l->b); -+ -+ ret.v = bkeyp_val(&l->b->format, _k); -+ -+ if (debug_check_iterators(iter->trans->c)) { -+ struct bkey k = bkey_unpack_key(l->b, _k); -+ -+ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); -+ } -+ -+ if (debug_check_bkeys(iter->trans->c)) -+ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's -+ * current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_next: returns first key greater than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek(iter); -+} -+ -+static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) -+{ -+ struct bpos pos = btree_iter_search_key(iter); -+ struct btree_trans *trans = iter->trans; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update2(trans, i) -+ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: -+ bkey_cmp(pos, i->k->k.p)) <= 0) -+ break; -+ -+ return i < trans->updates2 + trans->nr_updates2 && -+ iter->btree_id == i->iter->btree_id -+ ? bkey_i_to_s_c(i->k) -+ : bkey_s_c_null; -+} -+ -+static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k = __btree_iter_peek(iter, l); -+ struct bkey_s_c u = __btree_trans_updates_peek(iter); -+ -+ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) -+ return k; -+ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { -+ iter->k = *u.k; -+ return u; -+ } -+ return bkey_s_c_null; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) -+{ -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __bch2_btree_iter_peek_with_updates(iter); -+ -+ if (k.k && bkey_deleted(k.k)) { -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ continue; -+ } -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_next_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ /* -+ * iter->pos should always be equal to the key we just -+ * returned - except extents can straddle iter->pos: -+ */ -+ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || -+ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) -+ iter->pos = bkey_start_pos(k.k); -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_with_updates(iter); -+} -+ -+/** -+ * bch2_btree_iter_peek_prev: returns first key less than or equal to -+ * iterator's current position -+ */ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = iter->pos; -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE && -+ !bkey_deleted(&iter->k)) -+ return btree_iter_peek_uptodate(iter); -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ k = __btree_iter_peek(iter, l); -+ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) -+ k = __btree_iter_prev(iter, l); -+ -+ if (likely(k.k)) -+ break; -+ -+ if (!btree_iter_set_pos_to_prev_leaf(iter)) -+ return bkey_s_c_null; -+ } -+ -+ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); -+ iter->pos = bkey_start_pos(k.k); -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ return k; -+} -+ -+/** -+ * bch2_btree_iter_prev: returns first key less than iterator's current -+ * position -+ */ -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) -+{ -+ struct bpos pos = bkey_start_pos(&iter->k); -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (unlikely(!bkey_cmp(pos, POS_MIN))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); -+ -+ return bch2_btree_iter_peek_prev(iter); -+} -+ -+static inline struct bkey_s_c -+__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter; -+ struct bkey_s_c k; -+ struct bkey n; -+ int ret; -+ -+ /* keys & holes can't span inode numbers: */ -+ if (iter->pos.offset == KEY_OFFSET_MAX) { -+ if (iter->pos.inode == KEY_INODE_MAX) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ } -+ -+ /* -+ * iterator is now at the correct position for inserting at iter->pos, -+ * but we need to keep iterating until we find the first non whiteout so -+ * we know how big a hole we have, if any: -+ */ -+ -+ node_iter = l->iter; -+ k = __btree_iter_unpack(iter, l, &iter->k, -+ bch2_btree_node_iter_peek(&node_iter, l->b)); -+ -+ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { -+ /* -+ * We're not setting iter->uptodate because the node iterator -+ * doesn't necessarily point at the key we're returning: -+ */ -+ -+ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+ } -+ -+ /* hole */ -+ -+ if (!k.k) -+ k.k = &l->b->key.k; -+ -+ bkey_init(&n); -+ n.p = iter->pos; -+ bch2_key_resize(&n, -+ min_t(u64, KEY_SIZE_MAX, -+ (k.k->p.inode == n.p.inode -+ ? bkey_start_offset(k.k) -+ : KEY_OFFSET_MAX) - -+ n.p.offset)); -+ -+ EBUG_ON(!n.size); -+ -+ iter->k = n; -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ -+ bch2_btree_iter_verify_level(iter, 0); -+ return (struct bkey_s_c) { &iter->k, NULL }; -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_s_c k; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); -+ bch2_btree_iter_checks(iter); -+ -+ if (iter->uptodate == BTREE_ITER_UPTODATE) -+ return btree_iter_peek_uptodate(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ if (iter->flags & BTREE_ITER_IS_EXTENTS) -+ return __bch2_btree_iter_peek_slot_extents(iter); -+ -+ k = __btree_iter_peek_all(iter, l, &iter->k); -+ -+ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); -+ -+ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { -+ /* hole */ -+ bkey_init(&iter->k); -+ iter->k.p = iter->pos; -+ k = (struct bkey_s_c) { &iter->k, NULL }; -+ } -+ -+ iter->uptodate = BTREE_ITER_UPTODATE; -+ bch2_btree_iter_verify_level(iter, 0); -+ return k; -+} -+ -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) -+{ -+ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) -+ return bkey_s_c_null; -+ -+ bch2_btree_iter_set_pos(iter, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? iter->k.p -+ : bkey_successor(iter->k.p)); -+ -+ return bch2_btree_iter_peek_slot(iter); -+} -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) -+{ -+ struct bkey_cached *ck; -+ int ret; -+ -+ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); -+ bch2_btree_iter_checks(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (unlikely(ret)) -+ return bkey_s_c_err(ret); -+ -+ ck = (void *) iter->l[0].b; -+ -+ EBUG_ON(iter->btree_id != ck->key.btree_id || -+ bkey_cmp(iter->pos, ck->key.pos)); -+ BUG_ON(!ck->valid); -+ -+ return bkey_i_to_s_c(ck->k); -+} -+ -+static inline void bch2_btree_iter_init(struct btree_trans *trans, -+ struct btree_iter *iter, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned i; -+ -+ if (btree_node_type_is_extents(btree_id) && -+ !(flags & BTREE_ITER_NODES)) -+ flags |= BTREE_ITER_IS_EXTENTS; -+ -+ iter->trans = trans; -+ iter->pos = pos; -+ bkey_init(&iter->k); -+ iter->k.p = pos; -+ iter->flags = flags; -+ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; -+ iter->btree_id = btree_id; -+ iter->level = 0; -+ iter->min_depth = 0; -+ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; -+ iter->nodes_locked = 0; -+ iter->nodes_intent_locked = 0; -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; -+ -+ prefetch(c->btree_roots[btree_id].b); -+} -+ -+/* new transactional stuff: */ -+ -+static inline void __bch2_trans_iter_free(struct btree_trans *trans, -+ unsigned idx) -+{ -+ __bch2_btree_iter_unlock(&trans->iters[idx]); -+ trans->iters_linked &= ~(1ULL << idx); -+ trans->iters_live &= ~(1ULL << idx); -+ trans->iters_touched &= ~(1ULL << idx); -+} -+ -+int bch2_trans_iter_put(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ int ret; -+ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ BUG_ON(trans->iters + iter->idx != iter); -+ -+ ret = btree_iter_err(iter); -+ -+ if (!(trans->iters_touched & (1ULL << iter->idx)) && -+ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) -+ __bch2_trans_iter_free(trans, iter->idx); -+ -+ trans->iters_live &= ~(1ULL << iter->idx); -+ return ret; -+} -+ -+int bch2_trans_iter_free(struct btree_trans *trans, -+ struct btree_iter *iter) -+{ -+ if (IS_ERR_OR_NULL(iter)) -+ return 0; -+ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return bch2_trans_iter_put(trans, iter); -+} -+ -+static int bch2_trans_realloc_iters(struct btree_trans *trans, -+ unsigned new_size) -+{ -+ void *p, *new_iters, *new_updates, *new_updates2; -+ size_t iters_bytes; -+ size_t updates_bytes; -+ -+ new_size = roundup_pow_of_two(new_size); -+ -+ BUG_ON(new_size > BTREE_ITER_MAX); -+ -+ if (new_size <= trans->size) -+ return 0; -+ -+ BUG_ON(trans->used_mempool); -+ -+ bch2_trans_unlock(trans); -+ -+ iters_bytes = sizeof(struct btree_iter) * new_size; -+ updates_bytes = sizeof(struct btree_insert_entry) * new_size; -+ -+ p = kmalloc(iters_bytes + -+ updates_bytes + -+ updates_bytes, GFP_NOFS); -+ if (p) -+ goto success; -+ -+ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); -+ new_size = BTREE_ITER_MAX; -+ -+ trans->used_mempool = true; -+success: -+ new_iters = p; p += iters_bytes; -+ new_updates = p; p += updates_bytes; -+ new_updates2 = p; p += updates_bytes; -+ -+ memcpy(new_iters, trans->iters, -+ sizeof(struct btree_iter) * trans->nr_iters); -+ memcpy(new_updates, trans->updates, -+ sizeof(struct btree_insert_entry) * trans->nr_updates); -+ memcpy(new_updates2, trans->updates2, -+ sizeof(struct btree_insert_entry) * trans->nr_updates2); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ memset(trans->iters, POISON_FREE, -+ sizeof(struct btree_iter) * trans->nr_iters + -+ sizeof(struct btree_insert_entry) * trans->nr_iters); -+ -+ if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ -+ trans->iters = new_iters; -+ trans->updates = new_updates; -+ trans->updates2 = new_updates2; -+ trans->size = new_size; -+ -+ if (trans->iters_live) { -+ trace_trans_restart_iters_realloced(trans->ip, trans->size); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) -+{ -+ unsigned idx = __ffs64(~trans->iters_linked); -+ -+ if (idx < trans->nr_iters) -+ goto got_slot; -+ -+ if (trans->nr_iters == trans->size) { -+ int ret; -+ -+ if (trans->nr_iters >= BTREE_ITER_MAX) { -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) { -+ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", -+ bch2_btree_ids[iter->btree_id], -+ iter->pos.inode, -+ iter->pos.offset, -+ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", -+ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", -+ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", -+ (void *) iter->ip_allocated); -+ } -+ -+ panic("trans iter oveflow\n"); -+ } -+ -+ ret = bch2_trans_realloc_iters(trans, trans->size * 2); -+ if (ret) -+ return ERR_PTR(ret); -+ } -+ -+ idx = trans->nr_iters++; -+ BUG_ON(trans->nr_iters > trans->size); -+ -+ trans->iters[idx].idx = idx; -+got_slot: -+ BUG_ON(trans->iters_linked & (1ULL << idx)); -+ trans->iters_linked |= 1ULL << idx; -+ trans->iters[idx].flags = 0; -+ return &trans->iters[idx]; -+} -+ -+static inline void btree_iter_copy(struct btree_iter *dst, -+ struct btree_iter *src) -+{ -+ unsigned i, idx = dst->idx; -+ -+ *dst = *src; -+ dst->idx = idx; -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ for (i = 0; i < BTREE_MAX_DEPTH; i++) -+ if (btree_node_locked(dst, i)) -+ six_lock_increment(&dst->l[i].b->c.lock, -+ __btree_lock_want(dst, i)); -+ -+ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; -+} -+ -+static inline struct bpos bpos_diff(struct bpos l, struct bpos r) -+{ -+ if (bkey_cmp(l, r) > 0) -+ swap(l, r); -+ -+ return POS(r.inode - l.inode, r.offset - l.offset); -+} -+ -+static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, -+ unsigned btree_id, struct bpos pos, -+ unsigned flags) -+{ -+ struct btree_iter *iter, *best = NULL; -+ -+ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); -+ -+ trans_for_each_iter(trans, iter) { -+ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) -+ continue; -+ -+ if (iter->btree_id != btree_id) -+ continue; -+ -+ if (best && -+ bkey_cmp(bpos_diff(best->pos, pos), -+ bpos_diff(iter->pos, pos)) < 0) -+ continue; -+ -+ best = iter; -+ } -+ -+ if (!best) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); -+ } else if ((trans->iters_live & (1ULL << best->idx)) || -+ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, best); -+ } else { -+ iter = best; -+ } -+ -+ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; -+ iter->flags &= ~BTREE_ITER_USER_FLAGS; -+ iter->flags |= flags & BTREE_ITER_USER_FLAGS; -+ -+ if (iter->flags & BTREE_ITER_INTENT) -+ bch2_btree_iter_upgrade(iter, 1); -+ else -+ bch2_btree_iter_downgrade(iter); -+ -+ BUG_ON(iter->btree_id != btree_id); -+ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); -+ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); -+ BUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ trans->iters_touched |= 1ULL << iter->idx; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ __bch2_btree_iter_set_pos(iter, pos, -+ btree_node_type_is_extents(btree_id)); -+ return iter; -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos pos, -+ unsigned locks_want, -+ unsigned depth, -+ unsigned flags) -+{ -+ struct btree_iter *iter = -+ __btree_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_NODES); -+ unsigned i; -+ -+ BUG_ON(IS_ERR(iter)); -+ BUG_ON(bkey_cmp(iter->pos, pos)); -+ -+ iter->locks_want = locks_want; -+ iter->level = depth; -+ iter->min_depth = depth; -+ -+ for (i = 0; i < ARRAY_SIZE(iter->l); i++) -+ iter->l[i].b = NULL; -+ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; -+ -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, -+ struct btree_iter *src) -+{ -+ struct btree_iter *iter; -+ -+ iter = btree_trans_iter_alloc(trans); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ btree_iter_copy(iter, src); -+ -+ trans->iters_live |= 1ULL << iter->idx; -+ /* -+ * We don't need to preserve this iter since it's cheap to copy it -+ * again - this will cause trans_iter_put() to free it right away: -+ */ -+ trans->iters_touched &= ~(1ULL << iter->idx); -+ -+ return iter; -+} -+ -+static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) -+{ -+ if (size > trans->mem_bytes) { -+ size_t old_bytes = trans->mem_bytes; -+ size_t new_bytes = roundup_pow_of_two(size); -+ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); -+ -+ if (!new_mem) -+ return -ENOMEM; -+ -+ trans->mem = new_mem; -+ trans->mem_bytes = new_bytes; -+ -+ if (old_bytes) { -+ trace_trans_restart_mem_realloced(trans->ip, new_bytes); -+ return -EINTR; -+ } -+ } -+ -+ return 0; -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) -+{ -+ void *p; -+ int ret; -+ -+ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ p = trans->mem + trans->mem_top; -+ trans->mem_top += size; -+ return p; -+} -+ -+inline void bch2_trans_unlink_iters(struct btree_trans *trans) -+{ -+ u64 iters = trans->iters_linked & -+ ~trans->iters_touched & -+ ~trans->iters_live; -+ -+ while (iters) { -+ unsigned idx = __ffs64(iters); -+ -+ iters &= ~(1ULL << idx); -+ __bch2_trans_iter_free(trans, idx); -+ } -+} -+ -+void bch2_trans_reset(struct btree_trans *trans, unsigned flags) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| -+ BTREE_ITER_SET_POS_AFTER_COMMIT); -+ -+ bch2_trans_unlink_iters(trans); -+ -+ trans->iters_touched &= trans->iters_live; -+ -+ trans->need_reset = 0; -+ trans->nr_updates = 0; -+ trans->nr_updates2 = 0; -+ trans->mem_top = 0; -+ -+ trans->extra_journal_entries = NULL; -+ trans->extra_journal_entry_u64s = 0; -+ -+ if (trans->fs_usage_deltas) { -+ trans->fs_usage_deltas->used = 0; -+ memset(&trans->fs_usage_deltas->memset_start, 0, -+ (void *) &trans->fs_usage_deltas->memset_end - -+ (void *) &trans->fs_usage_deltas->memset_start); -+ } -+ -+ if (!(flags & TRANS_RESET_NOTRAVERSE)) -+ bch2_btree_iter_traverse_all(trans); -+} -+ -+void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, -+ unsigned expected_nr_iters, -+ size_t expected_mem_bytes) -+{ -+ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); -+ -+ /* -+ * reallocating iterators currently completely breaks -+ * bch2_trans_iter_put(): -+ */ -+ expected_nr_iters = BTREE_ITER_MAX; -+ -+ trans->c = c; -+ trans->ip = _RET_IP_; -+ trans->size = ARRAY_SIZE(trans->iters_onstack); -+ trans->iters = trans->iters_onstack; -+ trans->updates = trans->updates_onstack; -+ trans->updates2 = trans->updates2_onstack; -+ trans->fs_usage_deltas = NULL; -+ -+ if (expected_nr_iters > trans->size) -+ bch2_trans_realloc_iters(trans, expected_nr_iters); -+ -+ if (expected_mem_bytes) -+ bch2_trans_preload_mem(trans, expected_mem_bytes); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->pid = current->pid; -+ mutex_lock(&c->btree_trans_lock); -+ list_add(&trans->list, &c->btree_trans_list); -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+int bch2_trans_exit(struct btree_trans *trans) -+{ -+ bch2_trans_unlock(trans); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ mutex_lock(&trans->c->btree_trans_lock); -+ list_del(&trans->list); -+ mutex_unlock(&trans->c->btree_trans_lock); -+#endif -+ -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ kfree(trans->fs_usage_deltas); -+ kfree(trans->mem); -+ if (trans->used_mempool) -+ mempool_free(trans->iters, &trans->c->btree_iters_pool); -+ else if (trans->iters != trans->iters_onstack) -+ kfree(trans->iters); -+ trans->mem = (void *) 0x1; -+ trans->iters = (void *) 0x1; -+ -+ return trans->error ? -EIO : 0; -+} -+ -+static void bch2_btree_iter_node_to_text(struct printbuf *out, -+ struct btree_bkey_cached_common *_b, -+ enum btree_iter_type type) -+{ -+ pr_buf(out, " %px l=%u %s:", -+ _b, _b->level, bch2_btree_ids[_b->btree_id]); -+ bch2_bpos_to_text(out, btree_node_pos(_b, type)); -+} -+ -+void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct btree_trans *trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned l; -+ -+ mutex_lock(&c->btree_trans_lock); -+ list_for_each_entry(trans, &c->btree_trans_list, list) { -+ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); -+ -+ trans_for_each_iter(trans, iter) { -+ if (!iter->nodes_locked) -+ continue; -+ -+ pr_buf(out, " iter %u %s:", -+ iter->idx, -+ bch2_btree_ids[iter->btree_id]); -+ bch2_bpos_to_text(out, iter->pos); -+ pr_buf(out, "\n"); -+ -+ for (l = 0; l < BTREE_MAX_DEPTH; l++) { -+ if (btree_node_locked(iter, l)) { -+ pr_buf(out, " %s l=%u ", -+ btree_node_intent_locked(iter, l) ? "i" : "r", l); -+ bch2_btree_iter_node_to_text(out, -+ (void *) iter->l[l].b, -+ btree_iter_type(iter)); -+ pr_buf(out, "\n"); -+ } -+ } -+ } -+ -+ b = READ_ONCE(trans->locking); -+ if (b) { -+ pr_buf(out, " locking iter %u l=%u %s:", -+ trans->locking_iter_idx, -+ trans->locking_level, -+ bch2_btree_ids[trans->locking_btree_id]); -+ bch2_bpos_to_text(out, trans->locking_pos); -+ -+ -+ pr_buf(out, " node "); -+ bch2_btree_iter_node_to_text(out, -+ (void *) b, -+ btree_iter_type(&trans->iters[trans->locking_iter_idx])); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->btree_trans_lock); -+#endif -+} -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *c) -+{ -+ mempool_exit(&c->btree_iters_pool); -+} -+ -+int bch2_fs_btree_iter_init(struct bch_fs *c) -+{ -+ unsigned nr = BTREE_ITER_MAX; -+ -+ INIT_LIST_HEAD(&c->btree_trans_list); -+ mutex_init(&c->btree_trans_lock); -+ -+ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, -+ sizeof(struct btree_iter) * nr + -+ sizeof(struct btree_insert_entry) * nr + -+ sizeof(struct btree_insert_entry) * nr); -+} -diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h -new file mode 100644 -index 000000000000..bd9ec3ec9a92 ---- /dev/null -+++ b/fs/bcachefs/btree_iter.h -@@ -0,0 +1,314 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_ITER_H -+#define _BCACHEFS_BTREE_ITER_H -+ -+#include "bset.h" -+#include "btree_types.h" -+ -+static inline void btree_iter_set_dirty(struct btree_iter *iter, -+ enum btree_iter_uptodate u) -+{ -+ iter->uptodate = max_t(unsigned, iter->uptodate, u); -+} -+ -+static inline struct btree *btree_iter_node(struct btree_iter *iter, -+ unsigned level) -+{ -+ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; -+} -+ -+static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, -+ const struct btree *b, unsigned level) -+{ -+ /* -+ * We don't compare the low bits of the lock sequence numbers because -+ * @iter might have taken a write lock on @b, and we don't want to skip -+ * the linked iterator if the sequence numbers were equal before taking -+ * that write lock. The lock sequence number is incremented by taking -+ * and releasing write locks and is even when unlocked: -+ */ -+ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; -+} -+ -+static inline struct btree *btree_node_parent(struct btree_iter *iter, -+ struct btree *b) -+{ -+ return btree_iter_node(iter, b->c.level + 1); -+} -+ -+static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) -+{ -+ return hweight64(trans->iters_linked) > 1; -+} -+ -+static inline int btree_iter_err(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; -+} -+ -+/* Iterate over iters within a transaction: */ -+ -+#define trans_for_each_iter_all(_trans, _iter) \ -+ for (_iter = (_trans)->iters; \ -+ _iter < (_trans)->iters + (_trans)->nr_iters; \ -+ _iter++) -+ -+static inline struct btree_iter * -+__trans_next_iter(struct btree_trans *trans, unsigned idx) -+{ -+ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); -+ -+ for (; idx < trans->nr_iters; idx++) -+ if (trans->iters_linked & (1ULL << idx)) -+ return &trans->iters[idx]; -+ -+ return NULL; -+} -+ -+#define trans_for_each_iter(_trans, _iter) \ -+ for (_iter = __trans_next_iter((_trans), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) -+ -+static inline bool __iter_has_node(const struct btree_iter *iter, -+ const struct btree *b) -+{ -+ return iter->l[b->c.level].b == b && -+ btree_node_lock_seq_matches(iter, b, b->c.level); -+} -+ -+static inline struct btree_iter * -+__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, -+ unsigned idx) -+{ -+ struct btree_iter *iter = __trans_next_iter(trans, idx); -+ -+ while (iter && !__iter_has_node(iter, b)) -+ iter = __trans_next_iter(trans, iter->idx + 1); -+ -+ return iter; -+} -+ -+#define trans_for_each_iter_with_node(_trans, _b, _iter) \ -+ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ -+ (_iter); \ -+ _iter = __trans_next_iter_with_node((_trans), (_b), \ -+ (_iter)->idx + 1)) -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); -+void bch2_btree_trans_verify_locks(struct btree_trans *); -+#else -+static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, -+ struct btree *b) {} -+static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} -+#endif -+ -+void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, -+ struct bkey_packed *); -+void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_packed *, -+ unsigned, unsigned); -+ -+bool bch2_btree_iter_relock(struct btree_iter *, bool); -+bool bch2_trans_relock(struct btree_trans *); -+void bch2_trans_unlock(struct btree_trans *); -+ -+bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); -+bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, -+ unsigned new_locks_want) -+{ -+ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); -+ -+ return iter->locks_want < new_locks_want -+ ? (!iter->trans->nounlock -+ ? __bch2_btree_iter_upgrade(iter, new_locks_want) -+ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) -+ : iter->uptodate <= BTREE_ITER_NEED_PEEK; -+} -+ -+void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); -+ -+static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) -+{ -+ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) -+ __bch2_btree_iter_downgrade(iter, 0); -+} -+ -+void bch2_trans_downgrade(struct btree_trans *); -+ -+void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); -+void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); -+ -+void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); -+ -+int __must_check __bch2_btree_iter_traverse(struct btree_iter *); -+ -+static inline int __must_check -+bch2_btree_iter_traverse(struct btree_iter *iter) -+{ -+ return iter->uptodate >= BTREE_ITER_NEED_RELOCK -+ ? __bch2_btree_iter_traverse(iter) -+ : 0; -+} -+ -+int bch2_btree_iter_traverse_all(struct btree_trans *); -+ -+struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -+struct btree *bch2_btree_iter_next_node(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -+struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -+ -+struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); -+ -+void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); -+void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); -+void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); -+ -+static inline int btree_iter_cmp(const struct btree_iter *l, -+ const struct btree_iter *r) -+{ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: -+ bkey_cmp(l->pos, r->pos); -+} -+ -+/* -+ * Unlocks before scheduling -+ * Note: does not revalidate iterator -+ */ -+static inline int bch2_trans_cond_resched(struct btree_trans *trans) -+{ -+ if (need_resched() || race_fault()) { -+ bch2_trans_unlock(trans); -+ schedule(); -+ return bch2_trans_relock(trans) ? 0 : -EINTR; -+ } else { -+ return 0; -+ } -+} -+ -+#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _locks_want, _depth, _flags, _b) \ -+ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ -+ _start, _locks_want, _depth, _flags), \ -+ _b = bch2_btree_iter_peek_node(_iter); \ -+ (_b); \ -+ (_b) = bch2_btree_iter_next_node(_iter)) -+ -+#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ _flags, _b) \ -+ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ -+ 0, 0, _flags, _b) -+ -+static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, -+ unsigned flags) -+{ -+ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) -+ return bch2_btree_iter_peek_cached(iter); -+ else -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_peek_slot(iter) -+ : bch2_btree_iter_peek(iter); -+} -+ -+static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, -+ unsigned flags) -+{ -+ return flags & BTREE_ITER_SLOTS -+ ? bch2_btree_iter_next_slot(iter) -+ : bch2_btree_iter_next(iter); -+} -+ -+static inline int bkey_err(struct bkey_s_c k) -+{ -+ return PTR_ERR_OR_ZERO(k.k); -+} -+ -+#define for_each_btree_key(_trans, _iter, _btree_id, \ -+ _start, _flags, _k, _ret) \ -+ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ -+ bch2_trans_get_iter((_trans), (_btree_id), \ -+ (_start), (_flags))) ?: \ -+ PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_peek(_iter, _flags)).k); \ -+ !_ret && (_k).k; \ -+ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ -+ __bch2_btree_iter_next(_iter, _flags)).k)) -+ -+#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ -+ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ -+ !((_ret) = bkey_err(_k)) && (_k).k; \ -+ (_k) = __bch2_btree_iter_next(_iter, _flags)) -+ -+/* new multiple iterator interface: */ -+ -+int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); -+int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); -+ -+void bch2_trans_unlink_iters(struct btree_trans *); -+ -+struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, -+ struct bpos, unsigned); -+ -+static inline struct btree_iter * -+bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, -+ struct bpos pos, unsigned flags) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_get_iter(trans, btree_id, pos, flags); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+} -+ -+struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, -+ struct btree_iter *); -+static inline struct btree_iter * -+bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) -+{ -+ struct btree_iter *iter = -+ __bch2_trans_copy_iter(trans, src); -+ -+ if (!IS_ERR(iter)) -+ iter->ip_allocated = _THIS_IP_; -+ return iter; -+ -+} -+ -+struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, -+ enum btree_id, struct bpos, -+ unsigned, unsigned, unsigned); -+ -+#define TRANS_RESET_NOTRAVERSE (1 << 0) -+ -+void bch2_trans_reset(struct btree_trans *, unsigned); -+ -+static inline void bch2_trans_begin(struct btree_trans *trans) -+{ -+ return bch2_trans_reset(trans, 0); -+} -+ -+void *bch2_trans_kmalloc(struct btree_trans *, size_t); -+void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); -+int bch2_trans_exit(struct btree_trans *); -+ -+void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_btree_iter_exit(struct bch_fs *); -+int bch2_fs_btree_iter_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_ITER_H */ -diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c -new file mode 100644 -index 000000000000..61662750dfc0 ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.c -@@ -0,0 +1,519 @@ -+ -+#include "bcachefs.h" -+#include "btree_cache.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+#include "error.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+ -+#include -+ -+static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, -+ const void *obj) -+{ -+ const struct bkey_cached *ck = obj; -+ const struct bkey_cached_key *key = arg->key; -+ -+ return cmp_int(ck->key.btree_id, key->btree_id) ?: -+ bkey_cmp(ck->key.pos, key->pos); -+} -+ -+static const struct rhashtable_params bch2_btree_key_cache_params = { -+ .head_offset = offsetof(struct bkey_cached, hash), -+ .key_offset = offsetof(struct bkey_cached, key), -+ .key_len = sizeof(struct bkey_cached_key), -+ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, -+}; -+ -+__flatten -+static inline struct bkey_cached * -+btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) -+{ -+ struct bkey_cached_key key = { -+ .btree_id = btree_id, -+ .pos = pos, -+ }; -+ -+ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, -+ bch2_btree_key_cache_params); -+} -+ -+static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) -+{ -+ if (!six_trylock_intent(&ck->c.lock)) -+ return false; -+ -+ if (!six_trylock_write(&ck->c.lock)) { -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void bkey_cached_evict(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, -+ bch2_btree_key_cache_params)); -+ memset(&ck->key, ~0, sizeof(ck->key)); -+} -+ -+static void bkey_cached_free(struct btree_key_cache *c, -+ struct bkey_cached *ck) -+{ -+ list_move(&ck->list, &c->freed); -+ -+ kfree(ck->k); -+ ck->k = NULL; -+ ck->u64s = 0; -+ -+ six_unlock_write(&ck->c.lock); -+ six_unlock_intent(&ck->c.lock); -+} -+ -+static struct bkey_cached * -+bkey_cached_alloc(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck; -+ -+ list_for_each_entry(ck, &c->freed, list) -+ if (bkey_cached_lock_for_evict(ck)) -+ return ck; -+ -+ list_for_each_entry(ck, &c->clean, list) -+ if (bkey_cached_lock_for_evict(ck)) { -+ bkey_cached_evict(c, ck); -+ return ck; -+ } -+ -+ ck = kzalloc(sizeof(*ck), GFP_NOFS); -+ if (!ck) -+ return NULL; -+ -+ INIT_LIST_HEAD(&ck->list); -+ six_lock_init(&ck->c.lock); -+ BUG_ON(!six_trylock_intent(&ck->c.lock)); -+ BUG_ON(!six_trylock_write(&ck->c.lock)); -+ -+ return ck; -+} -+ -+static struct bkey_cached * -+btree_key_cache_create(struct btree_key_cache *c, -+ enum btree_id btree_id, -+ struct bpos pos) -+{ -+ struct bkey_cached *ck; -+ -+ ck = bkey_cached_alloc(c); -+ if (!ck) -+ return ERR_PTR(-ENOMEM); -+ -+ ck->c.level = 0; -+ ck->c.btree_id = btree_id; -+ ck->key.btree_id = btree_id; -+ ck->key.pos = pos; -+ ck->valid = false; -+ -+ BUG_ON(ck->flags); -+ -+ if (rhashtable_lookup_insert_fast(&c->table, -+ &ck->hash, -+ bch2_btree_key_cache_params)) { -+ /* We raced with another fill: */ -+ bkey_cached_free(c, ck); -+ return NULL; -+ } -+ -+ list_move(&ck->list, &c->clean); -+ six_unlock_write(&ck->c.lock); -+ -+ return ck; -+} -+ -+static int btree_key_cache_fill(struct btree_trans *trans, -+ struct btree_iter *ck_iter, -+ struct bkey_cached *ck) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned new_u64s = 0; -+ struct bkey_i *new_k = NULL; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, ck->key.btree_id, -+ ck->key.pos, BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+ } -+ -+ if (!bch2_btree_node_relock(ck_iter, 0)) { -+ bch2_trans_iter_put(trans, iter); -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ if (k.k->u64s > ck->u64s) { -+ new_u64s = roundup_pow_of_two(k.k->u64s); -+ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) { -+ bch2_trans_iter_put(trans, iter); -+ return -ENOMEM; -+ } -+ } -+ -+ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); -+ if (new_k) { -+ kfree(ck->k); -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ } -+ -+ bkey_reassemble(ck->k, k); -+ ck->valid = true; -+ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); -+ -+ /* We're not likely to need this iterator again: */ -+ bch2_trans_iter_free(trans, iter); -+ -+ return 0; -+} -+ -+static int bkey_cached_check_fn(struct six_lock *lock, void *p) -+{ -+ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); -+ const struct btree_iter *iter = p; -+ -+ return ck->key.btree_id == iter->btree_id && -+ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; -+} -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *iter) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck; -+ int ret = 0; -+ -+ BUG_ON(iter->level); -+ -+ if (btree_node_locked(iter, 0)) { -+ ck = (void *) iter->l[0].b; -+ goto fill; -+ } -+retry: -+ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); -+ if (!ck) { -+ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { -+ iter->l[0].b = NULL; -+ return 0; -+ } -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ ck = btree_key_cache_create(&c->btree_key_cache, -+ iter->btree_id, iter->pos); -+ mutex_unlock(&c->btree_key_cache.lock); -+ -+ ret = PTR_ERR_OR_ZERO(ck); -+ if (ret) -+ goto err; -+ if (!ck) -+ goto retry; -+ -+ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); -+ iter->locks_want = 1; -+ } else { -+ enum six_lock_type lock_want = __btree_lock_want(iter, 0); -+ -+ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, -+ bkey_cached_check_fn, iter)) { -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ goto retry; -+ } -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ if (ck->key.btree_id != iter->btree_id || -+ bkey_cmp(ck->key.pos, iter->pos)) { -+ six_unlock_type(&ck->c.lock, lock_want); -+ goto retry; -+ } -+ -+ mark_btree_node_locked(iter, 0, lock_want); -+ } -+ -+ iter->l[0].lock_seq = ck->c.lock.state.seq; -+ iter->l[0].b = (void *) ck; -+fill: -+ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { -+ if (!btree_node_intent_locked(iter, 0)) -+ bch2_btree_iter_upgrade(iter, 1); -+ if (!btree_node_intent_locked(iter, 0)) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ ret = -EINTR; -+ goto err; -+ } -+ -+ ret = btree_key_cache_fill(trans, iter, ck); -+ if (ret) -+ goto err; -+ } -+ -+ iter->uptodate = BTREE_ITER_NEED_PEEK; -+ bch2_btree_iter_downgrade(iter); -+ return ret; -+err: -+ if (ret != -EINTR) { -+ btree_node_unlock(iter, 0); -+ iter->flags |= BTREE_ITER_ERROR; -+ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; -+ } -+ return ret; -+} -+ -+static int btree_key_cache_flush_pos(struct btree_trans *trans, -+ struct bkey_cached_key key, -+ u64 journal_seq, -+ bool evict) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ struct btree_iter *c_iter = NULL, *b_iter = NULL; -+ struct bkey_cached *ck; -+ int ret; -+ -+ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_SLOTS| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(b_iter); -+ if (ret) -+ goto out; -+ -+ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_CACHED_NOCREATE| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(c_iter); -+ if (ret) -+ goto out; -+retry: -+ ret = bch2_btree_iter_traverse(c_iter); -+ if (ret) -+ goto err; -+ -+ ck = (void *) c_iter->l[0].b; -+ if (!ck || -+ (journal_seq && ck->journal.seq != journal_seq)) -+ goto out; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ if (!evict) -+ goto out; -+ goto evict; -+ } -+ -+ ret = bch2_btree_iter_traverse(b_iter) ?: -+ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_JOURNAL_RESERVED| -+ BTREE_INSERT_JOURNAL_RECLAIM); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ BUG_ON(ret && !bch2_journal_error(j)); -+ -+ if (ret) -+ goto out; -+ -+ bch2_journal_pin_drop(j, &ck->journal); -+ bch2_journal_preres_put(j, &ck->res); -+ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ -+ if (!evict) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_move_tail(&ck->list, &c->btree_key_cache.clean); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } else { -+evict: -+ BUG_ON(!btree_node_intent_locked(c_iter, 0)); -+ -+ mark_btree_node_unlocked(c_iter, 0); -+ c_iter->l[0].b = NULL; -+ -+ six_lock_write(&ck->c.lock, NULL, NULL); -+ -+ mutex_lock(&c->btree_key_cache.lock); -+ bkey_cached_evict(&c->btree_key_cache, ck); -+ bkey_cached_free(&c->btree_key_cache, ck); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+out: -+ bch2_trans_iter_put(trans, b_iter); -+ bch2_trans_iter_put(trans, c_iter); -+ return ret; -+} -+ -+static void btree_key_cache_journal_flush(struct journal *j, -+ struct journal_entry_pin *pin, -+ u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bkey_cached *ck = -+ container_of(pin, struct bkey_cached, journal); -+ struct bkey_cached_key key; -+ struct btree_trans trans; -+ -+ six_lock_read(&ck->c.lock, NULL, NULL); -+ key = ck->key; -+ -+ if (ck->journal.seq != seq || -+ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ six_unlock_read(&ck->c.lock); -+ return; -+ } -+ six_unlock_read(&ck->c.lock); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ btree_key_cache_flush_pos(&trans, key, seq, false); -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * Flush and evict a key from the key cache: -+ */ -+int bch2_btree_key_cache_flush(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached_key key = { id, pos }; -+ -+ /* Fastpath - assume it won't be found: */ -+ if (!btree_key_cache_find(c, id, pos)) -+ return 0; -+ -+ return btree_key_cache_flush_pos(trans, key, 0, true); -+} -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ BUG_ON(insert->u64s > ck->u64s); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ int difference; -+ -+ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); -+ -+ difference = jset_u64s(insert->u64s) - ck->res.u64s; -+ if (difference > 0) { -+ trans->journal_preres.u64s -= difference; -+ ck->res.u64s += difference; -+ } -+ } -+ -+ bkey_copy(ck->k, insert); -+ ck->valid = true; -+ -+ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { -+ mutex_lock(&c->btree_key_cache.lock); -+ list_del_init(&ck->list); -+ -+ set_bit(BKEY_CACHED_DIRTY, &ck->flags); -+ mutex_unlock(&c->btree_key_cache.lock); -+ } -+ -+ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, -+ &ck->journal, btree_key_cache_journal_flush); -+ return true; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) -+{ -+ BUG_ON(btree_key_cache_find(trans->c, id, pos)); -+} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) -+{ -+ struct bkey_cached *ck, *n; -+ -+ mutex_lock(&c->lock); -+ list_for_each_entry_safe(ck, n, &c->clean, list) { -+ kfree(ck->k); -+ kfree(ck); -+ } -+ list_for_each_entry_safe(ck, n, &c->freed, list) -+ kfree(ck); -+ mutex_unlock(&c->lock); -+ -+ rhashtable_destroy(&c->table); -+} -+ -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) -+{ -+ mutex_init(&c->lock); -+ INIT_LIST_HEAD(&c->freed); -+ INIT_LIST_HEAD(&c->clean); -+} -+ -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) -+{ -+ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); -+} -+ -+void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) -+{ -+ struct bucket_table *tbl; -+ struct bkey_cached *ck; -+ struct rhash_head *pos; -+ size_t i; -+ -+ mutex_lock(&c->lock); -+ tbl = rht_dereference_rcu(c->table.tbl, &c->table); -+ -+ for (i = 0; i < tbl->size; i++) { -+ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { -+ pr_buf(out, "%s:", -+ bch2_btree_ids[ck->key.btree_id]); -+ bch2_bpos_to_text(out, ck->key.pos); -+ -+ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) -+ pr_buf(out, " journal seq %llu", ck->journal.seq); -+ pr_buf(out, "\n"); -+ } -+ } -+ mutex_unlock(&c->lock); -+} -diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h -new file mode 100644 -index 000000000000..b1756c6c622c ---- /dev/null -+++ b/fs/bcachefs/btree_key_cache.h -@@ -0,0 +1,25 @@ -+#ifndef _BCACHEFS_BTREE_KEY_CACHE_H -+#define _BCACHEFS_BTREE_KEY_CACHE_H -+ -+int bch2_btree_iter_traverse_cached(struct btree_iter *); -+ -+bool bch2_btree_insert_key_cached(struct btree_trans *, -+ struct btree_iter *, struct bkey_i *); -+int bch2_btree_key_cache_flush(struct btree_trans *, -+ enum btree_id, struct bpos); -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_btree_key_cache_verify_clean(struct btree_trans *, -+ enum btree_id, struct bpos); -+#else -+static inline void -+bch2_btree_key_cache_verify_clean(struct btree_trans *trans, -+ enum btree_id id, struct bpos pos) {} -+#endif -+ -+void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); -+void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); -+int bch2_fs_btree_key_cache_init(struct btree_key_cache *); -+ -+void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); -+ -+#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ -diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h -new file mode 100644 -index 000000000000..81fbf3e18647 ---- /dev/null -+++ b/fs/bcachefs/btree_locking.h -@@ -0,0 +1,257 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_LOCKING_H -+#define _BCACHEFS_BTREE_LOCKING_H -+ -+/* -+ * Only for internal btree use: -+ * -+ * The btree iterator tracks what locks it wants to take, and what locks it -+ * currently has - here we have wrappers for locking/unlocking btree nodes and -+ * updating the iterator state -+ */ -+ -+#include -+ -+#include "btree_iter.h" -+ -+/* matches six lock types */ -+enum btree_node_locked_type { -+ BTREE_NODE_UNLOCKED = -1, -+ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, -+ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, -+}; -+ -+static inline int btree_node_locked_type(struct btree_iter *iter, -+ unsigned level) -+{ -+ /* -+ * We're relying on the fact that if nodes_intent_locked is set -+ * nodes_locked must be set as well, so that we can compute without -+ * branches: -+ */ -+ return BTREE_NODE_UNLOCKED + -+ ((iter->nodes_locked >> level) & 1) + -+ ((iter->nodes_intent_locked >> level) & 1); -+} -+ -+static inline bool btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; -+} -+ -+static inline bool btree_node_read_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; -+} -+ -+static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) -+{ -+ return iter->nodes_locked & (1 << level); -+} -+ -+static inline void mark_btree_node_unlocked(struct btree_iter *iter, -+ unsigned level) -+{ -+ iter->nodes_locked &= ~(1 << level); -+ iter->nodes_intent_locked &= ~(1 << level); -+} -+ -+static inline void mark_btree_node_locked(struct btree_iter *iter, -+ unsigned level, -+ enum six_lock_type type) -+{ -+ /* relying on this to avoid a branch */ -+ BUILD_BUG_ON(SIX_LOCK_read != 0); -+ BUILD_BUG_ON(SIX_LOCK_intent != 1); -+ -+ iter->nodes_locked |= 1 << level; -+ iter->nodes_intent_locked |= type << level; -+} -+ -+static inline void mark_btree_node_intent_locked(struct btree_iter *iter, -+ unsigned level) -+{ -+ mark_btree_node_locked(iter, level, SIX_LOCK_intent); -+} -+ -+static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) -+{ -+ return level < iter->locks_want -+ ? SIX_LOCK_intent -+ : SIX_LOCK_read; -+} -+ -+static inline enum btree_node_locked_type -+btree_lock_want(struct btree_iter *iter, int level) -+{ -+ if (level < iter->level) -+ return BTREE_NODE_UNLOCKED; -+ if (level < iter->locks_want) -+ return BTREE_NODE_INTENT_LOCKED; -+ if (level == iter->level) -+ return BTREE_NODE_READ_LOCKED; -+ return BTREE_NODE_UNLOCKED; -+} -+ -+static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ int lock_type = btree_node_locked_type(iter, level); -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ -+ if (lock_type != BTREE_NODE_UNLOCKED) -+ six_unlock_type(&iter->l[level].b->c.lock, lock_type); -+ mark_btree_node_unlocked(iter, level); -+} -+ -+static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) -+{ -+ EBUG_ON(!level && iter->trans->nounlock); -+ -+ __btree_node_unlock(iter, level); -+} -+ -+static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) -+{ -+ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); -+ -+ while (iter->nodes_locked) -+ btree_node_unlock(iter, __ffs(iter->nodes_locked)); -+} -+ -+static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) -+{ -+ switch (type) { -+ case SIX_LOCK_read: -+ return BCH_TIME_btree_lock_contended_read; -+ case SIX_LOCK_intent: -+ return BCH_TIME_btree_lock_contended_intent; -+ case SIX_LOCK_write: -+ return BCH_TIME_btree_lock_contended_write; -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * wrapper around six locks that just traces lock contended time -+ */ -+static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ u64 start_time = local_clock(); -+ -+ six_lock_type(&b->c.lock, type, NULL, NULL); -+ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); -+} -+ -+static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, -+ enum six_lock_type type) -+{ -+ if (!six_trylock_type(&b->c.lock, type)) -+ __btree_node_lock_type(c, b, type); -+} -+ -+/* -+ * Lock a btree node if we already have it locked on one of our linked -+ * iterators: -+ */ -+static inline bool btree_node_lock_increment(struct btree_trans *trans, -+ struct btree *b, unsigned level, -+ enum btree_node_locked_type want) -+{ -+ struct btree_iter *iter; -+ -+ trans_for_each_iter(trans, iter) -+ if (iter->l[level].b == b && -+ btree_node_locked_type(iter, level) >= want) { -+ six_lock_increment(&b->c.lock, want); -+ return true; -+ } -+ -+ return false; -+} -+ -+bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, -+ struct btree_iter *, enum six_lock_type, -+ six_lock_should_sleep_fn, void *); -+ -+static inline bool btree_node_lock(struct btree *b, -+ struct bpos pos, unsigned level, -+ struct btree_iter *iter, -+ enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ struct btree_trans *trans = iter->trans; -+ bool ret; -+ -+ EBUG_ON(level >= BTREE_MAX_DEPTH); -+ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = b; -+ trans->locking_iter_idx = iter->idx; -+ trans->locking_pos = pos; -+ trans->locking_btree_id = iter->btree_id; -+ trans->locking_level = level; -+#endif -+ ret = likely(six_trylock_type(&b->c.lock, type)) || -+ btree_node_lock_increment(trans, b, level, type) || -+ __bch2_btree_node_lock(b, pos, level, iter, type, -+ should_sleep_fn, p); -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans->locking = NULL; -+#endif -+ return ret; -+} -+ -+bool __bch2_btree_node_relock(struct btree_iter *, unsigned); -+ -+static inline bool bch2_btree_node_relock(struct btree_iter *iter, -+ unsigned level) -+{ -+ EBUG_ON(btree_node_locked(iter, level) && -+ btree_node_locked_type(iter, level) != -+ __btree_lock_want(iter, level)); -+ -+ return likely(btree_node_locked(iter, level)) || -+ __bch2_btree_node_relock(iter, level); -+} -+ -+/* -+ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will -+ * succeed: -+ */ -+static inline void -+bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ linked->l[b->c.level].lock_seq += 2; -+ -+ six_unlock_write(&b->c.lock); -+} -+ -+void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); -+ -+void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); -+ -+static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) -+{ -+ EBUG_ON(iter->l[b->c.level].b != b); -+ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); -+ -+ if (unlikely(!six_trylock_write(&b->c.lock))) -+ __bch2_btree_node_lock_write(b, iter); -+} -+ -+#endif /* _BCACHEFS_BTREE_LOCKING_H */ -+ -+ -diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h -new file mode 100644 -index 000000000000..cc01baeec138 ---- /dev/null -+++ b/fs/bcachefs/btree_types.h -@@ -0,0 +1,663 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_TYPES_H -+#define _BCACHEFS_BTREE_TYPES_H -+ -+#include -+#include -+#include -+ -+#include "bkey_methods.h" -+#include "buckets_types.h" -+#include "journal_types.h" -+ -+struct open_bucket; -+struct btree_update; -+struct btree_trans; -+ -+#define MAX_BSETS 3U -+ -+struct btree_nr_keys { -+ -+ /* -+ * Amount of live metadata (i.e. size of node after a compaction) in -+ * units of u64s -+ */ -+ u16 live_u64s; -+ u16 bset_u64s[MAX_BSETS]; -+ -+ /* live keys only: */ -+ u16 packed_keys; -+ u16 unpacked_keys; -+}; -+ -+struct bset_tree { -+ /* -+ * We construct a binary tree in an array as if the array -+ * started at 1, so that things line up on the same cachelines -+ * better: see comments in bset.c at cacheline_to_bkey() for -+ * details -+ */ -+ -+ /* size of the binary tree and prev array */ -+ u16 size; -+ -+ /* function of size - precalculated for to_inorder() */ -+ u16 extra; -+ -+ u16 data_offset; -+ u16 aux_data_offset; -+ u16 end_offset; -+ -+ struct bpos max_key; -+}; -+ -+struct btree_write { -+ struct journal_entry_pin journal; -+}; -+ -+struct btree_alloc { -+ struct open_buckets ob; -+ BKEY_PADDED(k); -+}; -+ -+struct btree_bkey_cached_common { -+ struct six_lock lock; -+ u8 level; -+ u8 btree_id; -+}; -+ -+struct btree { -+ struct btree_bkey_cached_common c; -+ -+ struct rhash_head hash; -+ u64 hash_val; -+ -+ unsigned long flags; -+ u16 written; -+ u8 nsets; -+ u8 nr_key_bits; -+ -+ struct bkey_format format; -+ -+ struct btree_node *data; -+ void *aux_data; -+ -+ /* -+ * Sets of sorted keys - the real btree node - plus a binary search tree -+ * -+ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point -+ * to the memory we have allocated for this btree node. Additionally, -+ * set[0]->data points to the entire btree node as it exists on disk. -+ */ -+ struct bset_tree set[MAX_BSETS]; -+ -+ struct btree_nr_keys nr; -+ u16 sib_u64s[2]; -+ u16 whiteout_u64s; -+ u8 byte_order; -+ u8 unpack_fn_len; -+ -+ /* -+ * XXX: add a delete sequence number, so when bch2_btree_node_relock() -+ * fails because the lock sequence number has changed - i.e. the -+ * contents were modified - we can still relock the node if it's still -+ * the one we want, without redoing the traversal -+ */ -+ -+ /* -+ * For asynchronous splits/interior node updates: -+ * When we do a split, we allocate new child nodes and update the parent -+ * node to point to them: we update the parent in memory immediately, -+ * but then we must wait until the children have been written out before -+ * the update to the parent can be written - this is a list of the -+ * btree_updates that are blocking this node from being -+ * written: -+ */ -+ struct list_head write_blocked; -+ -+ /* -+ * Also for asynchronous splits/interior node updates: -+ * If a btree node isn't reachable yet, we don't want to kick off -+ * another write - because that write also won't yet be reachable and -+ * marking it as completed before it's reachable would be incorrect: -+ */ -+ unsigned long will_make_reachable; -+ -+ struct open_buckets ob; -+ -+ /* lru list */ -+ struct list_head list; -+ -+ struct btree_write writes[2]; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ bool *expensive_debug_checks; -+#endif -+ -+ /* Key/pointer for this btree node */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+}; -+ -+struct btree_cache { -+ struct rhashtable table; -+ bool table_init_done; -+ /* -+ * We never free a struct btree, except on shutdown - we just put it on -+ * the btree_cache_freed list and reuse it later. This simplifies the -+ * code, and it doesn't cost us much memory as the memory usage is -+ * dominated by buffers that hold the actual btree node data and those -+ * can be freed - and the number of struct btrees allocated is -+ * effectively bounded. -+ * -+ * btree_cache_freeable effectively is a small cache - we use it because -+ * high order page allocations can be rather expensive, and it's quite -+ * common to delete and allocate btree nodes in quick succession. It -+ * should never grow past ~2-3 nodes in practice. -+ */ -+ struct mutex lock; -+ struct list_head live; -+ struct list_head freeable; -+ struct list_head freed; -+ -+ /* Number of elements in live + freeable lists */ -+ unsigned used; -+ unsigned reserve; -+ struct shrinker shrink; -+ -+ /* -+ * If we need to allocate memory for a new btree node and that -+ * allocation fails, we can cannibalize another node in the btree cache -+ * to satisfy the allocation - lock to guarantee only one thread does -+ * this at a time: -+ */ -+ struct task_struct *alloc_lock; -+ struct closure_waitlist alloc_wait; -+}; -+ -+struct btree_node_iter { -+ struct btree_node_iter_set { -+ u16 k, end; -+ } data[MAX_BSETS]; -+}; -+ -+enum btree_iter_type { -+ BTREE_ITER_KEYS, -+ BTREE_ITER_NODES, -+ BTREE_ITER_CACHED, -+}; -+ -+#define BTREE_ITER_TYPE ((1 << 2) - 1) -+ -+/* -+ * Iterate over all possible positions, synthesizing deleted keys for holes: -+ */ -+#define BTREE_ITER_SLOTS (1 << 2) -+/* -+ * Indicates that intent locks should be taken on leaf nodes, because we expect -+ * to be doing updates: -+ */ -+#define BTREE_ITER_INTENT (1 << 3) -+/* -+ * Causes the btree iterator code to prefetch additional btree nodes from disk: -+ */ -+#define BTREE_ITER_PREFETCH (1 << 4) -+/* -+ * Indicates that this iterator should not be reused until transaction commit, -+ * either because a pending update references it or because the update depends -+ * on that particular key being locked (e.g. by the str_hash code, for hash -+ * table consistency) -+ */ -+#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) -+/* -+ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for -+ * @pos or the first key strictly greater than @pos -+ */ -+#define BTREE_ITER_IS_EXTENTS (1 << 6) -+#define BTREE_ITER_ERROR (1 << 7) -+#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) -+#define BTREE_ITER_CACHED_NOFILL (1 << 9) -+#define BTREE_ITER_CACHED_NOCREATE (1 << 10) -+ -+#define BTREE_ITER_USER_FLAGS \ -+ (BTREE_ITER_SLOTS \ -+ |BTREE_ITER_INTENT \ -+ |BTREE_ITER_PREFETCH \ -+ |BTREE_ITER_CACHED_NOFILL \ -+ |BTREE_ITER_CACHED_NOCREATE) -+ -+enum btree_iter_uptodate { -+ BTREE_ITER_UPTODATE = 0, -+ BTREE_ITER_NEED_PEEK = 1, -+ BTREE_ITER_NEED_RELOCK = 2, -+ BTREE_ITER_NEED_TRAVERSE = 3, -+}; -+ -+#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) -+#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) -+#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) -+#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) -+#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) -+#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) -+#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) -+ -+/* -+ * @pos - iterator's current position -+ * @level - current btree depth -+ * @locks_want - btree level below which we start taking intent locks -+ * @nodes_locked - bitmask indicating which nodes in @nodes are locked -+ * @nodes_intent_locked - bitmask indicating which locks are intent locks -+ */ -+struct btree_iter { -+ struct btree_trans *trans; -+ struct bpos pos; -+ struct bpos pos_after_commit; -+ -+ u16 flags; -+ u8 idx; -+ -+ enum btree_id btree_id:4; -+ enum btree_iter_uptodate uptodate:4; -+ unsigned level:4, -+ min_depth:4, -+ locks_want:4, -+ nodes_locked:4, -+ nodes_intent_locked:4; -+ -+ struct btree_iter_level { -+ struct btree *b; -+ struct btree_node_iter iter; -+ u32 lock_seq; -+ } l[BTREE_MAX_DEPTH]; -+ -+ /* -+ * Current unpacked key - so that bch2_btree_iter_next()/ -+ * bch2_btree_iter_next_slot() can correctly advance pos. -+ */ -+ struct bkey k; -+ unsigned long ip_allocated; -+}; -+ -+static inline enum btree_iter_type -+btree_iter_type(const struct btree_iter *iter) -+{ -+ return iter->flags & BTREE_ITER_TYPE; -+} -+ -+static inline struct btree_iter_level *iter_l(struct btree_iter *iter) -+{ -+ return iter->l + iter->level; -+} -+ -+struct btree_key_cache { -+ struct mutex lock; -+ struct rhashtable table; -+ struct list_head freed; -+ struct list_head clean; -+}; -+ -+struct bkey_cached_key { -+ u32 btree_id; -+ struct bpos pos; -+} __attribute__((packed, aligned(4))); -+ -+#define BKEY_CACHED_DIRTY 0 -+ -+struct bkey_cached { -+ struct btree_bkey_cached_common c; -+ -+ unsigned long flags; -+ u8 u64s; -+ bool valid; -+ struct bkey_cached_key key; -+ -+ struct rhash_head hash; -+ struct list_head list; -+ -+ struct journal_preres res; -+ struct journal_entry_pin journal; -+ -+ struct bkey_i *k; -+}; -+ -+struct btree_insert_entry { -+ unsigned trigger_flags; -+ unsigned trans_triggers_run:1; -+ struct bkey_i *k; -+ struct btree_iter *iter; -+}; -+ -+#ifndef CONFIG_LOCKDEP -+#define BTREE_ITER_MAX 64 -+#else -+#define BTREE_ITER_MAX 32 -+#endif -+ -+struct btree_trans { -+ struct bch_fs *c; -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct list_head list; -+ struct btree *locking; -+ unsigned locking_iter_idx; -+ struct bpos locking_pos; -+ u8 locking_btree_id; -+ u8 locking_level; -+ pid_t pid; -+#endif -+ unsigned long ip; -+ -+ u64 iters_linked; -+ u64 iters_live; -+ u64 iters_touched; -+ -+ u8 nr_iters; -+ u8 nr_updates; -+ u8 nr_updates2; -+ u8 size; -+ unsigned used_mempool:1; -+ unsigned error:1; -+ unsigned nounlock:1; -+ unsigned need_reset:1; -+ unsigned in_traverse_all:1; -+ -+ unsigned mem_top; -+ unsigned mem_bytes; -+ void *mem; -+ -+ struct btree_iter *iters; -+ struct btree_insert_entry *updates; -+ struct btree_insert_entry *updates2; -+ -+ /* update path: */ -+ struct jset_entry *extra_journal_entries; -+ unsigned extra_journal_entry_u64s; -+ struct journal_entry_pin *journal_pin; -+ -+ struct journal_res journal_res; -+ struct journal_preres journal_preres; -+ u64 *journal_seq; -+ struct disk_reservation *disk_res; -+ unsigned flags; -+ unsigned journal_u64s; -+ unsigned journal_preres_u64s; -+ struct replicas_delta_list *fs_usage_deltas; -+ -+ struct btree_iter iters_onstack[2]; -+ struct btree_insert_entry updates_onstack[2]; -+ struct btree_insert_entry updates2_onstack[2]; -+}; -+ -+#define BTREE_FLAG(flag) \ -+static inline bool btree_node_ ## flag(struct btree *b) \ -+{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void set_btree_node_ ## flag(struct btree *b) \ -+{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ -+ \ -+static inline void clear_btree_node_ ## flag(struct btree *b) \ -+{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } -+ -+enum btree_flags { -+ BTREE_NODE_read_in_flight, -+ BTREE_NODE_read_error, -+ BTREE_NODE_dirty, -+ BTREE_NODE_need_write, -+ BTREE_NODE_noevict, -+ BTREE_NODE_write_idx, -+ BTREE_NODE_accessed, -+ BTREE_NODE_write_in_flight, -+ BTREE_NODE_just_written, -+ BTREE_NODE_dying, -+ BTREE_NODE_fake, -+ BTREE_NODE_old_extent_overwrite, -+ BTREE_NODE_need_rewrite, -+}; -+ -+BTREE_FLAG(read_in_flight); -+BTREE_FLAG(read_error); -+BTREE_FLAG(dirty); -+BTREE_FLAG(need_write); -+BTREE_FLAG(noevict); -+BTREE_FLAG(write_idx); -+BTREE_FLAG(accessed); -+BTREE_FLAG(write_in_flight); -+BTREE_FLAG(just_written); -+BTREE_FLAG(dying); -+BTREE_FLAG(fake); -+BTREE_FLAG(old_extent_overwrite); -+BTREE_FLAG(need_rewrite); -+ -+static inline struct btree_write *btree_current_write(struct btree *b) -+{ -+ return b->writes + btree_node_write_idx(b); -+} -+ -+static inline struct btree_write *btree_prev_write(struct btree *b) -+{ -+ return b->writes + (btree_node_write_idx(b) ^ 1); -+} -+ -+static inline struct bset_tree *bset_tree_last(struct btree *b) -+{ -+ EBUG_ON(!b->nsets); -+ return b->set + b->nsets - 1; -+} -+ -+static inline void * -+__btree_node_offset_to_ptr(const struct btree *b, u16 offset) -+{ -+ return (void *) ((u64 *) b->data + 1 + offset); -+} -+ -+static inline u16 -+__btree_node_ptr_to_offset(const struct btree *b, const void *p) -+{ -+ u16 ret = (u64 *) p - 1 - (u64 *) b->data; -+ -+ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); -+ return ret; -+} -+ -+static inline struct bset *bset(const struct btree *b, -+ const struct bset_tree *t) -+{ -+ return __btree_node_offset_to_ptr(b, t->data_offset); -+} -+ -+static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) -+{ -+ t->end_offset = -+ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); -+} -+ -+static inline void set_btree_bset(struct btree *b, struct bset_tree *t, -+ const struct bset *i) -+{ -+ t->data_offset = __btree_node_ptr_to_offset(b, i); -+ set_btree_bset_end(b, t); -+} -+ -+static inline struct bset *btree_bset_first(struct btree *b) -+{ -+ return bset(b, b->set); -+} -+ -+static inline struct bset *btree_bset_last(struct btree *b) -+{ -+ return bset(b, bset_tree_last(b)); -+} -+ -+static inline u16 -+__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) -+{ -+ return __btree_node_ptr_to_offset(b, k); -+} -+ -+static inline struct bkey_packed * -+__btree_node_offset_to_key(const struct btree *b, u16 k) -+{ -+ return __btree_node_offset_to_ptr(b, k); -+} -+ -+static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) -+{ -+ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); -+} -+ -+#define btree_bkey_first(_b, _t) \ -+({ \ -+ EBUG_ON(bset(_b, _t)->start != \ -+ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ -+ \ -+ bset(_b, _t)->start; \ -+}) -+ -+#define btree_bkey_last(_b, _t) \ -+({ \ -+ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ -+ vstruct_last(bset(_b, _t))); \ -+ \ -+ __btree_node_offset_to_key(_b, (_t)->end_offset); \ -+}) -+ -+static inline unsigned bset_u64s(struct bset_tree *t) -+{ -+ return t->end_offset - t->data_offset - -+ sizeof(struct bset) / sizeof(u64); -+} -+ -+static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) -+{ -+ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; -+} -+ -+static inline unsigned bset_byte_offset(struct btree *b, void *i) -+{ -+ return i - (void *) b->data; -+} -+ -+enum btree_node_type { -+#define x(kwd, val, name) BKEY_TYPE_##kwd = val, -+ BCH_BTREE_IDS() -+#undef x -+ BKEY_TYPE_BTREE, -+}; -+ -+/* Type of a key in btree @id at level @level: */ -+static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) -+{ -+ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; -+} -+ -+/* Type of keys @b contains: */ -+static inline enum btree_node_type btree_node_type(struct btree *b) -+{ -+ return __btree_node_type(b->c.level, b->c.btree_id); -+} -+ -+static inline bool btree_node_type_is_extents(enum btree_node_type type) -+{ -+ switch (type) { -+ case BKEY_TYPE_EXTENTS: -+ case BKEY_TYPE_REFLINK: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool btree_node_is_extents(struct btree *b) -+{ -+ return btree_node_type_is_extents(btree_node_type(b)); -+} -+ -+static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) -+{ -+ return __btree_node_type(iter->level, iter->btree_id); -+} -+ -+static inline bool btree_iter_is_extents(struct btree_iter *iter) -+{ -+ return btree_node_type_is_extents(btree_iter_key_type(iter)); -+} -+ -+#define BTREE_NODE_TYPE_HAS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_ALLOC)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_REFLINK)| \ -+ (1U << BKEY_TYPE_EC)| \ -+ (1U << BKEY_TYPE_BTREE)) -+ -+#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ -+ ((1U << BKEY_TYPE_EXTENTS)| \ -+ (1U << BKEY_TYPE_INODES)| \ -+ (1U << BKEY_TYPE_EC)| \ -+ (1U << BKEY_TYPE_REFLINK)) -+ -+enum btree_trigger_flags { -+ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ -+ -+ __BTREE_TRIGGER_INSERT, -+ __BTREE_TRIGGER_OVERWRITE, -+ __BTREE_TRIGGER_OVERWRITE_SPLIT, -+ -+ __BTREE_TRIGGER_GC, -+ __BTREE_TRIGGER_BUCKET_INVALIDATE, -+ __BTREE_TRIGGER_NOATOMIC, -+}; -+ -+#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) -+ -+#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) -+#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) -+#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) -+ -+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) -+#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) -+#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) -+ -+static inline bool btree_node_type_needs_gc(enum btree_node_type type) -+{ -+ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); -+} -+ -+struct btree_root { -+ struct btree *b; -+ -+ /* On disk root - see async splits: */ -+ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); -+ u8 level; -+ u8 alive; -+ s8 error; -+}; -+ -+/* -+ * Optional hook that will be called just prior to a btree node update, when -+ * we're holding the write lock and we know what key is about to be overwritten: -+ */ -+ -+enum btree_insert_ret { -+ BTREE_INSERT_OK, -+ /* leaf node needs to be split */ -+ BTREE_INSERT_BTREE_NODE_FULL, -+ BTREE_INSERT_ENOSPC, -+ BTREE_INSERT_NEED_MARK_REPLICAS, -+ BTREE_INSERT_NEED_JOURNAL_RES, -+}; -+ -+enum btree_gc_coalesce_fail_reason { -+ BTREE_GC_COALESCE_FAIL_RESERVE_GET, -+ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, -+ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, -+}; -+ -+enum btree_node_sibling { -+ btree_prev_sib, -+ btree_next_sib, -+}; -+ -+typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, -+ struct btree *, -+ struct btree_node_iter *); -+ -+#endif /* _BCACHEFS_BTREE_TYPES_H */ -diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h -new file mode 100644 -index 000000000000..e0b1bde37484 ---- /dev/null -+++ b/fs/bcachefs/btree_update.h -@@ -0,0 +1,144 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_H -+#define _BCACHEFS_BTREE_UPDATE_H -+ -+#include "btree_iter.h" -+#include "journal.h" -+ -+struct bch_fs; -+struct btree; -+ -+void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, -+ struct btree_node_iter *, struct bkey_i *); -+void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); -+ -+enum btree_insert_flags { -+ __BTREE_INSERT_NOUNLOCK, -+ __BTREE_INSERT_NOFAIL, -+ __BTREE_INSERT_NOCHECK_RW, -+ __BTREE_INSERT_LAZY_RW, -+ __BTREE_INSERT_USE_RESERVE, -+ __BTREE_INSERT_USE_ALLOC_RESERVE, -+ __BTREE_INSERT_JOURNAL_REPLAY, -+ __BTREE_INSERT_JOURNAL_RESERVED, -+ __BTREE_INSERT_JOURNAL_RECLAIM, -+ __BTREE_INSERT_NOWAIT, -+ __BTREE_INSERT_GC_LOCK_HELD, -+ __BCH_HASH_SET_MUST_CREATE, -+ __BCH_HASH_SET_MUST_REPLACE, -+}; -+ -+/* -+ * Don't drop locks _after_ successfully updating btree: -+ */ -+#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) -+ -+/* Don't check for -ENOSPC: */ -+#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) -+ -+#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) -+#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) -+ -+/* for copygc, or when merging btree nodes */ -+#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) -+#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) -+ -+/* Insert is for journal replay - don't get journal reservations: */ -+#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) -+ -+/* Indicates that we have pre-reserved space in the journal: */ -+#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) -+ -+/* Insert is being called from journal reclaim path: */ -+#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) -+ -+/* Don't block on allocation failure (for new btree nodes: */ -+#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) -+#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) -+ -+#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) -+#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) -+ -+int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); -+ -+int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); -+int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, -+ struct disk_reservation *, u64 *, int flags); -+ -+int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *); -+int bch2_btree_delete_range(struct bch_fs *, enum btree_id, -+ struct bpos, struct bpos, u64 *); -+ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, -+ __le64, unsigned); -+int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, -+ struct btree *, struct bkey_i *); -+ -+int bch2_trans_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, enum btree_trigger_flags); -+int __bch2_trans_commit(struct btree_trans *); -+ -+/** -+ * bch2_trans_commit - insert keys at given iterator positions -+ * -+ * This is main entry point for btree updates. -+ * -+ * Return values: -+ * -EINTR: locking changed, this function should be called again. -+ * -EROFS: filesystem read only -+ * -EIO: journal or btree node IO error -+ */ -+static inline int bch2_trans_commit(struct btree_trans *trans, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ unsigned flags) -+{ -+ trans->disk_res = disk_res; -+ trans->journal_seq = journal_seq; -+ trans->flags = flags; -+ -+ return __bch2_trans_commit(trans); -+} -+ -+#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ int _ret; \ -+ \ -+ while (1) { \ -+ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ -+ (_journal_seq), (_flags)); \ -+ if (_ret != -EINTR) \ -+ break; \ -+ bch2_trans_reset(_trans, 0); \ -+ } \ -+ \ -+ _ret; \ -+}) -+ -+#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ -+({ \ -+ struct btree_trans trans; \ -+ int _ret, _ret2; \ -+ \ -+ bch2_trans_init(&trans, (_c), 0, 0); \ -+ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ -+ _do); \ -+ _ret2 = bch2_trans_exit(&trans); \ -+ \ -+ _ret ?: _ret2; \ -+}) -+ -+#define trans_for_each_update(_trans, _i) \ -+ for ((_i) = (_trans)->updates; \ -+ (_i) < (_trans)->updates + (_trans)->nr_updates; \ -+ (_i)++) -+ -+#define trans_for_each_update2(_trans, _i) \ -+ for ((_i) = (_trans)->updates2; \ -+ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ -+ (_i)++) -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_H */ -diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c -new file mode 100644 -index 000000000000..a2604b0ce2d8 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.c -@@ -0,0 +1,2075 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "extents.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+/* Debug code: */ -+ -+/* -+ * Verify that child nodes correctly span parent node's range: -+ */ -+static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ struct bpos next_node = b->data->min_key; -+ struct btree_node_iter iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_btree_ptr_v2 bp; -+ struct bkey unpacked; -+ -+ BUG_ON(!b->c.level); -+ -+ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) -+ return; -+ -+ bch2_btree_node_iter_init_from_start(&iter, b); -+ -+ while (1) { -+ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); -+ if (k.k->type != KEY_TYPE_btree_ptr_v2) -+ break; -+ bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); -+ -+ bch2_btree_node_iter_advance(&iter, b); -+ -+ if (bch2_btree_node_iter_end(&iter)) { -+ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); -+ break; -+ } -+ -+ next_node = bkey_successor(k.k->p); -+ } -+#endif -+} -+ -+/* Calculate ideal packed bkey format for new btree nodes: */ -+ -+void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) -+{ -+ struct bkey_packed *k; -+ struct bset_tree *t; -+ struct bkey uk; -+ -+ bch2_bkey_format_add_pos(s, b->data->min_key); -+ -+ for_each_bset(b, t) -+ bset_tree_for_each_key(b, t, k) -+ if (!bkey_whiteout(k)) { -+ uk = bkey_unpack_key(b, k); -+ bch2_bkey_format_add_key(s, &uk); -+ } -+} -+ -+static struct bkey_format bch2_btree_calc_format(struct btree *b) -+{ -+ struct bkey_format_state s; -+ -+ bch2_bkey_format_init(&s); -+ __bch2_btree_calc_format(&s, b); -+ -+ return bch2_bkey_format_done(&s); -+} -+ -+static size_t btree_node_u64s_with_format(struct btree *b, -+ struct bkey_format *new_f) -+{ -+ struct bkey_format *old_f = &b->format; -+ -+ /* stupid integer promotion rules */ -+ ssize_t delta = -+ (((int) new_f->key_u64s - old_f->key_u64s) * -+ (int) b->nr.packed_keys) + -+ (((int) new_f->key_u64s - BKEY_U64s) * -+ (int) b->nr.unpacked_keys); -+ -+ BUG_ON(delta + b->nr.live_u64s < 0); -+ -+ return b->nr.live_u64s + delta; -+} -+ -+/** -+ * btree_node_format_fits - check if we could rewrite node with a new format -+ * -+ * This assumes all keys can pack with the new format -- it just checks if -+ * the re-packed keys would fit inside the node itself. -+ */ -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, -+ struct bkey_format *new_f) -+{ -+ size_t u64s = btree_node_u64s_with_format(b, new_f); -+ -+ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); -+} -+ -+/* Btree node freeing/allocation: */ -+ -+static void __btree_node_free(struct bch_fs *c, struct btree *b) -+{ -+ trace_btree_node_free(c, b); -+ -+ BUG_ON(btree_node_dirty(b)); -+ BUG_ON(btree_node_need_write(b)); -+ BUG_ON(b == btree_node_root(c, b)); -+ BUG_ON(b->ob.nr); -+ BUG_ON(!list_empty(&b->write_blocked)); -+ BUG_ON(b->will_make_reachable); -+ -+ clear_btree_node_noevict(b); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&b->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+} -+ -+void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) -+{ -+ struct open_buckets ob = b->ob; -+ -+ b->ob.nr = 0; -+ -+ clear_btree_node_dirty(b); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ bch2_open_buckets_put(c, &ob); -+} -+ -+void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct btree_iter *linked; -+ -+ trans_for_each_iter(iter->trans, linked) -+ BUG_ON(linked->l[b->c.level].b == b); -+ -+ six_lock_write(&b->c.lock, NULL, NULL); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, -+ struct disk_reservation *res, -+ struct closure *cl, -+ unsigned flags) -+{ -+ struct write_point *wp; -+ struct btree *b; -+ BKEY_PADDED(k) tmp; -+ struct open_buckets ob = { .nr = 0 }; -+ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; -+ unsigned nr_reserve; -+ enum alloc_reserve alloc_reserve; -+ -+ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { -+ nr_reserve = 0; -+ alloc_reserve = RESERVE_ALLOC; -+ } else if (flags & BTREE_INSERT_USE_RESERVE) { -+ nr_reserve = BTREE_NODE_RESERVE / 2; -+ alloc_reserve = RESERVE_BTREE; -+ } else { -+ nr_reserve = BTREE_NODE_RESERVE; -+ alloc_reserve = RESERVE_NONE; -+ } -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ if (c->btree_reserve_cache_nr > nr_reserve) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; -+ -+ ob = a->ob; -+ bkey_copy(&tmp.k, &a->k); -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ goto mem_alloc; -+ } -+ mutex_unlock(&c->btree_reserve_cache_lock); -+ -+retry: -+ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, -+ writepoint_ptr(&c->btree_write_point), -+ &devs_have, -+ res->nr_replicas, -+ c->opts.metadata_replicas_required, -+ alloc_reserve, 0, cl); -+ if (IS_ERR(wp)) -+ return ERR_CAST(wp); -+ -+ if (wp->sectors_free < c->opts.btree_node_size) { -+ struct open_bucket *ob; -+ unsigned i; -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) -+ if (ob->sectors_free < c->opts.btree_node_size) -+ ob->sectors_free = 0; -+ -+ bch2_alloc_sectors_done(c, wp); -+ goto retry; -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) -+ bkey_btree_ptr_v2_init(&tmp.k); -+ else -+ bkey_btree_ptr_init(&tmp.k); -+ -+ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); -+ -+ bch2_open_bucket_get(c, wp, &ob); -+ bch2_alloc_sectors_done(c, wp); -+mem_alloc: -+ b = bch2_btree_node_mem_alloc(c); -+ -+ /* we hold cannibalize_lock: */ -+ BUG_ON(IS_ERR(b)); -+ BUG_ON(b->ob.nr); -+ -+ bkey_copy(&b->key, &tmp.k); -+ b->ob = ob; -+ -+ return b; -+} -+ -+static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(level >= BTREE_MAX_DEPTH); -+ BUG_ON(!as->nr_prealloc_nodes); -+ -+ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ set_btree_node_accessed(b); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ b->c.level = level; -+ b->c.btree_id = as->btree_id; -+ -+ memset(&b->nr, 0, sizeof(b->nr)); -+ b->data->magic = cpu_to_le64(bset_magic(c)); -+ b->data->flags = 0; -+ SET_BTREE_NODE_ID(b->data, as->btree_id); -+ SET_BTREE_NODE_LEVEL(b->data, level); -+ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; -+ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { -+ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); -+ -+ bp->v.mem_ptr = 0; -+ bp->v.seq = b->data->keys.seq; -+ bp->v.sectors_written = 0; -+ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); -+ } -+ -+ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) -+ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); -+ -+ if (btree_node_is_extents(b) && -+ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { -+ set_btree_node_old_extent_overwrite(b); -+ set_btree_node_need_rewrite(b); -+ } -+ -+ bch2_btree_build_aux_trees(b); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); -+ BUG_ON(ret); -+ -+ trace_btree_node_alloc(c, b); -+ return b; -+} -+ -+static void btree_set_min(struct btree *b, struct bpos pos) -+{ -+ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) -+ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; -+ b->data->min_key = pos; -+} -+ -+static void btree_set_max(struct btree *b, struct bpos pos) -+{ -+ b->key.k.p = pos; -+ b->data->max_key = pos; -+} -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b, -+ struct bkey_format format) -+{ -+ struct btree *n; -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ -+ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); -+ -+ btree_set_min(n, b->data->min_key); -+ btree_set_max(n, b->data->max_key); -+ -+ n->data->format = format; -+ btree_node_set_format(n, format); -+ -+ bch2_btree_sort_into(as->c, n, b); -+ -+ btree_node_reset_sib_u64s(n); -+ -+ n->key.k.p = b->key.k.p; -+ return n; -+} -+ -+static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bkey_format new_f = bch2_btree_calc_format(b); -+ -+ /* -+ * The keys might expand with the new format - if they wouldn't fit in -+ * the btree node anymore, use the old format for now: -+ */ -+ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) -+ new_f = b->format; -+ -+ return __bch2_btree_node_alloc_replacement(as, b, new_f); -+} -+ -+static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) -+{ -+ struct btree *b = bch2_btree_node_alloc(as, level); -+ -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ -+ btree_node_set_format(b, b->data->format); -+ bch2_btree_build_aux_trees(b); -+ -+ bch2_btree_update_add_new_node(as, b); -+ six_unlock_write(&b->c.lock); -+ -+ return b; -+} -+ -+static void bch2_btree_reserve_put(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_reserve_cache_lock); -+ -+ while (as->nr_prealloc_nodes) { -+ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; -+ -+ six_unlock_write(&b->c.lock); -+ -+ if (c->btree_reserve_cache_nr < -+ ARRAY_SIZE(c->btree_reserve_cache)) { -+ struct btree_alloc *a = -+ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; -+ -+ a->ob = b->ob; -+ b->ob.nr = 0; -+ bkey_copy(&a->k, &b->key); -+ } else { -+ bch2_open_buckets_put(c, &b->ob); -+ } -+ -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ __btree_node_free(c, b); -+ six_unlock_write(&b->c.lock); -+ -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ mutex_unlock(&c->btree_reserve_cache_lock); -+} -+ -+static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, -+ unsigned flags, struct closure *cl) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b; -+ int ret; -+ -+ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); -+ -+ /* -+ * Protects reaping from the btree node cache and using the btree node -+ * open bucket reserve: -+ */ -+ ret = bch2_btree_cache_cannibalize_lock(c, cl); -+ if (ret) -+ return ret; -+ -+ while (as->nr_prealloc_nodes < nr_nodes) { -+ b = __bch2_btree_node_alloc(c, &as->disk_res, -+ flags & BTREE_INSERT_NOWAIT -+ ? NULL : cl, flags); -+ if (IS_ERR(b)) { -+ ret = PTR_ERR(b); -+ goto err_free; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); -+ if (ret) -+ goto err_free; -+ -+ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; -+ } -+ -+ bch2_btree_cache_cannibalize_unlock(c); -+ return 0; -+err_free: -+ bch2_btree_cache_cannibalize_unlock(c); -+ trace_btree_reserve_get_fail(c, nr_nodes, cl); -+ return ret; -+} -+ -+/* Asynchronous interior node update machinery */ -+ -+static void bch2_btree_update_free(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ bch2_journal_pin_flush(&c->journal, &as->journal); -+ bch2_disk_reservation_put(c, &as->disk_res); -+ bch2_btree_reserve_put(as); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_del(&as->unwritten_list); -+ list_del(&as->list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ closure_debug_destroy(&as->cl); -+ mempool_free(as, &c->btree_interior_update_pool); -+ -+ closure_wake_up(&c->btree_interior_update_wait); -+} -+ -+static void btree_update_will_delete_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_old_keys)); -+ bch2_keylist_add(&as->old_keys, k); -+} -+ -+static void btree_update_will_add_key(struct btree_update *as, -+ struct bkey_i *k) -+{ -+ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > -+ ARRAY_SIZE(as->_new_keys)); -+ bch2_keylist_add(&as->new_keys, k); -+} -+ -+/* -+ * The transactional part of an interior btree node update, where we journal the -+ * update we did to the interior node and update alloc info: -+ */ -+static int btree_update_nodes_written_trans(struct btree_trans *trans, -+ struct btree_update *as) -+{ -+ struct bkey_i *k; -+ int ret; -+ -+ trans->extra_journal_entries = (void *) &as->journal_entries[0]; -+ trans->extra_journal_entry_u64s = as->journal_u64s; -+ trans->journal_pin = &as->journal; -+ -+ for_each_keylist_key(&as->new_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_keylist_key(&as->old_keys, k) { -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static void btree_update_nodes_written(struct btree_update *as) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *b = as->b; -+ u64 journal_seq = 0; -+ unsigned i; -+ int ret; -+ -+ /* -+ * We did an update to a parent node where the pointers we added pointed -+ * to child nodes that weren't written yet: now, the child nodes have -+ * been written so we can write out the update to the interior node. -+ */ -+ -+ /* -+ * We can't call into journal reclaim here: we'd block on the journal -+ * reclaim lock, but we may need to release the open buckets we have -+ * pinned in order for other btree updates to make forward progress, and -+ * journal reclaim does btree updates when flushing bkey_cached entries, -+ * which may require allocations as well. -+ */ -+ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE| -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_JOURNAL_RECLAIM| -+ BTREE_INSERT_JOURNAL_RESERVED, -+ btree_update_nodes_written_trans(&trans, as)); -+ BUG_ON(ret && !bch2_journal_error(&c->journal)); -+ -+ if (b) { -+ /* -+ * @b is the node we did the final insert into: -+ * -+ * On failure to get a journal reservation, we still have to -+ * unblock the write and allow most of the write path to happen -+ * so that shutdown works, but the i->journal_seq mechanism -+ * won't work to prevent the btree write from being visible (we -+ * didn't get a journal sequence number) - instead -+ * __bch2_btree_node_write() doesn't do the actual write if -+ * we're in journal error state: -+ */ -+ -+ btree_node_lock_type(c, b, SIX_LOCK_intent); -+ btree_node_lock_type(c, b, SIX_LOCK_write); -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ list_del(&as->write_blocked_list); -+ -+ if (!ret && as->b == b) { -+ struct bset *i = btree_bset_last(b); -+ -+ BUG_ON(!b->c.level); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ i->journal_seq = cpu_to_le64( -+ max(journal_seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, journal_seq); -+ } -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ six_unlock_write(&b->c.lock); -+ -+ btree_node_write_if_need(c, b, SIX_LOCK_intent); -+ six_unlock_intent(&b->c.lock); -+ } -+ -+ bch2_journal_pin_drop(&c->journal, &as->journal); -+ -+ bch2_journal_preres_put(&c->journal, &as->journal_preres); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ BUG_ON(b->will_make_reachable != (unsigned long) as); -+ b->will_make_reachable = 0; -+ } -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ for (i = 0; i < as->nr_new_nodes; i++) { -+ b = as->new_nodes[i]; -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ btree_node_write_if_need(c, b, SIX_LOCK_read); -+ six_unlock_read(&b->c.lock); -+ } -+ -+ for (i = 0; i < as->nr_open_buckets; i++) -+ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); -+ -+ bch2_btree_update_free(as); -+} -+ -+static void btree_interior_update_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, btree_interior_update_work); -+ struct btree_update *as; -+ -+ while (1) { -+ mutex_lock(&c->btree_interior_update_lock); -+ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, -+ struct btree_update, unwritten_list); -+ if (as && !as->nodes_written) -+ as = NULL; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (!as) -+ break; -+ -+ btree_update_nodes_written(as); -+ } -+} -+ -+static void btree_update_set_nodes_written(struct closure *cl) -+{ -+ struct btree_update *as = container_of(cl, struct btree_update, cl); -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ as->nodes_written = true; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); -+} -+ -+/* -+ * We're updating @b with pointers to nodes that haven't finished writing yet: -+ * block @b from being written until @as completes -+ */ -+static void btree_update_updated_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ BUG_ON(!btree_node_dirty(b)); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_NODE; -+ as->b = b; -+ list_add(&as->write_blocked_list, &b->write_blocked); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+static void btree_update_reparent(struct btree_update *as, -+ struct btree_update *child) -+{ -+ struct bch_fs *c = as->c; -+ -+ lockdep_assert_held(&c->btree_interior_update_lock); -+ -+ child->b = NULL; -+ child->mode = BTREE_INTERIOR_UPDATING_AS; -+ -+ /* -+ * When we write a new btree root, we have to drop our journal pin -+ * _before_ the new nodes are technically reachable; see -+ * btree_update_nodes_written(). -+ * -+ * This goes for journal pins that are recursively blocked on us - so, -+ * just transfer the journal pin to the new interior update so -+ * btree_update_nodes_written() can drop it. -+ */ -+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &child->journal); -+} -+ -+static void btree_update_updated_root(struct btree_update *as, struct btree *b) -+{ -+ struct bkey_i *insert = &b->key; -+ struct bch_fs *c = as->c; -+ -+ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_root, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); -+ -+ as->mode = BTREE_INTERIOR_UPDATING_ROOT; -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+/* -+ * bch2_btree_update_add_new_node: -+ * -+ * This causes @as to wait on @b to be written, before it gets to -+ * bch2_btree_update_nodes_written -+ * -+ * Additionally, it sets b->will_make_reachable to prevent any additional writes -+ * to @b from happening besides the first until @b is reachable on disk -+ * -+ * And it adds @b to the list of @as's new nodes, so that we can update sector -+ * counts in bch2_btree_update_nodes_written: -+ */ -+void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ -+ closure_get(&as->cl); -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); -+ BUG_ON(b->will_make_reachable); -+ -+ as->new_nodes[as->nr_new_nodes++] = b; -+ b->will_make_reachable = 1UL|(unsigned long) as; -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ btree_update_will_add_key(as, &b->key); -+} -+ -+/* -+ * returns true if @b was a new node -+ */ -+static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) -+{ -+ struct btree_update *as; -+ unsigned long v; -+ unsigned i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ /* -+ * When b->will_make_reachable != 0, it owns a ref on as->cl that's -+ * dropped when it gets written by bch2_btree_complete_write - the -+ * xchg() is for synchronization with bch2_btree_complete_write: -+ */ -+ v = xchg(&b->will_make_reachable, 0); -+ as = (struct btree_update *) (v & ~1UL); -+ -+ if (!as) { -+ mutex_unlock(&c->btree_interior_update_lock); -+ return; -+ } -+ -+ for (i = 0; i < as->nr_new_nodes; i++) -+ if (as->new_nodes[i] == b) -+ goto found; -+ -+ BUG(); -+found: -+ array_remove_item(as->new_nodes, as->nr_new_nodes, i); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ if (v & 1) -+ closure_put(&as->cl); -+} -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) -+{ -+ while (b->ob.nr) -+ as->open_buckets[as->nr_open_buckets++] = -+ b->ob.v[--b->ob.nr]; -+} -+ -+/* -+ * @b is being split/rewritten: it may have pointers to not-yet-written btree -+ * nodes and thus outstanding btree_updates - redirect @b's -+ * btree_updates to point to this btree_update: -+ */ -+void bch2_btree_interior_update_will_free_node(struct btree_update *as, -+ struct btree *b) -+{ -+ struct bch_fs *c = as->c; -+ struct btree_update *p, *n; -+ struct btree_write *w; -+ -+ set_btree_node_dying(b); -+ -+ if (btree_node_fake(b)) -+ return; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ -+ /* -+ * Does this node have any btree_update operations preventing -+ * it from being written? -+ * -+ * If so, redirect them to point to this btree_update: we can -+ * write out our new nodes, but we won't make them visible until those -+ * operations complete -+ */ -+ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { -+ list_del_init(&p->write_blocked_list); -+ btree_update_reparent(as, p); -+ -+ /* -+ * for flush_held_btree_writes() waiting on updates to flush or -+ * nodes to be writeable: -+ */ -+ closure_wake_up(&c->btree_interior_update_wait); -+ } -+ -+ clear_btree_node_dirty(b); -+ clear_btree_node_need_write(b); -+ -+ /* -+ * Does this node have unwritten data that has a pin on the journal? -+ * -+ * If so, transfer that pin to the btree_update operation - -+ * note that if we're freeing multiple nodes, we only need to keep the -+ * oldest pin of any of the nodes we're freeing. We'll release the pin -+ * when the new nodes are persistent and reachable on disk: -+ */ -+ w = btree_current_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ w = btree_prev_write(b); -+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); -+ bch2_journal_pin_drop(&c->journal, &w->journal); -+ -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ /* -+ * Is this a node that isn't reachable on disk yet? -+ * -+ * Nodes that aren't reachable yet have writes blocked until they're -+ * reachable - now that we've cancelled any pending writes and moved -+ * things waiting on that write to wait on this update, we can drop this -+ * node from the list of nodes that the other update is making -+ * reachable, prior to freeing it: -+ */ -+ btree_update_drop_new_node(c, b); -+ -+ btree_update_will_delete_key(as, &b->key); -+} -+ -+void bch2_btree_update_done(struct btree_update *as) -+{ -+ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); -+ -+ bch2_btree_reserve_put(as); -+ -+ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); -+} -+ -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, -+ unsigned nr_nodes, unsigned flags, -+ struct closure *cl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_update *as; -+ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) -+ ? JOURNAL_RES_GET_RECLAIM : 0; -+ int ret = 0; -+ -+ /* -+ * This check isn't necessary for correctness - it's just to potentially -+ * prevent us from doing a lot of work that'll end up being wasted: -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); -+ memset(as, 0, sizeof(*as)); -+ closure_init(&as->cl, NULL); -+ as->c = c; -+ as->mode = BTREE_INTERIOR_NO_UPDATE; -+ as->btree_id = id; -+ INIT_LIST_HEAD(&as->list); -+ INIT_LIST_HEAD(&as->unwritten_list); -+ INIT_LIST_HEAD(&as->write_blocked_list); -+ bch2_keylist_init(&as->old_keys, as->_old_keys); -+ bch2_keylist_init(&as->new_keys, as->_new_keys); -+ bch2_keylist_init(&as->parent_keys, as->inline_keys); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags|JOURNAL_RES_GET_NONBLOCK); -+ if (ret == -EAGAIN) { -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ return ERR_PTR(-EINTR); -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, -+ BTREE_UPDATE_JOURNAL_RES, -+ journal_flags); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ if (!bch2_trans_relock(trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ ret = bch2_disk_reservation_get(c, &as->disk_res, -+ nr_nodes * c->opts.btree_node_size, -+ c->opts.metadata_replicas, -+ disk_res_flags); -+ if (ret) -+ goto err; -+ -+ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_add_tail(&as->list, &c->btree_interior_update_list); -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return as; -+err: -+ bch2_btree_update_free(as); -+ return ERR_PTR(ret); -+} -+ -+/* Btree root updates: */ -+ -+static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) -+{ -+ /* Root nodes cannot be reaped */ -+ mutex_lock(&c->btree_cache.lock); -+ list_del_init(&b->list); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ mutex_lock(&c->btree_root_lock); -+ BUG_ON(btree_node_root(c, b) && -+ (b->c.level < btree_node_root(c, b)->c.level || -+ !btree_node_dying(btree_node_root(c, b)))); -+ -+ btree_node_root(c, b) = b; -+ mutex_unlock(&c->btree_root_lock); -+ -+ bch2_recalc_btree_reserve(c); -+} -+ -+/** -+ * bch_btree_set_root - update the root in memory and on disk -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. However, you must hold an intent lock on the -+ * old root. -+ * -+ * Note: This allocates a journal entry but doesn't add any keys to -+ * it. All the btree roots are part of every journal write, so there -+ * is nothing new to be done. This just guarantees that there is a -+ * journal write. -+ */ -+static void bch2_btree_set_root(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *old; -+ -+ trace_btree_set_root(c, b); -+ BUG_ON(!b->written && -+ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); -+ -+ old = btree_node_root(c, b); -+ -+ /* -+ * Ensure no one is using the old root while we switch to the -+ * new root: -+ */ -+ bch2_btree_node_lock_write(old, iter); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ btree_update_updated_root(as, b); -+ -+ /* -+ * Unlock old root after new root is visible: -+ * -+ * The new root isn't persistent, but that's ok: we still have -+ * an intent lock on the new root, and any updates that would -+ * depend on the new root would have to update the new root. -+ */ -+ bch2_btree_node_unlock_write(old, iter); -+} -+ -+/* Interior node updates: */ -+ -+static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct btree_node_iter *node_iter) -+{ -+ struct bkey_packed *k; -+ -+ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > -+ ARRAY_SIZE(as->journal_entries)); -+ -+ as->journal_u64s += -+ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], -+ BCH_JSET_ENTRY_btree_keys, -+ b->c.btree_id, b->c.level, -+ insert, insert->k.u64s); -+ -+ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && -+ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) -+ bch2_btree_node_iter_advance(node_iter, b); -+ -+ bch2_btree_bset_insert_key(iter, b, node_iter, insert); -+ set_btree_node_dirty(b); -+ set_btree_node_need_write(b); -+} -+ -+/* -+ * Move keys from n1 (original replacement node, now lower node) to n2 (higher -+ * node) -+ */ -+static struct btree *__btree_split_node(struct btree_update *as, -+ struct btree *n1, -+ struct btree_iter *iter) -+{ -+ size_t nr_packed = 0, nr_unpacked = 0; -+ struct btree *n2; -+ struct bset *set1, *set2; -+ struct bkey_packed *k, *prev = NULL; -+ -+ n2 = bch2_btree_node_alloc(as, n1->c.level); -+ bch2_btree_update_add_new_node(as, n2); -+ -+ n2->data->max_key = n1->data->max_key; -+ n2->data->format = n1->format; -+ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); -+ n2->key.k.p = n1->key.k.p; -+ -+ btree_node_set_format(n2, n2->data->format); -+ -+ set1 = btree_bset_first(n1); -+ set2 = btree_bset_first(n2); -+ -+ /* -+ * Has to be a linear search because we don't have an auxiliary -+ * search tree yet -+ */ -+ k = set1->start; -+ while (1) { -+ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); -+ -+ if (n == vstruct_last(set1)) -+ break; -+ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) -+ break; -+ -+ if (bkey_packed(k)) -+ nr_packed++; -+ else -+ nr_unpacked++; -+ -+ prev = k; -+ k = n; -+ } -+ -+ BUG_ON(!prev); -+ -+ btree_set_max(n1, bkey_unpack_pos(n1, prev)); -+ btree_set_min(n2, bkey_successor(n1->key.k.p)); -+ -+ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); -+ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); -+ -+ set_btree_bset_end(n1, n1->set); -+ set_btree_bset_end(n2, n2->set); -+ -+ n2->nr.live_u64s = le16_to_cpu(set2->u64s); -+ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); -+ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; -+ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; -+ -+ n1->nr.live_u64s = le16_to_cpu(set1->u64s); -+ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); -+ n1->nr.packed_keys = nr_packed; -+ n1->nr.unpacked_keys = nr_unpacked; -+ -+ BUG_ON(!set1->u64s); -+ BUG_ON(!set2->u64s); -+ -+ memcpy_u64s(set2->start, -+ vstruct_end(set1), -+ le16_to_cpu(set2->u64s)); -+ -+ btree_node_reset_sib_u64s(n1); -+ btree_node_reset_sib_u64s(n2); -+ -+ bch2_verify_btree_nr_keys(n1); -+ bch2_verify_btree_nr_keys(n2); -+ -+ if (n1->c.level) { -+ btree_node_interior_verify(as->c, n1); -+ btree_node_interior_verify(as->c, n2); -+ } -+ -+ return n2; -+} -+ -+/* -+ * For updates to interior nodes, we've got to do the insert before we split -+ * because the stuff we're inserting has to be inserted atomically. Post split, -+ * the keys might have to go in different nodes and the split would no longer be -+ * atomic. -+ * -+ * Worse, if the insert is from btree node coalescing, if we do the insert after -+ * we do the split (and pick the pivot) - the pivot we pick might be between -+ * nodes that were coalesced, and thus in the middle of a child node post -+ * coalescing: -+ */ -+static void btree_split_insert_keys(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, -+ struct keylist *keys) -+{ -+ struct btree_node_iter node_iter; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct bkey_packed *src, *dst, *n; -+ struct bset *i; -+ -+ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); -+ -+ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); -+ -+ while (!bch2_keylist_empty(keys)) { -+ k = bch2_keylist_front(keys); -+ -+ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); -+ bch2_keylist_pop_front(keys); -+ } -+ -+ /* -+ * We can't tolerate whiteouts here - with whiteouts there can be -+ * duplicate keys, and it would be rather bad if we picked a duplicate -+ * for the pivot: -+ */ -+ i = btree_bset_first(b); -+ src = dst = i->start; -+ while (src != vstruct_last(i)) { -+ n = bkey_next_skip_noops(src, vstruct_last(i)); -+ if (!bkey_deleted(src)) { -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ src = n; -+ } -+ -+ i->u64s = cpu_to_le16((u64 *) dst - i->_data); -+ set_btree_bset_end(b, b->set); -+ -+ BUG_ON(b->nsets != 1 || -+ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); -+ -+ btree_node_interior_verify(as->c, b); -+} -+ -+static void btree_split(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree *n1, *n2 = NULL, *n3 = NULL; -+ u64 start_time = local_clock(); -+ -+ BUG_ON(!parent && (b != btree_node_root(c, b))); -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n1 = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n1); -+ -+ if (keys) -+ btree_split_insert_keys(as, n1, iter, keys); -+ -+ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { -+ trace_btree_split(c, b); -+ -+ n2 = __btree_split_node(as, n1, iter); -+ -+ bch2_btree_build_aux_trees(n2); -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n2->c.lock); -+ six_unlock_write(&n1->c.lock); -+ -+ bch2_btree_node_write(c, n2, SIX_LOCK_intent); -+ -+ /* -+ * Note that on recursive parent_keys == keys, so we -+ * can't start adding new keys to parent_keys before emptying it -+ * out (which we did with btree_split_insert_keys() above) -+ */ -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ bch2_keylist_add(&as->parent_keys, &n2->key); -+ -+ if (!parent) { -+ /* Depth increases, make a new root */ -+ n3 = __btree_root_alloc(as, b->c.level + 1); -+ -+ n3->sib_u64s[0] = U16_MAX; -+ n3->sib_u64s[1] = U16_MAX; -+ -+ btree_split_insert_keys(as, n3, iter, &as->parent_keys); -+ -+ bch2_btree_node_write(c, n3, SIX_LOCK_intent); -+ } -+ } else { -+ trace_btree_compact(c, b); -+ -+ bch2_btree_build_aux_trees(n1); -+ six_unlock_write(&n1->c.lock); -+ -+ if (parent) -+ bch2_keylist_add(&as->parent_keys, &n1->key); -+ } -+ -+ bch2_btree_node_write(c, n1, SIX_LOCK_intent); -+ -+ /* New nodes all written, now make them visible: */ -+ -+ if (parent) { -+ /* Split a non root node */ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else if (n3) { -+ bch2_btree_set_root(as, n3, iter); -+ } else { -+ /* Root filled up but didn't need to be split */ -+ bch2_btree_set_root(as, n1, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n1); -+ if (n2) -+ bch2_btree_update_get_open_buckets(as, n2); -+ if (n3) -+ bch2_btree_update_get_open_buckets(as, n3); -+ -+ /* Successful split, update the iterator to point to the new nodes: */ -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ if (n3) -+ bch2_btree_iter_node_replace(iter, n3); -+ if (n2) -+ bch2_btree_iter_node_replace(iter, n2); -+ bch2_btree_iter_node_replace(iter, n1); -+ -+ /* -+ * The old node must be freed (in memory) _before_ unlocking the new -+ * nodes - else another thread could re-acquire a read lock on the old -+ * node after another thread has locked and updated the new node, thus -+ * seeing stale data: -+ */ -+ bch2_btree_node_free_inmem(c, b, iter); -+ -+ if (n3) -+ six_unlock_intent(&n3->c.lock); -+ if (n2) -+ six_unlock_intent(&n2->c.lock); -+ six_unlock_intent(&n1->c.lock); -+ -+ bch2_btree_trans_verify_locks(iter->trans); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], -+ start_time); -+} -+ -+static void -+bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys) -+{ -+ struct btree_iter *linked; -+ struct btree_node_iter node_iter; -+ struct bkey_i *insert = bch2_keylist_front(keys); -+ struct bkey_packed *k; -+ -+ /* Don't screw up @iter's position: */ -+ node_iter = iter->l[b->c.level].iter; -+ -+ /* -+ * btree_split(), btree_gc_coalesce() will insert keys before -+ * the iterator's current position - they know the keys go in -+ * the node the iterator points to: -+ */ -+ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && -+ (bkey_cmp_packed(b, k, &insert->k) >= 0)) -+ ; -+ -+ for_each_keylist_key(keys, insert) -+ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); -+ -+ btree_update_updated_node(as, b); -+ -+ trans_for_each_iter_with_node(iter->trans, b, linked) -+ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); -+ -+ bch2_btree_trans_verify_iters(iter->trans, b); -+} -+ -+/** -+ * bch_btree_insert_node - insert bkeys into a given btree node -+ * -+ * @iter: btree iterator -+ * @keys: list of keys to insert -+ * @hook: insert callback -+ * @persistent: if not null, @persistent will wait on journal write -+ * -+ * Inserts as many keys as it can into a given btree node, splitting it if full. -+ * If a split occurred, this function will return early. This can only happen -+ * for leaf nodes -- inserts into interior nodes have to be atomic. -+ */ -+void bch2_btree_insert_node(struct btree_update *as, struct btree *b, -+ struct btree_iter *iter, struct keylist *keys, -+ unsigned flags) -+{ -+ struct bch_fs *c = as->c; -+ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); -+ BUG_ON(!b->c.level); -+ BUG_ON(!as || as->b); -+ bch2_verify_keylist_sorted(keys); -+ -+ if (as->must_rewrite) -+ goto split; -+ -+ bch2_btree_node_lock_for_insert(c, b, iter); -+ -+ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { -+ bch2_btree_node_unlock_write(b, iter); -+ goto split; -+ } -+ -+ bch2_btree_insert_keys_interior(as, b, iter, keys); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ bch2_btree_node_unlock_write(b, iter); -+ -+ btree_node_interior_verify(c, b); -+ -+ /* -+ * when called from the btree_split path the new nodes aren't added to -+ * the btree iterator yet, so the merge path's unlock/wait/relock dance -+ * won't work: -+ */ -+ bch2_foreground_maybe_merge(c, iter, b->c.level, -+ flags|BTREE_INSERT_NOUNLOCK); -+ return; -+split: -+ btree_split(as, b, iter, keys, flags); -+} -+ -+int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, -+ unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_update *as; -+ struct closure cl; -+ int ret = 0; -+ struct btree_insert_entry *i; -+ -+ /* -+ * We already have a disk reservation and open buckets pinned; this -+ * allocation must not block: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ flags |= BTREE_INSERT_USE_RESERVE; -+ -+ closure_init_stack(&cl); -+ -+ /* Hack, because gc and splitting nodes doesn't mix yet: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) { -+ if (flags & BTREE_INSERT_NOUNLOCK) { -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ return -EINTR; -+ } -+ -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(trans)) -+ ret = -EINTR; -+ } -+ -+ /* -+ * XXX: figure out how far we might need to split, -+ * instead of locking/reserving all the way to the root: -+ */ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ trace_trans_restart_iter_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, b), flags, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) { -+ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); -+ bch2_trans_unlock(trans); -+ ret = -EINTR; -+ -+ trace_transaction_restart_ip(trans->ip, _THIS_IP_); -+ } -+ goto out; -+ } -+ -+ btree_split(as, b, iter, NULL, flags); -+ bch2_btree_update_done(as); -+ -+ /* -+ * We haven't successfully inserted yet, so don't downgrade all the way -+ * back to read locks; -+ */ -+ __bch2_btree_iter_downgrade(iter, 1); -+out: -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+} -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree_update *as; -+ struct bkey_format_state new_s; -+ struct bkey_format new_f; -+ struct bkey_i delete; -+ struct btree *b, *m, *n, *prev, *next, *parent; -+ struct closure cl; -+ size_t sib_u64s; -+ int ret = 0; -+ -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ closure_init_stack(&cl); -+retry: -+ BUG_ON(!btree_node_locked(iter, level)); -+ -+ b = iter->l[level].b; -+ -+ parent = btree_node_parent(iter, b); -+ if (!parent) -+ goto out; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) -+ goto out; -+ -+ /* XXX: can't be holding read locks */ -+ m = bch2_btree_node_get_sibling(c, iter, b, sib); -+ if (IS_ERR(m)) { -+ ret = PTR_ERR(m); -+ goto err; -+ } -+ -+ /* NULL means no sibling: */ -+ if (!m) { -+ b->sib_u64s[sib] = U16_MAX; -+ goto out; -+ } -+ -+ if (sib == btree_prev_sib) { -+ prev = m; -+ next = b; -+ } else { -+ prev = b; -+ next = m; -+ } -+ -+ bch2_bkey_format_init(&new_s); -+ __bch2_btree_calc_format(&new_s, b); -+ __bch2_btree_calc_format(&new_s, m); -+ new_f = bch2_bkey_format_done(&new_s); -+ -+ sib_u64s = btree_node_u64s_with_format(b, &new_f) + -+ btree_node_u64s_with_format(m, &new_f); -+ -+ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { -+ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ sib_u64s /= 2; -+ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); -+ } -+ -+ sib_u64s = min(sib_u64s, btree_max_u64s(c)); -+ b->sib_u64s[sib] = sib_u64s; -+ -+ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { -+ six_unlock_intent(&m->c.lock); -+ goto out; -+ } -+ -+ /* We're changing btree topology, doesn't mix with gc: */ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && -+ !down_read_trylock(&c->gc_lock)) -+ goto err_cycle_gc_lock; -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { -+ ret = -EINTR; -+ goto err_unlock; -+ } -+ -+ as = bch2_btree_update_start(trans, iter->btree_id, -+ btree_update_reserve_required(c, parent) + 1, -+ flags| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE, -+ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ goto err_unlock; -+ } -+ -+ trace_btree_merge(c, b); -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ bch2_btree_interior_update_will_free_node(as, m); -+ -+ n = bch2_btree_node_alloc(as, b->c.level); -+ bch2_btree_update_add_new_node(as, n); -+ -+ btree_set_min(n, prev->data->min_key); -+ btree_set_max(n, next->data->max_key); -+ n->data->format = new_f; -+ -+ btree_node_set_format(n, new_f); -+ -+ bch2_btree_sort_into(c, n, prev); -+ bch2_btree_sort_into(c, n, next); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ bkey_init(&delete.k); -+ delete.k.p = prev->key.k.p; -+ bch2_keylist_add(&as->parent_keys, &delete); -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_drop(iter, m); -+ -+ bch2_btree_iter_node_replace(iter, n); -+ -+ bch2_btree_trans_verify_iters(trans, n); -+ -+ bch2_btree_node_free_inmem(c, b, iter); -+ bch2_btree_node_free_inmem(c, m, iter); -+ -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+out: -+ bch2_btree_trans_verify_locks(trans); -+ -+ /* -+ * Don't downgrade locks here: we're called after successful insert, -+ * and the caller will downgrade locks after a successful insert -+ * anyways (in case e.g. a split was required first) -+ * -+ * And we're also called when inserting into interior nodes in the -+ * split path, and downgrading to read locks in there is potentially -+ * confusing: -+ */ -+ closure_sync(&cl); -+ return; -+ -+err_cycle_gc_lock: -+ six_unlock_intent(&m->c.lock); -+ -+ if (flags & BTREE_INSERT_NOUNLOCK) -+ goto out; -+ -+ bch2_trans_unlock(trans); -+ -+ down_read(&c->gc_lock); -+ up_read(&c->gc_lock); -+ ret = -EINTR; -+ goto err; -+ -+err_unlock: -+ six_unlock_intent(&m->c.lock); -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+err: -+ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); -+ -+ if ((ret == -EAGAIN || ret == -EINTR) && -+ !(flags & BTREE_INSERT_NOUNLOCK)) { -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto out; -+ -+ goto retry; -+ } -+ -+ goto out; -+} -+ -+static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, unsigned flags, -+ struct closure *cl) -+{ -+ struct btree *n, *parent = btree_node_parent(iter, b); -+ struct btree_update *as; -+ -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ (parent -+ ? btree_update_reserve_required(c, parent) -+ : 0) + 1, -+ flags, cl); -+ if (IS_ERR(as)) { -+ trace_btree_gc_rewrite_node_fail(c, b); -+ return PTR_ERR(as); -+ } -+ -+ bch2_btree_interior_update_will_free_node(as, b); -+ -+ n = bch2_btree_node_alloc_replacement(as, b); -+ bch2_btree_update_add_new_node(as, n); -+ -+ bch2_btree_build_aux_trees(n); -+ six_unlock_write(&n->c.lock); -+ -+ trace_btree_gc_rewrite_node(c, b); -+ -+ bch2_btree_node_write(c, n, SIX_LOCK_intent); -+ -+ if (parent) { -+ bch2_keylist_add(&as->parent_keys, &n->key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); -+ } else { -+ bch2_btree_set_root(as, n, iter); -+ } -+ -+ bch2_btree_update_get_open_buckets(as, n); -+ -+ six_lock_increment(&b->c.lock, SIX_LOCK_intent); -+ bch2_btree_iter_node_drop(iter, b); -+ bch2_btree_iter_node_replace(iter, n); -+ bch2_btree_node_free_inmem(c, b, iter); -+ six_unlock_intent(&n->c.lock); -+ -+ bch2_btree_update_done(as); -+ return 0; -+} -+ -+/** -+ * bch_btree_node_rewrite - Rewrite/move a btree node -+ * -+ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. -+ * btree_check_reserve() has to wait) -+ */ -+int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, -+ __le64 seq, unsigned flags) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ flags |= BTREE_INSERT_NOFAIL; -+ -+ closure_init_stack(&cl); -+ -+ bch2_btree_iter_upgrade(iter, U8_MAX); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(trans); -+ down_read(&c->gc_lock); -+ } -+ } -+ -+ while (1) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ break; -+ -+ b = bch2_btree_iter_peek_node(iter); -+ if (!b || b->data->keys.seq != seq) -+ break; -+ -+ ret = __btree_node_rewrite(c, iter, b, flags, &cl); -+ if (ret != -EAGAIN && -+ ret != -EINTR) -+ break; -+ -+ bch2_trans_unlock(trans); -+ closure_sync(&cl); -+ } -+ -+ bch2_btree_iter_downgrade(iter); -+ -+ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) -+ up_read(&c->gc_lock); -+ -+ closure_sync(&cl); -+ return ret; -+} -+ -+static void __bch2_btree_node_update_key(struct bch_fs *c, -+ struct btree_update *as, -+ struct btree_iter *iter, -+ struct btree *b, struct btree *new_hash, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent; -+ int ret; -+ -+ btree_update_will_delete_key(as, &b->key); -+ btree_update_will_add_key(as, new_key); -+ -+ parent = btree_node_parent(iter, b); -+ if (parent) { -+ if (new_hash) { -+ bkey_copy(&new_hash->key, new_key); -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, -+ new_hash, b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ } -+ -+ bch2_keylist_add(&as->parent_keys, new_key); -+ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); -+ -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); -+ -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ bkey_copy(&b->key, new_key); -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } else { -+ bkey_copy(&b->key, new_key); -+ } -+ } else { -+ BUG_ON(btree_node_root(c, b) != b); -+ -+ bch2_btree_node_lock_write(b, iter); -+ bkey_copy(&b->key, new_key); -+ -+ if (btree_ptr_hash_val(&b->key) != b->hash_val) { -+ mutex_lock(&c->btree_cache.lock); -+ bch2_btree_node_hash_remove(&c->btree_cache, b); -+ -+ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); -+ BUG_ON(ret); -+ mutex_unlock(&c->btree_cache.lock); -+ } -+ -+ btree_update_updated_root(as, b); -+ bch2_btree_node_unlock_write(b, iter); -+ } -+ -+ bch2_btree_update_done(as); -+} -+ -+int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, -+ struct btree *b, -+ struct bkey_i *new_key) -+{ -+ struct btree *parent = btree_node_parent(iter, b); -+ struct btree_update *as = NULL; -+ struct btree *new_hash = NULL; -+ struct closure cl; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) -+ return -EINTR; -+ -+ if (!down_read_trylock(&c->gc_lock)) { -+ bch2_trans_unlock(iter->trans); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ /* -+ * check btree_ptr_hash_val() after @b is locked by -+ * btree_iter_traverse(): -+ */ -+ if (btree_ptr_hash_val(new_key) != b->hash_val) { -+ /* bch2_btree_reserve_get will unlock */ -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ if (ret) { -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (!bch2_trans_relock(iter->trans)) { -+ ret = -EINTR; -+ goto err; -+ } -+ } -+ -+ new_hash = bch2_btree_node_mem_alloc(c); -+ } -+retry: -+ as = bch2_btree_update_start(iter->trans, iter->btree_id, -+ parent ? btree_update_reserve_required(c, parent) : 0, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_USE_ALLOC_RESERVE, -+ &cl); -+ -+ if (IS_ERR(as)) { -+ ret = PTR_ERR(as); -+ if (ret == -EAGAIN) -+ ret = -EINTR; -+ -+ if (ret == -EINTR) { -+ bch2_trans_unlock(iter->trans); -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ down_read(&c->gc_lock); -+ -+ if (bch2_trans_relock(iter->trans)) -+ goto retry; -+ } -+ -+ goto err; -+ } -+ -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); -+ if (ret) -+ goto err_free_update; -+ -+ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); -+ -+ bch2_btree_iter_downgrade(iter); -+err: -+ if (new_hash) { -+ mutex_lock(&c->btree_cache.lock); -+ list_move(&new_hash->list, &c->btree_cache.freeable); -+ mutex_unlock(&c->btree_cache.lock); -+ -+ six_unlock_write(&new_hash->c.lock); -+ six_unlock_intent(&new_hash->c.lock); -+ } -+ up_read(&c->gc_lock); -+ closure_sync(&cl); -+ return ret; -+err_free_update: -+ bch2_btree_update_free(as); -+ goto err; -+} -+ -+/* Init code: */ -+ -+/* -+ * Only for filesystem bringup, when first reading the btree roots or allocating -+ * btree roots when initializing a new filesystem: -+ */ -+void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) -+{ -+ BUG_ON(btree_node_root(c, b)); -+ -+ bch2_btree_set_root_inmem(c, b); -+} -+ -+void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) -+{ -+ struct closure cl; -+ struct btree *b; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ ret = bch2_btree_cache_cannibalize_lock(c, &cl); -+ closure_sync(&cl); -+ } while (ret); -+ -+ b = bch2_btree_node_mem_alloc(c); -+ bch2_btree_cache_cannibalize_unlock(c); -+ -+ set_btree_node_fake(b); -+ set_btree_node_need_rewrite(b); -+ b->c.level = 0; -+ b->c.btree_id = id; -+ -+ bkey_btree_ptr_init(&b->key); -+ b->key.k.p = POS_MAX; -+ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; -+ -+ bch2_bset_init_first(b, &b->data->keys); -+ bch2_btree_build_aux_trees(b); -+ -+ b->data->flags = 0; -+ btree_set_min(b, POS_MIN); -+ btree_set_max(b, POS_MAX); -+ b->data->format = bch2_btree_calc_format(b); -+ btree_node_set_format(b, b->data->format); -+ -+ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, -+ b->c.level, b->c.btree_id); -+ BUG_ON(ret); -+ -+ bch2_btree_set_root_inmem(c, b); -+ -+ six_unlock_write(&b->c.lock); -+ six_unlock_intent(&b->c.lock); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_update *as; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each_entry(as, &c->btree_interior_update_list, list) -+ pr_buf(out, "%p m %u w %u r %u j %llu\n", -+ as, -+ as->mode, -+ as->nodes_written, -+ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, -+ as->journal.seq); -+ mutex_unlock(&c->btree_interior_update_lock); -+} -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct list_head *i; -+ -+ mutex_lock(&c->btree_interior_update_lock); -+ list_for_each(i, &c->btree_interior_update_list) -+ ret++; -+ mutex_unlock(&c->btree_interior_update_lock); -+ -+ return ret; -+} -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) -+{ -+ struct btree_root *r; -+ struct jset_entry *entry; -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ vstruct_for_each(jset, entry) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) { -+ r = &c->btree_roots[entry->btree_id]; -+ r->level = entry->level; -+ r->alive = true; -+ bkey_copy(&r->key, &entry->start[0]); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+} -+ -+struct jset_entry * -+bch2_btree_roots_to_journal_entries(struct bch_fs *c, -+ struct jset_entry *start, -+ struct jset_entry *end) -+{ -+ struct jset_entry *entry; -+ unsigned long have = 0; -+ unsigned i; -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root) -+ __set_bit(entry->btree_id, &have); -+ -+ mutex_lock(&c->btree_root_lock); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (c->btree_roots[i].alive && !test_bit(i, &have)) { -+ journal_entry_set(end, -+ BCH_JSET_ENTRY_btree_root, -+ i, c->btree_roots[i].level, -+ &c->btree_roots[i].key, -+ c->btree_roots[i].key.u64s); -+ end = vstruct_next(end); -+ } -+ -+ mutex_unlock(&c->btree_root_lock); -+ -+ return end; -+} -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *c) -+{ -+ if (c->btree_interior_update_worker) -+ destroy_workqueue(c->btree_interior_update_worker); -+ mempool_exit(&c->btree_interior_update_pool); -+} -+ -+int bch2_fs_btree_interior_update_init(struct bch_fs *c) -+{ -+ mutex_init(&c->btree_reserve_cache_lock); -+ INIT_LIST_HEAD(&c->btree_interior_update_list); -+ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); -+ mutex_init(&c->btree_interior_update_lock); -+ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); -+ -+ c->btree_interior_update_worker = -+ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); -+ if (!c->btree_interior_update_worker) -+ return -ENOMEM; -+ -+ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, -+ sizeof(struct btree_update)); -+} -diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h -new file mode 100644 -index 000000000000..7668225e72c6 ---- /dev/null -+++ b/fs/bcachefs/btree_update_interior.h -@@ -0,0 +1,331 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H -+ -+#include "btree_cache.h" -+#include "btree_locking.h" -+#include "btree_update.h" -+ -+void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); -+bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, -+ struct bkey_format *); -+ -+#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) -+ -+#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) -+ -+/* -+ * Tracks an in progress split/rewrite of a btree node and the update to the -+ * parent node: -+ * -+ * When we split/rewrite a node, we do all the updates in memory without -+ * waiting for any writes to complete - we allocate the new node(s) and update -+ * the parent node, possibly recursively up to the root. -+ * -+ * The end result is that we have one or more new nodes being written - -+ * possibly several, if there were multiple splits - and then a write (updating -+ * an interior node) which will make all these new nodes visible. -+ * -+ * Additionally, as we split/rewrite nodes we free the old nodes - but the old -+ * nodes can't be freed (their space on disk can't be reclaimed) until the -+ * update to the interior node that makes the new node visible completes - -+ * until then, the old nodes are still reachable on disk. -+ * -+ */ -+struct btree_update { -+ struct closure cl; -+ struct bch_fs *c; -+ -+ struct list_head list; -+ struct list_head unwritten_list; -+ -+ /* What kind of update are we doing? */ -+ enum { -+ BTREE_INTERIOR_NO_UPDATE, -+ BTREE_INTERIOR_UPDATING_NODE, -+ BTREE_INTERIOR_UPDATING_ROOT, -+ BTREE_INTERIOR_UPDATING_AS, -+ } mode; -+ -+ unsigned must_rewrite:1; -+ unsigned nodes_written:1; -+ -+ enum btree_id btree_id; -+ -+ struct disk_reservation disk_res; -+ struct journal_preres journal_preres; -+ -+ /* -+ * BTREE_INTERIOR_UPDATING_NODE: -+ * The update that made the new nodes visible was a regular update to an -+ * existing interior node - @b. We can't write out the update to @b -+ * until the new nodes we created are finished writing, so we block @b -+ * from writing by putting this btree_interior update on the -+ * @b->write_blocked list with @write_blocked_list: -+ */ -+ struct btree *b; -+ struct list_head write_blocked_list; -+ -+ /* -+ * We may be freeing nodes that were dirty, and thus had journal entries -+ * pinned: we need to transfer the oldest of those pins to the -+ * btree_update operation, and release it when the new node(s) -+ * are all persistent and reachable: -+ */ -+ struct journal_entry_pin journal; -+ -+ /* Preallocated nodes we reserve when we start the update: */ -+ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_prealloc_nodes; -+ -+ /* Nodes being freed: */ -+ struct keylist old_keys; -+ u64 _old_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* Nodes being added: */ -+ struct keylist new_keys; -+ u64 _new_keys[BTREE_UPDATE_NODES_MAX * -+ BKEY_BTREE_PTR_VAL_U64s_MAX]; -+ -+ /* New nodes, that will be made reachable by this update: */ -+ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; -+ unsigned nr_new_nodes; -+ -+ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * -+ BCH_REPLICAS_MAX]; -+ open_bucket_idx_t nr_open_buckets; -+ -+ unsigned journal_u64s; -+ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; -+ -+ /* Only here to reduce stack usage on recursive splits: */ -+ struct keylist parent_keys; -+ /* -+ * Enough room for btree_split's keys without realloc - btree node -+ * pointers never have crc/compression info, so we only need to acount -+ * for the pointers for three keys -+ */ -+ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; -+}; -+ -+void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, -+ struct btree_iter *); -+void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); -+ -+void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); -+ -+struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, -+ struct btree *, -+ struct bkey_format); -+ -+void bch2_btree_update_done(struct btree_update *); -+struct btree_update * -+bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, -+ unsigned, struct closure *); -+ -+void bch2_btree_interior_update_will_free_node(struct btree_update *, -+ struct btree *); -+void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); -+ -+void bch2_btree_insert_node(struct btree_update *, struct btree *, -+ struct btree_iter *, struct keylist *, -+ unsigned); -+int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); -+ -+void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, -+ unsigned, unsigned, enum btree_node_sibling); -+ -+static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, unsigned flags, -+ enum btree_node_sibling sib) -+{ -+ struct btree *b; -+ -+ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) -+ return; -+ -+ if (!bch2_btree_node_relock(iter, level)) -+ return; -+ -+ b = iter->l[level].b; -+ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) -+ return; -+ -+ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); -+} -+ -+static inline void bch2_foreground_maybe_merge(struct bch_fs *c, -+ struct btree_iter *iter, -+ unsigned level, -+ unsigned flags) -+{ -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_prev_sib); -+ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, -+ btree_next_sib); -+} -+ -+void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); -+void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); -+ -+static inline unsigned btree_update_reserve_required(struct bch_fs *c, -+ struct btree *b) -+{ -+ unsigned depth = btree_node_root(c, b)->c.level + 1; -+ -+ /* -+ * Number of nodes we might have to allocate in a worst case btree -+ * split operation - we split all the way up to the root, then allocate -+ * a new root, unless we're already at max depth: -+ */ -+ if (depth < BTREE_MAX_DEPTH) -+ return (depth - b->c.level) * 2 + 1; -+ else -+ return (depth - b->c.level) * 2 - 1; -+} -+ -+static inline void btree_node_reset_sib_u64s(struct btree *b) -+{ -+ b->sib_u64s[0] = b->nr.live_u64s; -+ b->sib_u64s[1] = b->nr.live_u64s; -+} -+ -+static inline void *btree_data_end(struct bch_fs *c, struct btree *b) -+{ -+ return (void *) b->data + btree_bytes(c); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, -+ struct btree *b) -+{ -+ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); -+} -+ -+static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, -+ struct btree *b) -+{ -+ return btree_data_end(c, b); -+} -+ -+static inline void *write_block(struct btree *b) -+{ -+ return (void *) b->data + (b->written << 9); -+} -+ -+static inline bool __btree_addr_written(struct btree *b, void *p) -+{ -+ return p < write_block(b); -+} -+ -+static inline bool bset_written(struct btree *b, struct bset *i) -+{ -+ return __btree_addr_written(b, i); -+} -+ -+static inline bool bkey_written(struct btree *b, struct bkey_packed *k) -+{ -+ return __btree_addr_written(b, k); -+} -+ -+static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, -+ struct btree *b, -+ void *end) -+{ -+ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + -+ b->whiteout_u64s; -+ ssize_t total = c->opts.btree_node_size << 6; -+ -+ return total - used; -+} -+ -+static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, -+ struct btree *b) -+{ -+ ssize_t remaining = __bch_btree_u64s_remaining(c, b, -+ btree_bkey_last(b, bset_tree_last(b))); -+ -+ BUG_ON(remaining < 0); -+ -+ if (bset_written(b, btree_bset_last(b))) -+ return 0; -+ -+ return remaining; -+} -+ -+static inline unsigned btree_write_set_buffer(struct btree *b) -+{ -+ /* -+ * Could buffer up larger amounts of keys for btrees with larger keys, -+ * pending benchmarking: -+ */ -+ return 4 << 10; -+} -+ -+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, -+ struct btree *b) -+{ -+ struct bset_tree *t = bset_tree_last(b); -+ struct btree_node_entry *bne = max(write_block(b), -+ (void *) btree_bkey_last(b, bset_tree_last(b))); -+ ssize_t remaining_space = -+ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); -+ -+ if (unlikely(bset_written(b, bset(b, t)))) { -+ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) -+ return bne; -+ } else { -+ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && -+ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) -+ return bne; -+ } -+ -+ return NULL; -+} -+ -+static inline void push_whiteout(struct bch_fs *c, struct btree *b, -+ struct bpos pos) -+{ -+ struct bkey_packed k; -+ -+ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); -+ -+ if (!bkey_pack_pos(&k, pos, b)) { -+ struct bkey *u = (void *) &k; -+ -+ bkey_init(u); -+ u->p = pos; -+ } -+ -+ k.needs_whiteout = true; -+ -+ b->whiteout_u64s += k.u64s; -+ bkey_copy(unwritten_whiteouts_start(c, b), &k); -+} -+ -+/* -+ * write lock must be held on @b (else the dirty bset that we were going to -+ * insert into could be written out from under us) -+ */ -+static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, -+ struct btree *b, unsigned u64s) -+{ -+ if (unlikely(btree_node_need_rewrite(b))) -+ return false; -+ -+ return u64s <= bch_btree_keys_u64s_remaining(c, b); -+} -+ -+void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); -+ -+size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); -+ -+void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); -+struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, -+ struct jset_entry *, struct jset_entry *); -+ -+void bch2_fs_btree_interior_update_exit(struct bch_fs *); -+int bch2_fs_btree_interior_update_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ -diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c -new file mode 100644 -index 000000000000..49995cd00c16 ---- /dev/null -+++ b/fs/bcachefs/btree_update_leaf.c -@@ -0,0 +1,1172 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_locking.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extent_update.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "keylist.h" -+#include "replicas.h" -+ -+#include -+#include -+#include -+ -+static inline bool same_leaf_as_prev(struct btree_trans *trans, -+ struct btree_insert_entry *i) -+{ -+ return i != trans->updates2 && -+ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; -+} -+ -+inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, -+ struct btree_iter *iter) -+{ -+ bch2_btree_node_lock_write(b, iter); -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) -+ return; -+ -+ if (unlikely(btree_node_just_written(b)) && -+ bch2_btree_post_write_cleanup(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ /* -+ * If the last bset has been written, or if it's gotten too big - start -+ * a new bset to insert into: -+ */ -+ if (want_new_bset(c, b)) -+ bch2_btree_init_next(c, b, iter); -+} -+ -+/* Inserting into a given leaf node (last stage of insert): */ -+ -+/* Handle overwrites and do insert, for non extents: */ -+bool bch2_btree_bset_insert_key(struct btree_iter *iter, -+ struct btree *b, -+ struct btree_node_iter *node_iter, -+ struct bkey_i *insert) -+{ -+ struct bkey_packed *k; -+ unsigned clobber_u64s = 0, new_u64s = 0; -+ -+ EBUG_ON(btree_node_just_written(b)); -+ EBUG_ON(bset_written(b, btree_bset_last(b))); -+ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); -+ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); -+ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); -+ EBUG_ON(insert->k.u64s > -+ bch_btree_keys_u64s_remaining(iter->trans->c, b)); -+ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); -+ -+ k = bch2_btree_node_iter_peek_all(node_iter, b); -+ if (k && bkey_cmp_packed(b, k, &insert->k)) -+ k = NULL; -+ -+ /* @k is the key being overwritten/deleted, if any: */ -+ EBUG_ON(k && bkey_whiteout(k)); -+ -+ /* Deleting, but not found? nothing to do: */ -+ if (bkey_whiteout(&insert->k) && !k) -+ return false; -+ -+ if (bkey_whiteout(&insert->k)) { -+ /* Deleting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ if (k->needs_whiteout) -+ push_whiteout(iter->trans->c, b, insert->k.p); -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ bch2_bset_delete(b, k, clobber_u64s); -+ goto fix_iter; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ -+ return true; -+ } -+ -+ if (k) { -+ /* Overwriting: */ -+ btree_account_key_drop(b, k); -+ k->type = KEY_TYPE_deleted; -+ -+ insert->k.needs_whiteout = k->needs_whiteout; -+ k->needs_whiteout = false; -+ -+ if (k >= btree_bset_last(b)->start) { -+ clobber_u64s = k->u64s; -+ goto overwrite; -+ } else { -+ bch2_btree_iter_fix_key_modified(iter, b, k); -+ } -+ } -+ -+ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); -+overwrite: -+ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); -+ new_u64s = k->u64s; -+fix_iter: -+ if (clobber_u64s != new_u64s) -+ bch2_btree_node_iter_fix(iter, b, node_iter, k, -+ clobber_u64s, new_u64s); -+ return true; -+} -+ -+static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, -+ unsigned i, u64 seq) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct btree_write *w = container_of(pin, struct btree_write, journal); -+ struct btree *b = container_of(w, struct btree, writes[i]); -+ -+ btree_node_lock_type(c, b, SIX_LOCK_read); -+ bch2_btree_node_write_cond(c, b, -+ (btree_current_write(b) == w && w->journal.seq == seq)); -+ six_unlock_read(&b->c.lock); -+} -+ -+static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 0, seq); -+} -+ -+static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) -+{ -+ return __btree_node_flush(j, pin, 1, seq); -+} -+ -+inline void bch2_btree_add_journal_pin(struct bch_fs *c, -+ struct btree *b, u64 seq) -+{ -+ struct btree_write *w = btree_current_write(b); -+ -+ bch2_journal_pin_add(&c->journal, seq, &w->journal, -+ btree_node_write_idx(b) == 0 -+ ? btree_node_flush0 -+ : btree_node_flush1); -+} -+ -+/** -+ * btree_insert_key - insert a key one key into a leaf node -+ */ -+static bool btree_insert_key_leaf(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct bset_tree *t = bset_tree_last(b); -+ struct bset *i = bset(b, t); -+ int old_u64s = bset_u64s(t); -+ int old_live_u64s = b->nr.live_u64s; -+ int live_u64s_added, u64s_added; -+ -+ EBUG_ON(!iter->level && -+ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); -+ -+ if (unlikely(!bch2_btree_bset_insert_key(iter, b, -+ &iter_l(iter)->iter, insert))) -+ return false; -+ -+ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, -+ le64_to_cpu(i->journal_seq))); -+ -+ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); -+ -+ if (unlikely(!btree_node_dirty(b))) -+ set_btree_node_dirty(b); -+ -+ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; -+ u64s_added = (int) bset_u64s(t) - old_u64s; -+ -+ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); -+ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) -+ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); -+ -+ if (u64s_added > live_u64s_added && -+ bch2_maybe_compact_whiteouts(c, b)) -+ bch2_btree_iter_reinit_node(iter, b); -+ -+ trace_btree_insert_key(c, b, insert); -+ return true; -+} -+ -+/* Cached btree updates: */ -+ -+/* Normal update interface: */ -+ -+static inline void btree_insert_entry_checks(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ -+ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); -+ BUG_ON(debug_check_bkeys(c) && -+ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), -+ __btree_node_type(iter->level, iter->btree_id))); -+} -+ -+static noinline int -+bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_journal_preres_get(&c->journal, -+ &trans->journal_preres, u64s, 0); -+ if (ret) -+ return ret; -+ -+ if (!bch2_trans_relock(trans)) { -+ trace_trans_restart_journal_preres_get(trans->ip); -+ return -EINTR; -+ } -+ -+ return 0; -+} -+ -+static inline int bch2_trans_journal_res_get(struct btree_trans *trans, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) -+ flags |= JOURNAL_RES_GET_RESERVED; -+ -+ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, -+ trans->journal_u64s, flags); -+ -+ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned u64s) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ -+ if (!bch2_btree_node_insert_fits(c, b, u64s)) -+ return BTREE_INSERT_BTREE_NODE_FULL; -+ -+ return BTREE_INSERT_OK; -+} -+ -+static enum btree_insert_ret -+btree_key_can_insert_cached(struct btree_trans *trans, -+ struct btree_iter *iter, -+ unsigned u64s) -+{ -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ unsigned new_u64s; -+ struct bkey_i *new_k; -+ -+ BUG_ON(iter->level); -+ -+ if (u64s <= ck->u64s) -+ return BTREE_INSERT_OK; -+ -+ new_u64s = roundup_pow_of_two(u64s); -+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); -+ if (!new_k) -+ return -ENOMEM; -+ -+ ck->u64s = new_u64s; -+ ck->k = new_k; -+ return BTREE_INSERT_OK; -+} -+ -+static inline void do_btree_insert_one(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct bch_fs *c = trans->c; -+ struct journal *j = &c->journal; -+ bool did_work; -+ -+ EBUG_ON(trans->journal_res.ref != -+ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); -+ -+ insert->k.needs_whiteout = false; -+ -+ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) -+ ? btree_insert_key_leaf(trans, iter, insert) -+ : bch2_btree_insert_key_cached(trans, iter, insert); -+ if (!did_work) -+ return; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ bch2_journal_add_keys(j, &trans->journal_res, -+ iter->btree_id, insert); -+ -+ bch2_journal_set_has_inode(j, &trans->journal_res, -+ insert->k.p.inode); -+ -+ if (trans->journal_seq) -+ *trans->journal_seq = trans->journal_res.seq; -+ } -+} -+ -+static inline bool iter_has_trans_triggers(struct btree_iter *iter) -+{ -+ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); -+} -+ -+static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) -+{ -+ return (((BTREE_NODE_TYPE_HAS_TRIGGERS & -+ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | -+ (1U << BTREE_ID_EC)) & -+ (1U << iter->btree_id); -+} -+ -+static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) -+{ -+ __bch2_btree_iter_unlock(iter); -+} -+ -+static noinline void bch2_trans_mark_gc(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) { -+ /* -+ * XXX: synchronization of cached update triggers with gc -+ */ -+ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); -+ -+ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) -+ bch2_mark_update(trans, i->iter, i->k, NULL, -+ i->trigger_flags|BTREE_TRIGGER_GC); -+ } -+} -+ -+static inline int -+bch2_trans_commit_write_locked(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_fs_usage *fs_usage = NULL; -+ struct btree_insert_entry *i; -+ unsigned u64s = 0; -+ bool marking = false; -+ int ret; -+ -+ if (race_fault()) { -+ trace_trans_restart_fault_inject(trans->ip); -+ return -EINTR; -+ } -+ -+ /* -+ * Check if the insert will fit in the leaf node with the write lock -+ * held, otherwise another thread could write the node changing the -+ * amount of space available: -+ */ -+ -+ prefetch(&trans->c->journal.flags); -+ -+ trans_for_each_update2(trans, i) { -+ /* Multiple inserts might go to same leaf: */ -+ if (!same_leaf_as_prev(trans, i)) -+ u64s = 0; -+ -+ u64s += i->k->k.u64s; -+ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED -+ ? btree_key_can_insert(trans, i->iter, u64s) -+ : btree_key_can_insert_cached(trans, i->iter, u64s); -+ if (ret) { -+ *stopped_at = i; -+ return ret; -+ } -+ -+ if (btree_node_type_needs_gc(i->iter->btree_id)) -+ marking = true; -+ } -+ -+ if (marking) { -+ percpu_down_read(&c->mark_lock); -+ fs_usage = bch2_fs_usage_scratch_get(c); -+ } -+ -+ /* -+ * Don't get journal reservation until after we know insert will -+ * succeed: -+ */ -+ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { -+ ret = bch2_trans_journal_res_get(trans, -+ JOURNAL_RES_GET_NONBLOCK); -+ if (ret) -+ goto err; -+ } else { -+ trans->journal_res.seq = c->journal.replay_journal_seq; -+ } -+ -+ if (unlikely(trans->extra_journal_entry_u64s)) { -+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), -+ trans->extra_journal_entries, -+ trans->extra_journal_entry_u64s); -+ -+ trans->journal_res.offset += trans->extra_journal_entry_u64s; -+ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; -+ } -+ -+ /* -+ * Not allowed to fail after we've gotten our journal reservation - we -+ * have to use it: -+ */ -+ -+ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { -+ if (journal_seq_verify(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version.lo = trans->journal_res.seq; -+ else if (inject_invalid_keys(c)) -+ trans_for_each_update2(trans, i) -+ i->k->k.version = MAX_VERSION; -+ } -+ -+ /* Must be called under mark_lock: */ -+ if (marking && trans->fs_usage_deltas && -+ bch2_replicas_delta_list_apply(c, fs_usage, -+ trans->fs_usage_deltas)) { -+ ret = BTREE_INSERT_NEED_MARK_REPLICAS; -+ goto err; -+ } -+ -+ trans_for_each_update(trans, i) -+ if (iter_has_nontrans_triggers(i->iter)) -+ bch2_mark_update(trans, i->iter, i->k, -+ fs_usage, i->trigger_flags); -+ -+ if (marking) -+ bch2_trans_fs_usage_apply(trans, fs_usage); -+ -+ if (unlikely(c->gc_pos.phase)) -+ bch2_trans_mark_gc(trans); -+ -+ trans_for_each_update2(trans, i) -+ do_btree_insert_one(trans, i->iter, i->k); -+err: -+ if (marking) { -+ bch2_fs_usage_scratch_put(c, fs_usage); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ return ret; -+} -+ -+/* -+ * Get journal reservation, take write locks, and attempt to do btree update(s): -+ */ -+static inline int do_bch2_trans_commit(struct btree_trans *trans, -+ struct btree_insert_entry **stopped_at) -+{ -+ struct btree_insert_entry *i; -+ struct btree_iter *iter; -+ int ret; -+ -+ trans_for_each_update2(trans, i) -+ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); -+ -+ ret = bch2_journal_preres_get(&trans->c->journal, -+ &trans->journal_preres, trans->journal_preres_u64s, -+ JOURNAL_RES_GET_NONBLOCK| -+ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) -+ ? JOURNAL_RES_GET_RECLAIM : 0)); -+ if (unlikely(ret == -EAGAIN)) -+ ret = bch2_trans_journal_preres_get_cold(trans, -+ trans->journal_preres_u64s); -+ if (unlikely(ret)) -+ return ret; -+ -+ /* -+ * Can't be holding any read locks when we go to take write locks: -+ * -+ * note - this must be done after bch2_trans_journal_preres_get_cold() -+ * or anything else that might call bch2_trans_relock(), since that -+ * would just retake the read locks: -+ */ -+ trans_for_each_iter(trans, iter) { -+ if (iter->nodes_locked != iter->nodes_intent_locked) { -+ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); -+ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); -+ bch2_btree_iter_unlock_noinline(iter); -+ } -+ } -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) -+ trans_for_each_update2(trans, i) -+ btree_insert_entry_checks(trans, i->iter, i->k); -+ bch2_btree_trans_verify_locks(trans); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_lock_for_insert(trans->c, -+ iter_l(i->iter)->b, i->iter); -+ -+ ret = bch2_trans_commit_write_locked(trans, stopped_at); -+ -+ trans_for_each_update2(trans, i) -+ if (!same_leaf_as_prev(trans, i)) -+ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, -+ i->iter); -+ -+ if (!ret && trans->journal_pin) -+ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, -+ trans->journal_pin, NULL); -+ -+ /* -+ * Drop journal reservation after dropping write locks, since dropping -+ * the journal reservation may kick off a journal write: -+ */ -+ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); -+ -+ if (unlikely(ret)) -+ return ret; -+ -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ trans->nounlock = true; -+ -+ trans_for_each_update2(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !same_leaf_as_prev(trans, i)) -+ bch2_foreground_maybe_merge(trans->c, i->iter, -+ 0, trans->flags); -+ -+ trans->nounlock = false; -+ -+ bch2_trans_downgrade(trans); -+ -+ return 0; -+} -+ -+static noinline -+int bch2_trans_commit_error(struct btree_trans *trans, -+ struct btree_insert_entry *i, -+ int ret) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned flags = trans->flags; -+ -+ /* -+ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree -+ * update; if we haven't done anything yet it doesn't apply -+ */ -+ flags &= ~BTREE_INSERT_NOUNLOCK; -+ -+ switch (ret) { -+ case BTREE_INSERT_BTREE_NODE_FULL: -+ ret = bch2_btree_split_leaf(c, i->iter, flags); -+ -+ /* -+ * if the split succeeded without dropping locks the insert will -+ * still be atomic (what the caller peeked() and is overwriting -+ * won't have changed) -+ */ -+#if 0 -+ /* -+ * XXX: -+ * split -> btree node merging (of parent node) might still drop -+ * locks when we're not passing it BTREE_INSERT_NOUNLOCK -+ * -+ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that -+ * will inhibit merging - but we don't have a reliable way yet -+ * (do we?) of checking if we dropped locks in this path -+ */ -+ if (!ret) -+ goto retry; -+#endif -+ -+ /* -+ * don't care if we got ENOSPC because we told split it -+ * couldn't block: -+ */ -+ if (!ret || -+ ret == -EINTR || -+ (flags & BTREE_INSERT_NOUNLOCK)) { -+ trace_trans_restart_btree_node_split(trans->ip); -+ ret = -EINTR; -+ } -+ break; -+ case BTREE_INSERT_ENOSPC: -+ ret = -ENOSPC; -+ break; -+ case BTREE_INSERT_NEED_MARK_REPLICAS: -+ bch2_trans_unlock(trans); -+ -+ trans_for_each_update(trans, i) { -+ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); -+ if (ret) -+ return ret; -+ } -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_mark_replicas(trans->ip); -+ ret = -EINTR; -+ break; -+ case BTREE_INSERT_NEED_JOURNAL_RES: -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); -+ if (ret) -+ return ret; -+ -+ if (bch2_trans_relock(trans)) -+ return 0; -+ -+ trace_trans_restart_journal_res_get(trans->ip); -+ ret = -EINTR; -+ break; -+ default: -+ BUG_ON(ret >= 0); -+ break; -+ } -+ -+ if (ret == -EINTR) { -+ int ret2 = bch2_btree_iter_traverse_all(trans); -+ -+ if (ret2) { -+ trace_trans_restart_traverse(trans->ip); -+ return ret2; -+ } -+ -+ trace_trans_restart_atomic(trans->ip); -+ } -+ -+ return ret; -+} -+ -+static noinline int -+bch2_trans_commit_get_rw_cold(struct btree_trans *trans) -+{ -+ struct bch_fs *c = trans->c; -+ int ret; -+ -+ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) -+ return -EROFS; -+ -+ bch2_trans_unlock(trans); -+ -+ ret = bch2_fs_read_write_early(c); -+ if (ret) -+ return ret; -+ -+ percpu_ref_get(&c->writes); -+ return 0; -+} -+ -+static void bch2_trans_update2(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .iter = iter, .k = insert -+ }; -+ -+ btree_insert_entry_checks(trans, n.iter, n.k); -+ -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ trans_for_each_update2(trans, i) { -+ if (btree_iter_cmp(n.iter, i->iter) == 0) { -+ *i = n; -+ return; -+ } -+ -+ if (btree_iter_cmp(n.iter, i->iter) <= 0) -+ break; -+ } -+ -+ array_insert_item(trans->updates2, trans->nr_updates2, -+ i - trans->updates2, n); -+} -+ -+static int extent_update_to_keys(struct btree_trans *trans, -+ struct btree_iter *orig_iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ ret = bch2_extent_can_insert(trans, orig_iter, insert); -+ if (ret) -+ return ret; -+ -+ if (bkey_deleted(&insert->k)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, orig_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ iter->flags |= BTREE_ITER_INTENT; -+ __bch2_btree_iter_set_pos(iter, insert->k.p, false); -+ bch2_trans_update2(trans, iter, insert); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+static int extent_handle_overwrites(struct btree_trans *trans, -+ enum btree_id btree_id, -+ struct bpos start, struct bpos end) -+{ -+ struct btree_iter *iter = NULL, *update_iter; -+ struct bkey_i *update; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_with_updates(iter); -+ -+ while (k.k && !(ret = bkey_err(k))) { -+ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_back(start, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ if (bkey_cmp(k.k->p, end) > 0) { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ bkey_reassemble(update, k); -+ bch2_cut_front(end, update); -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } else { -+ update_iter = bch2_trans_copy_iter(trans, iter); -+ if ((ret = PTR_ERR_OR_ZERO(update_iter))) -+ goto err; -+ -+ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); -+ if ((ret = PTR_ERR_OR_ZERO(update))) -+ goto err; -+ -+ update->k = *k.k; -+ set_bkey_val_u64s(&update->k, 0); -+ update->k.type = KEY_TYPE_deleted; -+ update->k.size = 0; -+ -+ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); -+ bch2_trans_update2(trans, update_iter, update); -+ bch2_trans_iter_put(trans, update_iter); -+ } -+ -+ k = bch2_btree_iter_next_with_updates(iter); -+ } -+err: -+ if (!IS_ERR_OR_NULL(iter)) -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_trans_commit(struct btree_trans *trans) -+{ -+ struct btree_insert_entry *i = NULL; -+ struct btree_iter *iter; -+ bool trans_trigger_run; -+ unsigned u64s; -+ int ret = 0; -+ -+ BUG_ON(trans->need_reset); -+ -+ if (!trans->nr_updates) -+ goto out_noupdates; -+ -+ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) -+ lockdep_assert_held(&trans->c->gc_lock); -+ -+ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); -+ -+ trans->journal_u64s = trans->extra_journal_entry_u64s; -+ trans->journal_preres_u64s = 0; -+ -+ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && -+ unlikely(!percpu_ref_tryget(&trans->c->writes))) { -+ ret = bch2_trans_commit_get_rw_cold(trans); -+ if (ret) -+ return ret; -+ } -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ trans_for_each_update(trans, i) -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && -+ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) -+ bch2_btree_key_cache_verify_clean(trans, -+ i->iter->btree_id, i->iter->pos); -+#endif -+ -+ /* -+ * Running triggers will append more updates to the list of updates as -+ * we're walking it: -+ */ -+ do { -+ trans_trigger_run = false; -+ -+ trans_for_each_update(trans, i) { -+ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && -+ (ret = bch2_btree_iter_traverse(i->iter)))) { -+ trace_trans_restart_traverse(trans->ip); -+ goto out; -+ } -+ -+ /* -+ * We're not using bch2_btree_iter_upgrade here because -+ * we know trans->nounlock can't be set: -+ */ -+ if (unlikely(i->iter->locks_want < 1 && -+ !__bch2_btree_iter_upgrade(i->iter, 1))) { -+ trace_trans_restart_upgrade(trans->ip); -+ ret = -EINTR; -+ goto out; -+ } -+ -+ if (iter_has_trans_triggers(i->iter) && -+ !i->trans_triggers_run) { -+ i->trans_triggers_run = true; -+ trans_trigger_run = true; -+ -+ ret = bch2_trans_mark_update(trans, i->iter, i->k, -+ i->trigger_flags); -+ if (unlikely(ret)) { -+ if (ret == -EINTR) -+ trace_trans_restart_mark(trans->ip); -+ goto out; -+ } -+ } -+ } -+ } while (trans_trigger_run); -+ -+ /* Turn extents updates into keys: */ -+ trans_for_each_update(trans, i) -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ struct bpos start = bkey_start_pos(&i->k->k); -+ -+ while (i + 1 < trans->updates + trans->nr_updates && -+ i[0].iter->btree_id == i[1].iter->btree_id && -+ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) -+ i++; -+ -+ ret = extent_handle_overwrites(trans, i->iter->btree_id, -+ start, i->k->k.p); -+ if (ret) -+ goto out; -+ } -+ -+ trans_for_each_update(trans, i) { -+ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { -+ ret = extent_update_to_keys(trans, i->iter, i->k); -+ if (ret) -+ goto out; -+ } else { -+ bch2_trans_update2(trans, i->iter, i->k); -+ } -+ } -+ -+ trans_for_each_update2(trans, i) { -+ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); -+ BUG_ON(i->iter->locks_want < 1); -+ -+ u64s = jset_u64s(i->k->k.u64s); -+ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && -+ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) -+ trans->journal_preres_u64s += u64s; -+ trans->journal_u64s += u64s; -+ } -+retry: -+ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); -+ -+ ret = do_bch2_trans_commit(trans, &i); -+ -+ /* make sure we didn't drop or screw up locks: */ -+ bch2_btree_trans_verify_locks(trans); -+ -+ if (ret) -+ goto err; -+ -+ trans_for_each_iter(trans, iter) -+ if ((trans->iters_live & (1ULL << iter->idx)) && -+ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { -+ if (trans->flags & BTREE_INSERT_NOUNLOCK) -+ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); -+ else -+ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); -+ } -+out: -+ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); -+ -+ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) -+ percpu_ref_put(&trans->c->writes); -+out_noupdates: -+ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); -+ -+ return ret; -+err: -+ ret = bch2_trans_commit_error(trans, i, ret); -+ if (ret) -+ goto out; -+ -+ goto retry; -+} -+ -+int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, -+ struct bkey_i *k, enum btree_trigger_flags flags) -+{ -+ struct btree_insert_entry *i, n = (struct btree_insert_entry) { -+ .trigger_flags = flags, .iter = iter, .k = k -+ }; -+ -+ EBUG_ON(bkey_cmp(iter->pos, -+ (iter->flags & BTREE_ITER_IS_EXTENTS) -+ ? bkey_start_pos(&k->k) -+ : k->k.p)); -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ iter->pos_after_commit = k->k.p; -+ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; -+ } -+ -+ /* -+ * Pending updates are kept sorted: first, find position of new update: -+ */ -+ trans_for_each_update(trans, i) -+ if (btree_iter_cmp(iter, i->iter) <= 0) -+ break; -+ -+ /* -+ * Now delete/trim any updates the new update overwrites: -+ */ -+ if (i > trans->updates && -+ i[-1].iter->btree_id == iter->btree_id && -+ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) -+ bch2_cut_back(n.iter->pos, i[-1].k); -+ -+ while (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) -+ array_remove_item(trans->updates, trans->nr_updates, -+ i - trans->updates); -+ -+ if (i < trans->updates + trans->nr_updates && -+ iter->btree_id == i->iter->btree_id && -+ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { -+ /* -+ * When we have an extent that overwrites the start of another -+ * update, trimming that extent will mean the iterator's -+ * position has to change since the iterator position has to -+ * match the extent's start pos - but we don't want to change -+ * the iterator pos if some other code is using it, so we may -+ * need to clone it: -+ */ -+ if (trans->iters_live & (1ULL << i->iter->idx)) { -+ i->iter = bch2_trans_copy_iter(trans, i->iter); -+ if (IS_ERR(i->iter)) { -+ trans->need_reset = true; -+ return PTR_ERR(i->iter); -+ } -+ -+ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, i->iter); -+ } -+ -+ bch2_cut_front(n.k->k.p, i->k); -+ bch2_btree_iter_set_pos(i->iter, n.k->k.p); -+ } -+ -+ EBUG_ON(trans->nr_updates >= trans->nr_iters); -+ -+ array_insert_item(trans->updates, trans->nr_updates, -+ i - trans->updates, n); -+ return 0; -+} -+ -+int __bch2_btree_insert(struct btree_trans *trans, -+ enum btree_id id, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, 0); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+/** -+ * bch2_btree_insert - insert keys into the extent btree -+ * @c: pointer to struct bch_fs -+ * @id: btree to insert into -+ * @insert_keys: list of keys to insert -+ * @hook: insert callback -+ */ -+int bch2_btree_insert(struct bch_fs *c, enum btree_id id, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, int flags) -+{ -+ return bch2_trans_do(c, disk_res, journal_seq, flags, -+ __bch2_btree_insert(&trans, id, k)); -+} -+ -+int bch2_btree_delete_at_range(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bpos end, -+ u64 *journal_seq) -+{ -+ struct bkey_s_c k; -+ int ret = 0; -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ bkey_init(&delete.k); -+ -+ /* -+ * For extents, iter.pos won't necessarily be the same as -+ * bkey_start_pos(k.k) (for non extents they always will be the -+ * same). It's important that we delete starting from iter.pos -+ * because the range we want to delete could start in the middle -+ * of k. -+ * -+ * (bch2_btree_iter_peek() does guarantee that iter.pos >= -+ * bkey_start_pos(k.k)). -+ */ -+ delete.k.p = iter->pos; -+ -+ if (btree_node_type_is_extents(iter->btree_id)) { -+ unsigned max_sectors = -+ KEY_SIZE_MAX & (~0 << trans->c->block_bits); -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_trim_atomic(&delete, iter); -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+ ret = bch2_trans_commit(trans, NULL, journal_seq, -+ BTREE_INSERT_NOFAIL); -+ if (ret) -+ break; -+ -+ bch2_trans_cond_resched(trans); -+ } -+ -+ if (ret == -EINTR) { -+ ret = 0; -+ goto retry; -+ } -+ -+ return ret; -+ -+} -+ -+int bch2_btree_delete_at(struct btree_trans *trans, -+ struct btree_iter *iter, unsigned flags) -+{ -+ struct bkey_i k; -+ -+ bkey_init(&k.k); -+ k.k.p = iter->pos; -+ -+ bch2_trans_update(trans, iter, &k, 0); -+ return bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE|flags); -+} -+ -+/* -+ * bch_btree_delete_range - delete everything within a given range -+ * -+ * Range is a half open interval - [start, end) -+ */ -+int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, -+ struct bpos start, struct bpos end, -+ u64 *journal_seq) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ /* -+ * XXX: whether we need mem/more iters depends on whether this btree id -+ * has triggers -+ */ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); -+ -+ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c -new file mode 100644 -index 000000000000..2a3b95968a86 ---- /dev/null -+++ b/fs/bcachefs/buckets.c -@@ -0,0 +1,2230 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ * -+ * Bucket states: -+ * - free bucket: mark == 0 -+ * The bucket contains no data and will not be read -+ * -+ * - allocator bucket: owned_by_allocator == 1 -+ * The bucket is on a free list, or it is an open bucket -+ * -+ * - cached bucket: owned_by_allocator == 0 && -+ * dirty_sectors == 0 && -+ * cached_sectors > 0 -+ * The bucket contains data but may be safely discarded as there are -+ * enough replicas of the data on other cache devices, or it has been -+ * written back to the backing device -+ * -+ * - dirty bucket: owned_by_allocator == 0 && -+ * dirty_sectors > 0 -+ * The bucket contains data that we must not discard (either only copy, -+ * or one of the 'main copies' for data requiring multiple replicas) -+ * -+ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 -+ * This is a btree node, journal or gen/prio bucket -+ * -+ * Lifecycle: -+ * -+ * bucket invalidated => bucket on freelist => open bucket => -+ * [dirty bucket =>] cached bucket => bucket invalidated => ... -+ * -+ * Note that cache promotion can skip the dirty bucket step, as data -+ * is copied from a deeper tier to a shallower tier, onto a cached -+ * bucket. -+ * Note also that a cached bucket can spontaneously become dirty -- -+ * see below. -+ * -+ * Only a traversal of the key space can determine whether a bucket is -+ * truly dirty or cached. -+ * -+ * Transitions: -+ * -+ * - free => allocator: bucket was invalidated -+ * - cached => allocator: bucket was invalidated -+ * -+ * - allocator => dirty: open bucket was filled up -+ * - allocator => cached: open bucket was filled up -+ * - allocator => metadata: metadata was allocated -+ * -+ * - dirty => cached: dirty sectors were copied to a deeper tier -+ * - dirty => free: dirty sectors were overwritten or moved (copy gc) -+ * - cached => free: cached sectors were overwritten -+ * -+ * - metadata => free: metadata was freed -+ * -+ * Oddities: -+ * - cached => dirty: a device was removed so formerly replicated data -+ * is no longer sufficiently replicated -+ * - free => cached: cannot happen -+ * - free => dirty: cannot happen -+ * - free => metadata: cannot happen -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "ec.h" -+#include "error.h" -+#include "movinggc.h" -+#include "replicas.h" -+ -+#include -+#include -+ -+/* -+ * Clear journal_seq_valid for buckets for which it's not needed, to prevent -+ * wraparound: -+ */ -+void bch2_bucket_seq_cleanup(struct bch_fs *c) -+{ -+ u64 journal_seq = atomic64_read(&c->journal.seq); -+ u16 last_seq_ondisk = c->journal.last_seq_ondisk; -+ struct bch_dev *ca; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ struct bucket_mark m; -+ unsigned i; -+ -+ if (journal_seq - c->last_bucket_seq_cleanup < -+ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) -+ return; -+ -+ c->last_bucket_seq_cleanup = journal_seq; -+ -+ for_each_member_device(ca, c, i) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) { -+ bucket_cmpxchg(g, m, ({ -+ if (!m.journal_seq_valid || -+ bucket_needs_journal_commit(m, last_seq_ondisk)) -+ break; -+ -+ m.journal_seq_valid = 0; -+ })); -+ } -+ up_read(&ca->bucket_lock); -+ } -+} -+ -+void bch2_fs_usage_initialize(struct bch_fs *c) -+{ -+ struct bch_fs_usage *usage; -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ usage = c->usage_base; -+ -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ usage->reserved += usage->persistent_reserved[i]; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ switch (e->data_type) { -+ case BCH_DATA_btree: -+ usage->btree += usage->replicas[i]; -+ break; -+ case BCH_DATA_user: -+ usage->data += usage->replicas[i]; -+ break; -+ case BCH_DATA_cached: -+ usage->cached += usage->replicas[i]; -+ break; -+ } -+ } -+ -+ percpu_up_write(&c->mark_lock); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ if (fs_usage == c->usage_scratch) -+ mutex_unlock(&c->usage_scratch_lock); -+ else -+ kfree(fs_usage); -+} -+ -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); -+ -+ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); -+ if (ret) -+ return ret; -+ -+ if (mutex_trylock(&c->usage_scratch_lock)) -+ goto out_pool; -+ -+ ret = kzalloc(bytes, GFP_NOFS); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->usage_scratch_lock); -+out_pool: -+ ret = c->usage_scratch; -+ memset(ret, 0, bytes); -+ return ret; -+} -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) -+{ -+ struct bch_dev_usage ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ acc_u64s_percpu((u64 *) &ret, -+ (u64 __percpu *) ca->usage[0], -+ sizeof(ret) / sizeof(u64)); -+ -+ return ret; -+} -+ -+static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, -+ unsigned journal_seq, -+ bool gc) -+{ -+ return this_cpu_ptr(gc -+ ? c->usage_gc -+ : c->usage[journal_seq & 1]); -+} -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) -+{ -+ ssize_t offset = v - (u64 *) c->usage_base; -+ unsigned seq; -+ u64 ret; -+ -+ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ ret = *v + -+ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + -+ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) -+{ -+ struct bch_fs_usage *ret; -+ unsigned seq, v, u64s = fs_usage_u64s(c); -+retry: -+ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); -+ if (unlikely(!ret)) -+ return NULL; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ v = fs_usage_u64s(c); -+ if (unlikely(u64s != v)) { -+ u64s = v; -+ percpu_up_read(&c->mark_lock); -+ kfree(ret); -+ goto retry; -+ } -+ -+ do { -+ seq = read_seqcount_begin(&c->usage_lock); -+ memcpy(ret, c->usage_base, u64s * sizeof(u64)); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); -+ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); -+ } while (read_seqcount_retry(&c->usage_lock, seq)); -+ -+ return ret; -+} -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) -+{ -+ unsigned u64s = fs_usage_u64s(c); -+ -+ BUG_ON(idx >= 2); -+ -+ preempt_disable(); -+ write_seqcount_begin(&c->usage_lock); -+ -+ acc_u64s_percpu((u64 *) c->usage_base, -+ (u64 __percpu *) c->usage[idx], u64s); -+ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); -+ -+ write_seqcount_end(&c->usage_lock); -+ preempt_enable(); -+} -+ -+void bch2_fs_usage_to_text(struct printbuf *out, -+ struct bch_fs *c, -+ struct bch_fs_usage *fs_usage) -+{ -+ unsigned i; -+ -+ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); -+ -+ pr_buf(out, "hidden:\t\t\t\t%llu\n", -+ fs_usage->hidden); -+ pr_buf(out, "data:\t\t\t\t%llu\n", -+ fs_usage->data); -+ pr_buf(out, "cached:\t\t\t\t%llu\n", -+ fs_usage->cached); -+ pr_buf(out, "reserved:\t\t\t%llu\n", -+ fs_usage->reserved); -+ pr_buf(out, "nr_inodes:\t\t\t%llu\n", -+ fs_usage->nr_inodes); -+ pr_buf(out, "online reserved:\t\t%llu\n", -+ fs_usage->online_reserved); -+ -+ for (i = 0; -+ i < ARRAY_SIZE(fs_usage->persistent_reserved); -+ i++) { -+ pr_buf(out, "%u replicas:\n", i + 1); -+ pr_buf(out, "\treserved:\t\t%llu\n", -+ fs_usage->persistent_reserved[i]); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ pr_buf(out, "\t"); -+ bch2_replicas_entry_to_text(out, e); -+ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); -+ } -+} -+ -+#define RESERVE_FACTOR 6 -+ -+static u64 reserve_factor(u64 r) -+{ -+ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); -+} -+ -+static u64 avail_factor(u64 r) -+{ -+ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); -+} -+ -+u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) -+{ -+ return min(fs_usage->hidden + -+ fs_usage->btree + -+ fs_usage->data + -+ reserve_factor(fs_usage->reserved + -+ fs_usage->online_reserved), -+ c->capacity); -+} -+ -+static struct bch_fs_usage_short -+__bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ u64 data, reserved; -+ -+ ret.capacity = c->capacity - -+ bch2_fs_usage_read_one(c, &c->usage_base->hidden); -+ -+ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + -+ bch2_fs_usage_read_one(c, &c->usage_base->btree); -+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + -+ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); -+ -+ ret.used = min(ret.capacity, data + reserve_factor(reserved)); -+ ret.free = ret.capacity - ret.used; -+ -+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); -+ -+ return ret; -+} -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *c) -+{ -+ struct bch_fs_usage_short ret; -+ -+ percpu_down_read(&c->mark_lock); -+ ret = __bch2_fs_usage_read_short(c); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+static inline int is_unavailable_bucket(struct bucket_mark m) -+{ -+ return !is_available_bucket(m); -+} -+ -+static inline int is_fragmented_bucket(struct bucket_mark m, -+ struct bch_dev *ca) -+{ -+ if (!m.owned_by_allocator && -+ m.data_type == BCH_DATA_user && -+ bucket_sectors_used(m)) -+ return max_t(int, 0, (int) ca->mi.bucket_size - -+ bucket_sectors_used(m)); -+ return 0; -+} -+ -+static inline int bucket_stripe_sectors(struct bucket_mark m) -+{ -+ return m.stripe ? m.dirty_sectors : 0; -+} -+ -+static inline enum bch_data_type bucket_type(struct bucket_mark m) -+{ -+ return m.cached_sectors && !m.dirty_sectors -+ ? BCH_DATA_cached -+ : m.data_type; -+} -+ -+static bool bucket_became_unavailable(struct bucket_mark old, -+ struct bucket_mark new) -+{ -+ return is_available_bucket(old) && -+ !is_available_bucket(new); -+} -+ -+int bch2_fs_usage_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct disk_reservation *disk_res, -+ unsigned journal_seq) -+{ -+ s64 added = fs_usage->data + fs_usage->reserved; -+ s64 should_not_have_added; -+ int ret = 0; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ /* -+ * Not allowed to reduce sectors_available except by getting a -+ * reservation: -+ */ -+ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); -+ if (WARN_ONCE(should_not_have_added > 0, -+ "disk usage increased by %lli without a reservation", -+ should_not_have_added)) { -+ atomic64_sub(should_not_have_added, &c->sectors_available); -+ added -= should_not_have_added; -+ ret = -1; -+ } -+ -+ if (added > 0) { -+ disk_res->sectors -= added; -+ fs_usage->online_reserved -= added; -+ } -+ -+ preempt_disable(); -+ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), -+ (u64 *) fs_usage, fs_usage_u64s(c)); -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static inline void account_bucket(struct bch_fs_usage *fs_usage, -+ struct bch_dev_usage *dev_usage, -+ enum bch_data_type type, -+ int nr, s64 size) -+{ -+ if (type == BCH_DATA_sb || type == BCH_DATA_journal) -+ fs_usage->hidden += size; -+ -+ dev_usage->buckets[type] += nr; -+} -+ -+static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, -+ struct bch_fs_usage *fs_usage, -+ struct bucket_mark old, struct bucket_mark new, -+ bool gc) -+{ -+ struct bch_dev_usage *u; -+ -+ percpu_rwsem_assert_held(&c->mark_lock); -+ -+ preempt_disable(); -+ u = this_cpu_ptr(ca->usage[gc]); -+ -+ if (bucket_type(old)) -+ account_bucket(fs_usage, u, bucket_type(old), -+ -1, -ca->mi.bucket_size); -+ -+ if (bucket_type(new)) -+ account_bucket(fs_usage, u, bucket_type(new), -+ 1, ca->mi.bucket_size); -+ -+ u->buckets_alloc += -+ (int) new.owned_by_allocator - (int) old.owned_by_allocator; -+ u->buckets_unavailable += -+ is_unavailable_bucket(new) - is_unavailable_bucket(old); -+ -+ u->buckets_ec += (int) new.stripe - (int) old.stripe; -+ u->sectors_ec += bucket_stripe_sectors(new) - -+ bucket_stripe_sectors(old); -+ -+ u->sectors[old.data_type] -= old.dirty_sectors; -+ u->sectors[new.data_type] += new.dirty_sectors; -+ u->sectors[BCH_DATA_cached] += -+ (int) new.cached_sectors - (int) old.cached_sectors; -+ u->sectors_fragmented += -+ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); -+ preempt_enable(); -+ -+ if (!is_available_bucket(old) && is_available_bucket(new)) -+ bch2_wake_allocator(ca); -+} -+ -+__flatten -+void bch2_dev_usage_from_buckets(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct bucket_mark old = { .v.counter = 0 }; -+ struct bucket_array *buckets; -+ struct bucket *g; -+ unsigned i; -+ int cpu; -+ -+ c->usage_base->hidden = 0; -+ -+ for_each_member_device(ca, c, i) { -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(ca->usage[0], cpu), 0, -+ sizeof(*ca->usage[0])); -+ -+ buckets = bucket_array(ca); -+ -+ for_each_bucket(g, buckets) -+ bch2_dev_usage_update(c, ca, c->usage_base, -+ old, g->mark, false); -+ } -+} -+ -+static inline int update_replicas(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ int idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) -+ return -1; -+ -+ if (!fs_usage) -+ return 0; -+ -+ switch (r->data_type) { -+ case BCH_DATA_btree: -+ fs_usage->btree += sectors; -+ break; -+ case BCH_DATA_user: -+ fs_usage->data += sectors; -+ break; -+ case BCH_DATA_cached: -+ fs_usage->cached += sectors; -+ break; -+ } -+ fs_usage->replicas[idx] += sectors; -+ return 0; -+} -+ -+static inline void update_cached_sectors(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas(c, fs_usage, &r.e, sectors); -+} -+ -+static struct replicas_delta_list * -+replicas_deltas_realloc(struct btree_trans *trans, unsigned more) -+{ -+ struct replicas_delta_list *d = trans->fs_usage_deltas; -+ unsigned new_size = d ? (d->size + more) * 2 : 128; -+ -+ if (!d || d->used + more > d->size) { -+ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); -+ BUG_ON(!d); -+ -+ d->size = new_size; -+ trans->fs_usage_deltas = d; -+ } -+ return d; -+} -+ -+static inline void update_replicas_list(struct btree_trans *trans, -+ struct bch_replicas_entry *r, -+ s64 sectors) -+{ -+ struct replicas_delta_list *d; -+ struct replicas_delta *n; -+ unsigned b; -+ -+ if (!sectors) -+ return; -+ -+ b = replicas_entry_bytes(r) + 8; -+ d = replicas_deltas_realloc(trans, b); -+ -+ n = (void *) d->d + d->used; -+ n->delta = sectors; -+ memcpy(&n->r, r, replicas_entry_bytes(r)); -+ d->used += b; -+} -+ -+static inline void update_cached_sectors_list(struct btree_trans *trans, -+ unsigned dev, s64 sectors) -+{ -+ struct bch_replicas_padded r; -+ -+ bch2_replicas_entry_cached(&r.e, dev); -+ -+ update_replicas_list(trans, &r.e, sectors); -+} -+ -+static inline struct replicas_delta * -+replicas_delta_next(struct replicas_delta *d) -+{ -+ return (void *) d + replicas_entry_bytes(&d->r) + 8; -+} -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *c, -+ struct bch_fs_usage *fs_usage, -+ struct replicas_delta_list *r) -+{ -+ struct replicas_delta *d = r->d; -+ struct replicas_delta *top = (void *) r->d + r->used; -+ unsigned i; -+ -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ if (update_replicas(c, fs_usage, &d->r, d->delta)) { -+ top = d; -+ goto unwind; -+ } -+ -+ if (!fs_usage) -+ return 0; -+ -+ fs_usage->nr_inodes += r->nr_inodes; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ fs_usage->reserved += r->persistent_reserved[i]; -+ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; -+ } -+ -+ return 0; -+unwind: -+ for (d = r->d; d != top; d = replicas_delta_next(d)) -+ update_replicas(c, fs_usage, &d->r, -d->delta); -+ return -1; -+} -+ -+#define do_mark_fn(fn, c, pos, flags, ...) \ -+({ \ -+ int gc, ret = 0; \ -+ \ -+ percpu_rwsem_assert_held(&c->mark_lock); \ -+ \ -+ for (gc = 0; gc < 2 && !ret; gc++) \ -+ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ -+ (gc && gc_visited(c, pos))) \ -+ ret = fn(c, __VA_ARGS__, gc); \ -+ ret; \ -+}) -+ -+static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *ret, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ BUG_ON(!is_available_bucket(new)); -+ -+ new.owned_by_allocator = true; -+ new.data_type = 0; -+ new.cached_sectors = 0; -+ new.dirty_sectors = 0; -+ new.gen++; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ if (old.cached_sectors) -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -((s64) old.cached_sectors)); -+ -+ if (!gc) -+ *ret = old; -+ return 0; -+} -+ -+void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, struct bucket_mark *old) -+{ -+ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, -+ ca, b, old); -+ -+ if (!old->owned_by_allocator && old->cached_sectors) -+ trace_invalidate(ca, bucket_to_sector(ca, b), -+ old->cached_sectors); -+} -+ -+static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ bool gc) -+{ -+ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.owned_by_allocator = owned_by_allocator; -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && -+ !owned_by_allocator && !old.owned_by_allocator); -+ -+ return 0; -+} -+ -+void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, bool owned_by_allocator, -+ struct gc_pos pos, unsigned flags) -+{ -+ preempt_disable(); -+ -+ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, -+ ca, b, owned_by_allocator); -+ -+ preempt_enable(); -+} -+ -+static int bch2_mark_alloc(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bkey_alloc_unpacked u; -+ struct bch_dev *ca; -+ struct bucket *g; -+ struct bucket_mark old_m, m; -+ -+ /* We don't do anything for deletions - do we?: */ -+ if (new.k->type != KEY_TYPE_alloc) -+ return 0; -+ -+ /* -+ * alloc btree is read in by bch2_alloc_read, not gc: -+ */ -+ if ((flags & BTREE_TRIGGER_GC) && -+ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, new.k->p.inode); -+ -+ if (new.k->p.offset >= ca->mi.nbuckets) -+ return 0; -+ -+ g = __bucket(ca, new.k->p.offset, gc); -+ u = bch2_alloc_unpack(new); -+ -+ old_m = bucket_cmpxchg(g, m, ({ -+ m.gen = u.gen; -+ m.data_type = u.data_type; -+ m.dirty_sectors = u.dirty_sectors; -+ m.cached_sectors = u.cached_sectors; -+ -+ if (journal_seq) { -+ m.journal_seq_valid = 1; -+ m.journal_seq = journal_seq; -+ } -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); -+ -+ g->io_time[READ] = u.read_time; -+ g->io_time[WRITE] = u.write_time; -+ g->oldest_gen = u.oldest_gen; -+ g->gen_valid = 1; -+ -+ /* -+ * need to know if we're getting called from the invalidate path or -+ * not: -+ */ -+ -+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && -+ old_m.cached_sectors) { -+ update_cached_sectors(c, fs_usage, ca->dev_idx, -+ -old_m.cached_sectors); -+ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), -+ old_m.cached_sectors); -+ } -+ -+ return 0; -+} -+ -+#define checked_add(a, b) \ -+({ \ -+ unsigned _res = (unsigned) (a) + (b); \ -+ bool overflow = _res > U16_MAX; \ -+ if (overflow) \ -+ _res = U16_MAX; \ -+ (a) = _res; \ -+ overflow; \ -+}) -+ -+static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type data_type, -+ unsigned sectors, bool gc) -+{ -+ struct bucket *g = __bucket(ca, b, gc); -+ struct bucket_mark old, new; -+ bool overflow; -+ -+ BUG_ON(data_type != BCH_DATA_sb && -+ data_type != BCH_DATA_journal); -+ -+ old = bucket_cmpxchg(g, new, ({ -+ new.data_type = data_type; -+ overflow = checked_add(new.dirty_sectors, sectors); -+ })); -+ -+ bch2_fs_inconsistent_on(old.data_type && -+ old.data_type != data_type, c, -+ "different types of data in same bucket: %s, %s", -+ bch2_data_types[old.data_type], -+ bch2_data_types[data_type]); -+ -+ bch2_fs_inconsistent_on(overflow, c, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", -+ ca->dev_idx, b, new.gen, -+ bch2_data_types[old.data_type ?: data_type], -+ old.dirty_sectors, sectors); -+ -+ if (c) -+ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), -+ old, new, gc); -+ -+ return 0; -+} -+ -+void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, enum bch_data_type type, -+ unsigned sectors, struct gc_pos pos, -+ unsigned flags) -+{ -+ BUG_ON(type != BCH_DATA_sb && -+ type != BCH_DATA_journal); -+ -+ preempt_disable(); -+ -+ if (likely(c)) { -+ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, -+ ca, b, type, sectors); -+ } else { -+ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); -+ } -+ -+ preempt_enable(); -+} -+ -+static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) -+{ -+ return DIV_ROUND_UP(sectors * n, d); -+} -+ -+static s64 __ptr_disk_sectors_delta(unsigned old_size, -+ unsigned offset, s64 delta, -+ unsigned flags, -+ unsigned n, unsigned d) -+{ -+ BUG_ON(!n || !d); -+ -+ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, offset) + -+ disk_sectors_scaled(n, d, old_size - offset + delta); -+ } else if (flags & BTREE_TRIGGER_OVERWRITE) { -+ BUG_ON(offset + -delta > old_size); -+ -+ return -disk_sectors_scaled(n, d, old_size) + -+ disk_sectors_scaled(n, d, old_size + delta); -+ } else { -+ return disk_sectors_scaled(n, d, delta); -+ } -+} -+ -+static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, -+ unsigned offset, s64 delta, -+ unsigned flags) -+{ -+ return __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, delta, flags, -+ p.crc.compressed_size, -+ p.crc.uncompressed_size); -+} -+ -+static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 bucket_data_type, -+ u16 dirty_sectors, u16 cached_sectors) -+{ -+ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); -+ u16 bucket_sectors = !ptr->cached -+ ? dirty_sectors -+ : cached_sectors; -+ char buf[200]; -+ -+ if (gen_after(ptr->gen, bucket_gen)) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != ptr->gen && !ptr->cached) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ ptr->gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if (bucket_gen != ptr->gen) -+ return 1; -+ -+ if (bucket_data_type && ptr_data_type && -+ bucket_data_type != ptr_data_type) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, -+ bch2_data_types[bucket_data_type], -+ bch2_data_types[ptr_data_type], -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" -+ "while marking %s", -+ ptr->dev, bucket_nr, bucket_gen, -+ bch2_data_types[bucket_data_type ?: ptr_data_type], -+ bucket_sectors, sectors, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ return -EIO; -+ } -+ -+ return 0; -+} -+ -+static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, -+ unsigned flags, -+ bool enabled) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bucket *g = PTR_BUCKET(ca, ptr, gc); -+ struct bucket_mark new, old; -+ char buf[200]; -+ int ret; -+ -+ old = bucket_cmpxchg(g, new, ({ -+ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, -+ new.dirty_sectors, new.cached_sectors); -+ if (ret) -+ return ret; -+ -+ if (new.stripe && enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ -+ if (!new.stripe && !enabled) -+ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, -+ "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", -+ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); -+ -+ new.stripe = enabled; -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ })); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ return 0; -+} -+ -+static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ s64 sectors, enum bch_data_type ptr_data_type, -+ u8 bucket_gen, u8 *bucket_data_type, -+ u16 *dirty_sectors, u16 *cached_sectors) -+{ -+ u16 *dst_sectors = !ptr->cached -+ ? dirty_sectors -+ : cached_sectors; -+ int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, -+ bucket_gen, *bucket_data_type, -+ *dirty_sectors, *cached_sectors); -+ -+ if (ret) -+ return ret; -+ -+ *dst_sectors += sectors; -+ *bucket_data_type = *dirty_sectors || *cached_sectors -+ ? ptr_data_type : 0; -+ return 0; -+} -+ -+static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, -+ struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct bucket_mark old, new; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); -+ u8 bucket_data_type; -+ u64 v; -+ int ret; -+ -+ v = atomic64_read(&g->_mark.v); -+ do { -+ new.v.counter = old.v.counter = v; -+ bucket_data_type = new.data_type; -+ -+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, -+ &bucket_data_type, -+ &new.dirty_sectors, -+ &new.cached_sectors); -+ if (ret) -+ return ret; -+ -+ new.data_type = bucket_data_type; -+ -+ if (journal_seq) { -+ new.journal_seq_valid = 1; -+ new.journal_seq = journal_seq; -+ } -+ -+ if (flags & BTREE_TRIGGER_NOATOMIC) { -+ g->_mark = new; -+ break; -+ } -+ } while ((v = atomic64_cmpxchg(&g->_mark.v, -+ old.v.counter, -+ new.v.counter)) != old.v.counter); -+ -+ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); -+ -+ BUG_ON(!gc && bucket_became_unavailable(old, new)); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe_ptr(struct bch_fs *c, -+ struct bch_extent_stripe_ptr p, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ s64 sectors, unsigned flags, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ struct stripe *m; -+ unsigned i, blocks_nonempty = 0; -+ -+ m = genradix_ptr(&c->stripes[gc], p.idx); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ if (!m || !m->alive) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ return -EIO; -+ } -+ -+ BUG_ON(m->r.e.data_type != data_type); -+ -+ *nr_data = m->nr_blocks - m->nr_redundant; -+ *nr_parity = m->nr_redundant; -+ *r = m->r; -+ -+ m->block_sectors[p.block] += sectors; -+ -+ for (i = 0; i < m->nr_blocks; i++) -+ blocks_nonempty += m->block_sectors[i] != 0; -+ -+ if (m->blocks_nonempty != blocks_nonempty) { -+ m->blocks_nonempty = blocks_nonempty; -+ if (!gc) -+ bch2_stripes_heap_update(c, m, p.idx); -+ } -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ return 0; -+} -+ -+static int bch2_mark_extent(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ enum bch_data_type data_type, -+ struct bch_fs_usage *fs_usage, -+ unsigned journal_seq, unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_btree -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, -+ fs_usage, journal_seq, flags); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors(c, fs_usage, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, -+ fs_usage, disk_sectors, flags, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas(c, fs_usage, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ /* -+ * There may be other dirty pointers in this extent, but -+ * if so they're not required for mounting if we have an -+ * erasure coded pointer in this extent: -+ */ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas(c, fs_usage, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int bch2_mark_stripe(struct bch_fs *c, -+ struct bkey_s_c old, struct bkey_s_c new, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ bool gc = flags & BTREE_TRIGGER_GC; -+ size_t idx = new.k->p.offset; -+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(old).v : NULL; -+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe -+ ? bkey_s_c_to_stripe(new).v : NULL; -+ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); -+ unsigned i; -+ int ret; -+ -+ if (!m || (old_s && !m->alive)) { -+ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", -+ idx); -+ return -1; -+ } -+ -+ if (!new_s) { -+ /* Deleting: */ -+ for (i = 0; i < old_s->nr_blocks; i++) { -+ ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, -+ journal_seq, flags, false); -+ if (ret) -+ return ret; -+ } -+ -+ if (!gc && m->on_heap) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_del(c, m, idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ memset(m, 0, sizeof(*m)); -+ } else { -+ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); -+ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ if (!old_s || -+ memcmp(new_s->ptrs + i, -+ old_s->ptrs + i, -+ sizeof(struct bch_extent_ptr))) { -+ -+ if (old_s) { -+ bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, -+ journal_seq, flags, false); -+ if (ret) -+ return ret; -+ } -+ ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage, -+ journal_seq, flags, true); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ m->alive = true; -+ m->sectors = le16_to_cpu(new_s->sectors); -+ m->algorithm = new_s->algorithm; -+ m->nr_blocks = new_s->nr_blocks; -+ m->nr_redundant = new_s->nr_redundant; -+ -+ bch2_bkey_to_replicas(&m->r.e, new); -+ -+ /* gc recalculates these fields: */ -+ if (!(flags & BTREE_TRIGGER_GC)) { -+ m->blocks_nonempty = 0; -+ -+ for (i = 0; i < new_s->nr_blocks; i++) { -+ m->block_sectors[i] = -+ stripe_blockcount_get(new_s, i); -+ m->blocks_nonempty += !!m->block_sectors[i]; -+ } -+ } -+ -+ if (!gc) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ bch2_stripes_heap_update(c, m, idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ } -+ -+ return 0; -+} -+ -+static int bch2_mark_key_locked(struct bch_fs *c, -+ struct bkey_s_c old, -+ struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; -+ int ret = 0; -+ -+ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); -+ -+ preempt_disable(); -+ -+ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) -+ fs_usage = fs_usage_ptr(c, journal_seq, -+ flags & BTREE_TRIGGER_GC); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_alloc: -+ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ ret = bch2_mark_extent(c, old, new, offset, sectors, -+ BCH_DATA_btree, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ ret = bch2_mark_extent(c, old, new, offset, sectors, -+ BCH_DATA_user, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_stripe: -+ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); -+ break; -+ case KEY_TYPE_inode: -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ fs_usage->nr_inodes++; -+ else -+ fs_usage->nr_inodes--; -+ break; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(fs_usage->persistent_reserved)); -+ -+ fs_usage->reserved += sectors; -+ fs_usage->persistent_reserved[replicas - 1] += sectors; -+ break; -+ } -+ } -+ -+ preempt_enable(); -+ -+ return ret; -+} -+ -+int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, -+ unsigned offset, s64 sectors, -+ struct bch_fs_usage *fs_usage, -+ u64 journal_seq, unsigned flags) -+{ -+ struct bkey deleted; -+ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; -+ int ret; -+ -+ bkey_init(&deleted); -+ -+ percpu_down_read(&c->mark_lock); -+ ret = bch2_mark_key_locked(c, old, new, offset, sectors, -+ fs_usage, journal_seq, -+ BTREE_TRIGGER_INSERT|flags); -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *new, -+ struct bch_fs_usage *fs_usage, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_old; -+ struct bkey_s_c old; -+ struct bkey unpacked; -+ int ret = 0; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ bkey_init(&unpacked); -+ old = (struct bkey_s_c) { &unpacked, NULL }; -+ -+ if (!btree_node_type_is_extents(iter->btree_id)) { -+ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { -+ _old = bch2_btree_node_iter_peek(&node_iter, b); -+ if (_old) -+ old = bkey_disassemble(b, _old, &unpacked); -+ } else { -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ if (ck->valid) -+ old = bkey_i_to_s_c(ck->k); -+ } -+ -+ if (old.k->type == new->k.type) { -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); -+ -+ } else { -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|flags); -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_OVERWRITE|flags); -+ } -+ } else { -+ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); -+ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), -+ 0, new->k.size, -+ fs_usage, trans->journal_res.seq, -+ BTREE_TRIGGER_INSERT|flags); -+ -+ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { -+ unsigned offset = 0; -+ s64 sectors; -+ -+ old = bkey_disassemble(b, _old, &unpacked); -+ sectors = -((s64) old.k->size); -+ -+ flags |= BTREE_TRIGGER_OVERWRITE; -+ -+ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) -+ return 0; -+ -+ switch (bch2_extent_overlap(&new->k, old.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) old.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = bkey_start_offset(&new->k) - -+ old.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(old.k) - -+ new->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&new->k) - -+ bkey_start_offset(old.k); -+ sectors = -((s64) new->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ -+ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), -+ offset, sectors, fs_usage, -+ trans->journal_res.seq, flags) ?: 1; -+ if (ret <= 0) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ } -+ -+ return ret; -+} -+ -+void bch2_trans_fs_usage_apply(struct btree_trans *trans, -+ struct bch_fs_usage *fs_usage) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_insert_entry *i; -+ static int warned_disk_usage = 0; -+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; -+ char buf[200]; -+ -+ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, -+ trans->journal_res.seq) || -+ warned_disk_usage || -+ xchg(&warned_disk_usage, 1)) -+ return; -+ -+ bch_err(c, "disk usage increased more than %llu sectors reserved", -+ disk_res_sectors); -+ -+ trans_for_each_update(trans, i) { -+ pr_err("while inserting"); -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); -+ pr_err("%s", buf); -+ pr_err("overlapping with"); -+ -+ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { -+ struct btree *b = iter_l(i->iter)->b; -+ struct btree_node_iter node_iter = iter_l(i->iter)->iter; -+ struct bkey_packed *_k; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ -+ pr_info("_k %px format %u", _k, _k->format); -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(i->k->k.p, k.k->p)) -+ break; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, k); -+ pr_err("%s", buf); -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ } else { -+ struct bkey_cached *ck = (void *) i->iter->l[0].b; -+ -+ if (ck->valid) { -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); -+ pr_err("%s", buf); -+ } -+ } -+ } -+} -+ -+/* trans_mark: */ -+ -+static struct btree_iter *trans_get_update(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct bkey_s_c *k) -+{ -+ struct btree_insert_entry *i; -+ -+ trans_for_each_update(trans, i) -+ if (i->iter->btree_id == btree_id && -+ (btree_node_type_is_extents(btree_id) -+ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && -+ bkey_cmp(pos, i->k->k.p) < 0 -+ : !bkey_cmp(pos, i->iter->pos))) { -+ *k = bkey_i_to_s_c(i->k); -+ return i->iter; -+ } -+ -+ return NULL; -+} -+ -+static int trans_get_key(struct btree_trans *trans, -+ enum btree_id btree_id, struct bpos pos, -+ struct btree_iter **iter, -+ struct bkey_s_c *k) -+{ -+ unsigned flags = btree_id != BTREE_ID_ALLOC -+ ? BTREE_ITER_SLOTS -+ : BTREE_ITER_CACHED; -+ int ret; -+ -+ *iter = trans_get_update(trans, btree_id, pos, k); -+ if (*iter) -+ return 1; -+ -+ *iter = bch2_trans_get_iter(trans, btree_id, pos, -+ flags|BTREE_ITER_INTENT); -+ if (IS_ERR(*iter)) -+ return PTR_ERR(*iter); -+ -+ *k = __bch2_btree_iter_peek(*iter, flags); -+ ret = bkey_err(*k); -+ if (ret) -+ bch2_trans_iter_put(trans, *iter); -+ return ret; -+} -+ -+static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, -+ const struct bch_extent_ptr *ptr, -+ struct bkey_alloc_unpacked *u) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); -+ struct bucket *g; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); -+ if (iter) { -+ *u = bch2_alloc_unpack(k); -+ } else { -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) { -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+ } -+ -+ percpu_down_read(&c->mark_lock); -+ g = bucket(ca, pos.offset); -+ *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ *_iter = iter; -+ return 0; -+} -+ -+static int bch2_trans_mark_pointer(struct btree_trans *trans, -+ struct bkey_s_c k, struct extent_ptr_decoded p, -+ s64 sectors, enum bch_data_type data_type) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_alloc_unpacked u; -+ struct bkey_i_alloc *a; -+ int ret; -+ -+ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); -+ if (ret) -+ return ret; -+ -+ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, -+ &u.dirty_sectors, &u.cached_sectors); -+ if (ret) -+ goto out; -+ -+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto out; -+ -+ bkey_alloc_init(&a->k_i); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, u); -+ bch2_trans_update(trans, iter, &a->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, -+ struct bch_extent_stripe_ptr p, -+ s64 sectors, enum bch_data_type data_type, -+ struct bch_replicas_padded *r, -+ unsigned *nr_data, -+ unsigned *nr_parity) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_stripe *s; -+ int ret = 0; -+ -+ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) { -+ bch2_fs_inconsistent(c, -+ "pointer to nonexistent stripe %llu", -+ (u64) p.idx); -+ ret = -EIO; -+ goto out; -+ } -+ -+ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(s); -+ if (ret) -+ goto out; -+ -+ bkey_reassemble(&s->k_i, k); -+ -+ stripe_blockcount_set(&s->v, p.block, -+ stripe_blockcount_get(&s->v, p.block) + -+ sectors); -+ -+ *nr_data = s->v.nr_blocks - s->v.nr_redundant; -+ *nr_parity = s->v.nr_redundant; -+ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); -+ bch2_trans_update(trans, iter, &s->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_extent(struct btree_trans *trans, -+ struct bkey_s_c k, unsigned offset, -+ s64 sectors, unsigned flags, -+ enum bch_data_type data_type) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_replicas_padded r; -+ s64 dirty_sectors = 0; -+ bool stale; -+ int ret; -+ -+ r.e.data_type = data_type; -+ r.e.nr_devs = 0; -+ r.e.nr_required = 1; -+ -+ BUG_ON(!sectors); -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ s64 disk_sectors = data_type == BCH_DATA_btree -+ ? sectors -+ : ptr_disk_sectors_delta(p, offset, sectors, flags); -+ -+ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, -+ data_type); -+ if (ret < 0) -+ return ret; -+ -+ stale = ret > 0; -+ -+ if (p.ptr.cached) { -+ if (!stale) -+ update_cached_sectors_list(trans, p.ptr.dev, -+ disk_sectors); -+ } else if (!p.has_ec) { -+ dirty_sectors += disk_sectors; -+ r.e.devs[r.e.nr_devs++] = p.ptr.dev; -+ } else { -+ struct bch_replicas_padded ec_r; -+ unsigned nr_data, nr_parity; -+ s64 parity_sectors; -+ -+ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, -+ disk_sectors, data_type, -+ &ec_r, &nr_data, &nr_parity); -+ if (ret) -+ return ret; -+ -+ parity_sectors = -+ __ptr_disk_sectors_delta(p.crc.live_size, -+ offset, sectors, flags, -+ p.crc.compressed_size * nr_parity, -+ p.crc.uncompressed_size * nr_data); -+ -+ update_replicas_list(trans, &ec_r.e, -+ disk_sectors + parity_sectors); -+ -+ r.e.nr_required = 0; -+ } -+ } -+ -+ if (r.e.nr_devs) -+ update_replicas_list(trans, &r.e, dirty_sectors); -+ -+ return 0; -+} -+ -+static int bch2_trans_mark_stripe(struct btree_trans *trans, -+ struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ struct bkey_alloc_unpacked u; -+ struct bkey_i_alloc *a; -+ struct btree_iter *iter; -+ unsigned i; -+ int ret = 0; -+ -+ /* -+ * The allocator code doesn't necessarily update bucket gens in the -+ * btree when incrementing them, right before handing out new buckets - -+ * we just need to persist those updates here along with the new stripe: -+ */ -+ -+ for (i = 0; i < s->nr_blocks && !ret; i++) { -+ ret = bch2_trans_start_alloc_update(trans, &iter, -+ &s->ptrs[i], &u); -+ if (ret) -+ break; -+ -+ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); -+ ret = PTR_ERR_OR_ZERO(a); -+ if (ret) -+ goto put_iter; -+ -+ bkey_alloc_init(&a->k_i); -+ a->k.p = iter->pos; -+ bch2_alloc_pack(a, u); -+ bch2_trans_update(trans, iter, &a->k_i, 0); -+put_iter: -+ bch2_trans_iter_put(trans, iter); -+ } -+ -+ return ret; -+} -+ -+static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, -+ u64 idx, unsigned sectors, -+ unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ s64 ret; -+ -+ ret = trans_get_key(trans, BTREE_ID_REFLINK, -+ POS(0, idx), &iter, &k); -+ if (ret < 0) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ bch2_fs_inconsistent(c, -+ "%llu:%llu len %u points to nonexistent indirect extent %llu", -+ p.k->p.inode, p.k->p.offset, p.k->size, idx); -+ ret = -EIO; -+ goto err; -+ } -+ -+ if ((flags & BTREE_TRIGGER_OVERWRITE) && -+ (bkey_start_offset(k.k) < idx || -+ k.k->p.offset > idx + sectors)) -+ goto out; -+ -+ sectors = k.k->p.offset - idx; -+ -+ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reassemble(&r_v->k_i, k); -+ -+ le64_add_cpu(&r_v->v.refcount, -+ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); -+ -+ if (!r_v->v.refcount) { -+ r_v->k.type = KEY_TYPE_deleted; -+ set_bkey_val_u64s(&r_v->k, 0); -+ } -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); -+ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); -+ -+ bch2_trans_update(trans, iter, &r_v->k_i, 0); -+out: -+ ret = sectors; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_trans_mark_reflink_p(struct btree_trans *trans, -+ struct bkey_s_c_reflink_p p, unsigned offset, -+ s64 sectors, unsigned flags) -+{ -+ u64 idx = le64_to_cpu(p.v->idx) + offset; -+ s64 ret = 0; -+ -+ sectors = abs(sectors); -+ BUG_ON(offset + sectors > p.k->size); -+ -+ while (sectors) { -+ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); -+ if (ret < 0) -+ break; -+ -+ idx += ret; -+ sectors = max_t(s64, 0LL, sectors - ret); -+ ret = 0; -+ } -+ -+ return ret; -+} -+ -+int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, -+ unsigned offset, s64 sectors, unsigned flags) -+{ -+ struct replicas_delta_list *d; -+ struct bch_fs *c = trans->c; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) -+ ? c->opts.btree_node_size -+ : -c->opts.btree_node_size; -+ -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_btree); -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return bch2_trans_mark_extent(trans, k, offset, sectors, -+ flags, BCH_DATA_user); -+ case KEY_TYPE_stripe: -+ return bch2_trans_mark_stripe(trans, k); -+ case KEY_TYPE_inode: -+ d = replicas_deltas_realloc(trans, 0); -+ -+ if (!(flags & BTREE_TRIGGER_OVERWRITE)) -+ d->nr_inodes++; -+ else -+ d->nr_inodes--; -+ return 0; -+ case KEY_TYPE_reservation: { -+ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; -+ -+ d = replicas_deltas_realloc(trans, 0); -+ -+ sectors *= replicas; -+ replicas = clamp_t(unsigned, replicas, 1, -+ ARRAY_SIZE(d->persistent_reserved)); -+ -+ d->persistent_reserved[replicas - 1] += sectors; -+ return 0; -+ } -+ case KEY_TYPE_reflink_p: -+ return bch2_trans_mark_reflink_p(trans, -+ bkey_s_c_to_reflink_p(k), -+ offset, sectors, flags); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_trans_mark_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert, -+ unsigned flags) -+{ -+ struct btree *b = iter_l(iter)->b; -+ struct btree_node_iter node_iter = iter_l(iter)->iter; -+ struct bkey_packed *_k; -+ int ret; -+ -+ if (unlikely(flags & BTREE_TRIGGER_NORUN)) -+ return 0; -+ -+ if (!btree_node_type_needs_gc(iter->btree_id)) -+ return 0; -+ -+ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), -+ 0, insert->k.size, BTREE_TRIGGER_INSERT); -+ if (ret) -+ return ret; -+ -+ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { -+ struct bkey_cached *ck = (void *) iter->l[0].b; -+ -+ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), -+ 0, 0, BTREE_TRIGGER_OVERWRITE); -+ } -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k; -+ unsigned offset = 0; -+ s64 sectors = 0; -+ unsigned flags = BTREE_TRIGGER_OVERWRITE; -+ -+ k = bkey_disassemble(b, _k, &unpacked); -+ -+ if (btree_node_is_extents(b) -+ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 -+ : bkey_cmp(insert->k.p, k.k->p)) -+ break; -+ -+ if (btree_node_is_extents(b)) { -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ offset = 0; -+ sectors = -((s64) k.k->size); -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = bkey_start_offset(&insert->k) - -+ k.k->p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_FRONT: -+ offset = 0; -+ sectors = bkey_start_offset(k.k) - -+ insert->k.p.offset; -+ break; -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ sectors = -((s64) insert->k.size); -+ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; -+ break; -+ } -+ -+ BUG_ON(sectors >= 0); -+ } -+ -+ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); -+ if (ret) -+ return ret; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return 0; -+} -+ -+/* Disk reservations: */ -+ -+static u64 bch2_recalc_sectors_available(struct bch_fs *c) -+{ -+ percpu_u64_set(&c->pcpu->sectors_available, 0); -+ -+ return avail_factor(__bch2_fs_usage_read_short(c).free); -+} -+ -+void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) -+{ -+ percpu_down_read(&c->mark_lock); -+ this_cpu_sub(c->usage[0]->online_reserved, -+ res->sectors); -+ percpu_up_read(&c->mark_lock); -+ -+ res->sectors = 0; -+} -+ -+#define SECTORS_CACHE 1024 -+ -+int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, -+ unsigned sectors, int flags) -+{ -+ struct bch_fs_pcpu *pcpu; -+ u64 old, v, get; -+ s64 sectors_available; -+ int ret; -+ -+ percpu_down_read(&c->mark_lock); -+ preempt_disable(); -+ pcpu = this_cpu_ptr(c->pcpu); -+ -+ if (sectors <= pcpu->sectors_available) -+ goto out; -+ -+ v = atomic64_read(&c->sectors_available); -+ do { -+ old = v; -+ get = min((u64) sectors + SECTORS_CACHE, old); -+ -+ if (get < sectors) { -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ goto recalculate; -+ } -+ } while ((v = atomic64_cmpxchg(&c->sectors_available, -+ old, old - get)) != old); -+ -+ pcpu->sectors_available += get; -+ -+out: -+ pcpu->sectors_available -= sectors; -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ -+ preempt_enable(); -+ percpu_up_read(&c->mark_lock); -+ return 0; -+ -+recalculate: -+ percpu_down_write(&c->mark_lock); -+ -+ sectors_available = bch2_recalc_sectors_available(c); -+ -+ if (sectors <= sectors_available || -+ (flags & BCH_DISK_RESERVATION_NOFAIL)) { -+ atomic64_set(&c->sectors_available, -+ max_t(s64, 0, sectors_available - sectors)); -+ this_cpu_add(c->usage[0]->online_reserved, sectors); -+ res->sectors += sectors; -+ ret = 0; -+ } else { -+ atomic64_set(&c->sectors_available, sectors_available); -+ ret = -ENOSPC; -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return ret; -+} -+ -+/* Startup/shutdown: */ -+ -+static void buckets_free_rcu(struct rcu_head *rcu) -+{ -+ struct bucket_array *buckets = -+ container_of(rcu, struct bucket_array, rcu); -+ -+ kvpfree(buckets, -+ sizeof(struct bucket_array) + -+ buckets->nbuckets * sizeof(struct bucket)); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bucket_array *buckets = NULL, *old_buckets = NULL; -+ unsigned long *buckets_nouse = NULL; -+ alloc_fifo free[RESERVE_NR]; -+ alloc_fifo free_inc; -+ alloc_heap alloc_heap; -+ -+ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, -+ ca->mi.bucket_size / c->opts.btree_node_size); -+ /* XXX: these should be tunable */ -+ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); -+ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); -+ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), -+ btree_reserve * 2); -+ bool resize = ca->buckets[0] != NULL; -+ int ret = -ENOMEM; -+ unsigned i; -+ -+ memset(&free, 0, sizeof(free)); -+ memset(&free_inc, 0, sizeof(free_inc)); -+ memset(&alloc_heap, 0, sizeof(alloc_heap)); -+ -+ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + -+ nbuckets * sizeof(struct bucket), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * -+ sizeof(unsigned long), -+ GFP_KERNEL|__GFP_ZERO)) || -+ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_MOVINGGC], -+ copygc_reserve, GFP_KERNEL) || -+ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || -+ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || -+ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) -+ goto err; -+ -+ buckets->first_bucket = ca->mi.first_bucket; -+ buckets->nbuckets = nbuckets; -+ -+ bch2_copygc_stop(c); -+ -+ if (resize) { -+ down_write(&c->gc_lock); -+ down_write(&ca->bucket_lock); -+ percpu_down_write(&c->mark_lock); -+ } -+ -+ old_buckets = bucket_array(ca); -+ -+ if (resize) { -+ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); -+ -+ memcpy(buckets->b, -+ old_buckets->b, -+ n * sizeof(struct bucket)); -+ memcpy(buckets_nouse, -+ ca->buckets_nouse, -+ BITS_TO_LONGS(n) * sizeof(unsigned long)); -+ } -+ -+ rcu_assign_pointer(ca->buckets[0], buckets); -+ buckets = old_buckets; -+ -+ swap(ca->buckets_nouse, buckets_nouse); -+ -+ if (resize) { -+ percpu_up_write(&c->mark_lock); -+ up_write(&c->gc_lock); -+ } -+ -+ spin_lock(&c->freelist_lock); -+ for (i = 0; i < RESERVE_NR; i++) { -+ fifo_move(&free[i], &ca->free[i]); -+ swap(ca->free[i], free[i]); -+ } -+ fifo_move(&free_inc, &ca->free_inc); -+ swap(ca->free_inc, free_inc); -+ spin_unlock(&c->freelist_lock); -+ -+ /* with gc lock held, alloc_heap can't be in use: */ -+ swap(ca->alloc_heap, alloc_heap); -+ -+ nbuckets = ca->mi.nbuckets; -+ -+ if (resize) -+ up_write(&ca->bucket_lock); -+ -+ ret = 0; -+err: -+ free_heap(&alloc_heap); -+ free_fifo(&free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&free[i]); -+ kvpfree(buckets_nouse, -+ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); -+ if (buckets) -+ call_rcu(&old_buckets->rcu, buckets_free_rcu); -+ -+ return ret; -+} -+ -+void bch2_dev_buckets_free(struct bch_dev *ca) -+{ -+ unsigned i; -+ -+ free_heap(&ca->alloc_heap); -+ free_fifo(&ca->free_inc); -+ for (i = 0; i < RESERVE_NR; i++) -+ free_fifo(&ca->free[i]); -+ kvpfree(ca->buckets_nouse, -+ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); -+ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), -+ sizeof(struct bucket_array) + -+ ca->mi.nbuckets * sizeof(struct bucket)); -+ -+ free_percpu(ca->usage[0]); -+} -+ -+int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) -+ return -ENOMEM; -+ -+ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; -+} -diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h -new file mode 100644 -index 000000000000..a3873becbb70 ---- /dev/null -+++ b/fs/bcachefs/buckets.h -@@ -0,0 +1,318 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Code for manipulating bucket marks for garbage collection. -+ * -+ * Copyright 2014 Datera, Inc. -+ */ -+ -+#ifndef _BUCKETS_H -+#define _BUCKETS_H -+ -+#include "buckets_types.h" -+#include "super.h" -+ -+#define for_each_bucket(_b, _buckets) \ -+ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ -+ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -+ -+#define bucket_cmpxchg(g, new, expr) \ -+({ \ -+ struct bucket *_g = g; \ -+ u64 _v = atomic64_read(&(g)->_mark.v); \ -+ struct bucket_mark _old; \ -+ \ -+ do { \ -+ (new).v.counter = _old.v.counter = _v; \ -+ expr; \ -+ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ -+ _old.v.counter, \ -+ (new).v.counter)) != _old.v.counter);\ -+ _old; \ -+}) -+ -+static inline struct bucket_array *__bucket_array(struct bch_dev *ca, -+ bool gc) -+{ -+ return rcu_dereference_check(ca->buckets[gc], -+ !ca->fs || -+ percpu_rwsem_is_held(&ca->fs->mark_lock) || -+ lockdep_is_held(&ca->fs->gc_lock) || -+ lockdep_is_held(&ca->bucket_lock)); -+} -+ -+static inline struct bucket_array *bucket_array(struct bch_dev *ca) -+{ -+ return __bucket_array(ca, false); -+} -+ -+static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) -+{ -+ struct bucket_array *buckets = __bucket_array(ca, gc); -+ -+ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); -+ return buckets->b + b; -+} -+ -+static inline struct bucket *bucket(struct bch_dev *ca, size_t b) -+{ -+ return __bucket(ca, b, false); -+} -+ -+static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -+{ -+ return c->bucket_clock[rw].hand - g->io_time[rw]; -+} -+ -+/* -+ * bucket_gc_gen() returns the difference between the bucket's current gen and -+ * the oldest gen of any pointer into that bucket in the btree. -+ */ -+ -+static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) -+{ -+ struct bucket *g = bucket(ca, b); -+ -+ return g->mark.gen - g->oldest_gen; -+} -+ -+static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return sector_to_bucket(ca, ptr->offset); -+} -+ -+static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr, -+ bool gc) -+{ -+ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); -+} -+ -+static inline enum bch_data_type ptr_data_type(const struct bkey *k, -+ const struct bch_extent_ptr *ptr) -+{ -+ if (k->type == KEY_TYPE_btree_ptr || -+ k->type == KEY_TYPE_btree_ptr_v2) -+ return BCH_DATA_btree; -+ -+ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; -+} -+ -+static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ struct bucket_mark m; -+ -+ rcu_read_lock(); -+ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); -+ rcu_read_unlock(); -+ -+ return m; -+} -+ -+static inline int gen_cmp(u8 a, u8 b) -+{ -+ return (s8) (a - b); -+} -+ -+static inline int gen_after(u8 a, u8 b) -+{ -+ int r = gen_cmp(a, b); -+ -+ return r > 0 ? r : 0; -+} -+ -+/** -+ * ptr_stale() - check if a pointer points into a bucket that has been -+ * invalidated. -+ */ -+static inline u8 ptr_stale(struct bch_dev *ca, -+ const struct bch_extent_ptr *ptr) -+{ -+ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); -+} -+ -+static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, -+ unsigned live_size) -+{ -+ return live_size && p.crc.compression_type -+ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, -+ p.crc.uncompressed_size)) -+ : live_size; -+} -+ -+static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) -+{ -+ return __ptr_disk_sectors(p, p.crc.live_size); -+} -+ -+/* bucket gc marks */ -+ -+static inline unsigned bucket_sectors_used(struct bucket_mark mark) -+{ -+ return mark.dirty_sectors + mark.cached_sectors; -+} -+ -+static inline bool bucket_unused(struct bucket_mark mark) -+{ -+ return !mark.owned_by_allocator && -+ !mark.data_type && -+ !bucket_sectors_used(mark); -+} -+ -+static inline bool is_available_bucket(struct bucket_mark mark) -+{ -+ return (!mark.owned_by_allocator && -+ !mark.dirty_sectors && -+ !mark.stripe); -+} -+ -+static inline bool bucket_needs_journal_commit(struct bucket_mark m, -+ u16 last_seq_ondisk) -+{ -+ return m.journal_seq_valid && -+ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); -+} -+ -+/* Device usage: */ -+ -+struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); -+ -+void bch2_dev_usage_from_buckets(struct bch_fs *); -+ -+static inline u64 __dev_buckets_available(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; -+ -+ if (WARN_ONCE(stats.buckets_unavailable > total, -+ "buckets_unavailable overflow (%llu > %llu)\n", -+ stats.buckets_unavailable, total)) -+ return 0; -+ -+ return total - stats.buckets_unavailable; -+} -+ -+/* -+ * Number of reclaimable buckets - only for use by the allocator thread: -+ */ -+static inline u64 dev_buckets_available(struct bch_dev *ca) -+{ -+ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); -+} -+ -+static inline u64 __dev_buckets_free(struct bch_dev *ca, -+ struct bch_dev_usage stats) -+{ -+ return __dev_buckets_available(ca, stats) + -+ fifo_used(&ca->free[RESERVE_NONE]) + -+ fifo_used(&ca->free_inc); -+} -+ -+static inline u64 dev_buckets_free(struct bch_dev *ca) -+{ -+ return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); -+} -+ -+/* Filesystem usage: */ -+ -+static inline unsigned fs_usage_u64s(struct bch_fs *c) -+{ -+ -+ return sizeof(struct bch_fs_usage) / sizeof(u64) + -+ READ_ONCE(c->replicas.nr); -+} -+ -+void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); -+struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); -+ -+u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); -+ -+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); -+ -+void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); -+ -+void bch2_fs_usage_to_text(struct printbuf *, -+ struct bch_fs *, struct bch_fs_usage *); -+ -+u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); -+ -+struct bch_fs_usage_short -+bch2_fs_usage_read_short(struct bch_fs *); -+ -+/* key/bucket marking: */ -+ -+void bch2_bucket_seq_cleanup(struct bch_fs *); -+void bch2_fs_usage_initialize(struct bch_fs *); -+ -+void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, struct bucket_mark *); -+void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, bool, struct gc_pos, unsigned); -+void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, -+ size_t, enum bch_data_type, unsigned, -+ struct gc_pos, unsigned); -+ -+int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, -+ s64, struct bch_fs_usage *, u64, unsigned); -+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, -+ struct disk_reservation *, unsigned); -+ -+int bch2_mark_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct bch_fs_usage *, unsigned); -+ -+int bch2_replicas_delta_list_apply(struct bch_fs *, -+ struct bch_fs_usage *, -+ struct replicas_delta_list *); -+int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, -+ unsigned, s64, unsigned); -+int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, -+ struct bkey_i *insert, unsigned); -+void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); -+ -+/* disk reservations: */ -+ -+void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); -+ -+static inline void bch2_disk_reservation_put(struct bch_fs *c, -+ struct disk_reservation *res) -+{ -+ if (res->sectors) -+ __bch2_disk_reservation_put(c, res); -+} -+ -+#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) -+ -+int bch2_disk_reservation_add(struct bch_fs *, -+ struct disk_reservation *, -+ unsigned, int); -+ -+static inline struct disk_reservation -+bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) -+{ -+ return (struct disk_reservation) { -+ .sectors = 0, -+#if 0 -+ /* not used yet: */ -+ .gen = c->capacity_gen, -+#endif -+ .nr_replicas = nr_replicas, -+ }; -+} -+ -+static inline int bch2_disk_reservation_get(struct bch_fs *c, -+ struct disk_reservation *res, -+ unsigned sectors, -+ unsigned nr_replicas, -+ int flags) -+{ -+ *res = bch2_disk_reservation_init(c, nr_replicas); -+ -+ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); -+} -+ -+int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); -+void bch2_dev_buckets_free(struct bch_dev *); -+int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); -+ -+#endif /* _BUCKETS_H */ -diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h -new file mode 100644 -index 000000000000..d5215b14d7d9 ---- /dev/null -+++ b/fs/bcachefs/buckets_types.h -@@ -0,0 +1,135 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BUCKETS_TYPES_H -+#define _BUCKETS_TYPES_H -+ -+#include "bcachefs_format.h" -+#include "util.h" -+ -+#define BUCKET_JOURNAL_SEQ_BITS 16 -+ -+struct bucket_mark { -+ union { -+ atomic64_t v; -+ -+ struct { -+ u8 gen; -+ u8 data_type:3, -+ owned_by_allocator:1, -+ journal_seq_valid:1, -+ stripe:1; -+ u16 dirty_sectors; -+ u16 cached_sectors; -+ -+ /* -+ * low bits of journal sequence number when this bucket was most -+ * recently modified: if journal_seq_valid is set, this bucket can't be -+ * reused until the journal sequence number written to disk is >= the -+ * bucket's journal sequence number: -+ */ -+ u16 journal_seq; -+ }; -+ }; -+}; -+ -+struct bucket { -+ union { -+ struct bucket_mark _mark; -+ const struct bucket_mark mark; -+ }; -+ -+ u16 io_time[2]; -+ u8 oldest_gen; -+ u8 gc_gen; -+ unsigned gen_valid:1; -+}; -+ -+struct bucket_array { -+ struct rcu_head rcu; -+ u16 first_bucket; -+ size_t nbuckets; -+ struct bucket b[]; -+}; -+ -+struct bch_dev_usage { -+ u64 buckets[BCH_DATA_NR]; -+ u64 buckets_alloc; -+ u64 buckets_unavailable; -+ -+ /* _compressed_ sectors: */ -+ u64 sectors[BCH_DATA_NR]; -+ u64 sectors_fragmented; -+ -+ u64 buckets_ec; -+ u64 sectors_ec; -+}; -+ -+struct bch_fs_usage { -+ /* all fields are in units of 512 byte sectors: */ -+ -+ u64 online_reserved; -+ -+ /* fields after online_reserved are cleared/recalculated by gc: */ -+ u64 gc_start[0]; -+ -+ u64 hidden; -+ u64 btree; -+ u64 data; -+ u64 cached; -+ u64 reserved; -+ u64 nr_inodes; -+ -+ /* XXX: add stats for compression ratio */ -+#if 0 -+ u64 uncompressed; -+ u64 compressed; -+#endif -+ -+ /* broken out: */ -+ -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ u64 replicas[]; -+}; -+ -+struct bch_fs_usage_short { -+ u64 capacity; -+ u64 used; -+ u64 free; -+ u64 nr_inodes; -+}; -+ -+struct replicas_delta { -+ s64 delta; -+ struct bch_replicas_entry r; -+} __packed; -+ -+struct replicas_delta_list { -+ unsigned size; -+ unsigned used; -+ -+ struct {} memset_start; -+ u64 nr_inodes; -+ u64 persistent_reserved[BCH_REPLICAS_MAX]; -+ struct {} memset_end; -+ struct replicas_delta d[0]; -+}; -+ -+/* -+ * A reservation for space on disk: -+ */ -+struct disk_reservation { -+ u64 sectors; -+ u32 gen; -+ unsigned nr_replicas; -+}; -+ -+struct copygc_heap_entry { -+ u8 dev; -+ u8 gen; -+ u16 fragmentation; -+ u32 sectors; -+ u64 offset; -+}; -+ -+typedef HEAP(struct copygc_heap_entry) copygc_heap; -+ -+#endif /* _BUCKETS_TYPES_H */ -diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c -new file mode 100644 -index 000000000000..0377f9018d27 ---- /dev/null -+++ b/fs/bcachefs/chardev.c -@@ -0,0 +1,704 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_CHARDEV -+ -+#include "bcachefs.h" -+#include "bcachefs_ioctl.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "move.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* returns with ref on ca->ref */ -+static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, -+ unsigned flags) -+{ -+ struct bch_dev *ca; -+ -+ if (flags & BCH_BY_INDEX) { -+ if (dev >= c->sb.nr_devices) -+ return ERR_PTR(-EINVAL); -+ -+ rcu_read_lock(); -+ ca = rcu_dereference(c->devs[dev]); -+ if (ca) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ if (!ca) -+ return ERR_PTR(-EINVAL); -+ } else { -+ char *path; -+ -+ path = strndup_user((const char __user *) -+ (unsigned long) dev, PATH_MAX); -+ if (IS_ERR(path)) -+ return ERR_CAST(path); -+ -+ ca = bch2_dev_lookup(c, path); -+ kfree(path); -+ } -+ -+ return ca; -+} -+ -+#if 0 -+static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) -+{ -+ struct bch_ioctl_assemble arg; -+ struct bch_fs *c; -+ u64 *user_devs = NULL; -+ char **devs = NULL; -+ unsigned i; -+ int ret = -EFAULT; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); -+ if (!user_devs) -+ return -ENOMEM; -+ -+ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); -+ -+ if (copy_from_user(user_devs, user_arg->devs, -+ sizeof(u64) * arg.nr_devs)) -+ goto err; -+ -+ for (i = 0; i < arg.nr_devs; i++) { -+ devs[i] = strndup_user((const char __user *)(unsigned long) -+ user_devs[i], -+ PATH_MAX); -+ if (!devs[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); -+ ret = PTR_ERR_OR_ZERO(c); -+ if (!ret) -+ closure_put(&c->cl); -+err: -+ if (devs) -+ for (i = 0; i < arg.nr_devs; i++) -+ kfree(devs[i]); -+ kfree(devs); -+ return ret; -+} -+ -+static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) -+{ -+ struct bch_ioctl_incremental arg; -+ const char *err; -+ char *path; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ err = bch2_fs_open_incremental(path); -+ kfree(path); -+ -+ if (err) { -+ pr_err("Could not register bcachefs devices: %s", err); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+#endif -+ -+static long bch2_global_ioctl(unsigned cmd, void __user *arg) -+{ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_ASSEMBLE: -+ return bch2_ioctl_assemble(arg); -+ case BCH_IOCTL_INCREMENTAL: -+ return bch2_ioctl_incremental(arg); -+#endif -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static long bch2_ioctl_query_uuid(struct bch_fs *c, -+ struct bch_ioctl_query_uuid __user *user_arg) -+{ -+ return copy_to_user(&user_arg->uuid, -+ &c->sb.user_uuid, -+ sizeof(c->sb.user_uuid)); -+} -+ -+#if 0 -+static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) -+{ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ return bch2_fs_start(c); -+} -+ -+static long bch2_ioctl_stop(struct bch_fs *c) -+{ -+ bch2_fs_stop(c); -+ return 0; -+} -+#endif -+ -+static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_add(c, path); -+ kfree(path); -+ -+ return ret; -+} -+ -+static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ return bch2_dev_remove(c, ca, arg.flags); -+} -+ -+static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ char *path; -+ int ret; -+ -+ if (arg.flags || arg.pad) -+ return -EINVAL; -+ -+ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); -+ if (!path) -+ return -ENOMEM; -+ -+ ret = bch2_dev_online(c, path); -+ kfree(path); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_offline(c, ca, arg.flags); -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_set_state(struct bch_fs *c, -+ struct bch_ioctl_disk_set_state arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| -+ BCH_FORCE_IF_METADATA_LOST| -+ BCH_FORCE_IF_DEGRADED| -+ BCH_BY_INDEX)) || -+ arg.pad[0] || arg.pad[1] || arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+struct bch_data_ctx { -+ struct bch_fs *c; -+ struct bch_ioctl_data arg; -+ struct bch_move_stats stats; -+ -+ int ret; -+ -+ struct task_struct *thread; -+}; -+ -+static int bch2_data_thread(void *arg) -+{ -+ struct bch_data_ctx *ctx = arg; -+ -+ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); -+ -+ ctx->stats.data_type = U8_MAX; -+ return 0; -+} -+ -+static int bch2_data_job_release(struct inode *inode, struct file *file) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ -+ kthread_stop(ctx->thread); -+ put_task_struct(ctx->thread); -+ kfree(ctx); -+ return 0; -+} -+ -+static ssize_t bch2_data_job_read(struct file *file, char __user *buf, -+ size_t len, loff_t *ppos) -+{ -+ struct bch_data_ctx *ctx = file->private_data; -+ struct bch_fs *c = ctx->c; -+ struct bch_ioctl_data_event e = { -+ .type = BCH_DATA_EVENT_PROGRESS, -+ .p.data_type = ctx->stats.data_type, -+ .p.btree_id = ctx->stats.btree_id, -+ .p.pos = ctx->stats.pos, -+ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), -+ .p.sectors_total = bch2_fs_usage_read_short(c).used, -+ }; -+ -+ if (len < sizeof(e)) -+ return -EINVAL; -+ -+ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); -+} -+ -+static const struct file_operations bcachefs_data_ops = { -+ .release = bch2_data_job_release, -+ .read = bch2_data_job_read, -+ .llseek = no_llseek, -+}; -+ -+static long bch2_ioctl_data(struct bch_fs *c, -+ struct bch_ioctl_data arg) -+{ -+ struct bch_data_ctx *ctx = NULL; -+ struct file *file = NULL; -+ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; -+ int ret, fd = -1; -+ -+ if (arg.op >= BCH_DATA_OP_NR || arg.flags) -+ return -EINVAL; -+ -+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); -+ if (!ctx) -+ return -ENOMEM; -+ -+ ctx->c = c; -+ ctx->arg = arg; -+ -+ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); -+ if (IS_ERR(ctx->thread)) { -+ ret = PTR_ERR(ctx->thread); -+ goto err; -+ } -+ -+ ret = get_unused_fd_flags(flags); -+ if (ret < 0) -+ goto err; -+ fd = ret; -+ -+ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); -+ if (IS_ERR(file)) { -+ ret = PTR_ERR(file); -+ goto err; -+ } -+ -+ fd_install(fd, file); -+ -+ get_task_struct(ctx->thread); -+ wake_up_process(ctx->thread); -+ -+ return fd; -+err: -+ if (fd >= 0) -+ put_unused_fd(fd); -+ if (!IS_ERR_OR_NULL(ctx->thread)) -+ kthread_stop(ctx->thread); -+ kfree(ctx); -+ return ret; -+} -+ -+static long bch2_ioctl_fs_usage(struct bch_fs *c, -+ struct bch_ioctl_fs_usage __user *user_arg) -+{ -+ struct bch_ioctl_fs_usage *arg = NULL; -+ struct bch_replicas_usage *dst_e, *dst_end; -+ struct bch_fs_usage *src; -+ u32 replica_entries_bytes; -+ unsigned i; -+ int ret = 0; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) -+ return -EFAULT; -+ -+ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); -+ if (!arg) -+ return -ENOMEM; -+ -+ src = bch2_fs_usage_read(c); -+ if (!src) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ arg->capacity = c->capacity; -+ arg->used = bch2_fs_sectors_used(c, src); -+ arg->online_reserved = src->online_reserved; -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) -+ arg->persistent_reserved[i] = src->persistent_reserved[i]; -+ -+ dst_e = arg->replicas; -+ dst_end = (void *) arg->replicas + replica_entries_bytes; -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *src_e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ dst_e->sectors = src->replicas[i]; -+ dst_e->r = *src_e; -+ -+ /* recheck after setting nr_devs: */ -+ if (replicas_usage_next(dst_e) > dst_end) { -+ ret = -ERANGE; -+ break; -+ } -+ -+ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); -+ -+ dst_e = replicas_usage_next(dst_e); -+ } -+ -+ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; -+ -+ percpu_up_read(&c->mark_lock); -+ kfree(src); -+ -+ if (!ret) -+ ret = copy_to_user(user_arg, arg, -+ sizeof(*arg) + arg->replica_entries_bytes); -+err: -+ kfree(arg); -+ return ret; -+} -+ -+static long bch2_ioctl_dev_usage(struct bch_fs *c, -+ struct bch_ioctl_dev_usage __user *user_arg) -+{ -+ struct bch_ioctl_dev_usage arg; -+ struct bch_dev_usage src; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ if (copy_from_user(&arg, user_arg, sizeof(arg))) -+ return -EFAULT; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad[0] || -+ arg.pad[1] || -+ arg.pad[2]) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ src = bch2_dev_usage_read(ca); -+ -+ arg.state = ca->mi.state; -+ arg.bucket_size = ca->mi.bucket_size; -+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; -+ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; -+ arg.ec_buckets = src.buckets_ec; -+ arg.ec_sectors = src.sectors_ec; -+ -+ for (i = 0; i < BCH_DATA_NR; i++) { -+ arg.buckets[i] = src.buckets[i]; -+ arg.sectors[i] = src.sectors[i]; -+ } -+ -+ percpu_ref_put(&ca->ref); -+ -+ return copy_to_user(user_arg, &arg, sizeof(arg)); -+} -+ -+static long bch2_ioctl_read_super(struct bch_fs *c, -+ struct bch_ioctl_read_super arg) -+{ -+ struct bch_dev *ca = NULL; -+ struct bch_sb *sb; -+ int ret = 0; -+ -+ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || -+ arg.pad) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (arg.flags & BCH_READ_DEV) { -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ -+ if (IS_ERR(ca)) { -+ ret = PTR_ERR(ca); -+ goto err; -+ } -+ -+ sb = ca->disk_sb.sb; -+ } else { -+ sb = c->disk_sb.sb; -+ } -+ -+ if (vstruct_bytes(sb) > arg.size) { -+ ret = -ERANGE; -+ goto err; -+ } -+ -+ ret = copy_to_user((void __user *)(unsigned long)arg.sb, -+ sb, vstruct_bytes(sb)); -+err: -+ if (ca) -+ percpu_ref_put(&ca->ref); -+ mutex_unlock(&c->sb_lock); -+ return ret; -+} -+ -+static long bch2_ioctl_disk_get_idx(struct bch_fs *c, -+ struct bch_ioctl_disk_get_idx arg) -+{ -+ dev_t dev = huge_decode_dev(arg.dev); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ if (ca->disk_sb.bdev->bd_dev == dev) { -+ percpu_ref_put(&ca->io_ref); -+ return i; -+ } -+ -+ return -ENOENT; -+} -+ -+static long bch2_ioctl_disk_resize(struct bch_fs *c, -+ struct bch_ioctl_disk_resize arg) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ if ((arg.flags & ~BCH_BY_INDEX) || -+ arg.pad) -+ return -EINVAL; -+ -+ ca = bch2_device_lookup(c, arg.dev, arg.flags); -+ if (IS_ERR(ca)) -+ return PTR_ERR(ca); -+ -+ ret = bch2_dev_resize(c, ca, arg.nbuckets); -+ -+ percpu_ref_put(&ca->ref); -+ return ret; -+} -+ -+#define BCH_IOCTL(_name, _argtype) \ -+do { \ -+ _argtype i; \ -+ \ -+ if (copy_from_user(&i, arg, sizeof(i))) \ -+ return -EFAULT; \ -+ return bch2_ioctl_##_name(c, i); \ -+} while (0) -+ -+long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) -+{ -+ /* ioctls that don't require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_QUERY_UUID: -+ return bch2_ioctl_query_uuid(c, arg); -+ case BCH_IOCTL_FS_USAGE: -+ return bch2_ioctl_fs_usage(c, arg); -+ case BCH_IOCTL_DEV_USAGE: -+ return bch2_ioctl_dev_usage(c, arg); -+ } -+ -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ switch (cmd) { -+#if 0 -+ case BCH_IOCTL_START: -+ BCH_IOCTL(start, struct bch_ioctl_start); -+ case BCH_IOCTL_STOP: -+ return bch2_ioctl_stop(c); -+#endif -+ case BCH_IOCTL_READ_SUPER: -+ BCH_IOCTL(read_super, struct bch_ioctl_read_super); -+ case BCH_IOCTL_DISK_GET_IDX: -+ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); -+ } -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EINVAL; -+ -+ /* ioctls that do require admin cap: */ -+ switch (cmd) { -+ case BCH_IOCTL_DISK_ADD: -+ BCH_IOCTL(disk_add, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_REMOVE: -+ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_ONLINE: -+ BCH_IOCTL(disk_online, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_OFFLINE: -+ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); -+ case BCH_IOCTL_DISK_SET_STATE: -+ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); -+ case BCH_IOCTL_DATA: -+ BCH_IOCTL(data, struct bch_ioctl_data); -+ case BCH_IOCTL_DISK_RESIZE: -+ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); -+ -+ default: -+ return -ENOTTY; -+ } -+} -+ -+static DEFINE_IDR(bch_chardev_minor); -+ -+static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) -+{ -+ unsigned minor = iminor(file_inode(filp)); -+ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; -+ void __user *arg = (void __user *) v; -+ -+ return c -+ ? bch2_fs_ioctl(c, cmd, arg) -+ : bch2_global_ioctl(cmd, arg); -+} -+ -+static const struct file_operations bch_chardev_fops = { -+ .owner = THIS_MODULE, -+ .unlocked_ioctl = bch2_chardev_ioctl, -+ .open = nonseekable_open, -+}; -+ -+static int bch_chardev_major; -+static struct class *bch_chardev_class; -+static struct device *bch_chardev; -+ -+void bch2_fs_chardev_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->chardev)) -+ device_unregister(c->chardev); -+ if (c->minor >= 0) -+ idr_remove(&bch_chardev_minor, c->minor); -+} -+ -+int bch2_fs_chardev_init(struct bch_fs *c) -+{ -+ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); -+ if (c->minor < 0) -+ return c->minor; -+ -+ c->chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, c->minor), c, -+ "bcachefs%u-ctl", c->minor); -+ if (IS_ERR(c->chardev)) -+ return PTR_ERR(c->chardev); -+ -+ return 0; -+} -+ -+void bch2_chardev_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ device_destroy(bch_chardev_class, -+ MKDEV(bch_chardev_major, U8_MAX)); -+ if (!IS_ERR_OR_NULL(bch_chardev_class)) -+ class_destroy(bch_chardev_class); -+ if (bch_chardev_major > 0) -+ unregister_chrdev(bch_chardev_major, "bcachefs"); -+} -+ -+int __init bch2_chardev_init(void) -+{ -+ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); -+ if (bch_chardev_major < 0) -+ return bch_chardev_major; -+ -+ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); -+ if (IS_ERR(bch_chardev_class)) -+ return PTR_ERR(bch_chardev_class); -+ -+ bch_chardev = device_create(bch_chardev_class, NULL, -+ MKDEV(bch_chardev_major, U8_MAX), -+ NULL, "bcachefs-ctl"); -+ if (IS_ERR(bch_chardev)) -+ return PTR_ERR(bch_chardev); -+ -+ return 0; -+} -+ -+#endif /* NO_BCACHEFS_CHARDEV */ -diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h -new file mode 100644 -index 000000000000..3a4890d39ff9 ---- /dev/null -+++ b/fs/bcachefs/chardev.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHARDEV_H -+#define _BCACHEFS_CHARDEV_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); -+ -+void bch2_fs_chardev_exit(struct bch_fs *); -+int bch2_fs_chardev_init(struct bch_fs *); -+ -+void bch2_chardev_exit(void); -+int __init bch2_chardev_init(void); -+ -+#else -+ -+static inline long bch2_fs_ioctl(struct bch_fs *c, -+ unsigned cmd, void __user * arg) -+{ -+ return -ENOSYS; -+} -+ -+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} -+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } -+ -+static inline void bch2_chardev_exit(void) {} -+static inline int __init bch2_chardev_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_CHARDEV_H */ -diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c -new file mode 100644 -index 000000000000..3d88719ba86c ---- /dev/null -+++ b/fs/bcachefs/checksum.c -@@ -0,0 +1,618 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static u64 bch2_checksum_init(unsigned type) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return 0; -+ case BCH_CSUM_CRC64: -+ return 0; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_final(unsigned type, u64 crc) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ return crc ^ U32_MAX; -+ case BCH_CSUM_CRC64_NONZERO: -+ return crc ^ U64_MAX; -+ case BCH_CSUM_CRC32C: -+ return crc; -+ case BCH_CSUM_CRC64: -+ return crc; -+ default: -+ BUG(); -+ } -+} -+ -+static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return 0; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC32C: -+ return crc32c(crc, data, len); -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC64: -+ return crc64_be(crc, data, len); -+ default: -+ BUG(); -+ } -+} -+ -+static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ struct scatterlist *sg, size_t len) -+{ -+ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); -+ int ret; -+ -+ skcipher_request_set_sync_tfm(req, tfm); -+ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); -+ -+ ret = crypto_skcipher_encrypt(req); -+ BUG_ON(ret); -+} -+ -+static inline void do_encrypt(struct crypto_sync_skcipher *tfm, -+ struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct scatterlist sg; -+ -+ sg_init_one(&sg, buf, len); -+ do_encrypt_sg(tfm, nonce, &sg, len); -+} -+ -+int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, -+ void *buf, size_t len) -+{ -+ struct crypto_sync_skcipher *chacha20 = -+ crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ int ret; -+ -+ if (!chacha20) { -+ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); -+ return PTR_ERR(chacha20); -+ } -+ -+ ret = crypto_skcipher_setkey(&chacha20->base, -+ (void *) key, sizeof(*key)); -+ if (ret) { -+ pr_err("crypto_skcipher_setkey() error: %i", ret); -+ goto err; -+ } -+ -+ do_encrypt(chacha20, nonce, buf, len); -+err: -+ crypto_free_sync_skcipher(chacha20); -+ return ret; -+} -+ -+static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, -+ struct nonce nonce) -+{ -+ u8 key[POLY1305_KEY_SIZE]; -+ -+ nonce.d[3] ^= BCH_NONCE_POLY; -+ -+ memset(key, 0, sizeof(key)); -+ do_encrypt(c->chacha20, nonce, key, sizeof(key)); -+ -+ desc->tfm = c->poly1305; -+ crypto_shash_init(desc); -+ crypto_shash_update(desc, key, sizeof(key)); -+} -+ -+struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, -+ struct nonce nonce, const void *data, size_t len) -+{ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+ crc = bch2_checksum_update(type, crc, data, len); -+ crc = bch2_checksum_final(type, crc); -+ -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+ crypto_shash_update(desc, data, len); -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_encrypt(struct bch_fs *c, unsigned type, -+ struct nonce nonce, void *data, size_t len) -+{ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ do_encrypt(c->chacha20, nonce, data, len); -+} -+ -+static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio, -+ struct bvec_iter *iter) -+{ -+ struct bio_vec bv; -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ return (struct bch_csum) { 0 }; -+ case BCH_CSUM_CRC32C_NONZERO: -+ case BCH_CSUM_CRC64_NONZERO: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: { -+ u64 crc = bch2_checksum_init(type); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ crc = bch2_checksum_update(type, -+ crc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crc = bch2_checksum_update(type, crc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crc = bch2_checksum_final(type, crc); -+ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; -+ } -+ -+ case BCH_CSUM_CHACHA20_POLY1305_80: -+ case BCH_CSUM_CHACHA20_POLY1305_128: { -+ SHASH_DESC_ON_STACK(desc, c->poly1305); -+ u8 digest[POLY1305_DIGEST_SIZE]; -+ struct bch_csum ret = { 0 }; -+ -+ gen_poly_key(c, desc, nonce); -+ -+#ifdef CONFIG_HIGHMEM -+ __bio_for_each_segment(bv, bio, *iter, *iter) { -+ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; -+ -+ crypto_shash_update(desc, p, bv.bv_len); -+ kunmap_atomic(p); -+ } -+#else -+ __bio_for_each_bvec(bv, bio, *iter, *iter) -+ crypto_shash_update(desc, -+ page_address(bv.bv_page) + bv.bv_offset, -+ bv.bv_len); -+#endif -+ crypto_shash_final(desc, digest); -+ -+ memcpy(&ret, digest, bch_crc_bytes[type]); -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ -+ return __bch2_checksum_bio(c, type, nonce, bio, &iter); -+} -+ -+void bch2_encrypt_bio(struct bch_fs *c, unsigned type, -+ struct nonce nonce, struct bio *bio) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ struct scatterlist sgl[16], *sg = sgl; -+ size_t bytes = 0; -+ -+ if (!bch2_csum_type_is_encryption(type)) -+ return; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ -+ bio_for_each_segment(bv, bio, iter) { -+ if (sg == sgl + ARRAY_SIZE(sgl)) { -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+ -+ nonce = nonce_add(nonce, bytes); -+ bytes = 0; -+ -+ sg_init_table(sgl, ARRAY_SIZE(sgl)); -+ sg = sgl; -+ } -+ -+ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); -+ bytes += bv.bv_len; -+ } -+ -+ sg_mark_end(sg - 1); -+ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, -+ struct bch_csum b, size_t b_len) -+{ -+ BUG_ON(!bch2_checksum_mergeable(type)); -+ -+ while (b_len) { -+ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); -+ -+ a.lo = bch2_checksum_update(type, a.lo, -+ page_address(ZERO_PAGE(0)), b); -+ b_len -= b; -+ } -+ -+ a.lo ^= b.lo; -+ a.hi ^= b.hi; -+ return a; -+} -+ -+int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc_old, -+ struct bch_extent_crc_unpacked *crc_a, -+ struct bch_extent_crc_unpacked *crc_b, -+ unsigned len_a, unsigned len_b, -+ unsigned new_csum_type) -+{ -+ struct bvec_iter iter = bio->bi_iter; -+ struct nonce nonce = extent_nonce(version, crc_old); -+ struct bch_csum merged = { 0 }; -+ struct crc_split { -+ struct bch_extent_crc_unpacked *crc; -+ unsigned len; -+ unsigned csum_type; -+ struct bch_csum csum; -+ } splits[3] = { -+ { crc_a, len_a, new_csum_type }, -+ { crc_b, len_b, new_csum_type }, -+ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, -+ }, *i; -+ bool mergeable = crc_old.csum_type == new_csum_type && -+ bch2_checksum_mergeable(new_csum_type); -+ unsigned crc_nonce = crc_old.nonce; -+ -+ BUG_ON(len_a + len_b > bio_sectors(bio)); -+ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); -+ BUG_ON(crc_is_compressed(crc_old)); -+ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)); -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ iter.bi_size = i->len << 9; -+ if (mergeable || i->crc) -+ i->csum = __bch2_checksum_bio(c, i->csum_type, -+ nonce, bio, &iter); -+ else -+ bio_advance_iter(bio, &iter, i->len << 9); -+ nonce = nonce_add(nonce, i->len << 9); -+ } -+ -+ if (mergeable) -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) -+ merged = bch2_checksum_merge(new_csum_type, merged, -+ i->csum, i->len << 9); -+ else -+ merged = bch2_checksum_bio(c, crc_old.csum_type, -+ extent_nonce(version, crc_old), bio); -+ -+ if (bch2_crc_cmp(merged, crc_old.csum)) -+ return -EIO; -+ -+ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { -+ if (i->crc) -+ *i->crc = (struct bch_extent_crc_unpacked) { -+ .csum_type = i->csum_type, -+ .compression_type = crc_old.compression_type, -+ .compressed_size = i->len, -+ .uncompressed_size = i->len, -+ .offset = 0, -+ .live_size = i->len, -+ .nonce = crc_nonce, -+ .csum = i->csum, -+ }; -+ -+ if (bch2_csum_type_is_encryption(new_csum_type)) -+ crc_nonce += i->len; -+ } -+ -+ return 0; -+} -+ -+#ifdef __KERNEL__ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ char key_description[60]; -+ struct key *keyring_key; -+ const struct user_key_payload *ukp; -+ int ret; -+ -+ snprintf(key_description, sizeof(key_description), -+ "bcachefs:%pUb", &sb->user_uuid); -+ -+ keyring_key = request_key(&key_type_logon, key_description, NULL); -+ if (IS_ERR(keyring_key)) -+ return PTR_ERR(keyring_key); -+ -+ down_read(&keyring_key->sem); -+ ukp = dereference_key_locked(keyring_key); -+ if (ukp->datalen == sizeof(*key)) { -+ memcpy(key, ukp->data, ukp->datalen); -+ ret = 0; -+ } else { -+ ret = -EINVAL; -+ } -+ up_read(&keyring_key->sem); -+ key_put(keyring_key); -+ -+ return ret; -+} -+#else -+#include -+#include -+ -+int bch2_request_key(struct bch_sb *sb, struct bch_key *key) -+{ -+ key_serial_t key_id; -+ char key_description[60]; -+ char uuid[40]; -+ -+ uuid_unparse_lower(sb->user_uuid.b, uuid); -+ sprintf(key_description, "bcachefs:%s", uuid); -+ -+ key_id = request_key("user", key_description, NULL, -+ KEY_SPEC_USER_KEYRING); -+ if (key_id < 0) -+ return -errno; -+ -+ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) -+ return -1; -+ -+ return 0; -+} -+#endif -+ -+int bch2_decrypt_sb_key(struct bch_fs *c, -+ struct bch_sb_field_crypt *crypt, -+ struct bch_key *key) -+{ -+ struct bch_encrypted_key sb_key = crypt->key; -+ struct bch_key user_key; -+ int ret = 0; -+ -+ /* is key encrypted? */ -+ if (!bch2_key_is_encrypted(&sb_key)) -+ goto out; -+ -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ /* decrypt real key: */ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &sb_key, sizeof(sb_key)); -+ if (ret) -+ goto err; -+ -+ if (bch2_key_is_encrypted(&sb_key)) { -+ bch_err(c, "incorrect encryption key"); -+ ret = -EINVAL; -+ goto err; -+ } -+out: -+ *key = sb_key.key; -+err: -+ memzero_explicit(&sb_key, sizeof(sb_key)); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ return ret; -+} -+ -+static int bch2_alloc_ciphers(struct bch_fs *c) -+{ -+ if (!c->chacha20) -+ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); -+ if (IS_ERR(c->chacha20)) { -+ bch_err(c, "error requesting chacha20 module: %li", -+ PTR_ERR(c->chacha20)); -+ return PTR_ERR(c->chacha20); -+ } -+ -+ if (!c->poly1305) -+ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); -+ if (IS_ERR(c->poly1305)) { -+ bch_err(c, "error requesting poly1305 module: %li", -+ PTR_ERR(c->poly1305)); -+ return PTR_ERR(c->poly1305); -+ } -+ -+ return 0; -+} -+ -+int bch2_disable_encryption(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ /* is key encrypted? */ -+ ret = 0; -+ if (bch2_key_is_encrypted(&crypt->key)) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ crypt->key.magic = BCH_KEY_MAGIC; -+ crypt->key.key = key; -+ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_enable_encryption(struct bch_fs *c, bool keyed) -+{ -+ struct bch_encrypted_key key; -+ struct bch_key user_key; -+ struct bch_sb_field_crypt *crypt; -+ int ret = -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ -+ /* Do we already have an encryption key? */ -+ if (bch2_sb_get_crypt(c->disk_sb.sb)) -+ goto err; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto err; -+ -+ key.magic = BCH_KEY_MAGIC; -+ get_random_bytes(&key.key, sizeof(key.key)); -+ -+ if (keyed) { -+ ret = bch2_request_key(c->disk_sb.sb, &user_key); -+ if (ret) { -+ bch_err(c, "error requesting encryption key: %i", ret); -+ goto err; -+ } -+ -+ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), -+ &key, sizeof(key)); -+ if (ret) -+ goto err; -+ } -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto err; -+ -+ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); -+ if (!crypt) { -+ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ -+ goto err; -+ } -+ -+ crypt->key = key; -+ -+ /* write superblock */ -+ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); -+ bch2_write_super(c); -+err: -+ mutex_unlock(&c->sb_lock); -+ memzero_explicit(&user_key, sizeof(user_key)); -+ memzero_explicit(&key, sizeof(key)); -+ return ret; -+} -+ -+void bch2_fs_encryption_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->poly1305)) -+ crypto_free_shash(c->poly1305); -+ if (!IS_ERR_OR_NULL(c->chacha20)) -+ crypto_free_sync_skcipher(c->chacha20); -+ if (!IS_ERR_OR_NULL(c->sha256)) -+ crypto_free_shash(c->sha256); -+} -+ -+int bch2_fs_encryption_init(struct bch_fs *c) -+{ -+ struct bch_sb_field_crypt *crypt; -+ struct bch_key key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->sha256 = crypto_alloc_shash("sha256", 0, 0); -+ if (IS_ERR(c->sha256)) { -+ bch_err(c, "error requesting sha256 module"); -+ ret = PTR_ERR(c->sha256); -+ goto out; -+ } -+ -+ crypt = bch2_sb_get_crypt(c->disk_sb.sb); -+ if (!crypt) -+ goto out; -+ -+ ret = bch2_alloc_ciphers(c); -+ if (ret) -+ goto out; -+ -+ ret = bch2_decrypt_sb_key(c, crypt, &key); -+ if (ret) -+ goto out; -+ -+ ret = crypto_skcipher_setkey(&c->chacha20->base, -+ (void *) &key.key, sizeof(key.key)); -+ if (ret) -+ goto out; -+out: -+ memzero_explicit(&key, sizeof(key)); -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h -new file mode 100644 -index 000000000000..24dee8039d57 ---- /dev/null -+++ b/fs/bcachefs/checksum.h -@@ -0,0 +1,202 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CHECKSUM_H -+#define _BCACHEFS_CHECKSUM_H -+ -+#include "bcachefs.h" -+#include "extents_types.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+static inline bool bch2_checksum_mergeable(unsigned type) -+{ -+ -+ switch (type) { -+ case BCH_CSUM_NONE: -+ case BCH_CSUM_CRC32C: -+ case BCH_CSUM_CRC64: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, -+ struct bch_csum, size_t); -+ -+#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) -+#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) -+#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) -+#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) -+#define BCH_NONCE_POLY cpu_to_le32(1 << 31) -+ -+struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, -+ const void *, size_t); -+ -+/* -+ * This is used for various on disk data structures - bch_sb, prio_set, bset, -+ * jset: The checksum is _always_ the first field of these structs -+ */ -+#define csum_vstruct(_c, _type, _nonce, _i) \ -+({ \ -+ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ -+ const void *end = vstruct_end(_i); \ -+ \ -+ bch2_checksum(_c, _type, _nonce, start, end - start); \ -+}) -+ -+int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); -+int bch2_request_key(struct bch_sb *, struct bch_key *); -+ -+void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, -+ void *data, size_t); -+ -+struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, -+ struct bch_extent_crc_unpacked, -+ struct bch_extent_crc_unpacked *, -+ struct bch_extent_crc_unpacked *, -+ unsigned, unsigned, unsigned); -+ -+void bch2_encrypt_bio(struct bch_fs *, unsigned, -+ struct nonce, struct bio *); -+ -+int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, -+ struct bch_key *); -+ -+int bch2_disable_encryption(struct bch_fs *); -+int bch2_enable_encryption(struct bch_fs *, bool); -+ -+void bch2_fs_encryption_exit(struct bch_fs *); -+int bch2_fs_encryption_init(struct bch_fs *); -+ -+static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, -+ bool data) -+{ -+ switch (type) { -+ case BCH_CSUM_OPT_NONE: -+ return BCH_CSUM_NONE; -+ case BCH_CSUM_OPT_CRC32C: -+ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; -+ case BCH_CSUM_OPT_CRC64: -+ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; -+ default: -+ BUG(); -+ } -+} -+ -+static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, -+ unsigned opt) -+{ -+ if (c->sb.encryption_type) -+ return c->opts.wide_macs -+ ? BCH_CSUM_CHACHA20_POLY1305_128 -+ : BCH_CSUM_CHACHA20_POLY1305_80; -+ -+ return bch2_csum_opt_to_type(opt, true); -+} -+ -+static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) -+{ -+ if (c->sb.encryption_type) -+ return BCH_CSUM_CHACHA20_POLY1305_128; -+ -+ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); -+} -+ -+static const unsigned bch2_compression_opt_to_type[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+static inline bool bch2_checksum_type_valid(const struct bch_fs *c, -+ unsigned type) -+{ -+ if (type >= BCH_CSUM_NR) -+ return false; -+ -+ if (bch2_csum_type_is_encryption(type) && !c->chacha20) -+ return false; -+ -+ return true; -+} -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) -+{ -+ /* -+ * XXX: need some way of preventing the compiler from optimizing this -+ * into a form that isn't constant time.. -+ */ -+ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; -+} -+ -+/* for skipping ahead and encrypting/decrypting at an offset: */ -+static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) -+{ -+ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); -+ -+ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); -+ return nonce; -+} -+ -+static inline struct nonce null_nonce(void) -+{ -+ struct nonce ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ return ret; -+} -+ -+static inline struct nonce extent_nonce(struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ unsigned compression_type = crc_is_compressed(crc) -+ ? crc.compression_type -+ : 0; -+ unsigned size = compression_type ? crc.uncompressed_size : 0; -+ struct nonce nonce = (struct nonce) {{ -+ [0] = cpu_to_le32(size << 22), -+ [1] = cpu_to_le32(version.lo), -+ [2] = cpu_to_le32(version.lo >> 32), -+ [3] = cpu_to_le32(version.hi| -+ (compression_type << 24))^BCH_NONCE_EXTENT, -+ }}; -+ -+ return nonce_add(nonce, crc.nonce << 9); -+} -+ -+static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) -+{ -+ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; -+} -+ -+static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) -+{ -+ __le64 magic = __bch2_sb_magic(sb); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) -+{ -+ __le64 magic = bch2_sb_magic(c); -+ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = 0, -+ [2] = ((__le32 *) &magic)[0], -+ [3] = ((__le32 *) &magic)[1], -+ }}; -+} -+ -+#endif /* _BCACHEFS_CHECKSUM_H */ -diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c -new file mode 100644 -index 000000000000..1d1590de55e8 ---- /dev/null -+++ b/fs/bcachefs/clock.c -@@ -0,0 +1,191 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "clock.h" -+ -+#include -+#include -+#include -+ -+static inline long io_timer_cmp(io_timer_heap *h, -+ struct io_timer *l, -+ struct io_timer *r) -+{ -+ return l->expire - r->expire; -+} -+ -+void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), -+ timer->expire)) { -+ spin_unlock(&clock->timer_lock); -+ timer->fn(timer); -+ return; -+ } -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) -+ goto out; -+ -+ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); -+out: -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) -+{ -+ size_t i; -+ -+ spin_lock(&clock->timer_lock); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ if (clock->timers.data[i] == timer) { -+ heap_del(&clock->timers, i, io_timer_cmp, NULL); -+ break; -+ } -+ -+ spin_unlock(&clock->timer_lock); -+} -+ -+struct io_clock_wait { -+ struct io_timer io_timer; -+ struct timer_list cpu_timer; -+ struct task_struct *task; -+ int expired; -+}; -+ -+static void io_clock_wait_fn(struct io_timer *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, io_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+static void io_clock_cpu_timeout(struct timer_list *timer) -+{ -+ struct io_clock_wait *wait = container_of(timer, -+ struct io_clock_wait, cpu_timer); -+ -+ wait->expired = 1; -+ wake_up_process(wait->task); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) -+{ -+ struct io_clock_wait wait; -+ -+ /* XXX: calculate sleep time rigorously */ -+ wait.io_timer.expire = until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ schedule(); -+ -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+void bch2_kthread_io_clock_wait(struct io_clock *clock, -+ unsigned long io_until, -+ unsigned long cpu_timeout) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct io_clock_wait wait; -+ -+ wait.io_timer.expire = io_until; -+ wait.io_timer.fn = io_clock_wait_fn; -+ wait.task = current; -+ wait.expired = 0; -+ bch2_io_timer_add(clock, &wait.io_timer); -+ -+ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); -+ -+ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) -+ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); -+ -+ while (1) { -+ set_current_state(TASK_INTERRUPTIBLE); -+ if (kthread && kthread_should_stop()) -+ break; -+ -+ if (wait.expired) -+ break; -+ -+ schedule(); -+ try_to_freeze(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+ del_singleshot_timer_sync(&wait.cpu_timer); -+ destroy_timer_on_stack(&wait.cpu_timer); -+ bch2_io_timer_del(clock, &wait.io_timer); -+} -+ -+static struct io_timer *get_expired_timer(struct io_clock *clock, -+ unsigned long now) -+{ -+ struct io_timer *ret = NULL; -+ -+ spin_lock(&clock->timer_lock); -+ -+ if (clock->timers.used && -+ time_after_eq(now, clock->timers.data[0]->expire)) -+ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); -+ -+ spin_unlock(&clock->timer_lock); -+ -+ return ret; -+} -+ -+void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) -+{ -+ struct io_timer *timer; -+ unsigned long now = atomic_long_add_return(sectors, &clock->now); -+ -+ while ((timer = get_expired_timer(clock, now))) -+ timer->fn(timer); -+} -+ -+void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) -+{ -+ unsigned long now; -+ unsigned i; -+ -+ spin_lock(&clock->timer_lock); -+ now = atomic_long_read(&clock->now); -+ -+ for (i = 0; i < clock->timers.used; i++) -+ pr_buf(out, "%ps:\t%li\n", -+ clock->timers.data[i]->fn, -+ clock->timers.data[i]->expire - now); -+ spin_unlock(&clock->timer_lock); -+} -+ -+void bch2_io_clock_exit(struct io_clock *clock) -+{ -+ free_heap(&clock->timers); -+ free_percpu(clock->pcpu_buf); -+} -+ -+int bch2_io_clock_init(struct io_clock *clock) -+{ -+ atomic_long_set(&clock->now, 0); -+ spin_lock_init(&clock->timer_lock); -+ -+ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); -+ -+ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); -+ if (!clock->pcpu_buf) -+ return -ENOMEM; -+ -+ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h -new file mode 100644 -index 000000000000..70a0f7436c84 ---- /dev/null -+++ b/fs/bcachefs/clock.h -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_H -+#define _BCACHEFS_CLOCK_H -+ -+void bch2_io_timer_add(struct io_clock *, struct io_timer *); -+void bch2_io_timer_del(struct io_clock *, struct io_timer *); -+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, -+ unsigned long); -+ -+void __bch2_increment_clock(struct io_clock *, unsigned); -+ -+static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, -+ int rw) -+{ -+ struct io_clock *clock = &c->io_clock[rw]; -+ -+ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= -+ IO_CLOCK_PCPU_SECTORS)) -+ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); -+} -+ -+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); -+ -+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ -+({ \ -+ long __ret = timeout; \ -+ might_sleep(); \ -+ if (!___wait_cond_timeout(condition)) \ -+ __ret = __wait_event_timeout(wq, condition, timeout); \ -+ __ret; \ -+}) -+ -+void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); -+ -+void bch2_io_clock_exit(struct io_clock *); -+int bch2_io_clock_init(struct io_clock *); -+ -+#endif /* _BCACHEFS_CLOCK_H */ -diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h -new file mode 100644 -index 000000000000..92c740a47565 ---- /dev/null -+++ b/fs/bcachefs/clock_types.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_CLOCK_TYPES_H -+#define _BCACHEFS_CLOCK_TYPES_H -+ -+#include "util.h" -+ -+#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) -+ -+/* -+ * Clocks/timers in units of sectors of IO: -+ * -+ * Note - they use percpu batching, so they're only approximate. -+ */ -+ -+struct io_timer; -+typedef void (*io_timer_fn)(struct io_timer *); -+ -+struct io_timer { -+ io_timer_fn fn; -+ unsigned long expire; -+}; -+ -+/* Amount to buffer up on a percpu counter */ -+#define IO_CLOCK_PCPU_SECTORS 128 -+ -+typedef HEAP(struct io_timer *) io_timer_heap; -+ -+struct io_clock { -+ atomic_long_t now; -+ u16 __percpu *pcpu_buf; -+ unsigned max_slop; -+ -+ spinlock_t timer_lock; -+ io_timer_heap timers; -+}; -+ -+#endif /* _BCACHEFS_CLOCK_TYPES_H */ -diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c -new file mode 100644 -index 000000000000..b50d2b0d5fd3 ---- /dev/null -+++ b/fs/bcachefs/compress.c -@@ -0,0 +1,629 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "checksum.h" -+#include "compress.h" -+#include "extents.h" -+#include "io.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+ -+/* Bounce buffer: */ -+struct bbuf { -+ void *b; -+ enum { -+ BB_NONE, -+ BB_VMAP, -+ BB_KMALLOC, -+ BB_MEMPOOL, -+ } type; -+ int rw; -+}; -+ -+static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) -+{ -+ void *b; -+ -+ BUG_ON(size > c->sb.encoded_extent_max << 9); -+ -+ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; -+ -+ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); -+ if (b) -+ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; -+ -+ BUG(); -+} -+ -+static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ void *expected_start = NULL; -+ -+ __bio_for_each_bvec(bv, bio, iter, start) { -+ if (expected_start && -+ expected_start != page_address(bv.bv_page) + bv.bv_offset) -+ return false; -+ -+ expected_start = page_address(bv.bv_page) + -+ bv.bv_offset + bv.bv_len; -+ } -+ -+ return true; -+} -+ -+static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, -+ struct bvec_iter start, int rw) -+{ -+ struct bbuf ret; -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ unsigned nr_pages = 0; -+ struct page *stack_pages[16]; -+ struct page **pages = NULL; -+ void *data; -+ -+ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); -+ -+ if (!IS_ENABLED(CONFIG_HIGHMEM) && -+ bio_phys_contig(bio, start)) -+ return (struct bbuf) { -+ .b = page_address(bio_iter_page(bio, start)) + -+ bio_iter_offset(bio, start), -+ .type = BB_NONE, .rw = rw -+ }; -+ -+ /* check if we can map the pages contiguously: */ -+ __bio_for_each_segment(bv, bio, iter, start) { -+ if (iter.bi_size != start.bi_size && -+ bv.bv_offset) -+ goto bounce; -+ -+ if (bv.bv_len < iter.bi_size && -+ bv.bv_offset + bv.bv_len < PAGE_SIZE) -+ goto bounce; -+ -+ nr_pages++; -+ } -+ -+ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); -+ -+ pages = nr_pages > ARRAY_SIZE(stack_pages) -+ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) -+ : stack_pages; -+ if (!pages) -+ goto bounce; -+ -+ nr_pages = 0; -+ __bio_for_each_segment(bv, bio, iter, start) -+ pages[nr_pages++] = bv.bv_page; -+ -+ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); -+ if (pages != stack_pages) -+ kfree(pages); -+ -+ if (data) -+ return (struct bbuf) { -+ .b = data + bio_iter_offset(bio, start), -+ .type = BB_VMAP, .rw = rw -+ }; -+bounce: -+ ret = __bounce_alloc(c, start.bi_size, rw); -+ -+ if (rw == READ) -+ memcpy_from_bio(ret.b, bio, start); -+ -+ return ret; -+} -+ -+static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) -+{ -+ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); -+} -+ -+static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) -+{ -+ switch (buf.type) { -+ case BB_NONE: -+ break; -+ case BB_VMAP: -+ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); -+ break; -+ case BB_KMALLOC: -+ kfree(buf.b); -+ break; -+ case BB_MEMPOOL: -+ mempool_free(buf.b, &c->compression_bounce[buf.rw]); -+ break; -+ } -+} -+ -+static inline void zlib_set_workspace(z_stream *strm, void *workspace) -+{ -+#ifdef __KERNEL__ -+ strm->workspace = workspace; -+#endif -+} -+ -+static int __bio_uncompress(struct bch_fs *c, struct bio *src, -+ void *dst_data, struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf src_data = { NULL }; -+ size_t src_len = src->bi_iter.bi_size; -+ size_t dst_len = crc.uncompressed_size << 9; -+ void *workspace; -+ int ret; -+ -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ switch (crc.compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4_old: -+ case BCH_COMPRESSION_TYPE_lz4: -+ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, -+ src_len, dst_len, dst_len); -+ if (ret != dst_len) -+ goto err; -+ break; -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src_data.b, -+ .avail_in = src_len, -+ .next_out = dst_data, -+ .avail_out = dst_len, -+ }; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_inflateInit2(&strm, -MAX_WBITS); -+ ret = zlib_inflate(&strm, Z_FINISH); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != Z_STREAM_END) -+ goto err; -+ break; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_DCtx *ctx; -+ size_t real_src_len = le32_to_cpup(src_data.b); -+ -+ if (real_src_len > src_len - 4) -+ goto err; -+ -+ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); -+ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); -+ -+ ret = ZSTD_decompressDCtx(ctx, -+ dst_data, dst_len, -+ src_data.b + 4, real_src_len); -+ -+ mempool_free(workspace, &c->decompress_workspace); -+ -+ if (ret != dst_len) -+ goto err; -+ break; -+ } -+ default: -+ BUG(); -+ } -+ ret = 0; -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ return ret; -+err: -+ ret = -EIO; -+ goto out; -+} -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, -+ struct bch_extent_crc_unpacked *crc) -+{ -+ struct bbuf data = { NULL }; -+ size_t dst_len = crc->uncompressed_size << 9; -+ -+ /* bio must own its pages: */ -+ BUG_ON(!bio->bi_vcnt); -+ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); -+ -+ if (crc->uncompressed_size > c->sb.encoded_extent_max || -+ crc->compressed_size > c->sb.encoded_extent_max) { -+ bch_err(c, "error rewriting existing data: extent too big"); -+ return -EIO; -+ } -+ -+ data = __bounce_alloc(c, dst_len, WRITE); -+ -+ if (__bio_uncompress(c, bio, data.b, *crc)) { -+ bch_err(c, "error rewriting existing data: decompression error"); -+ bio_unmap_or_unbounce(c, data); -+ return -EIO; -+ } -+ -+ /* -+ * XXX: don't have a good way to assert that the bio was allocated with -+ * enough space, we depend on bch2_move_extent doing the right thing -+ */ -+ bio->bi_iter.bi_size = crc->live_size << 9; -+ -+ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); -+ -+ crc->csum_type = 0; -+ crc->compression_type = 0; -+ crc->compressed_size = crc->live_size; -+ crc->uncompressed_size = crc->live_size; -+ crc->offset = 0; -+ crc->csum = (struct bch_csum) { 0, 0 }; -+ -+ bio_unmap_or_unbounce(c, data); -+ return 0; -+} -+ -+int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, -+ struct bio *dst, struct bvec_iter dst_iter, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bbuf dst_data = { NULL }; -+ size_t dst_len = crc.uncompressed_size << 9; -+ int ret = -ENOMEM; -+ -+ if (crc.uncompressed_size > c->sb.encoded_extent_max || -+ crc.compressed_size > c->sb.encoded_extent_max) -+ return -EIO; -+ -+ dst_data = dst_len == dst_iter.bi_size -+ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) -+ : __bounce_alloc(c, dst_len, WRITE); -+ -+ ret = __bio_uncompress(c, src, dst_data.b, crc); -+ if (ret) -+ goto err; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); -+err: -+ bio_unmap_or_unbounce(c, dst_data); -+ return ret; -+} -+ -+static int attempt_compress(struct bch_fs *c, -+ void *workspace, -+ void *dst, size_t dst_len, -+ void *src, size_t src_len, -+ enum bch_compression_type compression_type) -+{ -+ switch (compression_type) { -+ case BCH_COMPRESSION_TYPE_lz4: { -+ int len = src_len; -+ int ret = LZ4_compress_destSize( -+ src, dst, -+ &len, dst_len, -+ workspace); -+ -+ if (len < src_len) -+ return -len; -+ -+ return ret; -+ } -+ case BCH_COMPRESSION_TYPE_gzip: { -+ z_stream strm = { -+ .next_in = src, -+ .avail_in = src_len, -+ .next_out = dst, -+ .avail_out = dst_len, -+ }; -+ -+ zlib_set_workspace(&strm, workspace); -+ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, -+ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, -+ Z_DEFAULT_STRATEGY); -+ -+ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) -+ return 0; -+ -+ if (zlib_deflateEnd(&strm) != Z_OK) -+ return 0; -+ -+ return strm.total_out; -+ } -+ case BCH_COMPRESSION_TYPE_zstd: { -+ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, -+ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); -+ -+ size_t len = ZSTD_compressCCtx(ctx, -+ dst + 4, dst_len - 4, -+ src, src_len, -+ c->zstd_params); -+ if (ZSTD_isError(len)) -+ return 0; -+ -+ *((__le32 *) dst) = cpu_to_le32(len); -+ return len + 4; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static unsigned __bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ enum bch_compression_type compression_type) -+{ -+ struct bbuf src_data = { NULL }, dst_data = { NULL }; -+ void *workspace; -+ unsigned pad; -+ int ret = 0; -+ -+ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); -+ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); -+ -+ /* If it's only one block, don't bother trying to compress: */ -+ if (bio_sectors(src) <= c->opts.block_size) -+ return 0; -+ -+ dst_data = bio_map_or_bounce(c, dst, WRITE); -+ src_data = bio_map_or_bounce(c, src, READ); -+ -+ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); -+ -+ *src_len = src->bi_iter.bi_size; -+ *dst_len = dst->bi_iter.bi_size; -+ -+ /* -+ * XXX: this algorithm sucks when the compression code doesn't tell us -+ * how much would fit, like LZ4 does: -+ */ -+ while (1) { -+ if (*src_len <= block_bytes(c)) { -+ ret = -1; -+ break; -+ } -+ -+ ret = attempt_compress(c, workspace, -+ dst_data.b, *dst_len, -+ src_data.b, *src_len, -+ compression_type); -+ if (ret > 0) { -+ *dst_len = ret; -+ ret = 0; -+ break; -+ } -+ -+ /* Didn't fit: should we retry with a smaller amount? */ -+ if (*src_len <= *dst_len) { -+ ret = -1; -+ break; -+ } -+ -+ /* -+ * If ret is negative, it's a hint as to how much data would fit -+ */ -+ BUG_ON(-ret >= *src_len); -+ -+ if (ret < 0) -+ *src_len = -ret; -+ else -+ *src_len -= (*src_len - *dst_len) / 2; -+ *src_len = round_down(*src_len, block_bytes(c)); -+ } -+ -+ mempool_free(workspace, &c->compress_workspace[compression_type]); -+ -+ if (ret) -+ goto err; -+ -+ /* Didn't get smaller: */ -+ if (round_up(*dst_len, block_bytes(c)) >= *src_len) -+ goto err; -+ -+ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; -+ -+ memset(dst_data.b + *dst_len, 0, pad); -+ *dst_len += pad; -+ -+ if (dst_data.type != BB_NONE && -+ dst_data.type != BB_VMAP) -+ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); -+ -+ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); -+ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); -+ BUG_ON(*dst_len & (block_bytes(c) - 1)); -+ BUG_ON(*src_len & (block_bytes(c) - 1)); -+out: -+ bio_unmap_or_unbounce(c, src_data); -+ bio_unmap_or_unbounce(c, dst_data); -+ return compression_type; -+err: -+ compression_type = BCH_COMPRESSION_TYPE_incompressible; -+ goto out; -+} -+ -+unsigned bch2_bio_compress(struct bch_fs *c, -+ struct bio *dst, size_t *dst_len, -+ struct bio *src, size_t *src_len, -+ unsigned compression_type) -+{ -+ unsigned orig_dst = dst->bi_iter.bi_size; -+ unsigned orig_src = src->bi_iter.bi_size; -+ -+ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ -+ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, -+ c->sb.encoded_extent_max << 9); -+ /* Don't generate a bigger output than input: */ -+ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ -+ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) -+ compression_type = BCH_COMPRESSION_TYPE_lz4; -+ -+ compression_type = -+ __bio_compress(c, dst, dst_len, src, src_len, compression_type); -+ -+ dst->bi_iter.bi_size = orig_dst; -+ src->bi_iter.bi_size = orig_src; -+ return compression_type; -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *, u64); -+ -+#define BCH_FEATURE_none 0 -+ -+static const unsigned bch2_compression_opt_to_feature[] = { -+#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+}; -+ -+#undef BCH_FEATURE_none -+ -+static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) -+{ -+ int ret = 0; -+ -+ if ((c->sb.features & f) == f) -+ return 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if ((c->sb.features & f) == f) { -+ mutex_unlock(&c->sb_lock); -+ return 0; -+ } -+ -+ ret = __bch2_fs_compress_init(c, c->sb.features|f); -+ if (ret) { -+ mutex_unlock(&c->sb_lock); -+ return ret; -+ } -+ -+ c->disk_sb.sb->features[0] |= cpu_to_le64(f); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *c, -+ unsigned compression_type) -+{ -+ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); -+ -+ return compression_type -+ ? __bch2_check_set_has_compressed_data(c, -+ 1ULL << bch2_compression_opt_to_feature[compression_type]) -+ : 0; -+} -+ -+void bch2_fs_compress_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ mempool_exit(&c->decompress_workspace); -+ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) -+ mempool_exit(&c->compress_workspace[i]); -+ mempool_exit(&c->compression_bounce[WRITE]); -+ mempool_exit(&c->compression_bounce[READ]); -+} -+ -+static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) -+{ -+ size_t max_extent = c->sb.encoded_extent_max << 9; -+ size_t decompress_workspace_size = 0; -+ bool decompress_workspace_needed; -+ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); -+ struct { -+ unsigned feature; -+ unsigned type; -+ size_t compress_workspace; -+ size_t decompress_workspace; -+ } compression_types[] = { -+ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, -+ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, -+ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), -+ zlib_inflate_workspacesize(), }, -+ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, -+ ZSTD_CCtxWorkspaceBound(params.cParams), -+ ZSTD_DCtxWorkspaceBound() }, -+ }, *i; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ c->zstd_params = params; -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) -+ if (features & (1 << i->feature)) -+ goto have_compressed; -+ -+ goto out; -+have_compressed: -+ -+ if (!mempool_initialized(&c->compression_bounce[READ])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->compression_bounce[WRITE])) { -+ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], -+ 1, max_extent); -+ if (ret) -+ goto out; -+ } -+ -+ for (i = compression_types; -+ i < compression_types + ARRAY_SIZE(compression_types); -+ i++) { -+ decompress_workspace_size = -+ max(decompress_workspace_size, i->decompress_workspace); -+ -+ if (!(features & (1 << i->feature))) -+ continue; -+ -+ if (i->decompress_workspace) -+ decompress_workspace_needed = true; -+ -+ if (mempool_initialized(&c->compress_workspace[i->type])) -+ continue; -+ -+ ret = mempool_init_kvpmalloc_pool( -+ &c->compress_workspace[i->type], -+ 1, i->compress_workspace); -+ if (ret) -+ goto out; -+ } -+ -+ if (!mempool_initialized(&c->decompress_workspace)) { -+ ret = mempool_init_kvpmalloc_pool( -+ &c->decompress_workspace, -+ 1, decompress_workspace_size); -+ if (ret) -+ goto out; -+ } -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_compress_init(struct bch_fs *c) -+{ -+ u64 f = c->sb.features; -+ -+ if (c->opts.compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; -+ -+ if (c->opts.background_compression) -+ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; -+ -+ return __bch2_fs_compress_init(c, f); -+ -+} -diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h -new file mode 100644 -index 000000000000..4bab1f61b3b5 ---- /dev/null -+++ b/fs/bcachefs/compress.h -@@ -0,0 +1,18 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_COMPRESS_H -+#define _BCACHEFS_COMPRESS_H -+ -+#include "extents_types.h" -+ -+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, -+ struct bch_extent_crc_unpacked *); -+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, -+ struct bvec_iter, struct bch_extent_crc_unpacked); -+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, -+ struct bio *, size_t *, unsigned); -+ -+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); -+void bch2_fs_compress_exit(struct bch_fs *); -+int bch2_fs_compress_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_COMPRESS_H */ -diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c -new file mode 100644 -index 000000000000..aa10591a3b1a ---- /dev/null -+++ b/fs/bcachefs/debug.c -@@ -0,0 +1,432 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Assorted bcachefs debug code -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "debug.h" -+#include "error.h" -+#include "extents.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "super.h" -+ -+#include -+#include -+#include -+#include -+#include -+ -+static struct dentry *bch_debug; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+void __bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ struct btree *v = c->verify_data; -+ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; -+ struct bset *sorted, *inmemory; -+ struct extent_ptr_decoded pick; -+ struct bch_dev *ca; -+ struct bio *bio; -+ -+ if (c->opts.nochanges) -+ return; -+ -+ btree_node_io_lock(b); -+ mutex_lock(&c->verify_lock); -+ -+ n_ondisk = c->verify_ondisk; -+ n_sorted = c->verify_data->data; -+ n_inmemory = b->data; -+ -+ bkey_copy(&v->key, &b->key); -+ v->written = 0; -+ v->c.level = b->c.level; -+ v->c.btree_id = b->c.btree_id; -+ bch2_btree_keys_init(v, &c->expensive_debug_checks); -+ -+ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), -+ NULL, &pick) <= 0) -+ return; -+ -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ if (!bch2_dev_get_ioref(ca, READ)) -+ return; -+ -+ bio = bio_alloc_bioset(GFP_NOIO, -+ buf_pages(n_sorted, btree_bytes(c)), -+ &c->btree_bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_READ|REQ_META; -+ bio->bi_iter.bi_sector = pick.ptr.offset; -+ bch2_bio_map(bio, n_sorted, btree_bytes(c)); -+ -+ submit_bio_wait(bio); -+ -+ bio_put(bio); -+ percpu_ref_put(&ca->io_ref); -+ -+ memcpy(n_ondisk, n_sorted, btree_bytes(c)); -+ -+ if (bch2_btree_node_read_done(c, v, false)) -+ goto out; -+ -+ n_sorted = c->verify_data->data; -+ sorted = &n_sorted->keys; -+ inmemory = &n_inmemory->keys; -+ -+ if (inmemory->u64s != sorted->u64s || -+ memcmp(inmemory->start, -+ sorted->start, -+ vstruct_end(inmemory) - (void *) inmemory->start)) { -+ unsigned offset = 0, sectors; -+ struct bset *i; -+ unsigned j; -+ -+ console_lock(); -+ -+ printk(KERN_ERR "*** in memory:\n"); -+ bch2_dump_bset(c, b, inmemory, 0); -+ -+ printk(KERN_ERR "*** read back in:\n"); -+ bch2_dump_bset(c, v, sorted, 0); -+ -+ while (offset < b->written) { -+ if (!offset ) { -+ i = &n_ondisk->keys; -+ sectors = vstruct_blocks(n_ondisk, c->block_bits) << -+ c->block_bits; -+ } else { -+ struct btree_node_entry *bne = -+ (void *) n_ondisk + (offset << 9); -+ i = &bne->keys; -+ -+ sectors = vstruct_blocks(bne, c->block_bits) << -+ c->block_bits; -+ } -+ -+ printk(KERN_ERR "*** on disk block %u:\n", offset); -+ bch2_dump_bset(c, b, i, offset); -+ -+ offset += sectors; -+ } -+ -+ printk(KERN_ERR "*** block %u/%u not written\n", -+ offset >> c->block_bits, btree_blocks(c)); -+ -+ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) -+ if (inmemory->_data[j] != sorted->_data[j]) -+ break; -+ -+ printk(KERN_ERR "b->written %u\n", b->written); -+ -+ console_unlock(); -+ panic("verify failed at %u\n", j); -+ } -+out: -+ mutex_unlock(&c->verify_lock); -+ btree_node_io_unlock(b); -+} -+ -+#endif -+ -+#ifdef CONFIG_DEBUG_FS -+ -+/* XXX: bch_fs refcounting */ -+ -+struct dump_iter { -+ struct bpos from; -+ struct bch_fs *c; -+ enum btree_id id; -+ -+ char buf[PAGE_SIZE]; -+ size_t bytes; /* what's currently in buf */ -+ -+ char __user *ubuf; /* destination user buffer */ -+ size_t size; /* size of requested read */ -+ ssize_t ret; /* bytes read so far */ -+}; -+ -+static int flush_buf(struct dump_iter *i) -+{ -+ if (i->bytes) { -+ size_t bytes = min(i->bytes, i->size); -+ int err = copy_to_user(i->ubuf, i->buf, bytes); -+ -+ if (err) -+ return err; -+ -+ i->ret += bytes; -+ i->ubuf += bytes; -+ i->size -= bytes; -+ i->bytes -= bytes; -+ memmove(i->buf, i->buf + bytes, i->bytes); -+ } -+ -+ return 0; -+} -+ -+static int bch2_dump_open(struct inode *inode, struct file *file) -+{ -+ struct btree_debug *bd = inode->i_private; -+ struct dump_iter *i; -+ -+ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); -+ if (!i) -+ return -ENOMEM; -+ -+ file->private_data = i; -+ i->from = POS_MIN; -+ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); -+ i->id = bd->id; -+ -+ return 0; -+} -+ -+static int bch2_dump_release(struct inode *inode, struct file *file) -+{ -+ kfree(file->private_data); -+ return 0; -+} -+ -+static ssize_t bch2_read_btree(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ k = bch2_btree_iter_peek(iter); -+ -+ while (k.k && !(err = bkey_err(k))) { -+ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); -+ i->bytes = strlen(i->buf); -+ BUG_ON(i->bytes >= PAGE_SIZE); -+ i->buf[i->bytes] = '\n'; -+ i->bytes++; -+ -+ k = bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree, -+}; -+ -+static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size || !bkey_cmp(POS_MAX, i->from)) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ /* -+ * can't easily correctly restart a btree node traversal across -+ * all nodes, meh -+ */ -+ i->from = bkey_cmp(POS_MAX, b->key.k.p) -+ ? bkey_successor(b->key.k.p) -+ : b->key.k.p; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations btree_format_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_btree_formats, -+}; -+ -+static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, -+ size_t size, loff_t *ppos) -+{ -+ struct dump_iter *i = file->private_data; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct btree *prev_node = NULL; -+ int err; -+ -+ i->ubuf = buf; -+ i->size = size; -+ i->ret = 0; -+ -+ err = flush_buf(i); -+ if (err) -+ return err; -+ -+ if (!i->size) -+ return i->ret; -+ -+ bch2_trans_init(&trans, i->c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(err = bkey_err(k))) { -+ struct btree_iter_level *l = &iter->l[0]; -+ struct bkey_packed *_k = -+ bch2_btree_node_iter_peek(&l->iter, l->b); -+ -+ if (l->b != prev_node) { -+ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ } -+ prev_node = l->b; -+ -+ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); -+ i->bytes = strlen(i->buf); -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ bch2_btree_iter_next(iter); -+ i->from = iter->pos; -+ -+ err = flush_buf(i); -+ if (err) -+ break; -+ -+ if (!i->size) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ return err < 0 ? err : i->ret; -+} -+ -+static const struct file_operations bfloat_failed_debug_ops = { -+ .owner = THIS_MODULE, -+ .open = bch2_dump_open, -+ .release = bch2_dump_release, -+ .read = bch2_read_bfloat_failed, -+}; -+ -+void bch2_fs_debug_exit(struct bch_fs *c) -+{ -+ if (!IS_ERR_OR_NULL(c->debug)) -+ debugfs_remove_recursive(c->debug); -+} -+ -+void bch2_fs_debug_init(struct bch_fs *c) -+{ -+ struct btree_debug *bd; -+ char name[100]; -+ -+ if (IS_ERR_OR_NULL(bch_debug)) -+ return; -+ -+ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); -+ c->debug = debugfs_create_dir(name, bch_debug); -+ if (IS_ERR_OR_NULL(c->debug)) -+ return; -+ -+ for (bd = c->btree_debug; -+ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); -+ bd++) { -+ bd->id = bd - c->btree_debug; -+ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], -+ 0400, c->debug, bd, -+ &btree_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-formats", -+ bch2_btree_ids[bd->id]); -+ -+ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, -+ &btree_format_debug_ops); -+ -+ snprintf(name, sizeof(name), "%s-bfloat-failed", -+ bch2_btree_ids[bd->id]); -+ -+ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, -+ &bfloat_failed_debug_ops); -+ } -+} -+ -+#endif -+ -+void bch2_debug_exit(void) -+{ -+ if (!IS_ERR_OR_NULL(bch_debug)) -+ debugfs_remove_recursive(bch_debug); -+} -+ -+int __init bch2_debug_init(void) -+{ -+ int ret = 0; -+ -+ bch_debug = debugfs_create_dir("bcachefs", NULL); -+ return ret; -+} -diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h -new file mode 100644 -index 000000000000..56c2d1ab5f63 ---- /dev/null -+++ b/fs/bcachefs/debug.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DEBUG_H -+#define _BCACHEFS_DEBUG_H -+ -+#include "bcachefs.h" -+ -+struct bio; -+struct btree; -+struct bch_fs; -+ -+#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_ALWAYS() -+#undef BCH_DEBUG_PARAM -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) \ -+ { return bch2_##name || c->name; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+void __bch2_btree_verify(struct bch_fs *, struct btree *); -+ -+#define bypass_torture_test(d) ((d)->bypass_torture_test) -+ -+#else /* DEBUG */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ static inline bool name(struct bch_fs *c) { return false; } -+BCH_DEBUG_PARAMS_DEBUG() -+#undef BCH_DEBUG_PARAM -+ -+static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} -+ -+#define bypass_torture_test(d) 0 -+ -+#endif -+ -+static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) -+{ -+ if (verify_btree_ondisk(c)) -+ __bch2_btree_verify(c, b); -+} -+ -+#ifdef CONFIG_DEBUG_FS -+void bch2_fs_debug_exit(struct bch_fs *); -+void bch2_fs_debug_init(struct bch_fs *); -+#else -+static inline void bch2_fs_debug_exit(struct bch_fs *c) {} -+static inline void bch2_fs_debug_init(struct bch_fs *c) {} -+#endif -+ -+void bch2_debug_exit(void); -+int bch2_debug_init(void); -+ -+#endif /* _BCACHEFS_DEBUG_H */ -diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c -new file mode 100644 -index 000000000000..f34bfda8ab0d ---- /dev/null -+++ b/fs/bcachefs/dirent.c -@@ -0,0 +1,385 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "dirent.h" -+#include "fs.h" -+#include "keylist.h" -+#include "str_hash.h" -+ -+#include -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) -+{ -+ unsigned len = bkey_val_bytes(d.k) - -+ offsetof(struct bch_dirent, d_name); -+ -+ return strnlen(d.v->d_name, len); -+} -+ -+static u64 bch2_dirent_hash(const struct bch_hash_info *info, -+ const struct qstr *name) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, name->name, name->len); -+ -+ /* [0,2) reserved for dots */ -+ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); -+} -+ -+static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_dirent_hash(info, key); -+} -+ -+static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); -+ -+ return bch2_dirent_hash(info, &name); -+} -+ -+static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ int len = bch2_dirent_name_bytes(l); -+ const struct qstr *r = _r; -+ -+ return len - r->len ?: memcmp(l.v->d_name, r->name, len); -+} -+ -+static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); -+ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); -+ int l_len = bch2_dirent_name_bytes(l); -+ int r_len = bch2_dirent_name_bytes(r); -+ -+ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); -+} -+ -+const struct bch_hash_desc bch2_dirent_hash_desc = { -+ .btree_id = BTREE_ID_DIRENTS, -+ .key_type = KEY_TYPE_dirent, -+ .hash_key = dirent_hash_key, -+ .hash_bkey = dirent_hash_bkey, -+ .cmp_key = dirent_cmp_key, -+ .cmp_bkey = dirent_cmp_bkey, -+}; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ unsigned len; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) -+ return "value too small"; -+ -+ len = bch2_dirent_name_bytes(d); -+ if (!len) -+ return "empty name"; -+ -+ /* -+ * older versions of bcachefs were buggy and creating dirent -+ * keys that were bigger than necessary: -+ */ -+ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) -+ return "value too big"; -+ -+ if (len > BCH_NAME_MAX) -+ return "dirent name too big"; -+ -+ return NULL; -+} -+ -+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); -+ -+ bch_scnmemcpy(out, d.v->d_name, -+ bch2_dirent_name_bytes(d)); -+ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); -+} -+ -+static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, -+ u8 type, const struct qstr *name, u64 dst) -+{ -+ struct bkey_i_dirent *dirent; -+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); -+ -+ if (name->len > BCH_NAME_MAX) -+ return ERR_PTR(-ENAMETOOLONG); -+ -+ BUG_ON(u64s > U8_MAX); -+ -+ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(dirent)) -+ return dirent; -+ -+ bkey_dirent_init(&dirent->k_i); -+ dirent->k.u64s = u64s; -+ dirent->v.d_inum = cpu_to_le64(dst); -+ dirent->v.d_type = type; -+ -+ memcpy(dirent->v.d_name, name->name, name->len); -+ memset(dirent->v.d_name + name->len, 0, -+ bkey_val_bytes(&dirent->k) - -+ offsetof(struct bch_dirent, d_name) - -+ name->len); -+ -+ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); -+ -+ return dirent; -+} -+ -+int bch2_dirent_create(struct btree_trans *trans, -+ u64 dir_inum, const struct bch_hash_info *hash_info, -+ u8 type, const struct qstr *name, u64 dst_inum, -+ int flags) -+{ -+ struct bkey_i_dirent *dirent; -+ int ret; -+ -+ dirent = dirent_create_key(trans, type, name, dst_inum); -+ ret = PTR_ERR_OR_ZERO(dirent); -+ if (ret) -+ return ret; -+ -+ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, -+ dir_inum, &dirent->k_i, flags); -+} -+ -+static void dirent_copy_target(struct bkey_i_dirent *dst, -+ struct bkey_s_c_dirent src) -+{ -+ dst->v.d_inum = src.v->d_inum; -+ dst->v.d_type = src.v->d_type; -+} -+ -+int bch2_dirent_rename(struct btree_trans *trans, -+ u64 src_dir, struct bch_hash_info *src_hash, -+ u64 dst_dir, struct bch_hash_info *dst_hash, -+ const struct qstr *src_name, u64 *src_inum, -+ const struct qstr *dst_name, u64 *dst_inum, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_iter = NULL, *dst_iter = NULL; -+ struct bkey_s_c old_src, old_dst; -+ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; -+ struct bpos dst_pos = -+ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); -+ int ret = 0; -+ -+ *src_inum = *dst_inum = 0; -+ -+ /* -+ * Lookup dst: -+ * -+ * Note that in BCH_RENAME mode, we're _not_ checking if -+ * the target already exists - we're relying on the VFS -+ * to do that check for us for correctness: -+ */ -+ dst_iter = mode == BCH_RENAME -+ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name) -+ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ dst_hash, dst_dir, dst_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_iter); -+ if (ret) -+ goto out; -+ -+ old_dst = bch2_btree_iter_peek_slot(dst_iter); -+ -+ if (mode != BCH_RENAME) -+ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); -+ -+ /* Lookup src: */ -+ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ src_hash, src_dir, src_name, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_iter); -+ if (ret) -+ goto out; -+ -+ old_src = bch2_btree_iter_peek_slot(src_iter); -+ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); -+ -+ /* Create new dst key: */ -+ new_dst = dirent_create_key(trans, 0, dst_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_dst); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); -+ new_dst->k.p = dst_iter->pos; -+ -+ /* Create new src key: */ -+ if (mode == BCH_RENAME_EXCHANGE) { -+ new_src = dirent_create_key(trans, 0, src_name, 0); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); -+ new_src->k.p = src_iter->pos; -+ } else { -+ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); -+ ret = PTR_ERR_OR_ZERO(new_src); -+ if (ret) -+ goto out; -+ -+ bkey_init(&new_src->k); -+ new_src->k.p = src_iter->pos; -+ -+ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && -+ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { -+ /* -+ * We have a hash collision for the new dst key, -+ * and new_src - the key we're deleting - is between -+ * new_dst's hashed slot and the slot we're going to be -+ * inserting it into - oops. This will break the hash -+ * table if we don't deal with it: -+ */ -+ if (mode == BCH_RENAME) { -+ /* -+ * If we're not overwriting, we can just insert -+ * new_dst at the src position: -+ */ -+ new_dst->k.p = src_iter->pos; -+ bch2_trans_update(trans, src_iter, -+ &new_dst->k_i, 0); -+ goto out; -+ } else { -+ /* If we're overwriting, we can't insert new_dst -+ * at a different slot because it has to -+ * overwrite old_dst - just make sure to use a -+ * whiteout when deleting src: -+ */ -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } else { -+ /* Check if we need a whiteout to delete src: */ -+ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, -+ src_hash, src_iter); -+ if (ret < 0) -+ goto out; -+ -+ if (ret) -+ new_src->k.type = KEY_TYPE_whiteout; -+ } -+ } -+ -+ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); -+ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); -+out: -+ bch2_trans_iter_put(trans, src_iter); -+ bch2_trans_iter_put(trans, dst_iter); -+ return ret; -+} -+ -+int bch2_dirent_delete_at(struct btree_trans *trans, -+ const struct bch_hash_info *hash_info, -+ struct btree_iter *iter) -+{ -+ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, -+ hash_info, iter); -+} -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name, unsigned flags) -+{ -+ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, -+ hash_info, dir_inum, name, flags); -+} -+ -+u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, -+ const struct bch_hash_info *hash_info, -+ const struct qstr *name) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 inum = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, -+ hash_info, name, 0); -+ if (IS_ERR(iter)) { -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ goto out; -+ } -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+out: -+ bch2_trans_exit(&trans); -+ return inum; -+} -+ -+int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, -+ POS(dir_inum, 0), 0, k, ret) { -+ if (k.k->p.inode > dir_inum) -+ break; -+ -+ if (k.k->type == KEY_TYPE_dirent) { -+ ret = -ENOTEMPTY; -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+} -+ -+int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(inum, ctx->pos), 0, k, ret) { -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ /* -+ * XXX: dir_emit() can fault and block, while we're holding -+ * locks -+ */ -+ ctx->pos = dirent.k->p.offset; -+ if (!dir_emit(ctx, dirent.v->d_name, -+ bch2_dirent_name_bytes(dirent), -+ le64_to_cpu(dirent.v->d_inum), -+ dirent.v->d_type)) -+ break; -+ ctx->pos = dirent.k->p.offset + 1; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ return ret; -+} -diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h -new file mode 100644 -index 000000000000..34769371dd13 ---- /dev/null -+++ b/fs/bcachefs/dirent.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DIRENT_H -+#define _BCACHEFS_DIRENT_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_dirent_hash_desc; -+ -+const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_dirent (struct bkey_ops) { \ -+ .key_invalid = bch2_dirent_invalid, \ -+ .val_to_text = bch2_dirent_to_text, \ -+} -+ -+struct qstr; -+struct file; -+struct dir_context; -+struct bch_fs; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); -+ -+static inline unsigned dirent_val_u64s(unsigned len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, -+ sizeof(u64)); -+} -+ -+int bch2_dirent_create(struct btree_trans *, u64, -+ const struct bch_hash_info *, u8, -+ const struct qstr *, u64, int); -+ -+int bch2_dirent_delete_at(struct btree_trans *, -+ const struct bch_hash_info *, -+ struct btree_iter *); -+ -+enum bch_rename_mode { -+ BCH_RENAME, -+ BCH_RENAME_OVERWRITE, -+ BCH_RENAME_EXCHANGE, -+}; -+ -+int bch2_dirent_rename(struct btree_trans *, -+ u64, struct bch_hash_info *, -+ u64, struct bch_hash_info *, -+ const struct qstr *, u64 *, -+ const struct qstr *, u64 *, -+ enum bch_rename_mode); -+ -+struct btree_iter * -+__bch2_dirent_lookup_trans(struct btree_trans *, u64, -+ const struct bch_hash_info *, -+ const struct qstr *, unsigned); -+u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, -+ const struct qstr *); -+ -+int bch2_empty_dir_trans(struct btree_trans *, u64); -+int bch2_readdir(struct bch_fs *, u64, struct dir_context *); -+ -+#endif /* _BCACHEFS_DIRENT_H */ -diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c -new file mode 100644 -index 000000000000..c52b6faac9b4 ---- /dev/null -+++ b/fs/bcachefs/disk_groups.c -@@ -0,0 +1,486 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "disk_groups.h" -+#include "super-io.h" -+ -+#include -+ -+static int group_cmp(const void *_l, const void *_r) -+{ -+ const struct bch_disk_group *l = _l; -+ const struct bch_disk_group *r = _r; -+ -+ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - -+ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: -+ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - -+ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: -+ strncmp(l->label, r->label, sizeof(l->label)); -+} -+ -+static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g, *sorted = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member *m; -+ unsigned i, nr_groups, len; -+ const char *err = NULL; -+ -+ mi = bch2_sb_get_members(sb); -+ groups = bch2_sb_get_disk_groups(sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ unsigned g; -+ -+ if (!BCH_MEMBER_GROUP(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m) - 1; -+ -+ if (g >= nr_groups || -+ BCH_GROUP_DELETED(&groups->entries[g])) -+ return "disk has invalid group"; -+ } -+ -+ if (!nr_groups) -+ return NULL; -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ len = strnlen(g->label, sizeof(g->label)); -+ if (!len) { -+ err = "group with empty label"; -+ goto err; -+ } -+ } -+ -+ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); -+ if (!sorted) -+ return "cannot allocate memory"; -+ -+ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); -+ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); -+ -+ for (i = 0; i + 1 < nr_groups; i++) -+ if (!BCH_GROUP_DELETED(sorted + i) && -+ !group_cmp(sorted + i, sorted + i + 1)) { -+ err = "duplicate groups"; -+ goto err; -+ } -+ -+ err = NULL; -+err: -+ kfree(sorted); -+ return err; -+} -+ -+static void bch2_sb_disk_groups_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ field_to_type(f, disk_groups); -+ struct bch_disk_group *g; -+ unsigned nr_groups = disk_groups_nr(groups); -+ -+ for (g = groups->entries; -+ g < groups->entries + nr_groups; -+ g++) { -+ if (g != groups->entries) -+ pr_buf(out, " "); -+ -+ if (BCH_GROUP_DELETED(g)) -+ pr_buf(out, "[deleted]"); -+ else -+ pr_buf(out, "[parent %llu name %s]", -+ BCH_GROUP_PARENT(g), g->label); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { -+ .validate = bch2_sb_disk_groups_validate, -+ .to_text = bch2_sb_disk_groups_to_text -+}; -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_sb_field_disk_groups *groups; -+ struct bch_disk_groups_cpu *cpu_g, *old_g; -+ unsigned i, g, nr_groups; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); -+ nr_groups = disk_groups_nr(groups); -+ -+ if (!groups) -+ return 0; -+ -+ cpu_g = kzalloc(sizeof(*cpu_g) + -+ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); -+ if (!cpu_g) -+ return -ENOMEM; -+ -+ cpu_g->nr = nr_groups; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *src = &groups->entries[i]; -+ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; -+ -+ dst->deleted = BCH_GROUP_DELETED(src); -+ dst->parent = BCH_GROUP_PARENT(src); -+ } -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ struct bch_member *m = mi->members + i; -+ struct bch_disk_group_cpu *dst = -+ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; -+ -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ g = BCH_MEMBER_GROUP(m); -+ while (g) { -+ dst = &cpu_g->entries[g - 1]; -+ __set_bit(i, dst->devs.d); -+ g = dst->parent; -+ } -+ } -+ -+ old_g = rcu_dereference_protected(c->disk_groups, -+ lockdep_is_held(&c->sb_lock)); -+ rcu_assign_pointer(c->disk_groups, cpu_g); -+ if (old_g) -+ kfree_rcu(old_g, rcu); -+ -+ return 0; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return NULL; -+ case TARGET_DEV: { -+ struct bch_dev *ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ return ca ? &ca->self : NULL; -+ } -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); -+ -+ return g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) -+{ -+ struct target t = target_decode(target); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ return false; -+ case TARGET_DEV: -+ return dev == t.dev; -+ case TARGET_GROUP: { -+ struct bch_disk_groups_cpu *g; -+ const struct bch_devs_mask *m; -+ bool ret; -+ -+ rcu_read_lock(); -+ g = rcu_dereference(c->disk_groups); -+ m = g && t.group < g->nr && !g->entries[t.group].deleted -+ ? &g->entries[t.group].devs -+ : NULL; -+ -+ ret = m ? test_bit(dev, m->d) : false; -+ rcu_read_unlock(); -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+} -+ -+static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, -+ unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; i < nr_groups; i++) { -+ struct bch_disk_group *g = groups->entries + i; -+ -+ if (BCH_GROUP_DELETED(g)) -+ continue; -+ -+ if (!BCH_GROUP_DELETED(g) && -+ BCH_GROUP_PARENT(g) == parent && -+ strnlen(g->label, sizeof(g->label)) == namelen && -+ !memcmp(name, g->label, namelen)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, -+ const char *name, unsigned namelen) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ unsigned i, nr_groups = disk_groups_nr(groups); -+ struct bch_disk_group *g; -+ -+ if (!namelen || namelen > BCH_SB_LABEL_SIZE) -+ return -EINVAL; -+ -+ for (i = 0; -+ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); -+ i++) -+ ; -+ -+ if (i == nr_groups) { -+ unsigned u64s = -+ (sizeof(struct bch_sb_field_disk_groups) + -+ sizeof(struct bch_disk_group) * (nr_groups + 1)) / -+ sizeof(u64); -+ -+ groups = bch2_sb_resize_disk_groups(sb, u64s); -+ if (!groups) -+ return -ENOSPC; -+ -+ nr_groups = disk_groups_nr(groups); -+ } -+ -+ BUG_ON(i >= nr_groups); -+ -+ g = &groups->entries[i]; -+ -+ memcpy(g->label, name, namelen); -+ if (namelen < sizeof(g->label)) -+ g->label[namelen] = '\0'; -+ SET_BCH_GROUP_DELETED(g, 0); -+ SET_BCH_GROUP_PARENT(g, parent); -+ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); -+ -+ return i; -+} -+ -+int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ v = __bch2_disk_group_find(groups, v + 1, name, len); -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) -+{ -+ struct bch_sb_field_disk_groups *groups; -+ unsigned parent = 0; -+ int v = -1; -+ -+ do { -+ const char *next = strchrnul(name, '.'); -+ unsigned len = next - name; -+ -+ if (*next == '.') -+ next++; -+ -+ groups = bch2_sb_get_disk_groups(sb->sb); -+ -+ v = __bch2_disk_group_find(groups, parent, name, len); -+ if (v < 0) -+ v = __bch2_disk_group_add(sb, parent, name, len); -+ if (v < 0) -+ return v; -+ -+ parent = v + 1; -+ name = next; -+ } while (*name && v >= 0); -+ -+ return v; -+} -+ -+void bch2_disk_path_to_text(struct printbuf *out, -+ struct bch_sb_handle *sb, -+ unsigned v) -+{ -+ struct bch_sb_field_disk_groups *groups = -+ bch2_sb_get_disk_groups(sb->sb); -+ struct bch_disk_group *g; -+ unsigned nr = 0; -+ u16 path[32]; -+ -+ while (1) { -+ if (nr == ARRAY_SIZE(path)) -+ goto inval; -+ -+ if (v >= disk_groups_nr(groups)) -+ goto inval; -+ -+ g = groups->entries + v; -+ -+ if (BCH_GROUP_DELETED(g)) -+ goto inval; -+ -+ path[nr++] = v; -+ -+ if (!BCH_GROUP_PARENT(g)) -+ break; -+ -+ v = BCH_GROUP_PARENT(g) - 1; -+ } -+ -+ while (nr) { -+ v = path[--nr]; -+ g = groups->entries + v; -+ -+ bch_scnmemcpy(out, g->label, -+ strnlen(g->label, sizeof(g->label))); -+ -+ if (nr) -+ pr_buf(out, "."); -+ } -+ return; -+inval: -+ pr_buf(out, "invalid group %u", v); -+} -+ -+int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) -+{ -+ struct bch_member *mi; -+ int v = -1; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (!strlen(name) || !strcmp(name, "none")) -+ goto write_sb; -+ -+ v = bch2_disk_path_find_or_create(&c->disk_sb, name); -+ if (v < 0) { -+ mutex_unlock(&c->sb_lock); -+ return v; -+ } -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ goto unlock; -+write_sb: -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ SET_BCH_MEMBER_GROUP(mi, v + 1); -+ -+ bch2_write_super(c); -+unlock: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) -+{ -+ struct bch_dev *ca; -+ int g; -+ -+ if (!strlen(buf) || !strcmp(buf, "none")) { -+ *v = 0; -+ return 0; -+ } -+ -+ /* Is it a device? */ -+ ca = bch2_dev_lookup(c, buf); -+ if (!IS_ERR(ca)) { -+ *v = dev_to_target(ca->dev_idx); -+ percpu_ref_put(&ca->ref); -+ return 0; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ g = bch2_disk_path_find(&c->disk_sb, buf); -+ mutex_unlock(&c->sb_lock); -+ -+ if (g >= 0) { -+ *v = group_to_target(g); -+ return 0; -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) -+{ -+ struct target t = target_decode(v); -+ -+ switch (t.type) { -+ case TARGET_NULL: -+ pr_buf(out, "none"); -+ break; -+ case TARGET_DEV: { -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ ca = t.dev < c->sb.nr_devices -+ ? rcu_dereference(c->devs[t.dev]) -+ : NULL; -+ -+ if (ca && percpu_ref_tryget(&ca->io_ref)) { -+ char b[BDEVNAME_SIZE]; -+ -+ pr_buf(out, "/dev/%s", -+ bdevname(ca->disk_sb.bdev, b)); -+ percpu_ref_put(&ca->io_ref); -+ } else if (ca) { -+ pr_buf(out, "offline device %u", t.dev); -+ } else { -+ pr_buf(out, "invalid device %u", t.dev); -+ } -+ -+ rcu_read_unlock(); -+ break; -+ } -+ case TARGET_GROUP: -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(out, &c->disk_sb, t.group); -+ mutex_unlock(&c->sb_lock); -+ break; -+ default: -+ BUG(); -+ } -+} -diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h -new file mode 100644 -index 000000000000..3d84f23c34ed ---- /dev/null -+++ b/fs/bcachefs/disk_groups.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_DISK_GROUPS_H -+#define _BCACHEFS_DISK_GROUPS_H -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; -+ -+static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) -+{ -+ return groups -+ ? (vstruct_end(&groups->field) - -+ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) -+ : 0; -+} -+ -+struct target { -+ enum { -+ TARGET_NULL, -+ TARGET_DEV, -+ TARGET_GROUP, -+ } type; -+ union { -+ unsigned dev; -+ unsigned group; -+ }; -+}; -+ -+#define TARGET_DEV_START 1 -+#define TARGET_GROUP_START (256 + TARGET_DEV_START) -+ -+static inline u16 dev_to_target(unsigned dev) -+{ -+ return TARGET_DEV_START + dev; -+} -+ -+static inline u16 group_to_target(unsigned group) -+{ -+ return TARGET_GROUP_START + group; -+} -+ -+static inline struct target target_decode(unsigned target) -+{ -+ if (target >= TARGET_GROUP_START) -+ return (struct target) { -+ .type = TARGET_GROUP, -+ .group = target - TARGET_GROUP_START -+ }; -+ -+ if (target >= TARGET_DEV_START) -+ return (struct target) { -+ .type = TARGET_DEV, -+ .group = target - TARGET_DEV_START -+ }; -+ -+ return (struct target) { .type = TARGET_NULL }; -+} -+ -+const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); -+ -+static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, -+ enum bch_data_type data_type, -+ u16 target) -+{ -+ struct bch_devs_mask devs = c->rw_devs[data_type]; -+ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); -+ -+ if (t) -+ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); -+ return devs; -+} -+ -+bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); -+ -+int bch2_disk_path_find(struct bch_sb_handle *, const char *); -+ -+/* Exported for userspace bcachefs-tools: */ -+int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); -+ -+void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, -+ unsigned); -+ -+int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); -+void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); -+ -+int bch2_sb_disk_groups_to_cpu(struct bch_fs *); -+ -+int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); -+ -+const char *bch2_sb_validate_disk_groups(struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_DISK_GROUPS_H */ -diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c -new file mode 100644 -index 000000000000..eac750ad2240 ---- /dev/null -+++ b/fs/bcachefs/ec.c -@@ -0,0 +1,1636 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+/* erasure coding */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "keylist.h" -+#include "recovery.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+#ifdef __KERNEL__ -+ -+#include -+#include -+ -+static void raid5_recov(unsigned disks, unsigned failed_idx, -+ size_t size, void **data) -+{ -+ unsigned i = 2, nr; -+ -+ BUG_ON(failed_idx >= disks); -+ -+ swap(data[0], data[failed_idx]); -+ memcpy(data[0], data[1], size); -+ -+ while (i < disks) { -+ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); -+ xor_blocks(nr, size, data[0], data + i); -+ i += nr; -+ } -+ -+ swap(data[0], data[failed_idx]); -+} -+ -+static void raid_gen(int nd, int np, size_t size, void **v) -+{ -+ if (np >= 1) -+ raid5_recov(nd + np, nd, size, v); -+ if (np >= 2) -+ raid6_call.gen_syndrome(nd + np, size, v); -+ BUG_ON(np > 2); -+} -+ -+static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) -+{ -+ switch (nr) { -+ case 0: -+ break; -+ case 1: -+ if (ir[0] < nd + 1) -+ raid5_recov(nd + 1, ir[0], size, v); -+ else -+ raid6_call.gen_syndrome(nd + np, size, v); -+ break; -+ case 2: -+ if (ir[1] < nd) { -+ /* data+data failure. */ -+ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); -+ } else if (ir[0] < nd) { -+ /* data + p/q failure */ -+ -+ if (ir[1] == nd) /* data + p failure */ -+ raid6_datap_recov(nd + np, size, ir[0], v); -+ else { /* data + q failure */ -+ raid5_recov(nd + 1, ir[0], size, v); -+ raid6_call.gen_syndrome(nd + np, size, v); -+ } -+ } else { -+ raid_gen(nd, np, size, v); -+ } -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+#else -+ -+#include -+ -+#endif -+ -+struct ec_bio { -+ struct bch_dev *ca; -+ struct ec_stripe_buf *buf; -+ size_t idx; -+ struct bio bio; -+}; -+ -+/* Stripes btree keys: */ -+ -+const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ -+ if (k.k->p.inode) -+ return "invalid stripe key"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s)) -+ return "incorrect value size"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(*s) || -+ bkey_val_u64s(k.k) < stripe_val_u64s(s)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; -+ unsigned i; -+ -+ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", -+ s->algorithm, -+ le16_to_cpu(s->sectors), -+ s->nr_blocks - s->nr_redundant, -+ s->nr_redundant, -+ s->csum_type, -+ 1U << s->csum_granularity_bits); -+ -+ for (i = 0; i < s->nr_blocks; i++) -+ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, -+ (u64) s->ptrs[i].offset, -+ stripe_blockcount_get(s, i)); -+} -+ -+static int ptr_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ const struct bch_extent_ptr *ptr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { -+ const struct bch_extent_ptr *ptr2 = v->ptrs + i; -+ -+ if (ptr->dev == ptr2->dev && -+ ptr->gen == ptr2->gen && -+ ptr->offset >= ptr2->offset && -+ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) -+ return i; -+ } -+ -+ return -1; -+} -+ -+static int extent_matches_stripe(struct bch_fs *c, -+ struct bch_stripe *v, -+ struct bkey_s_c k) -+{ -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const struct bch_extent_ptr *ptr; -+ int idx; -+ -+ extent_for_each_ptr(e, ptr) { -+ idx = ptr_matches_stripe(c, v, ptr); -+ if (idx >= 0) -+ return idx; -+ } -+ break; -+ } -+ } -+ -+ return -1; -+} -+ -+static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ -+ extent_for_each_entry(e, entry) -+ if (extent_entry_type(entry) == -+ BCH_EXTENT_ENTRY_stripe_ptr && -+ entry->stripe_ptr.idx == idx) -+ return true; -+ -+ break; -+ } -+ } -+ -+ return false; -+} -+ -+/* Checksumming: */ -+ -+static void ec_generate_checksums(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csums_per_device = stripe_csums_per_device(v); -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i, j; -+ -+ if (!csum_bytes) -+ return; -+ -+ BUG_ON(buf->offset); -+ BUG_ON(buf->size != le16_to_cpu(v->sectors)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ for (j = 0; j < csums_per_device; j++) { -+ unsigned offset = j << v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, buf->size - offset); -+ -+ struct bch_csum csum = -+ bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + (offset << 9), -+ len << 9); -+ -+ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); -+ } -+ } -+} -+ -+static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned csum_granularity = 1 << v->csum_granularity_bits; -+ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; -+ unsigned i; -+ -+ if (!csum_bytes) -+ return; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ unsigned offset = buf->offset; -+ unsigned end = buf->offset + buf->size; -+ -+ if (!test_bit(i, buf->valid)) -+ continue; -+ -+ while (offset < end) { -+ unsigned j = offset >> v->csum_granularity_bits; -+ unsigned len = min(csum_granularity, end - offset); -+ struct bch_csum csum; -+ -+ BUG_ON(offset & (csum_granularity - 1)); -+ BUG_ON(offset + len != le16_to_cpu(v->sectors) && -+ ((offset + len) & (csum_granularity - 1))); -+ -+ csum = bch2_checksum(NULL, v->csum_type, -+ null_nonce(), -+ buf->data[i] + ((offset - buf->offset) << 9), -+ len << 9); -+ -+ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { -+ __bcache_io_error(c, -+ "checksum error while doing reconstruct read (%u:%u)", -+ i, j); -+ clear_bit(i, buf->valid); -+ break; -+ } -+ -+ offset += len; -+ } -+ } -+} -+ -+/* Erasure coding: */ -+ -+static void ec_generate_ec(struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = le16_to_cpu(v->sectors) << 9; -+ -+ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); -+} -+ -+static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) -+{ -+ return nr - bitmap_weight(buf->valid, nr); -+} -+ -+static unsigned ec_nr_failed(struct ec_stripe_buf *buf) -+{ -+ return __ec_nr_failed(buf, buf->key.v.nr_blocks); -+} -+ -+static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; -+ unsigned nr_data = v->nr_blocks - v->nr_redundant; -+ unsigned bytes = buf->size << 9; -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ return -1; -+ } -+ -+ for (i = 0; i < nr_data; i++) -+ if (!test_bit(i, buf->valid)) -+ failed[nr_failed++] = i; -+ -+ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); -+ return 0; -+} -+ -+/* IO: */ -+ -+static void ec_block_endio(struct bio *bio) -+{ -+ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); -+ struct bch_dev *ca = ec_bio->ca; -+ struct closure *cl = bio->bi_private; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", -+ bio_data_dir(bio) ? "write" : "read", -+ bch2_blk_status_to_str(bio->bi_status))) -+ clear_bit(ec_bio->idx, ec_bio->buf->valid); -+ -+ bio_put(&ec_bio->bio); -+ percpu_ref_put(&ca->io_ref); -+ closure_put(cl); -+} -+ -+static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, -+ unsigned rw, unsigned idx, struct closure *cl) -+{ -+ struct bch_stripe *v = &buf->key.v; -+ unsigned offset = 0, bytes = buf->size << 9; -+ struct bch_extent_ptr *ptr = &v->ptrs[idx]; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (!bch2_dev_get_ioref(ca, rw)) { -+ clear_bit(idx, buf->valid); -+ return; -+ } -+ -+ while (offset < bytes) { -+ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, -+ DIV_ROUND_UP(bytes, PAGE_SIZE)); -+ unsigned b = min_t(size_t, bytes - offset, -+ nr_iovecs << PAGE_SHIFT); -+ struct ec_bio *ec_bio; -+ -+ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, -+ &c->ec_bioset), -+ struct ec_bio, bio); -+ -+ ec_bio->ca = ca; -+ ec_bio->buf = buf; -+ ec_bio->idx = idx; -+ -+ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); -+ bio_set_op_attrs(&ec_bio->bio, rw, 0); -+ -+ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); -+ ec_bio->bio.bi_end_io = ec_block_endio; -+ ec_bio->bio.bi_private = cl; -+ -+ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); -+ -+ closure_get(cl); -+ percpu_ref_get(&ca->io_ref); -+ -+ submit_bio(&ec_bio->bio); -+ -+ offset += b; -+ } -+ -+ percpu_ref_put(&ca->io_ref); -+} -+ -+/* recovery read path: */ -+int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct ec_stripe_buf *buf; -+ struct closure cl; -+ struct bkey_s_c k; -+ struct bch_stripe *v; -+ unsigned stripe_idx; -+ unsigned offset, end; -+ unsigned i, nr_data, csum_granularity; -+ int ret = 0, idx; -+ -+ closure_init_stack(&cl); -+ -+ BUG_ON(!rbio->pick.has_ec); -+ -+ stripe_idx = rbio->pick.ec.idx; -+ -+ buf = kzalloc(sizeof(*buf), GFP_NOIO); -+ if (!buf) -+ return -ENOMEM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, -+ POS(0, stripe_idx), -+ BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stripe not found"); -+ kfree(buf); -+ return bch2_trans_exit(&trans) ?: -EIO; -+ } -+ -+ bkey_reassemble(&buf->key.k_i, k); -+ bch2_trans_exit(&trans); -+ -+ v = &buf->key.v; -+ -+ nr_data = v->nr_blocks - v->nr_redundant; -+ -+ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); -+ BUG_ON(idx < 0); -+ -+ csum_granularity = 1U << v->csum_granularity_bits; -+ -+ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; -+ end = offset + bio_sectors(&rbio->bio); -+ -+ BUG_ON(end > le16_to_cpu(v->sectors)); -+ -+ buf->offset = round_down(offset, csum_granularity); -+ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), -+ round_up(end, csum_granularity)) - buf->offset; -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); -+ if (!buf->data[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ } -+ -+ memset(buf->valid, 0xFF, sizeof(buf->valid)); -+ -+ for (i = 0; i < v->nr_blocks; i++) { -+ struct bch_extent_ptr *ptr = v->ptrs + i; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (ptr_stale(ca, ptr)) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: stale pointer"); -+ clear_bit(i, buf->valid); -+ continue; -+ } -+ -+ ec_block_io(c, buf, REQ_OP_READ, i, &cl); -+ } -+ -+ closure_sync(&cl); -+ -+ if (ec_nr_failed(buf) > v->nr_redundant) { -+ __bcache_io_error(c, -+ "error doing reconstruct read: unable to read enough blocks"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ ec_validate_checksums(c, buf); -+ -+ ret = ec_do_recov(c, buf); -+ if (ret) -+ goto err; -+ -+ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, -+ buf->data[idx] + ((offset - buf->offset) << 9)); -+err: -+ for (i = 0; i < v->nr_blocks; i++) -+ kfree(buf->data[i]); -+ kfree(buf); -+ return ret; -+} -+ -+/* stripe bucket accounting: */ -+ -+static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) -+{ -+ ec_stripes_heap n, *h = &c->ec_stripes_heap; -+ -+ if (idx >= h->size) { -+ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) -+ return -ENOMEM; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ if (n.size > h->size) { -+ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); -+ n.used = h->used; -+ swap(*h, n); -+ } -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ free_heap(&n); -+ } -+ -+ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) -+ return -ENOMEM; -+ -+ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && -+ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) -+ return -ENOMEM; -+ -+ return 0; -+} -+ -+static int ec_stripe_mem_alloc(struct bch_fs *c, -+ struct btree_iter *iter) -+{ -+ size_t idx = iter->pos.offset; -+ int ret = 0; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) -+ return ret; -+ -+ bch2_trans_unlock(iter->trans); -+ ret = -EINTR; -+ -+ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) -+ return ret; -+ -+ return -ENOMEM; -+} -+ -+static ssize_t stripe_idx_to_delete(struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ -+ return h->used && h->data[0].blocks_nonempty == 0 -+ ? h->data[0].idx : -1; -+} -+ -+static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, -+ struct ec_stripe_heap_entry l, -+ struct ec_stripe_heap_entry r) -+{ -+ return ((l.blocks_nonempty > r.blocks_nonempty) - -+ (l.blocks_nonempty < r.blocks_nonempty)); -+} -+ -+static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, -+ size_t i) -+{ -+ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); -+ -+ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; -+} -+ -+static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m = genradix_ptr(&c->stripes[0], idx); -+ -+ BUG_ON(!m->alive); -+ BUG_ON(m->heap_idx >= h->used); -+ BUG_ON(h->data[m->heap_idx].idx != idx); -+} -+ -+void bch2_stripes_heap_del(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ if (!m->on_heap) -+ return; -+ -+ m->on_heap = false; -+ -+ heap_verify_backpointer(c, idx); -+ -+ heap_del(&c->ec_stripes_heap, m->heap_idx, -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+} -+ -+void bch2_stripes_heap_insert(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ if (m->on_heap) -+ return; -+ -+ BUG_ON(heap_full(&c->ec_stripes_heap)); -+ -+ m->on_heap = true; -+ -+ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { -+ .idx = idx, -+ .blocks_nonempty = m->blocks_nonempty, -+ }), -+ ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+} -+ -+void bch2_stripes_heap_update(struct bch_fs *c, -+ struct stripe *m, size_t idx) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ size_t i; -+ -+ if (!m->on_heap) -+ return; -+ -+ heap_verify_backpointer(c, idx); -+ -+ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; -+ -+ i = m->heap_idx; -+ heap_sift_up(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ heap_sift_down(h, i, ec_stripes_heap_cmp, -+ ec_stripes_heap_set_backpointer); -+ -+ heap_verify_backpointer(c, idx); -+ -+ if (stripe_idx_to_delete(c) >= 0 && -+ !percpu_ref_is_dying(&c->writes)) -+ schedule_work(&c->ec_stripe_delete_work); -+} -+ -+/* stripe deletion */ -+ -+static int ec_stripe_delete(struct bch_fs *c, size_t idx) -+{ -+ //pr_info("deleting stripe %zu", idx); -+ return bch2_btree_delete_range(c, BTREE_ID_EC, -+ POS(0, idx), -+ POS(0, idx + 1), -+ NULL); -+} -+ -+static void ec_stripe_delete_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, ec_stripe_delete_work); -+ ssize_t idx; -+ -+ while (1) { -+ spin_lock(&c->ec_stripes_heap_lock); -+ idx = stripe_idx_to_delete(c); -+ if (idx < 0) { -+ spin_unlock(&c->ec_stripes_heap_lock); -+ break; -+ } -+ -+ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ if (ec_stripe_delete(c, idx)) -+ break; -+ } -+} -+ -+/* stripe creation: */ -+ -+static int ec_stripe_bkey_insert(struct bch_fs *c, -+ struct bkey_i_stripe *stripe) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bpos start_pos = POS(0, c->ec_stripe_hint); -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { -+ if (start_pos.offset) { -+ start_pos = POS_MIN; -+ bch2_btree_iter_set_pos(iter, start_pos); -+ continue; -+ } -+ -+ ret = -ENOSPC; -+ break; -+ } -+ -+ if (bkey_deleted(k.k)) -+ goto found_slot; -+ } -+ -+ goto err; -+found_slot: -+ start_pos = iter->pos; -+ -+ ret = ec_stripe_mem_alloc(c, iter); -+ if (ret) -+ goto err; -+ -+ stripe->k.p = iter->pos; -+ -+ bch2_trans_update(&trans, iter, &stripe->k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+err: -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static void extent_stripe_ptr_add(struct bkey_s_extent e, -+ struct ec_stripe_buf *s, -+ struct bch_extent_ptr *ptr, -+ unsigned block) -+{ -+ struct bch_extent_stripe_ptr *dst = (void *) ptr; -+ union bch_extent_entry *end = extent_entry_last(e); -+ -+ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); -+ e.k->u64s += sizeof(*dst) / sizeof(u64); -+ -+ *dst = (struct bch_extent_stripe_ptr) { -+ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, -+ .block = block, -+ .idx = s->key.k.p.offset, -+ }; -+} -+ -+static int ec_stripe_update_ptrs(struct bch_fs *c, -+ struct ec_stripe_buf *s, -+ struct bkey *pos) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_extent e; -+ struct bkey_on_stack sk; -+ int ret = 0, dev, idx; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ /* XXX this doesn't support the reflink btree */ -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(pos), -+ BTREE_ITER_INTENT); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { -+ struct bch_extent_ptr *ptr, *ec_ptr = NULL; -+ -+ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ idx = extent_matches_stripe(c, &s->key.v, k); -+ if (idx < 0) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ dev = s->key.v.ptrs[idx].dev; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ e = bkey_i_to_s_extent(sk.k); -+ -+ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); -+ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); -+ BUG_ON(!ec_ptr); -+ -+ extent_stripe_ptr_add(e, s, ec_ptr, idx); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* -+ * data buckets of new stripe all written: create the stripe -+ */ -+static void ec_stripe_create(struct ec_stripe_new *s) -+{ -+ struct bch_fs *c = s->c; -+ struct open_bucket *ob; -+ struct bkey_i *k; -+ struct stripe *m; -+ struct bch_stripe *v = &s->stripe.key.v; -+ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; -+ struct closure cl; -+ int ret; -+ -+ BUG_ON(s->h->s == s); -+ -+ closure_init_stack(&cl); -+ -+ if (s->err) { -+ if (s->err != -EROFS) -+ bch_err(c, "error creating stripe: error writing data buckets"); -+ goto err; -+ } -+ -+ BUG_ON(!s->allocated); -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ goto err; -+ -+ BUG_ON(bitmap_weight(s->blocks_allocated, -+ s->blocks.nr) != s->blocks.nr); -+ -+ ec_generate_ec(&s->stripe); -+ -+ ec_generate_checksums(&s->stripe); -+ -+ /* write p/q: */ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); -+ -+ closure_sync(&cl); -+ -+ for (i = nr_data; i < v->nr_blocks; i++) -+ if (!test_bit(i, s->stripe.valid)) { -+ bch_err(c, "error creating stripe: error writing redundancy buckets"); -+ goto err_put_writes; -+ } -+ -+ ret = s->existing_stripe -+ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, -+ NULL, NULL, BTREE_INSERT_NOFAIL) -+ : ec_stripe_bkey_insert(c, &s->stripe.key); -+ if (ret) { -+ bch_err(c, "error creating stripe: error creating stripe key"); -+ goto err_put_writes; -+ } -+ -+ for_each_keylist_key(&s->keys, k) { -+ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); -+ if (ret) { -+ bch_err(c, "error creating stripe: error updating pointers"); -+ break; -+ } -+ } -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); -+#if 0 -+ pr_info("created a %s stripe %llu", -+ s->existing_stripe ? "existing" : "new", -+ s->stripe.key.k.p.offset); -+#endif -+ BUG_ON(m->on_heap); -+ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); -+ spin_unlock(&c->ec_stripes_heap_lock); -+err_put_writes: -+ percpu_ref_put(&c->writes); -+err: -+ open_bucket_for_each(c, &s->blocks, ob, i) { -+ ob->ec = NULL; -+ __bch2_open_bucket_put(c, ob); -+ } -+ -+ bch2_open_buckets_put(c, &s->parity); -+ -+ bch2_keylist_free(&s->keys, s->inline_keys); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+} -+ -+static void ec_stripe_create_work(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, -+ struct bch_fs, ec_stripe_create_work); -+ struct ec_stripe_new *s, *n; -+restart: -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) -+ if (!atomic_read(&s->pin)) { -+ list_del(&s->list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ ec_stripe_create(s); -+ goto restart; -+ } -+ mutex_unlock(&c->ec_stripe_new_lock); -+} -+ -+static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) -+{ -+ BUG_ON(atomic_read(&s->pin) <= 0); -+ -+ if (atomic_dec_and_test(&s->pin)) { -+ BUG_ON(!s->pending); -+ queue_work(system_long_wq, &c->ec_stripe_create_work); -+ } -+} -+ -+static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s = h->s; -+ -+ BUG_ON(!s->allocated && !s->err); -+ -+ h->s = NULL; -+ s->pending = true; -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_add(&s->list, &c->ec_stripe_new_list); -+ mutex_unlock(&c->ec_stripe_new_lock); -+ -+ ec_stripe_new_put(c, s); -+} -+ -+/* have a full bucket - hand it off to be erasure coded: */ -+void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ if (ob->sectors_free) -+ s->err = -1; -+ -+ ec_stripe_new_put(c, s); -+} -+ -+void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) -+{ -+ struct ec_stripe_new *s = ob->ec; -+ -+ s->err = -EIO; -+} -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct bch_dev *ca; -+ unsigned offset; -+ -+ if (!ob) -+ return NULL; -+ -+ ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ offset = ca->mi.bucket_size - ob->sectors_free; -+ -+ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); -+} -+ -+void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, -+ struct bpos pos, unsigned sectors) -+{ -+ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); -+ struct ec_stripe_new *ec; -+ -+ if (!ob) -+ return; -+ -+ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); -+ -+ ec = ob->ec; -+ mutex_lock(&ec->lock); -+ -+ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, -+ ARRAY_SIZE(ec->inline_keys), -+ BKEY_U64s)) { -+ BUG(); -+ } -+ -+ bkey_init(&ec->keys.top->k); -+ ec->keys.top->k.p = pos; -+ bch2_key_resize(&ec->keys.top->k, sectors); -+ bch2_keylist_push(&ec->keys); -+ -+ mutex_unlock(&ec->lock); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ unsigned l = *((const unsigned *) _l); -+ unsigned r = *((const unsigned *) _r); -+ -+ return cmp_int(l, r); -+} -+ -+/* pick most common bucket size: */ -+static unsigned pick_blocksize(struct bch_fs *c, -+ struct bch_devs_mask *devs) -+{ -+ struct bch_dev *ca; -+ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; -+ struct { -+ unsigned nr, size; -+ } cur = { 0, 0 }, best = { 0, 0 }; -+ -+ for_each_member_device_rcu(ca, c, i, devs) -+ sizes[nr++] = ca->mi.bucket_size; -+ -+ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); -+ -+ for (i = 0; i < nr; i++) { -+ if (sizes[i] != cur.size) { -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ cur.nr = 0; -+ cur.size = sizes[i]; -+ } -+ -+ cur.nr++; -+ } -+ -+ if (cur.nr > best.nr) -+ best = cur; -+ -+ return best.size; -+} -+ -+static bool may_create_new_stripe(struct bch_fs *c) -+{ -+ return false; -+} -+ -+static void ec_stripe_key_init(struct bch_fs *c, -+ struct bkey_i_stripe *s, -+ unsigned nr_data, -+ unsigned nr_parity, -+ unsigned stripe_size) -+{ -+ unsigned u64s; -+ -+ bkey_stripe_init(&s->k_i); -+ s->v.sectors = cpu_to_le16(stripe_size); -+ s->v.algorithm = 0; -+ s->v.nr_blocks = nr_data + nr_parity; -+ s->v.nr_redundant = nr_parity; -+ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); -+ s->v.csum_type = BCH_CSUM_CRC32C; -+ s->v.pad = 0; -+ -+ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { -+ BUG_ON(1 << s->v.csum_granularity_bits >= -+ le16_to_cpu(s->v.sectors) || -+ s->v.csum_granularity_bits == U8_MAX); -+ s->v.csum_granularity_bits++; -+ } -+ -+ set_bkey_val_u64s(&s->k, u64s); -+} -+ -+static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct ec_stripe_new *s; -+ unsigned i; -+ -+ lockdep_assert_held(&h->lock); -+ -+ s = kzalloc(sizeof(*s), GFP_KERNEL); -+ if (!s) -+ return -ENOMEM; -+ -+ mutex_init(&s->lock); -+ atomic_set(&s->pin, 1); -+ s->c = c; -+ s->h = h; -+ s->nr_data = min_t(unsigned, h->nr_active_devs, -+ EC_STRIPE_MAX) - h->redundancy; -+ s->nr_parity = h->redundancy; -+ -+ bch2_keylist_init(&s->keys, s->inline_keys); -+ -+ s->stripe.offset = 0; -+ s->stripe.size = h->blocksize; -+ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); -+ -+ ec_stripe_key_init(c, &s->stripe.key, s->nr_data, -+ s->nr_parity, h->blocksize); -+ -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { -+ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); -+ if (!s->stripe.data[i]) -+ goto err; -+ } -+ -+ h->s = s; -+ -+ return 0; -+err: -+ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) -+ kvpfree(s->stripe.data[i], s->stripe.size << 9); -+ kfree(s); -+ return -ENOMEM; -+} -+ -+static struct ec_stripe_head * -+ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, -+ unsigned algo, unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ h = kzalloc(sizeof(*h), GFP_KERNEL); -+ if (!h) -+ return NULL; -+ -+ mutex_init(&h->lock); -+ mutex_lock(&h->lock); -+ -+ h->target = target; -+ h->algo = algo; -+ h->redundancy = redundancy; -+ -+ rcu_read_lock(); -+ h->devs = target_rw_devs(c, BCH_DATA_user, target); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (!ca->mi.durability) -+ __clear_bit(i, h->devs.d); -+ -+ h->blocksize = pick_blocksize(c, &h->devs); -+ -+ for_each_member_device_rcu(ca, c, i, &h->devs) -+ if (ca->mi.bucket_size == h->blocksize) -+ h->nr_active_devs++; -+ -+ rcu_read_unlock(); -+ list_add(&h->list, &c->ec_stripe_head_list); -+ return h; -+} -+ -+void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ if (h->s && -+ h->s->allocated && -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr) == h->s->blocks.nr) -+ ec_stripe_set_pending(c, h); -+ -+ mutex_unlock(&h->lock); -+} -+ -+struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ struct ec_stripe_head *h; -+ -+ if (!redundancy) -+ return NULL; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) -+ if (h->target == target && -+ h->algo == algo && -+ h->redundancy == redundancy) { -+ mutex_lock(&h->lock); -+ goto found; -+ } -+ -+ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); -+found: -+ mutex_unlock(&c->ec_stripe_head_lock); -+ return h; -+} -+ -+/* -+ * XXX: use a higher watermark for allocating open buckets here: -+ */ -+static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) -+{ -+ struct bch_devs_mask devs; -+ struct open_bucket *ob; -+ unsigned i, nr_have, nr_data = -+ min_t(unsigned, h->nr_active_devs, -+ EC_STRIPE_MAX) - h->redundancy; -+ bool have_cache = true; -+ int ret = 0; -+ -+ devs = h->devs; -+ -+ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { -+ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); -+ --nr_data; -+ } -+ -+ BUG_ON(h->s->blocks.nr > nr_data); -+ BUG_ON(h->s->parity.nr > h->redundancy); -+ -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ open_bucket_for_each(c, &h->s->blocks, ob, i) -+ __clear_bit(ob->ptr.dev, devs.d); -+ -+ percpu_down_read(&c->mark_lock); -+ rcu_read_lock(); -+ -+ if (h->s->parity.nr < h->redundancy) { -+ nr_have = h->s->parity.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->s->parity, -+ &h->parity_stripe, -+ &devs, -+ h->redundancy, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+ -+ if (h->s->blocks.nr < nr_data) { -+ nr_have = h->s->blocks.nr; -+ -+ ret = bch2_bucket_alloc_set(c, &h->s->blocks, -+ &h->block_stripe, -+ &devs, -+ nr_data, -+ &nr_have, -+ &have_cache, -+ RESERVE_NONE, -+ 0, -+ NULL); -+ if (ret) -+ goto err; -+ } -+err: -+ rcu_read_unlock(); -+ percpu_up_read(&c->mark_lock); -+ return ret; -+} -+ -+/* XXX: doesn't obey target: */ -+static s64 get_existing_stripe(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t heap_idx; -+ u64 stripe_idx; -+ -+ if (may_create_new_stripe(c)) -+ return -1; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { -+ if (!h->data[heap_idx].blocks_nonempty) -+ continue; -+ -+ stripe_idx = h->data[heap_idx].idx; -+ m = genradix_ptr(&c->stripes[0], stripe_idx); -+ -+ if (m->algorithm == algo && -+ m->nr_redundant == redundancy && -+ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { -+ bch2_stripes_heap_del(c, m, stripe_idx); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ return stripe_idx; -+ } -+ } -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ return -1; -+} -+ -+static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (!ret) -+ bkey_reassemble(&stripe->key.k_i, k); -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, -+ unsigned target, -+ unsigned algo, -+ unsigned redundancy) -+{ -+ struct closure cl; -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i, data_idx = 0; -+ s64 idx; -+ -+ closure_init_stack(&cl); -+ -+ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); -+ if (!h) -+ return NULL; -+ -+ if (!h->s && ec_new_stripe_alloc(c, h)) { -+ bch2_ec_stripe_head_put(c, h); -+ return NULL; -+ } -+ -+ if (!h->s->allocated) { -+ if (!h->s->existing_stripe && -+ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { -+ //pr_info("got existing stripe %llu", idx); -+ -+ h->s->existing_stripe = true; -+ h->s->existing_stripe_idx = idx; -+ if (get_stripe_key(c, idx, &h->s->stripe)) { -+ /* btree error */ -+ BUG(); -+ } -+ -+ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) -+ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { -+ __set_bit(i, h->s->blocks_allocated); -+ ec_block_io(c, &h->s->stripe, READ, i, &cl); -+ } -+ } -+ -+ if (new_stripe_alloc_buckets(c, h)) { -+ bch2_ec_stripe_head_put(c, h); -+ h = NULL; -+ goto out; -+ } -+ -+ open_bucket_for_each(c, &h->s->blocks, ob, i) { -+ data_idx = find_next_zero_bit(h->s->blocks_allocated, -+ h->s->nr_data, data_idx); -+ BUG_ON(data_idx >= h->s->nr_data); -+ -+ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; -+ h->s->data_block_idx[i] = data_idx; -+ data_idx++; -+ } -+ -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; -+ -+ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); -+ h->s->allocated = true; -+ } -+out: -+ closure_sync(&cl); -+ return h; -+} -+ -+void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct ec_stripe_head *h; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ -+ mutex_lock(&h->lock); -+ if (!h->s) -+ goto unlock; -+ -+ open_bucket_for_each(c, &h->s->blocks, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ open_bucket_for_each(c, &h->s->parity, ob, i) -+ if (ob->ptr.dev == ca->dev_idx) -+ goto found; -+ goto unlock; -+found: -+ h->s->err = -EROFS; -+ ec_stripe_set_pending(c, h); -+unlock: -+ mutex_unlock(&h->lock); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+} -+ -+static int __bch2_stripe_write_key(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct stripe *m, -+ size_t idx, -+ struct bkey_i_stripe *new_key) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_s_c k; -+ unsigned i; -+ int ret; -+ -+ bch2_btree_iter_set_pos(iter, POS(0, idx)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ return ret; -+ -+ if (k.k->type != KEY_TYPE_stripe) -+ return -EIO; -+ -+ bkey_reassemble(&new_key->k_i, k); -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ -+ for (i = 0; i < new_key->v.nr_blocks; i++) -+ stripe_blockcount_set(&new_key->v, i, -+ m->block_sectors[i]); -+ m->dirty = false; -+ -+ spin_unlock(&c->ec_stripes_heap_lock); -+ -+ bch2_trans_update(trans, iter, &new_key->k_i, 0); -+ return 0; -+} -+ -+int bch2_stripes_write(struct bch_fs *c, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct genradix_iter giter; -+ struct bkey_i_stripe *new_key; -+ struct stripe *m; -+ int ret = 0; -+ -+ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); -+ BUG_ON(!new_key); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ genradix_for_each(&c->stripes[0], giter, m) { -+ if (!m->dirty) -+ continue; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|flags, -+ __bch2_stripe_write_key(&trans, iter, m, -+ giter.pos, new_key)); -+ -+ if (ret) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ kfree(new_key); -+ -+ return ret; -+} -+ -+static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k) -+{ -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_stripe) { -+ struct stripe *m; -+ -+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: -+ bch2_mark_key(c, k, 0, 0, NULL, 0, -+ BTREE_TRIGGER_NOATOMIC); -+ if (ret) -+ return ret; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ m = genradix_ptr(&c->stripes[0], k.k->p.offset); -+ bch2_stripes_heap_insert(c, m, k.k->p.offset); -+ spin_unlock(&c->ec_stripes_heap_lock); -+ } -+ -+ return ret; -+} -+ -+int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) -+{ -+ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, -+ NULL, bch2_stripes_read_fn); -+ if (ret) -+ bch_err(c, "error reading stripes: %i", ret); -+ -+ return ret; -+} -+ -+int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ size_t i, idx = 0; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); -+ -+ k = bch2_btree_iter_prev(iter); -+ if (!IS_ERR_OR_NULL(k.k)) -+ idx = k.k->p.offset + 1; -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return ret; -+ -+ if (!idx) -+ return 0; -+ -+ if (!gc && -+ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), -+ GFP_KERNEL)) -+ return -ENOMEM; -+#if 0 -+ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); -+#else -+ for (i = 0; i < idx; i++) -+ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) -+ return -ENOMEM; -+#endif -+ return 0; -+} -+ -+void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ ec_stripes_heap *h = &c->ec_stripes_heap; -+ struct stripe *m; -+ size_t i; -+ -+ spin_lock(&c->ec_stripes_heap_lock); -+ for (i = 0; i < min(h->used, 20UL); i++) { -+ m = genradix_ptr(&c->stripes[0], h->data[i].idx); -+ -+ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, -+ h->data[i].blocks_nonempty, -+ m->nr_blocks - m->nr_redundant, -+ m->nr_redundant); -+ } -+ spin_unlock(&c->ec_stripes_heap_lock); -+} -+ -+void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ struct ec_stripe_new *s; -+ -+ mutex_lock(&c->ec_stripe_head_lock); -+ list_for_each_entry(h, &c->ec_stripe_head_list, list) { -+ pr_buf(out, "target %u algo %u redundancy %u:\n", -+ h->target, h->algo, h->redundancy); -+ -+ if (h->s) -+ pr_buf(out, "\tpending: blocks %u allocated %u\n", -+ h->s->blocks.nr, -+ bitmap_weight(h->s->blocks_allocated, -+ h->s->blocks.nr)); -+ } -+ mutex_unlock(&c->ec_stripe_head_lock); -+ -+ mutex_lock(&c->ec_stripe_new_lock); -+ list_for_each_entry(s, &c->ec_stripe_new_list, list) { -+ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", -+ s->blocks.nr, -+ bitmap_weight(s->blocks_allocated, -+ s->blocks.nr), -+ atomic_read(&s->pin)); -+ } -+ mutex_unlock(&c->ec_stripe_new_lock); -+} -+ -+void bch2_fs_ec_exit(struct bch_fs *c) -+{ -+ struct ec_stripe_head *h; -+ -+ while (1) { -+ mutex_lock(&c->ec_stripe_head_lock); -+ h = list_first_entry_or_null(&c->ec_stripe_head_list, -+ struct ec_stripe_head, list); -+ if (h) -+ list_del(&h->list); -+ mutex_unlock(&c->ec_stripe_head_lock); -+ if (!h) -+ break; -+ -+ BUG_ON(h->s); -+ kfree(h); -+ } -+ -+ BUG_ON(!list_empty(&c->ec_stripe_new_list)); -+ -+ free_heap(&c->ec_stripes_heap); -+ genradix_free(&c->stripes[0]); -+ bioset_exit(&c->ec_bioset); -+} -+ -+int bch2_fs_ec_init(struct bch_fs *c) -+{ -+ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); -+ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); -+ -+ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), -+ BIOSET_NEED_BVECS); -+} -diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h -new file mode 100644 -index 000000000000..6db16cf768da ---- /dev/null -+++ b/fs/bcachefs/ec.h -@@ -0,0 +1,169 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_H -+#define _BCACHEFS_EC_H -+ -+#include "ec_types.h" -+#include "keylist_types.h" -+ -+const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_stripe (struct bkey_ops) { \ -+ .key_invalid = bch2_stripe_invalid, \ -+ .val_to_text = bch2_stripe_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(le16_to_cpu(s->sectors), -+ 1 << s->csum_granularity_bits); -+} -+ -+static inline unsigned stripe_csum_offset(const struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; -+ -+ return sizeof(struct bch_stripe) + -+ sizeof(struct bch_extent_ptr) * s->nr_blocks + -+ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; -+} -+ -+static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return stripe_csum_offset(s, s->nr_blocks, 0) + -+ sizeof(u16) * idx; -+} -+ -+static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, -+ unsigned idx) -+{ -+ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); -+} -+ -+static inline void stripe_blockcount_set(struct bch_stripe *s, -+ unsigned idx, unsigned v) -+{ -+ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); -+ -+ *p = cpu_to_le16(v); -+} -+ -+static inline unsigned stripe_val_u64s(const struct bch_stripe *s) -+{ -+ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), -+ sizeof(u64)); -+} -+ -+static inline void *stripe_csum(struct bch_stripe *s, -+ unsigned dev, unsigned csum_idx) -+{ -+ return (void *) s + stripe_csum_offset(s, dev, csum_idx); -+} -+ -+struct bch_read_bio; -+ -+struct ec_stripe_buf { -+ /* might not be buffering the entire stripe: */ -+ unsigned offset; -+ unsigned size; -+ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ void *data[EC_STRIPE_MAX]; -+ -+ union { -+ struct bkey_i_stripe key; -+ u64 pad[255]; -+ }; -+}; -+ -+struct ec_stripe_head; -+ -+struct ec_stripe_new { -+ struct bch_fs *c; -+ struct ec_stripe_head *h; -+ struct mutex lock; -+ struct list_head list; -+ -+ /* counts in flight writes, stripe is created when pin == 0 */ -+ atomic_t pin; -+ -+ int err; -+ -+ u8 nr_data; -+ u8 nr_parity; -+ bool allocated; -+ bool pending; -+ bool existing_stripe; -+ u64 existing_stripe_idx; -+ -+ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; -+ -+ struct open_buckets blocks; -+ u8 data_block_idx[EC_STRIPE_MAX]; -+ struct open_buckets parity; -+ -+ struct keylist keys; -+ u64 inline_keys[BKEY_U64s * 8]; -+ -+ struct ec_stripe_buf stripe; -+}; -+ -+struct ec_stripe_head { -+ struct list_head list; -+ struct mutex lock; -+ -+ unsigned target; -+ unsigned algo; -+ unsigned redundancy; -+ -+ struct bch_devs_mask devs; -+ unsigned nr_active_devs; -+ -+ unsigned blocksize; -+ -+ struct dev_stripe_state block_stripe; -+ struct dev_stripe_state parity_stripe; -+ -+ struct ec_stripe_new *s; -+}; -+ -+int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); -+ -+void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -+void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, -+ struct bpos, unsigned); -+ -+void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); -+void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); -+ -+int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); -+ -+void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -+struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, -+ unsigned, unsigned); -+ -+void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -+void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); -+ -+void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); -+ -+void bch2_ec_flush_new_stripes(struct bch_fs *); -+ -+struct journal_keys; -+int bch2_stripes_read(struct bch_fs *, struct journal_keys *); -+int bch2_stripes_write(struct bch_fs *, unsigned); -+ -+int bch2_ec_mem_alloc(struct bch_fs *, bool); -+ -+void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); -+void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_fs_ec_exit(struct bch_fs *); -+int bch2_fs_ec_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_EC_H */ -diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h -new file mode 100644 -index 000000000000..e4d633fca5bf ---- /dev/null -+++ b/fs/bcachefs/ec_types.h -@@ -0,0 +1,39 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EC_TYPES_H -+#define _BCACHEFS_EC_TYPES_H -+ -+#include -+ -+#define EC_STRIPE_MAX 16 -+ -+struct bch_replicas_padded { -+ struct bch_replicas_entry e; -+ u8 pad[EC_STRIPE_MAX]; -+}; -+ -+struct stripe { -+ size_t heap_idx; -+ -+ u16 sectors; -+ u8 algorithm; -+ -+ u8 nr_blocks; -+ u8 nr_redundant; -+ -+ unsigned alive:1; -+ unsigned dirty:1; -+ unsigned on_heap:1; -+ u8 blocks_nonempty; -+ u16 block_sectors[EC_STRIPE_MAX]; -+ -+ struct bch_replicas_padded r; -+}; -+ -+struct ec_stripe_heap_entry { -+ size_t idx; -+ unsigned blocks_nonempty; -+}; -+ -+typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; -+ -+#endif /* _BCACHEFS_EC_TYPES_H */ -diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c -new file mode 100644 -index 000000000000..cd46706fb6f5 ---- /dev/null -+++ b/fs/bcachefs/error.c -@@ -0,0 +1,172 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "error.h" -+#include "io.h" -+#include "super.h" -+ -+#define FSCK_ERR_RATELIMIT_NR 10 -+ -+bool bch2_inconsistent_error(struct bch_fs *c) -+{ -+ set_bit(BCH_FS_ERROR, &c->flags); -+ -+ switch (c->opts.errors) { -+ case BCH_ON_ERROR_CONTINUE: -+ return false; -+ case BCH_ON_ERROR_RO: -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+ return true; -+ case BCH_ON_ERROR_PANIC: -+ panic(bch2_fmt(c, "panic after error")); -+ return true; -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_fatal_error(struct bch_fs *c) -+{ -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only"); -+} -+ -+void bch2_io_error_work(struct work_struct *work) -+{ -+ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); -+ struct bch_fs *c = ca->fs; -+ bool dev; -+ -+ down_write(&c->state_lock); -+ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED); -+ if (dev -+ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, -+ BCH_FORCE_IF_DEGRADED) -+ : bch2_fs_emergency_read_only(c)) -+ bch_err(ca, -+ "too many IO errors, setting %s RO", -+ dev ? "device" : "filesystem"); -+ up_write(&c->state_lock); -+} -+ -+void bch2_io_error(struct bch_dev *ca) -+{ -+ //queue_work(system_long_wq, &ca->io_error_work); -+} -+ -+#ifdef __KERNEL__ -+#define ask_yn() false -+#else -+#include "tools-util.h" -+#endif -+ -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, -+ const char *fmt, ...) -+{ -+ struct fsck_err_state *s = NULL; -+ va_list args; -+ bool fix = false, print = true, suppressing = false; -+ char _buf[sizeof(s->buf)], *buf = _buf; -+ -+ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { -+ va_start(args, fmt); -+ vprintk(fmt, args); -+ va_end(args); -+ -+ return bch2_inconsistent_error(c) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_FIX; -+ } -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry(s, &c->fsck_errors, list) -+ if (s->fmt == fmt) -+ goto found; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS); -+ if (!s) { -+ if (!c->fsck_alloc_err) -+ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); -+ c->fsck_alloc_err = true; -+ buf = _buf; -+ goto print; -+ } -+ -+ INIT_LIST_HEAD(&s->list); -+ s->fmt = fmt; -+found: -+ list_move(&s->list, &c->fsck_errors); -+ s->nr++; -+ if (c->opts.ratelimit_errors && -+ s->nr >= FSCK_ERR_RATELIMIT_NR) { -+ if (s->nr == FSCK_ERR_RATELIMIT_NR) -+ suppressing = true; -+ else -+ print = false; -+ } -+ buf = s->buf; -+print: -+ va_start(args, fmt); -+ vscnprintf(buf, sizeof(_buf), fmt, args); -+ va_end(args); -+ -+ if (c->opts.fix_errors == FSCK_OPT_EXIT) { -+ bch_err(c, "%s, exiting", buf); -+ } else if (flags & FSCK_CAN_FIX) { -+ if (c->opts.fix_errors == FSCK_OPT_ASK) { -+ printk(KERN_ERR "%s: fix?", buf); -+ fix = ask_yn(); -+ } else if (c->opts.fix_errors == FSCK_OPT_YES || -+ (c->opts.nochanges && -+ !(flags & FSCK_CAN_IGNORE))) { -+ if (print) -+ bch_err(c, "%s, fixing", buf); -+ fix = true; -+ } else { -+ if (print) -+ bch_err(c, "%s, not fixing", buf); -+ fix = false; -+ } -+ } else if (flags & FSCK_NEED_FSCK) { -+ if (print) -+ bch_err(c, "%s (run fsck to correct)", buf); -+ } else { -+ if (print) -+ bch_err(c, "%s (repair unimplemented)", buf); -+ } -+ -+ if (suppressing) -+ bch_err(c, "Ratelimiting new instances of previous error"); -+ -+ mutex_unlock(&c->fsck_error_lock); -+ -+ if (fix) { -+ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); -+ return FSCK_ERR_FIX; -+ } else { -+ set_bit(BCH_FS_ERROR, &c->flags); -+ return c->opts.fix_errors == FSCK_OPT_EXIT || -+ !(flags & FSCK_CAN_IGNORE) -+ ? FSCK_ERR_EXIT -+ : FSCK_ERR_IGNORE; -+ } -+} -+ -+void bch2_flush_fsck_errs(struct bch_fs *c) -+{ -+ struct fsck_err_state *s, *n; -+ -+ mutex_lock(&c->fsck_error_lock); -+ -+ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { -+ if (s->ratelimited) -+ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); -+ -+ list_del(&s->list); -+ kfree(s); -+ } -+ -+ mutex_unlock(&c->fsck_error_lock); -+} -diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h -new file mode 100644 -index 000000000000..94b53312fbbd ---- /dev/null -+++ b/fs/bcachefs/error.h -@@ -0,0 +1,211 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_ERROR_H -+#define _BCACHEFS_ERROR_H -+ -+#include -+#include -+ -+struct bch_dev; -+struct bch_fs; -+struct work_struct; -+ -+/* -+ * XXX: separate out errors that indicate on disk data is inconsistent, and flag -+ * superblock as such -+ */ -+ -+/* Error messages: */ -+ -+/* -+ * Inconsistency errors: The on disk data is inconsistent. If these occur during -+ * initial recovery, they don't indicate a bug in the running code - we walk all -+ * the metadata before modifying anything. If they occur at runtime, they -+ * indicate either a bug in the running code or (less likely) data is being -+ * silently corrupted under us. -+ * -+ * XXX: audit all inconsistent errors and make sure they're all recoverable, in -+ * BCH_ON_ERROR_CONTINUE mode -+ */ -+ -+bool bch2_inconsistent_error(struct bch_fs *); -+ -+#define bch2_fs_inconsistent(c, ...) \ -+({ \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_inconsistent_error(c); \ -+}) -+ -+#define bch2_fs_inconsistent_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_inconsistent(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Later we might want to mark only the particular device inconsistent, not the -+ * entire filesystem: -+ */ -+ -+#define bch2_dev_inconsistent(ca, ...) \ -+do { \ -+ bch_err(ca, __VA_ARGS__); \ -+ bch2_inconsistent_error((ca)->fs); \ -+} while (0) -+ -+#define bch2_dev_inconsistent_on(cond, ca, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_inconsistent(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * Fsck errors: inconsistency errors we detect at mount time, and should ideally -+ * be able to repair: -+ */ -+ -+enum { -+ BCH_FSCK_OK = 0, -+ BCH_FSCK_ERRORS_NOT_FIXED = 1, -+ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, -+ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, -+ BCH_FSCK_UNKNOWN_VERSION = 4, -+}; -+ -+enum fsck_err_opts { -+ FSCK_OPT_EXIT, -+ FSCK_OPT_YES, -+ FSCK_OPT_NO, -+ FSCK_OPT_ASK, -+}; -+ -+enum fsck_err_ret { -+ FSCK_ERR_IGNORE = 0, -+ FSCK_ERR_FIX = 1, -+ FSCK_ERR_EXIT = 2, -+}; -+ -+struct fsck_err_state { -+ struct list_head list; -+ const char *fmt; -+ u64 nr; -+ bool ratelimited; -+ char buf[512]; -+}; -+ -+#define FSCK_CAN_FIX (1 << 0) -+#define FSCK_CAN_IGNORE (1 << 1) -+#define FSCK_NEED_FSCK (1 << 2) -+ -+__printf(3, 4) __cold -+enum fsck_err_ret bch2_fsck_err(struct bch_fs *, -+ unsigned, const char *, ...); -+void bch2_flush_fsck_errs(struct bch_fs *); -+ -+#define __fsck_err(c, _flags, msg, ...) \ -+({ \ -+ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ -+ \ -+ if (_fix == FSCK_ERR_EXIT) { \ -+ bch_err(c, "Unable to continue, halting"); \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ \ -+ _fix; \ -+}) -+ -+/* These macros return true if error should be fixed: */ -+ -+/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ -+ -+#define __fsck_err_on(cond, c, _flags, ...) \ -+ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) -+ -+#define need_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define need_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define mustfix_fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) -+ -+#define fsck_err(c, ...) \ -+ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+#define fsck_err_on(cond, c, ...) \ -+ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) -+ -+/* -+ * Fatal errors: these don't indicate a bug, but we can't continue running in RW -+ * mode - pretty much just due to metadata IO errors: -+ */ -+ -+void bch2_fatal_error(struct bch_fs *); -+ -+#define bch2_fs_fatal_error(c, ...) \ -+do { \ -+ bch_err(c, __VA_ARGS__); \ -+ bch2_fatal_error(c); \ -+} while (0) -+ -+#define bch2_fs_fatal_err_on(cond, c, ...) \ -+({ \ -+ int _ret = !!(cond); \ -+ \ -+ if (_ret) \ -+ bch2_fs_fatal_error(c, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* -+ * IO errors: either recoverable metadata IO (because we have replicas), or data -+ * IO - we need to log it and print out a message, but we don't (necessarily) -+ * want to shut down the fs: -+ */ -+ -+void bch2_io_error_work(struct work_struct *); -+ -+/* Does the error handling without logging a message */ -+void bch2_io_error(struct bch_dev *); -+ -+/* Logs message and handles the error: */ -+#define bch2_dev_io_error(ca, fmt, ...) \ -+do { \ -+ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ -+ "IO error on %s for " fmt), \ -+ (ca)->name, ##__VA_ARGS__); \ -+ bch2_io_error(ca); \ -+} while (0) -+ -+#define bch2_dev_io_err_on(cond, ca, ...) \ -+({ \ -+ bool _ret = (cond); \ -+ \ -+ if (_ret) \ -+ bch2_dev_io_error(ca, __VA_ARGS__); \ -+ _ret; \ -+}) -+ -+/* kill? */ -+ -+#define __bcache_io_error(c, fmt, ...) \ -+ printk_ratelimited(KERN_ERR bch2_fmt(c, \ -+ "IO error: " fmt), ##__VA_ARGS__) -+ -+#define bcache_io_error(c, bio, fmt, ...) \ -+do { \ -+ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ -+ (bio)->bi_status = BLK_STS_IOERR; \ -+} while (0) -+ -+#endif /* _BCACHEFS_ERROR_H */ -diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c -new file mode 100644 -index 000000000000..fd011df3cb99 ---- /dev/null -+++ b/fs/bcachefs/extent_update.c -@@ -0,0 +1,229 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "debug.h" -+#include "extents.h" -+#include "extent_update.h" -+ -+/* -+ * This counts the number of iterators to the alloc & ec btrees we'll need -+ * inserting/removing this extent: -+ */ -+static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ unsigned ret = 0; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ret++; -+ } -+ } -+ -+ return ret; -+} -+ -+static int count_iters_for_insert(struct btree_trans *trans, -+ struct bkey_s_c k, -+ unsigned offset, -+ struct bpos *end, -+ unsigned *nr_iters, -+ unsigned max_iters) -+{ -+ int ret = 0, ret2 = 0; -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); -+ -+ if (*nr_iters >= max_iters) { -+ *end = bpos_min(*end, k.k->p); -+ ret = 1; -+ } -+ -+ break; -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ u64 idx = le64_to_cpu(p.v->idx); -+ unsigned sectors = bpos_min(*end, p.k->p).offset - -+ bkey_start_offset(p.k); -+ struct btree_iter *iter; -+ struct bkey_s_c r_k; -+ -+ for_each_btree_key(trans, iter, -+ BTREE_ID_REFLINK, POS(0, idx + offset), -+ BTREE_ITER_SLOTS, r_k, ret2) { -+ if (bkey_cmp(bkey_start_pos(r_k.k), -+ POS(0, idx + sectors)) >= 0) -+ break; -+ -+ /* extent_update_to_keys(), for the reflink_v update */ -+ *nr_iters += 1; -+ -+ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); -+ -+ if (*nr_iters >= max_iters) { -+ struct bpos pos = bkey_start_pos(k.k); -+ pos.offset += min_t(u64, k.k->size, -+ r_k.k->p.offset - idx); -+ -+ *end = bpos_min(*end, pos); -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ break; -+ } -+ } -+ -+ return ret2 ?: ret; -+} -+ -+#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) -+ -+int bch2_extent_atomic_end(struct btree_iter *iter, -+ struct bkey_i *insert, -+ struct bpos *end) -+{ -+ struct btree_trans *trans = iter->trans; -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey_packed *_k; -+ unsigned nr_iters = 0; -+ int ret; -+ -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ return ret; -+ -+ b = iter->l[0].b; -+ node_iter = iter->l[0].iter; -+ -+ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && -+ bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_predecessor(b->data->min_key)) < 0); -+ -+ *end = bpos_min(insert->k.p, b->key.k.p); -+ -+ /* extent_update_to_keys(): */ -+ nr_iters += 1; -+ -+ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, -+ &nr_iters, EXTENT_ITERS_MAX / 2); -+ if (ret < 0) -+ return ret; -+ -+ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { -+ struct bkey unpacked; -+ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); -+ unsigned offset = 0; -+ -+ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) -+ break; -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), -+ bkey_start_pos(k.k)) > 0) -+ offset = bkey_start_offset(&insert->k) - -+ bkey_start_offset(k.k); -+ -+ /* extent_handle_overwrites(): */ -+ switch (bch2_extent_overlap(&insert->k, k.k)) { -+ case BCH_EXTENT_OVERLAP_ALL: -+ case BCH_EXTENT_OVERLAP_FRONT: -+ nr_iters += 1; -+ break; -+ case BCH_EXTENT_OVERLAP_BACK: -+ case BCH_EXTENT_OVERLAP_MIDDLE: -+ nr_iters += 2; -+ break; -+ } -+ -+ ret = count_iters_for_insert(trans, k, offset, end, -+ &nr_iters, EXTENT_ITERS_MAX); -+ if (ret) -+ break; -+ -+ bch2_btree_node_iter_advance(&node_iter, b); -+ } -+ -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ bch2_cut_back(end, k); -+ return 0; -+} -+ -+int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) -+{ -+ struct bpos end; -+ int ret; -+ -+ ret = bch2_extent_atomic_end(iter, k, &end); -+ if (ret) -+ return ret; -+ -+ return !bkey_cmp(end, k->k.p); -+} -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *insert) -+{ -+ struct btree_iter_level *l = &iter->l[0]; -+ struct btree_node_iter node_iter = l->iter; -+ struct bkey_packed *_k; -+ struct bkey_s_c k; -+ struct bkey unpacked; -+ int sectors; -+ -+ _k = bch2_btree_node_iter_peek(&node_iter, l->b); -+ if (!_k) -+ return BTREE_INSERT_OK; -+ -+ k = bkey_disassemble(l->b, _k, &unpacked); -+ -+ /* Check if we're splitting a compressed extent: */ -+ -+ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && -+ bkey_cmp(insert->k.p, k.k->p) < 0 && -+ (sectors = bch2_bkey_sectors_compressed(k))) { -+ int flags = trans->flags & BTREE_INSERT_NOFAIL -+ ? BCH_DISK_RESERVATION_NOFAIL : 0; -+ -+ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, -+ sectors, flags)) { -+ case 0: -+ break; -+ case -ENOSPC: -+ return BTREE_INSERT_ENOSPC; -+ default: -+ BUG(); -+ } -+ } -+ -+ return BTREE_INSERT_OK; -+} -diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h -new file mode 100644 -index 000000000000..38dc084627d2 ---- /dev/null -+++ b/fs/bcachefs/extent_update.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENT_UPDATE_H -+#define _BCACHEFS_EXTENT_UPDATE_H -+ -+#include "bcachefs.h" -+ -+int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, -+ struct bpos *); -+int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); -+int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); -+ -+enum btree_insert_ret -+bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *); -+ -+#endif /* _BCACHEFS_EXTENT_UPDATE_H */ -diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c -new file mode 100644 -index 000000000000..568f039edcff ---- /dev/null -+++ b/fs/bcachefs/extents.c -@@ -0,0 +1,1258 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Copyright (C) 2010 Kent Overstreet -+ * -+ * Code for managing the extent btree and dynamically updating the writeback -+ * dirty sector count. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "util.h" -+ -+#include -+ -+static unsigned bch2_crc_field_size_max[] = { -+ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, -+ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, -+}; -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *, -+ struct bch_extent_crc_unpacked, -+ enum bch_extent_entry_type); -+ -+static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, -+ unsigned dev) -+{ -+ struct bch_dev_io_failures *i; -+ -+ for (i = f->devs; i < f->devs + f->nr; i++) -+ if (i->dev == dev) -+ return i; -+ -+ return NULL; -+} -+ -+void bch2_mark_io_failure(struct bch_io_failures *failed, -+ struct extent_ptr_decoded *p) -+{ -+ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); -+ -+ if (!f) { -+ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); -+ -+ f = &failed->devs[failed->nr++]; -+ f->dev = p->ptr.dev; -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else if (p->idx != f->idx) { -+ f->idx = p->idx; -+ f->nr_failed = 1; -+ f->nr_retries = 0; -+ } else { -+ f->nr_failed++; -+ } -+} -+ -+/* -+ * returns true if p1 is better than p2: -+ */ -+static inline bool ptr_better(struct bch_fs *c, -+ const struct extent_ptr_decoded p1, -+ const struct extent_ptr_decoded p2) -+{ -+ if (likely(!p1.idx && !p2.idx)) { -+ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); -+ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); -+ -+ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); -+ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); -+ -+ /* Pick at random, biased in favor of the faster device: */ -+ -+ return bch2_rand_range(l1 + l2) > l1; -+ } -+ -+ if (force_reconstruct_read(c)) -+ return p1.idx > p2.idx; -+ -+ return p1.idx < p2.idx; -+} -+ -+/* -+ * This picks a non-stale pointer, preferably from a device other than @avoid. -+ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to -+ * other devices, it will still pick a pointer from avoid. -+ */ -+int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_io_failures *failed, -+ struct extent_ptr_decoded *pick) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ struct bch_dev_io_failures *f; -+ struct bch_dev *ca; -+ int ret = 0; -+ -+ if (k.k->type == KEY_TYPE_error) -+ return -EIO; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ /* -+ * If there are any dirty pointers it's an error if we can't -+ * read: -+ */ -+ if (!ret && !p.ptr.cached) -+ ret = -EIO; -+ -+ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) -+ continue; -+ -+ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; -+ if (f) -+ p.idx = f->nr_failed < f->nr_retries -+ ? f->idx -+ : f->idx + 1; -+ -+ if (!p.idx && -+ !bch2_dev_is_readable(ca)) -+ p.idx++; -+ -+ if (force_reconstruct_read(c) && -+ !p.idx && p.has_ec) -+ p.idx++; -+ -+ if (p.idx >= (unsigned) p.has_ec + 1) -+ continue; -+ -+ if (ret > 0 && !ptr_better(c, p, *pick)) -+ continue; -+ -+ *pick = p; -+ ret = 1; -+ } -+ -+ return ret; -+} -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) -+ return "value too big"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ const char *err; -+ char buf[160]; -+ struct bucket_mark mark; -+ struct bch_dev *ca; -+ -+ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ mark = ptr_bucket_mark(ca, ptr); -+ -+ err = "stale"; -+ if (gen_after(mark.gen, ptr->gen)) -+ goto err; -+ -+ err = "inconsistent"; -+ if (mark.data_type != BCH_DATA_btree || -+ mark.dirty_sectors < c->opts.btree_node_size) -+ goto err; -+ } -+out: -+ percpu_up_read(&c->mark_lock); -+ return; -+err: -+ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", -+ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), -+ PTR_BUCKET_NR(ca, ptr), -+ mark.gen, (unsigned) mark.v.counter); -+ goto out; -+} -+ -+void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); -+ -+ pr_buf(out, "seq %llx sectors %u written %u min_key ", -+ le64_to_cpu(bp.v->seq), -+ le16_to_cpu(bp.v->sectors), -+ le16_to_cpu(bp.v->sectors_written)); -+ -+ bch2_bpos_to_text(out, bp.v->min_key); -+ pr_buf(out, " "); -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, -+ unsigned big_endian, int write, -+ struct bkey_s k) -+{ -+ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); -+ -+ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); -+ -+ if (version < bcachefs_metadata_version_inode_btree_change && -+ btree_node_type_is_extents(btree_id) && -+ bkey_cmp(bp.v->min_key, POS_MIN)) -+ bp.v->min_key = write -+ ? bkey_predecessor(bp.v->min_key) -+ : bkey_successor(bp.v->min_key); -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ char buf[160]; -+ -+ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || -+ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) -+ return; -+ -+ if (!percpu_down_read_trylock(&c->mark_lock)) -+ return; -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); -+ unsigned stale = gen_after(mark.gen, p.ptr.gen); -+ unsigned disk_sectors = ptr_disk_sectors(p); -+ unsigned mark_sectors = p.ptr.cached -+ ? mark.cached_sectors -+ : mark.dirty_sectors; -+ -+ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, -+ "stale dirty pointer (ptr gen %u bucket %u", -+ p.ptr.gen, mark.gen); -+ -+ bch2_fs_inconsistent_on(stale > 96, c, -+ "key too stale: %i", stale); -+ -+ bch2_fs_inconsistent_on(!stale && -+ (mark.data_type != BCH_DATA_user || -+ mark_sectors < disk_sectors), c, -+ "extent pointer not marked: %s:\n" -+ "type %u sectors %u < %u", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), -+ mark.data_type, -+ mark_sectors, disk_sectors); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+} -+ -+void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+enum merge_result bch2_extent_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_extent l = bkey_s_to_extent(_l); -+ struct bkey_s_extent r = bkey_s_to_extent(_r); -+ union bch_extent_entry *en_l = l.v->start; -+ union bch_extent_entry *en_r = r.v->start; -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) -+ return BCH_MERGE_NOMERGE; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, NULL); -+ -+ extent_for_each_entry(l, en_l) { -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (extent_entry_type(en_l) != extent_entry_type(en_r)) -+ return BCH_MERGE_NOMERGE; -+ -+ switch (extent_entry_type(en_l)) { -+ case BCH_EXTENT_ENTRY_ptr: { -+ const struct bch_extent_ptr *lp = &en_l->ptr; -+ const struct bch_extent_ptr *rp = &en_r->ptr; -+ struct bch_dev *ca; -+ -+ if (lp->offset + crc_l.compressed_size != rp->offset || -+ lp->dev != rp->dev || -+ lp->gen != rp->gen) -+ return BCH_MERGE_NOMERGE; -+ -+ /* We don't allow extents to straddle buckets: */ -+ ca = bch_dev_bkey_exists(c, lp->dev); -+ -+ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ } -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || -+ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) -+ return BCH_MERGE_NOMERGE; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ if (crc_l.csum_type != crc_r.csum_type || -+ crc_l.compression_type != crc_r.compression_type || -+ crc_l.nonce != crc_r.nonce) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || -+ crc_r.offset) -+ return BCH_MERGE_NOMERGE; -+ -+ if (!bch2_checksum_mergeable(crc_l.csum_type)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_is_compressed(crc_l)) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.csum_type && -+ crc_l.uncompressed_size + -+ crc_r.uncompressed_size > c->sb.encoded_extent_max) -+ return BCH_MERGE_NOMERGE; -+ -+ if (crc_l.uncompressed_size + crc_r.uncompressed_size > -+ bch2_crc_field_size_max[extent_entry_type(en_l)]) -+ return BCH_MERGE_NOMERGE; -+ -+ break; -+ default: -+ return BCH_MERGE_NOMERGE; -+ } -+ } -+ -+ extent_for_each_entry(l, en_l) { -+ struct bch_extent_crc_unpacked crc_l, crc_r; -+ -+ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); -+ -+ if (!extent_entry_is_crc(en_l)) -+ continue; -+ -+ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); -+ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); -+ -+ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, -+ crc_l.csum, -+ crc_r.csum, -+ crc_r.uncompressed_size << 9); -+ -+ crc_l.uncompressed_size += crc_r.uncompressed_size; -+ crc_l.compressed_size += crc_r.compressed_size; -+ -+ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, -+ extent_entry_type(en_l)); -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) -+ return "incorrect value size"; -+ -+ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) -+ return "invalid nr_replicas"; -+ -+ return NULL; -+} -+ -+void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); -+ -+ pr_buf(out, "generation %u replicas %u", -+ le32_to_cpu(r.v->generation), -+ r.v->nr_replicas); -+} -+ -+enum merge_result bch2_reservation_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reservation l = bkey_s_to_reservation(_l); -+ struct bkey_s_reservation r = bkey_s_to_reservation(_r); -+ -+ if (l.v->generation != r.v->generation || -+ l.v->nr_replicas != r.v->nr_replicas) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, r.s); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* Extent checksum entries: */ -+ -+/* returns true if not equal */ -+static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, -+ struct bch_extent_crc_unpacked r) -+{ -+ return (l.csum_type != r.csum_type || -+ l.compression_type != r.compression_type || -+ l.compressed_size != r.compressed_size || -+ l.uncompressed_size != r.uncompressed_size || -+ l.offset != r.offset || -+ l.live_size != r.live_size || -+ l.nonce != r.nonce || -+ bch2_crc_cmp(l.csum, r.csum)); -+} -+ -+static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, -+ struct bch_extent_crc_unpacked n) -+{ -+ return !crc_is_compressed(u) && -+ u.csum_type && -+ u.uncompressed_size > u.live_size && -+ bch2_csum_type_is_encryption(u.csum_type) == -+ bch2_csum_type_is_encryption(n.csum_type); -+} -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, -+ struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ if (!n.csum_type) -+ return false; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (can_narrow_crc(crc, n)) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * We're writing another replica for this extent, so while we've got the data in -+ * memory we'll be computing a new checksum for the currently live data. -+ * -+ * If there are other replicas we aren't moving, and they are checksummed but -+ * not compressed, we can modify them to point to only the data that is -+ * currently live (so that readers won't have to bounce) while we've got the -+ * checksum we need: -+ */ -+bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked u; -+ struct extent_ptr_decoded p; -+ union bch_extent_entry *i; -+ bool ret = false; -+ -+ /* Find a checksum entry that covers only live data: */ -+ if (!n.csum_type) { -+ bkey_for_each_crc(&k->k, ptrs, u, i) -+ if (!crc_is_compressed(u) && -+ u.csum_type && -+ u.live_size == u.uncompressed_size) { -+ n = u; -+ goto found; -+ } -+ return false; -+ } -+found: -+ BUG_ON(crc_is_compressed(n)); -+ BUG_ON(n.offset); -+ BUG_ON(n.live_size != k->k.size); -+ -+restart_narrow_pointers: -+ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ -+ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) -+ if (can_narrow_crc(p.crc, n)) { -+ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); -+ p.ptr.offset += p.crc.offset; -+ p.crc = n; -+ bch2_extent_ptr_decoded_append(k, &p); -+ ret = true; -+ goto restart_narrow_pointers; -+ } -+ -+ return ret; -+} -+ -+static void bch2_extent_crc_pack(union bch_extent_crc *dst, -+ struct bch_extent_crc_unpacked src, -+ enum bch_extent_entry_type type) -+{ -+#define set_common_fields(_dst, _src) \ -+ _dst.type = 1 << type; \ -+ _dst.csum_type = _src.csum_type, \ -+ _dst.compression_type = _src.compression_type, \ -+ _dst._compressed_size = _src.compressed_size - 1, \ -+ _dst._uncompressed_size = _src.uncompressed_size - 1, \ -+ _dst.offset = _src.offset -+ -+ switch (type) { -+ case BCH_EXTENT_ENTRY_crc32: -+ set_common_fields(dst->crc32, src); -+ dst->crc32.csum = *((__le32 *) &src.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ set_common_fields(dst->crc64, src); -+ dst->crc64.nonce = src.nonce; -+ dst->crc64.csum_lo = src.csum.lo; -+ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ set_common_fields(dst->crc128, src); -+ dst->crc128.nonce = src.nonce; -+ dst->crc128.csum = src.csum; -+ break; -+ default: -+ BUG(); -+ } -+#undef set_common_fields -+} -+ -+void bch2_extent_crc_append(struct bkey_i *k, -+ struct bch_extent_crc_unpacked new) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ union bch_extent_crc *crc = (void *) ptrs.end; -+ enum bch_extent_entry_type type; -+ -+ if (bch_crc_bytes[new.csum_type] <= 4 && -+ new.uncompressed_size <= CRC32_SIZE_MAX && -+ new.nonce <= CRC32_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc32; -+ else if (bch_crc_bytes[new.csum_type] <= 10 && -+ new.uncompressed_size <= CRC64_SIZE_MAX && -+ new.nonce <= CRC64_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc64; -+ else if (bch_crc_bytes[new.csum_type] <= 16 && -+ new.uncompressed_size <= CRC128_SIZE_MAX && -+ new.nonce <= CRC128_NONCE_MAX) -+ type = BCH_EXTENT_ENTRY_crc128; -+ else -+ BUG(); -+ -+ bch2_extent_crc_pack(crc, new, type); -+ -+ k->k.u64s += extent_entry_u64s(ptrs.end); -+ -+ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); -+} -+ -+/* Generic code for keys with pointers: */ -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) -+{ -+ return bch2_bkey_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) -+{ -+ return k.k->type == KEY_TYPE_reservation -+ ? bkey_s_c_to_reservation(k).v->nr_replicas -+ : bch2_bkey_dirty_devs(k).nr; -+} -+ -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) -+{ -+ unsigned ret = 0; -+ -+ if (k.k->type == KEY_TYPE_reservation) { -+ ret = bkey_s_c_to_reservation(k).v->nr_replicas; -+ } else { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ ret += !p.ptr.cached && !crc_is_compressed(p.crc); -+ } -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned ret = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && crc_is_compressed(p.crc)) -+ ret += p.crc.compressed_size; -+ -+ return ret; -+} -+ -+bool bch2_bkey_is_incompressible(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, entry) -+ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) -+ return true; -+ return false; -+} -+ -+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, -+ unsigned nr_replicas) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end = pos; -+ struct bkey_s_c k; -+ bool ret = true; -+ int err; -+ -+ end.offset += size; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, -+ BTREE_ITER_SLOTS, k, err) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { -+ ret = false; -+ break; -+ } -+ } -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+static unsigned bch2_extent_ptr_durability(struct bch_fs *c, -+ struct extent_ptr_decoded p) -+{ -+ unsigned durability = 0; -+ struct bch_dev *ca; -+ -+ if (p.ptr.cached) -+ return 0; -+ -+ ca = bch_dev_bkey_exists(c, p.ptr.dev); -+ -+ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) -+ durability = max_t(unsigned, durability, ca->mi.durability); -+ -+ if (p.has_ec) { -+ struct stripe *s = -+ genradix_ptr(&c->stripes[0], p.ec.idx); -+ -+ if (WARN_ON(!s)) -+ goto out; -+ -+ durability += s->nr_redundant; -+ } -+out: -+ return durability; -+} -+ -+unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned durability = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ durability += bch2_extent_ptr_durability(c, p); -+ -+ return durability; -+} -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, -+ unsigned target, -+ unsigned nr_desired_replicas) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; -+ -+ if (target && extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra && -+ !bch2_dev_in_target(c, p.ptr.dev, target)) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+ -+ if (extra > 0) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int n = bch2_extent_ptr_durability(c, p); -+ -+ if (n && n <= extra) { -+ entry->ptr.cached = true; -+ extra -= n; -+ } -+ } -+} -+ -+void bch2_bkey_append_ptr(struct bkey_i *k, -+ struct bch_extent_ptr ptr) -+{ -+ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); -+ -+ switch (k->k.type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); -+ -+ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ -+ memcpy((void *) &k->v + bkey_val_bytes(&k->k), -+ &ptr, -+ sizeof(ptr)); -+ k->u64s++; -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void __extent_entry_insert(struct bkey_i *k, -+ union bch_extent_entry *dst, -+ union bch_extent_entry *new) -+{ -+ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); -+ -+ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), -+ dst, (u64 *) end - (u64 *) dst); -+ k->k.u64s += extent_entry_u64s(new); -+ memcpy(dst, new, extent_entry_bytes(new)); -+} -+ -+void bch2_extent_ptr_decoded_append(struct bkey_i *k, -+ struct extent_ptr_decoded *p) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); -+ struct bch_extent_crc_unpacked crc = -+ bch2_extent_crc_unpack(&k->k, NULL); -+ union bch_extent_entry *pos; -+ -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = ptrs.start; -+ goto found; -+ } -+ -+ bkey_for_each_crc(&k->k, ptrs, crc, pos) -+ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { -+ pos = extent_entry_next(pos); -+ goto found; -+ } -+ -+ bch2_extent_crc_append(k, p->crc); -+ pos = bkey_val_end(bkey_i_to_s(k)); -+found: -+ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ptr)); -+ -+ if (p->has_ec) { -+ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; -+ __extent_entry_insert(k, pos, to_entry(&p->ec)); -+ } -+} -+ -+static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, -+ union bch_extent_entry *entry) -+{ -+ union bch_extent_entry *i = ptrs.start; -+ -+ if (i == entry) -+ return NULL; -+ -+ while (extent_entry_next(i) != entry) -+ i = extent_entry_next(i); -+ return i; -+} -+ -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, -+ struct bch_extent_ptr *ptr) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *dst, *src, *prev; -+ bool drop_crc = true; -+ -+ EBUG_ON(ptr < &ptrs.start->ptr || -+ ptr >= &ptrs.end->ptr); -+ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); -+ -+ src = extent_entry_next(to_entry(ptr)); -+ if (src != ptrs.end && -+ !extent_entry_is_crc(src)) -+ drop_crc = false; -+ -+ dst = to_entry(ptr); -+ while ((prev = extent_entry_prev(ptrs, dst))) { -+ if (extent_entry_is_ptr(prev)) -+ break; -+ -+ if (extent_entry_is_crc(prev)) { -+ if (drop_crc) -+ dst = prev; -+ break; -+ } -+ -+ dst = prev; -+ } -+ -+ memmove_u64s_down(dst, src, -+ (u64 *) ptrs.end - (u64 *) src); -+ k.k->u64s -= (u64 *) src - (u64 *) dst; -+ -+ return dst; -+} -+ -+void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); -+} -+ -+const struct bch_extent_ptr * -+bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (ptr->dev == dev) -+ return ptr; -+ -+ return NULL; -+} -+ -+bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) -+ if (bch2_dev_in_target(c, ptr->dev, target) && -+ (!ptr->cached || -+ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) -+ return true; -+ -+ return false; -+} -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, -+ struct bch_extent_ptr m, u64 offset) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == m.dev && -+ p.ptr.gen == m.gen && -+ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == -+ (s64) m.offset - offset) -+ return true; -+ -+ return false; -+} -+ -+/* -+ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. -+ * -+ * Returns true if @k should be dropped entirely -+ * -+ * For existing keys, only called when btree nodes are being rewritten, not when -+ * they're merely being compacted/resorted in memory. -+ */ -+bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) -+{ -+ struct bch_extent_ptr *ptr; -+ -+ bch2_bkey_drop_ptrs(k, ptr, -+ ptr->cached && -+ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); -+ -+ /* will only happen if all pointers were cached: */ -+ if (!bch2_bkey_nr_ptrs(k.s_c)) -+ k.k->type = KEY_TYPE_discard; -+ -+ return bkey_whiteout(k.k); -+} -+ -+void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ const struct bch_extent_ptr *ptr; -+ const struct bch_extent_stripe_ptr *ec; -+ struct bch_dev *ca; -+ bool first = true; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (!first) -+ pr_buf(out, " "); -+ -+ switch (__extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ ptr = entry_to_ptr(entry); -+ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] -+ ? bch_dev_bkey_exists(c, ptr->dev) -+ : NULL; -+ -+ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, -+ (u64) ptr->offset, ptr->gen, -+ ptr->cached ? " cached" : "", -+ ca && ptr_stale(ca, ptr) -+ ? " stale" : ""); -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", -+ crc.compressed_size, -+ crc.uncompressed_size, -+ crc.offset, crc.nonce, -+ crc.csum_type, -+ crc.compression_type); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ ec = &entry->stripe_ptr; -+ -+ pr_buf(out, "ec: idx %llu block %u", -+ (u64) ec->idx, ec->block); -+ break; -+ default: -+ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); -+ return; -+ } -+ -+ first = false; -+ } -+} -+ -+static const char *extent_ptr_invalid(const struct bch_fs *c, -+ struct bkey_s_c k, -+ const struct bch_extent_ptr *ptr, -+ unsigned size_ondisk, -+ bool metadata) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr2; -+ struct bch_dev *ca; -+ -+ if (!bch2_dev_exists2(c, ptr->dev)) -+ return "pointer to invalid device"; -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!ca) -+ return "pointer to invalid device"; -+ -+ bkey_for_each_ptr(ptrs, ptr2) -+ if (ptr != ptr2 && ptr->dev == ptr2->dev) -+ return "multiple pointers to same device"; -+ -+ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) -+ return "offset past end of device"; -+ -+ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) -+ return "offset before first bucket"; -+ -+ if (bucket_remainder(ca, ptr->offset) + -+ size_ondisk > ca->mi.bucket_size) -+ return "spans multiple buckets"; -+ -+ return NULL; -+} -+ -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct bch_extent_crc_unpacked crc; -+ unsigned size_ondisk = k.k->size; -+ const char *reason; -+ unsigned nonce = UINT_MAX; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr) -+ size_ondisk = c->opts.btree_node_size; -+ if (k.k->type == KEY_TYPE_btree_ptr_v2) -+ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) -+ return "invalid extent entry type"; -+ -+ if (k.k->type == KEY_TYPE_btree_ptr && -+ !extent_entry_is_ptr(entry)) -+ return "has non ptr field"; -+ -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ reason = extent_ptr_invalid(c, k, &entry->ptr, -+ size_ondisk, false); -+ if (reason) -+ return reason; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); -+ -+ if (crc.offset + crc.live_size > -+ crc.uncompressed_size) -+ return "checksum offset + key size > uncompressed size"; -+ -+ size_ondisk = crc.compressed_size; -+ -+ if (!bch2_checksum_type_valid(c, crc.csum_type)) -+ return "invalid checksum type"; -+ -+ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) -+ return "invalid compression type"; -+ -+ if (bch2_csum_type_is_encryption(crc.csum_type)) { -+ if (nonce == UINT_MAX) -+ nonce = crc.offset + crc.nonce; -+ else if (nonce != crc.offset + crc.nonce) -+ return "incorrect nonce"; -+ } -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+ -+ return NULL; -+} -+ -+void bch2_ptr_swab(struct bkey_s k) -+{ -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ u64 *d; -+ -+ for (d = (u64 *) ptrs.start; -+ d != (u64 *) ptrs.end; -+ d++) -+ *d = swab64(*d); -+ -+ for (entry = ptrs.start; -+ entry < ptrs.end; -+ entry = extent_entry_next(entry)) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.csum = swab32(entry->crc32.csum); -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); -+ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.csum.hi = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.hi); -+ entry->crc128.csum.lo = (__force __le64) -+ swab64((__force u64) entry->crc128.csum.lo); -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ } -+} -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 sub; -+ -+ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, k.k->p) > 0); -+ -+ sub = where.offset - bkey_start_offset(k.k); -+ -+ k.k->size -= sub; -+ -+ if (!k.k->size) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: { -+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); -+ union bch_extent_entry *entry; -+ bool seen_crc = false; -+ -+ bkey_extent_entry_for_each(ptrs, entry) { -+ switch (extent_entry_type(entry)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ if (!seen_crc) -+ entry->ptr.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc32: -+ entry->crc32.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc64: -+ entry->crc64.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_crc128: -+ entry->crc128.offset += sub; -+ break; -+ case BCH_EXTENT_ENTRY_stripe_ptr: -+ break; -+ } -+ -+ if (extent_entry_is_crc(entry)) -+ seen_crc = true; -+ } -+ -+ break; -+ } -+ case KEY_TYPE_reflink_p: { -+ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); -+ -+ le64_add_cpu(&p.v->idx, sub); -+ break; -+ } -+ case KEY_TYPE_inline_data: { -+ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); -+ -+ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); -+ -+ memmove(d.v->data, -+ d.v->data + sub, -+ bkey_val_bytes(d.k) - sub); -+ -+ new_val_u64s -= sub >> 3; -+ break; -+ } -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -+ -+int bch2_cut_back_s(struct bpos where, struct bkey_s k) -+{ -+ unsigned new_val_u64s = bkey_val_u64s(k.k); -+ int val_u64s_delta; -+ u64 len = 0; -+ -+ if (bkey_cmp(where, k.k->p) >= 0) -+ return 0; -+ -+ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); -+ -+ len = where.offset - bkey_start_offset(k.k); -+ -+ k.k->p = where; -+ k.k->size = len; -+ -+ if (!len) { -+ k.k->type = KEY_TYPE_deleted; -+ new_val_u64s = 0; -+ } -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inline_data: -+ new_val_u64s = min(new_val_u64s, k.k->size << 6); -+ break; -+ } -+ -+ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; -+ BUG_ON(val_u64s_delta < 0); -+ -+ set_bkey_val_u64s(k.k, new_val_u64s); -+ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); -+ return -val_u64s_delta; -+} -diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h -new file mode 100644 -index 000000000000..29b15365d19c ---- /dev/null -+++ b/fs/bcachefs/extents.h -@@ -0,0 +1,603 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_H -+#define _BCACHEFS_EXTENTS_H -+ -+#include "bcachefs.h" -+#include "bkey.h" -+#include "extents_types.h" -+ -+struct bch_fs; -+struct btree_trans; -+ -+/* extent entries: */ -+ -+#define extent_entry_last(_e) \ -+ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) -+ -+#define entry_to_ptr(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ -+ \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const struct bch_extent_ptr *) (_entry), \ -+ (struct bch_extent_ptr *) (_entry)); \ -+}) -+ -+/* downcast, preserves const */ -+#define to_entry(_entry) \ -+({ \ -+ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ -+ !type_is(_entry, struct bch_extent_ptr *) && \ -+ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ -+ \ -+ __builtin_choose_expr( \ -+ (type_is_exact(_entry, const union bch_extent_crc *) || \ -+ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ -+ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ -+ (const union bch_extent_entry *) (_entry), \ -+ (union bch_extent_entry *) (_entry)); \ -+}) -+ -+#define extent_entry_next(_entry) \ -+ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) -+ -+static inline unsigned -+__extent_entry_type(const union bch_extent_entry *e) -+{ -+ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; -+} -+ -+static inline enum bch_extent_entry_type -+extent_entry_type(const union bch_extent_entry *e) -+{ -+ int ret = __ffs(e->type); -+ -+ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); -+ -+ return ret; -+} -+ -+static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) -+{ -+ switch (extent_entry_type(entry)) { -+#define x(f, n) \ -+ case BCH_EXTENT_ENTRY_##f: \ -+ return sizeof(struct bch_extent_##f); -+ BCH_EXTENT_ENTRY_TYPES() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) -+{ -+ return extent_entry_bytes(entry) / sizeof(u64); -+} -+ -+static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_ptr: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool extent_entry_is_crc(const union bch_extent_entry *e) -+{ -+ switch (extent_entry_type(e)) { -+ case BCH_EXTENT_ENTRY_crc32: -+ case BCH_EXTENT_ENTRY_crc64: -+ case BCH_EXTENT_ENTRY_crc128: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+union bch_extent_crc { -+ u8 type; -+ struct bch_extent_crc32 crc32; -+ struct bch_extent_crc64 crc64; -+ struct bch_extent_crc128 crc128; -+}; -+ -+#define __entry_to_crc(_entry) \ -+ __builtin_choose_expr( \ -+ type_is_exact(_entry, const union bch_extent_entry *), \ -+ (const union bch_extent_crc *) (_entry), \ -+ (union bch_extent_crc *) (_entry)) -+ -+#define entry_to_crc(_entry) \ -+({ \ -+ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ -+ \ -+ __entry_to_crc(_entry); \ -+}) -+ -+static inline struct bch_extent_crc_unpacked -+bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) -+{ -+#define common_fields(_crc) \ -+ .csum_type = _crc.csum_type, \ -+ .compression_type = _crc.compression_type, \ -+ .compressed_size = _crc._compressed_size + 1, \ -+ .uncompressed_size = _crc._uncompressed_size + 1, \ -+ .offset = _crc.offset, \ -+ .live_size = k->size -+ -+ if (!crc) -+ return (struct bch_extent_crc_unpacked) { -+ .compressed_size = k->size, -+ .uncompressed_size = k->size, -+ .live_size = k->size, -+ }; -+ -+ switch (extent_entry_type(to_entry(crc))) { -+ case BCH_EXTENT_ENTRY_crc32: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc32), -+ }; -+ -+ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; -+ -+ memcpy(&ret.csum.lo, &crc->crc32.csum, -+ sizeof(crc->crc32.csum)); -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc64: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc64), -+ .nonce = crc->crc64.nonce, -+ .csum.lo = (__force __le64) crc->crc64.csum_lo, -+ }; -+ -+ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; -+ -+ return ret; -+ } -+ case BCH_EXTENT_ENTRY_crc128: { -+ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { -+ common_fields(crc->crc128), -+ .nonce = crc->crc128.nonce, -+ .csum = crc->crc128.csum, -+ }; -+ -+ return ret; -+ } -+ default: -+ BUG(); -+ } -+#undef common_fields -+} -+ -+static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) -+{ -+ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && -+ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); -+} -+ -+/* bkey_ptrs: generically over any key type that has ptrs */ -+ -+struct bkey_ptrs_c { -+ const union bch_extent_entry *start; -+ const union bch_extent_entry *end; -+}; -+ -+struct bkey_ptrs { -+ union bch_extent_entry *start; -+ union bch_extent_entry *end; -+}; -+ -+static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: { -+ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ case KEY_TYPE_extent: { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ return (struct bkey_ptrs_c) { -+ e.v->start, -+ extent_entry_last(e) -+ }; -+ } -+ case KEY_TYPE_stripe: { -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&s.v->ptrs[0]), -+ to_entry(&s.v->ptrs[s.v->nr_blocks]), -+ }; -+ } -+ case KEY_TYPE_reflink_v: { -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ return (struct bkey_ptrs_c) { -+ r.v->start, -+ bkey_val_end(r), -+ }; -+ } -+ case KEY_TYPE_btree_ptr_v2: { -+ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); -+ return (struct bkey_ptrs_c) { -+ to_entry(&e.v->start[0]), -+ to_entry(extent_entry_last(e)) -+ }; -+ } -+ default: -+ return (struct bkey_ptrs_c) { NULL, NULL }; -+ } -+} -+ -+static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) -+{ -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); -+ -+ return (struct bkey_ptrs) { -+ (void *) p.start, -+ (void *) p.end -+ }; -+} -+ -+#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ -+ for ((_entry) = (_start); \ -+ (_entry) < (_end); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define __bkey_ptr_next(_ptr, _end) \ -+({ \ -+ typeof(_end) _entry; \ -+ \ -+ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ -+ if (extent_entry_is_ptr(_entry)) \ -+ break; \ -+ \ -+ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ -+}) -+ -+#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) -+ -+#define bkey_extent_entry_for_each(_p, _entry) \ -+ bkey_extent_entry_for_each_from(_p, _entry, _p.start) -+ -+#define __bkey_for_each_ptr(_start, _end, _ptr) \ -+ for ((_ptr) = (_start); \ -+ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ -+ (_ptr)++) -+ -+#define bkey_ptr_next(_p, _ptr) \ -+ __bkey_ptr_next(_ptr, (_p).end) -+ -+#define bkey_for_each_ptr(_p, _ptr) \ -+ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) -+ -+#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ -+({ \ -+ __label__ out; \ -+ \ -+ (_ptr).idx = 0; \ -+ (_ptr).has_ec = false; \ -+ \ -+ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ -+ switch (extent_entry_type(_entry)) { \ -+ case BCH_EXTENT_ENTRY_ptr: \ -+ (_ptr).ptr = _entry->ptr; \ -+ goto out; \ -+ case BCH_EXTENT_ENTRY_crc32: \ -+ case BCH_EXTENT_ENTRY_crc64: \ -+ case BCH_EXTENT_ENTRY_crc128: \ -+ (_ptr).crc = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_entry)); \ -+ break; \ -+ case BCH_EXTENT_ENTRY_stripe_ptr: \ -+ (_ptr).ec = _entry->stripe_ptr; \ -+ (_ptr).has_ec = true; \ -+ break; \ -+ } \ -+out: \ -+ _entry < (_end); \ -+}) -+ -+#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ -+ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ -+ (_entry) = _start; \ -+ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ -+ (_entry) = extent_entry_next(_entry)) -+ -+#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ -+ _ptr, _entry) -+ -+#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ -+({ \ -+ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ -+ if (extent_entry_is_crc(_iter)) { \ -+ (_crc) = bch2_extent_crc_unpack(_k, \ -+ entry_to_crc(_iter)); \ -+ break; \ -+ } \ -+ \ -+ (_iter) < (_end); \ -+}) -+ -+#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ -+ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ -+ (_iter) = (_start); \ -+ bkey_crc_next(_k, _start, _end, _crc, _iter); \ -+ (_iter) = extent_entry_next(_iter)) -+ -+#define bkey_for_each_crc(_k, _p, _crc, _iter) \ -+ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) -+ -+/* Iterate over pointers in KEY_TYPE_extent: */ -+ -+#define extent_for_each_entry_from(_e, _entry, _start) \ -+ __bkey_extent_entry_for_each_from(_start, \ -+ extent_entry_last(_e),_entry) -+ -+#define extent_for_each_entry(_e, _entry) \ -+ extent_for_each_entry_from(_e, _entry, (_e).v->start) -+ -+#define extent_ptr_next(_e, _ptr) \ -+ __bkey_ptr_next(_ptr, extent_entry_last(_e)) -+ -+#define extent_for_each_ptr(_e, _ptr) \ -+ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) -+ -+#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ -+ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ -+ extent_entry_last(_e), _ptr, _entry) -+ -+/* utility code common to all keys with pointers: */ -+ -+void bch2_mark_io_failure(struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_failures *, -+ struct extent_ptr_decoded *); -+ -+/* KEY_TYPE_btree_ptr: */ -+ -+const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, -+ int, struct bkey_s); -+ -+#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ -+ .key_invalid = bch2_btree_ptr_invalid, \ -+ .key_debugcheck = bch2_btree_ptr_debugcheck, \ -+ .val_to_text = bch2_btree_ptr_v2_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .compat = bch2_btree_ptr_v2_compat, \ -+} -+ -+/* KEY_TYPE_extent: */ -+ -+const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); -+void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_extent_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_extent (struct bkey_ops) { \ -+ .key_invalid = bch2_extent_invalid, \ -+ .key_debugcheck = bch2_extent_debugcheck, \ -+ .val_to_text = bch2_extent_to_text, \ -+ .swab = bch2_ptr_swab, \ -+ .key_normalize = bch2_extent_normalize, \ -+ .key_merge = bch2_extent_merge, \ -+} -+ -+/* KEY_TYPE_reservation: */ -+ -+const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+enum merge_result bch2_reservation_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reservation (struct bkey_ops) { \ -+ .key_invalid = bch2_reservation_invalid, \ -+ .val_to_text = bch2_reservation_to_text, \ -+ .key_merge = bch2_reservation_merge, \ -+} -+ -+/* Extent checksum entries: */ -+ -+bool bch2_can_narrow_extent_crcs(struct bkey_s_c, -+ struct bch_extent_crc_unpacked); -+bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); -+void bch2_extent_crc_append(struct bkey_i *, -+ struct bch_extent_crc_unpacked); -+ -+/* Generic code for keys with pointers: */ -+ -+static inline bool bkey_extent_is_direct_data(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline bool bkey_extent_is_data(const struct bkey *k) -+{ -+ return bkey_extent_is_direct_data(k) || -+ k->type == KEY_TYPE_inline_data || -+ k->type == KEY_TYPE_reflink_p; -+} -+ -+/* -+ * Should extent be counted under inode->i_sectors? -+ */ -+static inline bool bkey_extent_is_allocation(const struct bkey *k) -+{ -+ switch (k->type) { -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reservation: -+ case KEY_TYPE_reflink_p: -+ case KEY_TYPE_reflink_v: -+ case KEY_TYPE_inline_data: -+ return true; -+ default: -+ return false; -+ } -+} -+ -+static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (!ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) -+{ -+ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; -+ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(p, ptr) -+ if (ptr->cached) -+ ret.devs[ret.nr++] = ptr->dev; -+ -+ return ret; -+} -+ -+unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); -+unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); -+bool bch2_bkey_is_incompressible(struct bkey_s_c); -+unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); -+unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); -+ -+void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, -+ unsigned, unsigned); -+ -+void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); -+void bch2_extent_ptr_decoded_append(struct bkey_i *, -+ struct extent_ptr_decoded *); -+union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, -+ struct bch_extent_ptr *); -+ -+#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ -+do { \ -+ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ -+ \ -+ _ptr = &_ptrs.start->ptr; \ -+ \ -+ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ -+ if (_cond) { \ -+ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ -+ _ptrs = bch2_bkey_ptrs(_k); \ -+ continue; \ -+ } \ -+ \ -+ (_ptr)++; \ -+ } \ -+} while (0) -+ -+void bch2_bkey_drop_device(struct bkey_s, unsigned); -+const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); -+bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); -+ -+bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, -+ struct bch_extent_ptr, u64); -+ -+bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); -+void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); -+ -+void bch2_ptr_swab(struct bkey_s); -+ -+/* Generic extent code: */ -+ -+int bch2_cut_front_s(struct bpos, struct bkey_s); -+int bch2_cut_back_s(struct bpos, struct bkey_s); -+ -+static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_front_s(where, bkey_i_to_s(k)); -+} -+ -+static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) -+{ -+ bch2_cut_back_s(where, bkey_i_to_s(k)); -+} -+ -+/** -+ * bch_key_resize - adjust size of @k -+ * -+ * bkey_start_offset(k) will be preserved, modifies where the extent ends -+ */ -+static inline void bch2_key_resize(struct bkey *k, unsigned new_size) -+{ -+ k->p.offset -= k->size; -+ k->p.offset += new_size; -+ k->size = new_size; -+} -+ -+/* -+ * In extent_sort_fix_overlapping(), insert_fixup_extent(), -+ * extent_merge_inline() - we're modifying keys in place that are packed. To do -+ * that we have to unpack the key, modify the unpacked key - then this -+ * copies/repacks the unpacked to the original as necessary. -+ */ -+static inline void extent_save(struct btree *b, struct bkey_packed *dst, -+ struct bkey *src) -+{ -+ struct bkey_format *f = &b->format; -+ struct bkey_i *dst_unpacked; -+ -+ if ((dst_unpacked = packed_to_bkey(dst))) -+ dst_unpacked->k = *src; -+ else -+ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); -+} -+ -+#endif /* _BCACHEFS_EXTENTS_H */ -diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h -new file mode 100644 -index 000000000000..43d6c341ecca ---- /dev/null -+++ b/fs/bcachefs/extents_types.h -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_EXTENTS_TYPES_H -+#define _BCACHEFS_EXTENTS_TYPES_H -+ -+#include "bcachefs_format.h" -+ -+struct bch_extent_crc_unpacked { -+ u32 compressed_size; -+ u32 uncompressed_size; -+ u32 live_size; -+ -+ u8 csum_type; -+ u8 compression_type; -+ -+ u16 offset; -+ -+ u16 nonce; -+ -+ struct bch_csum csum; -+}; -+ -+struct extent_ptr_decoded { -+ unsigned idx; -+ bool has_ec; -+ struct bch_extent_crc_unpacked crc; -+ struct bch_extent_ptr ptr; -+ struct bch_extent_stripe_ptr ec; -+}; -+ -+struct bch_io_failures { -+ u8 nr; -+ struct bch_dev_io_failures { -+ u8 dev; -+ u8 idx; -+ u8 nr_failed; -+ u8 nr_retries; -+ } devs[BCH_REPLICAS_MAX]; -+}; -+ -+#endif /* _BCACHEFS_EXTENTS_TYPES_H */ -diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h -new file mode 100644 -index 000000000000..26d5cad7e6a5 ---- /dev/null -+++ b/fs/bcachefs/eytzinger.h -@@ -0,0 +1,285 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _EYTZINGER_H -+#define _EYTZINGER_H -+ -+#include -+#include -+ -+#include "util.h" -+ -+/* -+ * Traversal for trees in eytzinger layout - a full binary tree layed out in an -+ * array -+ */ -+ -+/* -+ * One based indexing version: -+ * -+ * With one based indexing each level of the tree starts at a power of two - -+ * good for cacheline alignment: -+ * -+ * Size parameter is treated as if we were using 0 based indexing, however: -+ * valid nodes, and inorder indices, are in the range [1..size) - that is, there -+ * are actually size - 1 elements -+ */ -+ -+static inline unsigned eytzinger1_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + child; -+} -+ -+static inline unsigned eytzinger1_left_child(unsigned i) -+{ -+ return eytzinger1_child(i, 0); -+} -+ -+static inline unsigned eytzinger1_right_child(unsigned i) -+{ -+ return eytzinger1_child(i, 1); -+} -+ -+static inline unsigned eytzinger1_first(unsigned size) -+{ -+ return rounddown_pow_of_two(size - 1); -+} -+ -+static inline unsigned eytzinger1_last(unsigned size) -+{ -+ return rounddown_pow_of_two(size) - 1; -+} -+ -+/* -+ * eytzinger1_next() and eytzinger1_prev() have the nice properties that -+ * -+ * eytzinger1_next(0) == eytzinger1_first()) -+ * eytzinger1_prev(0) == eytzinger1_last()) -+ * -+ * eytzinger1_prev(eytzinger1_first()) == 0 -+ * eytzinger1_next(eytzinger1_last()) == 0 -+ */ -+ -+static inline unsigned eytzinger1_next(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_right_child(i) < size) { -+ i = eytzinger1_right_child(i); -+ -+ i <<= __fls(size) - __fls(i); -+ i >>= i >= size; -+ } else { -+ i >>= ffz(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_prev(unsigned i, unsigned size) -+{ -+ EBUG_ON(i >= size); -+ -+ if (eytzinger1_left_child(i) < size) { -+ i = eytzinger1_left_child(i) + 1; -+ -+ i <<= __fls(size) - __fls(i); -+ i -= 1; -+ i >>= i >= size; -+ } else { -+ i >>= __ffs(i) + 1; -+ } -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_extra(unsigned size) -+{ -+ return (size - rounddown_pow_of_two(size - 1)) << 1; -+} -+ -+static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned b = __fls(i); -+ unsigned shift = __fls(size - 1) - b; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ i ^= 1U << b; -+ i <<= 1; -+ i |= 1; -+ i <<= shift; -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i -= (i - extra) >> 1; -+ */ -+ s = extra - i; -+ i += (s >> 1) & (s >> 31); -+ -+ return i; -+} -+ -+static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ unsigned shift; -+ int s; -+ -+ EBUG_ON(!i || i >= size); -+ -+ /* -+ * sign bit trick: -+ * -+ * if (i > extra) -+ * i += i - extra; -+ */ -+ s = extra - i; -+ i -= s & (s >> 31); -+ -+ shift = __ffs(i); -+ -+ i >>= shift + 1; -+ i |= 1U << (__fls(size - 1) - shift); -+ -+ return i; -+} -+ -+static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); -+} -+ -+#define eytzinger1_for_each(_i, _size) \ -+ for ((_i) = eytzinger1_first((_size)); \ -+ (_i) != 0; \ -+ (_i) = eytzinger1_next((_i), (_size))) -+ -+/* Zero based indexing version: */ -+ -+static inline unsigned eytzinger0_child(unsigned i, unsigned child) -+{ -+ EBUG_ON(child > 1); -+ -+ return (i << 1) + 1 + child; -+} -+ -+static inline unsigned eytzinger0_left_child(unsigned i) -+{ -+ return eytzinger0_child(i, 0); -+} -+ -+static inline unsigned eytzinger0_right_child(unsigned i) -+{ -+ return eytzinger0_child(i, 1); -+} -+ -+static inline unsigned eytzinger0_first(unsigned size) -+{ -+ return eytzinger1_first(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_last(unsigned size) -+{ -+ return eytzinger1_last(size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_next(unsigned i, unsigned size) -+{ -+ return eytzinger1_next(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_prev(unsigned i, unsigned size) -+{ -+ return eytzinger1_prev(i + 1, size + 1) - 1; -+} -+ -+static inline unsigned eytzinger0_extra(unsigned size) -+{ -+ return eytzinger1_extra(size + 1); -+} -+ -+static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, -+ unsigned extra) -+{ -+ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; -+} -+ -+static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) -+{ -+ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); -+} -+ -+static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) -+{ -+ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); -+} -+ -+#define eytzinger0_for_each(_i, _size) \ -+ for ((_i) = eytzinger0_first((_size)); \ -+ (_i) != -1; \ -+ (_i) = eytzinger0_next((_i), (_size))) -+ -+typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); -+ -+/* return greatest node <= @search, or -1 if not found */ -+static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, -+ eytzinger_cmp_fn cmp, const void *search) -+{ -+ unsigned i, n = 0; -+ -+ if (!nr) -+ return -1; -+ -+ do { -+ i = n; -+ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); -+ } while (n < nr); -+ -+ if (n & 1) { -+ /* @i was greater than @search, return previous node: */ -+ -+ if (i == eytzinger0_first(nr)) -+ return -1; -+ -+ return eytzinger0_prev(i, nr); -+ } else { -+ return i; -+ } -+} -+ -+#define eytzinger0_find(base, nr, size, _cmp, search) \ -+({ \ -+ void *_base = (base); \ -+ void *_search = (search); \ -+ size_t _nr = (nr); \ -+ size_t _size = (size); \ -+ size_t _i = 0; \ -+ int _res; \ -+ \ -+ while (_i < _nr && \ -+ (_res = _cmp(_search, _base + _i * _size, _size))) \ -+ _i = eytzinger0_child(_i, _res > 0); \ -+ _i; \ -+}) -+ -+void eytzinger0_sort(void *, size_t, size_t, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+#endif /* _EYTZINGER_H */ -diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h -new file mode 100644 -index 000000000000..cdb272708a4b ---- /dev/null -+++ b/fs/bcachefs/fifo.h -@@ -0,0 +1,127 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FIFO_H -+#define _BCACHEFS_FIFO_H -+ -+#include "util.h" -+ -+#define FIFO(type) \ -+struct { \ -+ size_t front, back, size, mask; \ -+ type *data; \ -+} -+ -+#define DECLARE_FIFO(type, name) FIFO(type) name -+ -+#define fifo_buf_size(fifo) \ -+ ((fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ -+ : 0) -+ -+#define init_fifo(fifo, _size, _gfp) \ -+({ \ -+ (fifo)->front = (fifo)->back = 0; \ -+ (fifo)->size = (_size); \ -+ (fifo)->mask = (fifo)->size \ -+ ? roundup_pow_of_two((fifo)->size) - 1 \ -+ : 0; \ -+ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ -+}) -+ -+#define free_fifo(fifo) \ -+do { \ -+ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ -+ (fifo)->data = NULL; \ -+} while (0) -+ -+#define fifo_swap(l, r) \ -+do { \ -+ swap((l)->front, (r)->front); \ -+ swap((l)->back, (r)->back); \ -+ swap((l)->size, (r)->size); \ -+ swap((l)->mask, (r)->mask); \ -+ swap((l)->data, (r)->data); \ -+} while (0) -+ -+#define fifo_move(dest, src) \ -+do { \ -+ typeof(*((dest)->data)) _t; \ -+ while (!fifo_full(dest) && \ -+ fifo_pop(src, _t)) \ -+ fifo_push(dest, _t); \ -+} while (0) -+ -+#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) -+#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) -+ -+#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) -+#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) -+ -+#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) -+#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) -+ -+#define fifo_entry_idx_abs(fifo, p) \ -+ ((((p) >= &fifo_peek_front(fifo) \ -+ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ -+ (((p) - (fifo)->data))) -+ -+#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) -+#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] -+ -+#define fifo_push_back_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) -+ -+#define fifo_push_front_ref(f) \ -+ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) -+ -+#define fifo_push_back(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_push_front(fifo, new) \ -+({ \ -+ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ -+ if (_r) \ -+ *_r = (new); \ -+ _r != NULL; \ -+}) -+ -+#define fifo_pop_front(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_pop_back(fifo, i) \ -+({ \ -+ bool _r = !fifo_empty((fifo)); \ -+ if (_r) \ -+ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ -+ _r; \ -+}) -+ -+#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) -+#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) -+#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) -+#define fifo_peek(fifo) fifo_peek_front(fifo) -+ -+#define fifo_for_each_entry(_entry, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ -+ for (typecheck(typeof((_fifo)->front), _iter), \ -+ (_iter) = (_fifo)->front; \ -+ ((_iter != (_fifo)->back) && \ -+ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ -+ (_iter)++) -+ -+#endif /* _BCACHEFS_FIFO_H */ -diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c -new file mode 100644 -index 000000000000..878419d40992 ---- /dev/null -+++ b/fs/bcachefs/fs-common.c -@@ -0,0 +1,317 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "fs-common.h" -+#include "inode.h" -+#include "xattr.h" -+ -+#include -+ -+int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, -+ struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *new_inode, -+ const struct qstr *name, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct posix_acl *default_acl, -+ struct posix_acl *acl) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *dir_iter = NULL; -+ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); -+ -+ if (!name) -+ new_inode->bi_flags |= BCH_INODE_UNLINKED; -+ -+ ret = bch2_inode_create(trans, new_inode, -+ BLOCKDEV_INODE_MAX, 0, -+ &c->unused_inode_hint); -+ if (ret) -+ goto err; -+ -+ if (default_acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ default_acl, ACL_TYPE_DEFAULT); -+ if (ret) -+ goto err; -+ } -+ -+ if (acl) { -+ ret = bch2_set_acl_trans(trans, new_inode, &hash, -+ acl, ACL_TYPE_ACCESS); -+ if (ret) -+ goto err; -+ } -+ -+ if (name) { -+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ if (S_ISDIR(new_inode->bi_mode)) -+ dir_u->bi_nlink++; -+ -+ ret = bch2_inode_write(trans, dir_iter, dir_u); -+ if (ret) -+ goto err; -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(new_inode->bi_mode), -+ name, new_inode->bi_inum, -+ BCH_HASH_SET_MUST_CREATE); -+ if (ret) -+ goto err; -+ } -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, -+ u64 inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 now = bch2_current_time(trans->c); -+ int ret; -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ inode_u->bi_ctime = now; -+ bch2_inode_nlink_inc(inode_u); -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = now; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, -+ mode_to_type(inode_u->bi_mode), -+ name, inum, BCH_HASH_SET_MUST_CREATE) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, dir_iter); -+ bch2_trans_iter_put(trans, inode_iter); -+ return ret; -+} -+ -+int bch2_unlink_trans(struct btree_trans *trans, -+ u64 dir_inum, struct bch_inode_unpacked *dir_u, -+ struct bch_inode_unpacked *inode_u, -+ const struct qstr *name) -+{ -+ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, -+ *inode_iter = NULL; -+ struct bch_hash_info dir_hash; -+ u64 inum, now = bch2_current_time(trans->c); -+ struct bkey_s_c k; -+ int ret; -+ -+ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dir_iter); -+ if (ret) -+ goto err; -+ -+ dir_hash = bch2_hash_info_init(trans->c, dir_u); -+ -+ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, -+ name, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dirent_iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek_slot(dirent_iter); -+ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); -+ -+ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto err; -+ -+ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; -+ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); -+ bch2_inode_nlink_dec(inode_u); -+ -+ ret = (S_ISDIR(inode_u->bi_mode) -+ ? bch2_empty_dir_trans(trans, inum) -+ : 0) ?: -+ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: -+ bch2_inode_write(trans, dir_iter, dir_u) ?: -+ bch2_inode_write(trans, inode_iter, inode_u); -+err: -+ bch2_trans_iter_put(trans, inode_iter); -+ bch2_trans_iter_put(trans, dirent_iter); -+ bch2_trans_iter_put(trans, dir_iter); -+ return ret; -+} -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, -+ struct bch_inode_unpacked *src_u) -+{ -+ u64 src, dst; -+ unsigned id; -+ bool ret = false; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ if (dst_u->bi_fields_set & (1 << id)) -+ continue; -+ -+ src = bch2_inode_opt_get(src_u, id); -+ dst = bch2_inode_opt_get(dst_u, id); -+ -+ if (src == dst) -+ continue; -+ -+ bch2_inode_opt_set(dst_u, id, src); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+int bch2_rename_trans(struct btree_trans *trans, -+ u64 src_dir, struct bch_inode_unpacked *src_dir_u, -+ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, -+ struct bch_inode_unpacked *src_inode_u, -+ struct bch_inode_unpacked *dst_inode_u, -+ const struct qstr *src_name, -+ const struct qstr *dst_name, -+ enum bch_rename_mode mode) -+{ -+ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; -+ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; -+ struct bch_hash_info src_hash, dst_hash; -+ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); -+ int ret; -+ -+ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_dir_iter); -+ if (ret) -+ goto err; -+ -+ src_hash = bch2_hash_info_init(trans->c, src_dir_u); -+ -+ if (dst_dir != src_dir) { -+ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_dir_iter); -+ if (ret) -+ goto err; -+ -+ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); -+ } else { -+ dst_dir_u = src_dir_u; -+ dst_hash = src_hash; -+ } -+ -+ ret = bch2_dirent_rename(trans, -+ src_dir, &src_hash, -+ dst_dir, &dst_hash, -+ src_name, &src_inode, -+ dst_name, &dst_inode, -+ mode); -+ if (ret) -+ goto err; -+ -+ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(src_inode_iter); -+ if (ret) -+ goto err; -+ -+ if (dst_inode) { -+ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(dst_inode_iter); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ if (S_ISDIR(src_inode_u->bi_mode) != -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -ENOTDIR; -+ goto err; -+ } -+ -+ if (S_ISDIR(dst_inode_u->bi_mode) && -+ bch2_empty_dir_trans(trans, dst_inode)) { -+ ret = -ENOTEMPTY; -+ goto err; -+ } -+ } -+ -+ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && -+ S_ISDIR(src_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && -+ S_ISDIR(dst_inode_u->bi_mode)) { -+ ret = -EXDEV; -+ goto err; -+ } -+ -+ if (S_ISDIR(src_inode_u->bi_mode)) { -+ src_dir_u->bi_nlink--; -+ dst_dir_u->bi_nlink++; -+ } -+ -+ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { -+ dst_dir_u->bi_nlink--; -+ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; -+ } -+ -+ if (mode == BCH_RENAME_OVERWRITE) -+ bch2_inode_nlink_dec(dst_inode_u); -+ -+ src_dir_u->bi_mtime = now; -+ src_dir_u->bi_ctime = now; -+ -+ if (src_dir != dst_dir) { -+ dst_dir_u->bi_mtime = now; -+ dst_dir_u->bi_ctime = now; -+ } -+ -+ src_inode_u->bi_ctime = now; -+ -+ if (dst_inode) -+ dst_inode_u->bi_ctime = now; -+ -+ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: -+ (src_dir != dst_dir -+ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) -+ : 0 ) ?: -+ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: -+ (dst_inode -+ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) -+ : 0 ); -+err: -+ bch2_trans_iter_put(trans, dst_inode_iter); -+ bch2_trans_iter_put(trans, src_inode_iter); -+ bch2_trans_iter_put(trans, dst_dir_iter); -+ bch2_trans_iter_put(trans, src_dir_iter); -+ return ret; -+} -diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h -new file mode 100644 -index 000000000000..2273b7961c9b ---- /dev/null -+++ b/fs/bcachefs/fs-common.h -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_COMMON_H -+#define _BCACHEFS_FS_COMMON_H -+ -+struct posix_acl; -+ -+int bch2_create_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct posix_acl *, -+ struct posix_acl *); -+ -+int bch2_link_trans(struct btree_trans *, u64, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_unlink_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *); -+ -+int bch2_rename_trans(struct btree_trans *, -+ u64, struct bch_inode_unpacked *, -+ u64, struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *, -+ const struct qstr *, -+ const struct qstr *, -+ enum bch_rename_mode); -+ -+bool bch2_reinherit_attrs(struct bch_inode_unpacked *, -+ struct bch_inode_unpacked *); -+ -+#endif /* _BCACHEFS_FS_COMMON_H */ -diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c -new file mode 100644 -index 000000000000..4ceeafcfa33c ---- /dev/null -+++ b/fs/bcachefs/fs-io.c -@@ -0,0 +1,3140 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "error.h" -+#include "extents.h" -+#include "extent_update.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "journal.h" -+#include "io.h" -+#include "keylist.h" -+#include "quota.h" -+#include "reflink.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+struct quota_res { -+ u64 sectors; -+}; -+ -+struct bch_writepage_io { -+ struct closure cl; -+ struct bch_inode_info *inode; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_write { -+ struct completion done; -+ struct kiocb *req; -+ struct mm_struct *mm; -+ unsigned loop:1, -+ sync:1, -+ free_iov:1; -+ struct quota_res quota_res; -+ u64 written; -+ -+ struct iov_iter iter; -+ struct iovec inline_vecs[2]; -+ -+ /* must be last: */ -+ struct bch_write_op op; -+}; -+ -+struct dio_read { -+ struct closure cl; -+ struct kiocb *req; -+ long ret; -+ struct bch_read_bio rbio; -+}; -+ -+/* pagecache_block must be held */ -+static int write_invalidate_inode_pages_range(struct address_space *mapping, -+ loff_t start, loff_t end) -+{ -+ int ret; -+ -+ /* -+ * XXX: the way this is currently implemented, we can spin if a process -+ * is continually redirtying a specific page -+ */ -+ do { -+ if (!mapping->nrpages && -+ !mapping->nrexceptional) -+ return 0; -+ -+ ret = filemap_write_and_wait_range(mapping, start, end); -+ if (ret) -+ break; -+ -+ if (!mapping->nrpages) -+ return 0; -+ -+ ret = invalidate_inode_pages2_range(mapping, -+ start >> PAGE_SHIFT, -+ end >> PAGE_SHIFT); -+ } while (ret == -EBUSY); -+ -+ return ret; -+} -+ -+/* quotas */ -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+ if (!res->sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ BUG_ON(res->sectors > inode->ei_quota_reserved); -+ -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -+ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); -+ inode->ei_quota_reserved -= res->sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ res->sectors = 0; -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ int ret; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, -+ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); -+ if (likely(!ret)) { -+ inode->ei_quota_reserved += sectors; -+ res->sectors += sectors; -+ } -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+#else -+ -+static void bch2_quota_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res) -+{ -+} -+ -+static int bch2_quota_reservation_add(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct quota_res *res, -+ unsigned sectors, -+ bool check_enospc) -+{ -+ return 0; -+} -+ -+#endif -+ -+/* i_size updates: */ -+ -+struct inode_new_size { -+ loff_t new_size; -+ u64 now; -+ unsigned fields; -+}; -+ -+static int inode_set_size(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_new_size *s = p; -+ -+ bi->bi_size = s->new_size; -+ if (s->fields & ATTR_ATIME) -+ bi->bi_atime = s->now; -+ if (s->fields & ATTR_MTIME) -+ bi->bi_mtime = s->now; -+ if (s->fields & ATTR_CTIME) -+ bi->bi_ctime = s->now; -+ -+ return 0; -+} -+ -+int __must_check bch2_write_inode_size(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ loff_t new_size, unsigned fields) -+{ -+ struct inode_new_size s = { -+ .new_size = new_size, -+ .now = bch2_current_time(c), -+ .fields = fields, -+ }; -+ -+ return bch2_write_inode(c, inode, inode_set_size, &s, fields); -+} -+ -+static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, -+ struct quota_res *quota_res, s64 sectors) -+{ -+ if (!sectors) -+ return; -+ -+ mutex_lock(&inode->ei_quota_lock); -+#ifdef CONFIG_BCACHEFS_QUOTA -+ if (quota_res && sectors > 0) { -+ BUG_ON(sectors > quota_res->sectors); -+ BUG_ON(sectors > inode->ei_quota_reserved); -+ -+ quota_res->sectors -= sectors; -+ inode->ei_quota_reserved -= sectors; -+ } else { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); -+ } -+#endif -+ inode->v.i_blocks += sectors; -+ mutex_unlock(&inode->ei_quota_lock); -+} -+ -+/* page state: */ -+ -+/* stored in page->private: */ -+ -+struct bch_page_sector { -+ /* Uncompressed, fully allocated replicas: */ -+ unsigned nr_replicas:3; -+ -+ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ -+ unsigned replicas_reserved:3; -+ -+ /* i_sectors: */ -+ enum { -+ SECTOR_UNALLOCATED, -+ SECTOR_RESERVED, -+ SECTOR_DIRTY, -+ SECTOR_ALLOCATED, -+ } state:2; -+}; -+ -+struct bch_page_state { -+ spinlock_t lock; -+ atomic_t write_count; -+ struct bch_page_sector s[PAGE_SECTORS]; -+}; -+ -+static inline struct bch_page_state *__bch2_page_state(struct page *page) -+{ -+ return page_has_private(page) -+ ? (struct bch_page_state *) page_private(page) -+ : NULL; -+} -+ -+static inline struct bch_page_state *bch2_page_state(struct page *page) -+{ -+ EBUG_ON(!PageLocked(page)); -+ -+ return __bch2_page_state(page); -+} -+ -+/* for newly allocated pages: */ -+static void __bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = __bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+static void bch2_page_state_release(struct page *page) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ -+ if (!s) -+ return; -+ -+ ClearPagePrivate(page); -+ set_page_private(page, 0); -+ put_page(page); -+ kfree(s); -+} -+ -+/* for newly allocated pages: */ -+static struct bch_page_state *__bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ struct bch_page_state *s; -+ -+ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); -+ if (!s) -+ return NULL; -+ -+ spin_lock_init(&s->lock); -+ /* -+ * migrate_page_move_mapping() assumes that pages with private data -+ * have their count elevated by 1. -+ */ -+ get_page(page); -+ set_page_private(page, (unsigned long) s); -+ SetPagePrivate(page); -+ return s; -+} -+ -+static struct bch_page_state *bch2_page_state_create(struct page *page, -+ gfp_t gfp) -+{ -+ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); -+} -+ -+static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) -+{ -+ /* XXX: this should not be open coded */ -+ return inode->ei_inode.bi_data_replicas -+ ? inode->ei_inode.bi_data_replicas - 1 -+ : c->opts.data_replicas; -+} -+ -+static inline unsigned sectors_to_reserve(struct bch_page_sector *s, -+ unsigned nr_replicas) -+{ -+ return max(0, (int) nr_replicas - -+ s->nr_replicas - -+ s->replicas_reserved); -+} -+ -+static int bch2_get_page_disk_reservation(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct page *page, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned nr_replicas = inode_nr_replicas(c, inode); -+ struct disk_reservation disk_res = { 0 }; -+ unsigned i, disk_res_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ if (!disk_res_sectors) -+ return 0; -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ disk_res_sectors, 1, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) -+ s->s[i].replicas_reserved += -+ sectors_to_reserve(&s->s[i], nr_replicas); -+ -+ return 0; -+} -+ -+struct bch2_page_reservation { -+ struct disk_reservation disk; -+ struct quota_res quota; -+}; -+ -+static void bch2_page_reservation_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ memset(res, 0, sizeof(*res)); -+ -+ res->disk.nr_replicas = inode_nr_replicas(c, inode); -+} -+ -+static void bch2_page_reservation_put(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch2_page_reservation *res) -+{ -+ bch2_disk_reservation_put(c, &res->disk); -+ bch2_quota_reservation_put(c, inode, &res->quota); -+} -+ -+static int bch2_page_reservation_get(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len, bool check_enospc) -+{ -+ struct bch_page_state *s = bch2_page_state_create(page, 0); -+ unsigned i, disk_sectors = 0, quota_sectors = 0; -+ int ret; -+ -+ if (!s) -+ return -ENOMEM; -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ disk_sectors += sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; -+ } -+ -+ if (disk_sectors) { -+ ret = bch2_disk_reservation_add(c, &res->disk, -+ disk_sectors, -+ !check_enospc -+ ? BCH_DISK_RESERVATION_NOFAIL -+ : 0); -+ if (unlikely(ret)) -+ return ret; -+ } -+ -+ if (quota_sectors) { -+ ret = bch2_quota_reservation_add(c, inode, &res->quota, -+ quota_sectors, -+ check_enospc); -+ if (unlikely(ret)) { -+ struct disk_reservation tmp = { -+ .sectors = disk_sectors -+ }; -+ -+ bch2_disk_reservation_put(c, &tmp); -+ res->disk.sectors -= disk_sectors; -+ return ret; -+ } -+ } -+ -+ return 0; -+} -+ -+static void bch2_clear_page_bits(struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_page_state *s = bch2_page_state(page); -+ struct disk_reservation disk_res = { 0 }; -+ int i, dirty_sectors = 0; -+ -+ if (!s) -+ return; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(PageWriteback(page)); -+ -+ for (i = 0; i < ARRAY_SIZE(s->s); i++) { -+ disk_res.sectors += s->s[i].replicas_reserved; -+ s->s[i].replicas_reserved = 0; -+ -+ if (s->s[i].state == SECTOR_DIRTY) { -+ dirty_sectors++; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ } -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, NULL, -dirty_sectors); -+ -+ bch2_page_state_release(page); -+} -+ -+static void bch2_set_page_dirty(struct bch_fs *c, -+ struct bch_inode_info *inode, struct page *page, -+ struct bch2_page_reservation *res, -+ unsigned offset, unsigned len) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i, dirty_sectors = 0; -+ -+ WARN_ON((u64) page_offset(page) + offset + len > -+ round_up((u64) i_size_read(&inode->v), block_bytes(c))); -+ -+ spin_lock(&s->lock); -+ -+ for (i = round_down(offset, block_bytes(c)) >> 9; -+ i < round_up(offset + len, block_bytes(c)) >> 9; -+ i++) { -+ unsigned sectors = sectors_to_reserve(&s->s[i], -+ res->disk.nr_replicas); -+ -+ /* -+ * This can happen if we race with the error path in -+ * bch2_writepage_io_done(): -+ */ -+ sectors = min_t(unsigned, sectors, res->disk.sectors); -+ -+ s->s[i].replicas_reserved += sectors; -+ res->disk.sectors -= sectors; -+ -+ if (s->s[i].state == SECTOR_UNALLOCATED) -+ dirty_sectors++; -+ -+ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); -+ } -+ -+ spin_unlock(&s->lock); -+ -+ if (dirty_sectors) -+ i_sectors_acct(c, inode, &res->quota, dirty_sectors); -+ -+ if (!PageDirty(page)) -+ __set_page_dirty_nobuffers(page); -+} -+ -+vm_fault_t bch2_page_fault(struct vm_fault *vmf) -+{ -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ int ret; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = filemap_fault(vmf); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return ret; -+} -+ -+vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) -+{ -+ struct page *page = vmf->page; -+ struct file *file = vmf->vma->vm_file; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation res; -+ unsigned len; -+ loff_t isize; -+ int ret = VM_FAULT_LOCKED; -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ sb_start_pagefault(inode->v.i_sb); -+ file_update_time(file); -+ -+ /* -+ * Not strictly necessary, but helps avoid dio writes livelocking in -+ * write_invalidate_inode_pages_range() - can drop this if/when we get -+ * a write_invalidate_inode_pages_range() that works without dropping -+ * page lock before invalidating page -+ */ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ lock_page(page); -+ isize = i_size_read(&inode->v); -+ -+ if (page->mapping != mapping || page_offset(page) >= isize) { -+ unlock_page(page); -+ ret = VM_FAULT_NOPAGE; -+ goto out; -+ } -+ -+ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); -+ -+ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { -+ unlock_page(page); -+ ret = VM_FAULT_SIGBUS; -+ goto out; -+ } -+ -+ bch2_set_page_dirty(c, inode, page, &res, 0, len); -+ bch2_page_reservation_put(c, inode, &res); -+ -+ wait_for_stable_page(page); -+out: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ sb_end_pagefault(inode->v.i_sb); -+ -+ return ret; -+} -+ -+void bch2_invalidatepage(struct page *page, unsigned int offset, -+ unsigned int length) -+{ -+ if (offset || length < PAGE_SIZE) -+ return; -+ -+ bch2_clear_page_bits(page); -+} -+ -+int bch2_releasepage(struct page *page, gfp_t gfp_mask) -+{ -+ if (PageDirty(page)) -+ return 0; -+ -+ bch2_clear_page_bits(page); -+ return 1; -+} -+ -+#ifdef CONFIG_MIGRATION -+int bch2_migrate_page(struct address_space *mapping, struct page *newpage, -+ struct page *page, enum migrate_mode mode) -+{ -+ int ret; -+ -+ EBUG_ON(!PageLocked(page)); -+ EBUG_ON(!PageLocked(newpage)); -+ -+ ret = migrate_page_move_mapping(mapping, newpage, page, 0); -+ if (ret != MIGRATEPAGE_SUCCESS) -+ return ret; -+ -+ if (PagePrivate(page)) { -+ ClearPagePrivate(page); -+ get_page(newpage); -+ set_page_private(newpage, page_private(page)); -+ set_page_private(page, 0); -+ put_page(page); -+ SetPagePrivate(newpage); -+ } -+ -+ if (mode != MIGRATE_SYNC_NO_COPY) -+ migrate_page_copy(newpage, page); -+ else -+ migrate_page_states(newpage, page); -+ return MIGRATEPAGE_SUCCESS; -+} -+#endif -+ -+/* readpage(s): */ -+ -+static void bch2_readpages_end_io(struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) { -+ struct page *page = bv->bv_page; -+ -+ if (!bio->bi_status) { -+ SetPageUptodate(page); -+ } else { -+ ClearPageUptodate(page); -+ SetPageError(page); -+ } -+ unlock_page(page); -+ } -+ -+ bio_put(bio); -+} -+ -+static inline void page_state_init_for_read(struct page *page) -+{ -+ SetPagePrivate(page); -+ page->private = 0; -+} -+ -+struct readpages_iter { -+ struct address_space *mapping; -+ struct page **pages; -+ unsigned nr_pages; -+ unsigned nr_added; -+ unsigned idx; -+ pgoff_t offset; -+}; -+ -+static int readpages_iter_init(struct readpages_iter *iter, -+ struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->mapping = mapping; -+ iter->offset = list_last_entry(pages, struct page, lru)->index; -+ -+ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); -+ if (!iter->pages) -+ return -ENOMEM; -+ -+ while (!list_empty(pages)) { -+ struct page *page = list_last_entry(pages, struct page, lru); -+ -+ __bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ iter->pages[iter->nr_pages++] = page; -+ list_del(&page->lru); -+ } -+ -+ return 0; -+} -+ -+static inline struct page *readpage_iter_next(struct readpages_iter *iter) -+{ -+ struct page *page; -+ unsigned i; -+ int ret; -+ -+ BUG_ON(iter->idx > iter->nr_added); -+ BUG_ON(iter->nr_added > iter->nr_pages); -+ -+ if (iter->idx < iter->nr_added) -+ goto out; -+ -+ while (1) { -+ if (iter->idx == iter->nr_pages) -+ return NULL; -+ -+ ret = add_to_page_cache_lru_vec(iter->mapping, -+ iter->pages + iter->nr_added, -+ iter->nr_pages - iter->nr_added, -+ iter->offset + iter->nr_added, -+ GFP_NOFS); -+ if (ret > 0) -+ break; -+ -+ page = iter->pages[iter->nr_added]; -+ iter->idx++; -+ iter->nr_added++; -+ -+ __bch2_page_state_release(page); -+ put_page(page); -+ } -+ -+ iter->nr_added += ret; -+ -+ for (i = iter->idx; i < iter->nr_added; i++) -+ put_page(iter->pages[i]); -+out: -+ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); -+ -+ return iter->pages[iter->idx]; -+} -+ -+static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) -+{ -+ struct bvec_iter iter; -+ struct bio_vec bv; -+ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v -+ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); -+ unsigned state = k.k->type == KEY_TYPE_reservation -+ ? SECTOR_RESERVED -+ : SECTOR_ALLOCATED; -+ -+ bio_for_each_segment(bv, bio, iter) { -+ struct bch_page_state *s = bch2_page_state(bv.bv_page); -+ unsigned i; -+ -+ for (i = bv.bv_offset >> 9; -+ i < (bv.bv_offset + bv.bv_len) >> 9; -+ i++) { -+ s->s[i].nr_replicas = nr_ptrs; -+ s->s[i].state = state; -+ } -+ } -+} -+ -+static bool extent_partial_reads_expensive(struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct bch_extent_crc_unpacked crc; -+ const union bch_extent_entry *i; -+ -+ bkey_for_each_crc(k.k, ptrs, crc, i) -+ if (crc.csum_type || crc.compression_type) -+ return true; -+ return false; -+} -+ -+static void readpage_bio_extend(struct readpages_iter *iter, -+ struct bio *bio, -+ unsigned sectors_this_extent, -+ bool get_more) -+{ -+ while (bio_sectors(bio) < sectors_this_extent && -+ bio->bi_vcnt < bio->bi_max_vecs) { -+ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; -+ struct page *page = readpage_iter_next(iter); -+ int ret; -+ -+ if (page) { -+ if (iter->offset + iter->idx != page_offset) -+ break; -+ -+ iter->idx++; -+ } else { -+ if (!get_more) -+ break; -+ -+ page = xa_load(&iter->mapping->i_pages, page_offset); -+ if (page && !xa_is_value(page)) -+ break; -+ -+ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); -+ if (!page) -+ break; -+ -+ if (!__bch2_page_state_create(page, 0)) { -+ put_page(page); -+ break; -+ } -+ -+ ret = add_to_page_cache_lru(page, iter->mapping, -+ page_offset, GFP_NOFS); -+ if (ret) { -+ __bch2_page_state_release(page); -+ put_page(page); -+ break; -+ } -+ -+ put_page(page); -+ } -+ -+ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); -+ } -+} -+ -+static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, -+ struct bch_read_bio *rbio, u64 inum, -+ struct readpages_iter *readpages_iter) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_on_stack sk; -+ int flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE; -+ int ret = 0; -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+retry: -+ while (1) { -+ struct bkey_s_c k; -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inum, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(trans); -+ -+ if (readpages_iter) -+ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, -+ extent_partial_reads_expensive(k)); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ bch2_add_page_sectors(&rbio->bio, k); -+ -+ bch2_read_extent(trans, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (ret) { -+ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); -+ bio_endio(&rbio->bio); -+ } -+ -+ bkey_on_stack_exit(&sk, c); -+} -+ -+int bch2_readpages(struct file *file, struct address_space *mapping, -+ struct list_head *pages, unsigned nr_pages) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct page *page; -+ struct readpages_iter readpages_iter; -+ int ret; -+ -+ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); -+ BUG_ON(ret); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ while ((page = readpage_iter_next(&readpages_iter))) { -+ pgoff_t index = readpages_iter.offset + readpages_iter.idx; -+ unsigned n = min_t(unsigned, -+ readpages_iter.nr_pages - -+ readpages_iter.idx, -+ BIO_MAX_PAGES); -+ struct bch_read_bio *rbio = -+ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), -+ opts); -+ -+ readpages_iter.idx++; -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); -+ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bchfs_read(&trans, iter, rbio, inode->v.i_ino, -+ &readpages_iter); -+ } -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_trans_exit(&trans); -+ kfree(readpages_iter.pages); -+ -+ return 0; -+} -+ -+static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, -+ u64 inum, struct page *page) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ -+ bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); -+ rbio->bio.bi_iter.bi_sector = -+ (sector_t) page->index << PAGE_SECTOR_SHIFT; -+ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS); -+ -+ bchfs_read(&trans, iter, rbio, inum, NULL); -+ -+ bch2_trans_exit(&trans); -+} -+ -+int bch2_readpage(struct file *file, struct page *page) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct bch_read_bio *rbio; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); -+ rbio->bio.bi_end_io = bch2_readpages_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ return 0; -+} -+ -+static void bch2_read_single_page_end_io(struct bio *bio) -+{ -+ complete(bio->bi_private); -+} -+ -+static int bch2_read_single_page(struct page *page, -+ struct address_space *mapping) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_read_bio *rbio; -+ int ret; -+ DECLARE_COMPLETION_ONSTACK(done); -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), -+ io_opts(c, &inode->ei_inode)); -+ rbio->bio.bi_private = &done; -+ rbio->bio.bi_end_io = bch2_read_single_page_end_io; -+ -+ __bchfs_readpage(c, rbio, inode->v.i_ino, page); -+ wait_for_completion(&done); -+ -+ ret = blk_status_to_errno(rbio->bio.bi_status); -+ bio_put(&rbio->bio); -+ -+ if (ret < 0) -+ return ret; -+ -+ SetPageUptodate(page); -+ return 0; -+} -+ -+/* writepages: */ -+ -+struct bch_writepage_state { -+ struct bch_writepage_io *io; -+ struct bch_io_opts opts; -+}; -+ -+static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, -+ struct bch_inode_info *inode) -+{ -+ return (struct bch_writepage_state) { -+ .opts = io_opts(c, &inode->ei_inode) -+ }; -+} -+ -+static void bch2_writepage_io_free(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ -+ bio_put(&io->op.wbio.bio); -+} -+ -+static void bch2_writepage_io_done(struct closure *cl) -+{ -+ struct bch_writepage_io *io = container_of(cl, -+ struct bch_writepage_io, cl); -+ struct bch_fs *c = io->op.c; -+ struct bio *bio = &io->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bvec; -+ unsigned i; -+ -+ if (io->op.error) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ SetPageError(bvec->bv_page); -+ mapping_set_error(bvec->bv_page->mapping, -EIO); -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s; -+ -+ s = __bch2_page_state(bvec->bv_page); -+ spin_lock(&s->lock); -+ for (i = 0; i < PAGE_SECTORS; i++) -+ s->s[i].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ } -+ -+ /* -+ * racing with fallocate can cause us to add fewer sectors than -+ * expected - but we shouldn't add more sectors than expected: -+ */ -+ BUG_ON(io->op.i_sectors_delta > 0); -+ -+ /* -+ * (error (due to going RO) halfway through a page can screw that up -+ * slightly) -+ * XXX wtf? -+ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); -+ */ -+ -+ /* -+ * PageWriteback is effectively our ref on the inode - fixup i_blocks -+ * before calling end_page_writeback: -+ */ -+ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); -+ -+ bio_for_each_segment_all(bvec, bio, iter) { -+ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(bvec->bv_page); -+ } -+ -+ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); -+} -+ -+static void bch2_writepage_do_io(struct bch_writepage_state *w) -+{ -+ struct bch_writepage_io *io = w->io; -+ -+ w->io = NULL; -+ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); -+ continue_at(&io->cl, bch2_writepage_io_done, NULL); -+} -+ -+/* -+ * Get a bch_writepage_io and add @page to it - appending to an existing one if -+ * possible, else allocating a new one: -+ */ -+static void bch2_writepage_io_alloc(struct bch_fs *c, -+ struct writeback_control *wbc, -+ struct bch_writepage_state *w, -+ struct bch_inode_info *inode, -+ u64 sector, -+ unsigned nr_replicas) -+{ -+ struct bch_write_op *op; -+ -+ w->io = container_of(bio_alloc_bioset(GFP_NOFS, -+ BIO_MAX_PAGES, -+ &c->writepage_bioset), -+ struct bch_writepage_io, op.wbio.bio); -+ -+ closure_init(&w->io->cl, NULL); -+ w->io->inode = inode; -+ -+ op = &w->io->op; -+ bch2_write_op_init(op, c, w->opts); -+ op->target = w->opts.foreground_target; -+ op_journal_seq_set(op, &inode->ei_journal_seq); -+ op->nr_replicas = nr_replicas; -+ op->res.nr_replicas = nr_replicas; -+ op->write_point = writepoint_hashed(inode->ei_last_dirtied); -+ op->pos = POS(inode->v.i_ino, sector); -+ op->wbio.bio.bi_iter.bi_sector = sector; -+ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); -+} -+ -+static int __bch2_writepage(struct page *page, -+ struct writeback_control *wbc, -+ void *data) -+{ -+ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_writepage_state *w = data; -+ struct bch_page_state *s, orig; -+ unsigned i, offset, nr_replicas_this_write = U32_MAX; -+ loff_t i_size = i_size_read(&inode->v); -+ pgoff_t end_index = i_size >> PAGE_SHIFT; -+ int ret; -+ -+ EBUG_ON(!PageUptodate(page)); -+ -+ /* Is the page fully inside i_size? */ -+ if (page->index < end_index) -+ goto do_io; -+ -+ /* Is the page fully outside i_size? (truncate in progress) */ -+ offset = i_size & (PAGE_SIZE - 1); -+ if (page->index > end_index || !offset) { -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* -+ * The page straddles i_size. It must be zeroed out on each and every -+ * writepage invocation because it may be mmapped. "A file is mapped -+ * in multiples of the page size. For a file that is not a multiple of -+ * the page size, the remaining memory is zeroed when mapped, and -+ * writes to that region are not written out to the file." -+ */ -+ zero_user_segment(page, offset, PAGE_SIZE); -+do_io: -+ s = bch2_page_state_create(page, __GFP_NOFAIL); -+ -+ ret = bch2_get_page_disk_reservation(c, inode, page, true); -+ if (ret) { -+ SetPageError(page); -+ mapping_set_error(page->mapping, ret); -+ unlock_page(page); -+ return 0; -+ } -+ -+ /* Before unlocking the page, get copy of reservations: */ -+ orig = *s; -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ nr_replicas_this_write = -+ min_t(unsigned, nr_replicas_this_write, -+ s->s[i].nr_replicas + -+ s->s[i].replicas_reserved); -+ } -+ -+ for (i = 0; i < PAGE_SECTORS; i++) { -+ if (s->s[i].state < SECTOR_DIRTY) -+ continue; -+ -+ s->s[i].nr_replicas = w->opts.compression -+ ? 0 : nr_replicas_this_write; -+ -+ s->s[i].replicas_reserved = 0; -+ s->s[i].state = SECTOR_ALLOCATED; -+ } -+ -+ BUG_ON(atomic_read(&s->write_count)); -+ atomic_set(&s->write_count, 1); -+ -+ BUG_ON(PageWriteback(page)); -+ set_page_writeback(page); -+ -+ unlock_page(page); -+ -+ offset = 0; -+ while (1) { -+ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; -+ u64 sector; -+ -+ while (offset < PAGE_SECTORS && -+ orig.s[offset].state < SECTOR_DIRTY) -+ offset++; -+ -+ if (offset == PAGE_SECTORS) -+ break; -+ -+ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; -+ -+ while (offset + sectors < PAGE_SECTORS && -+ orig.s[offset + sectors].state >= SECTOR_DIRTY) -+ sectors++; -+ -+ for (i = offset; i < offset + sectors; i++) { -+ reserved_sectors += orig.s[i].replicas_reserved; -+ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; -+ } -+ -+ if (w->io && -+ (w->io->op.res.nr_replicas != nr_replicas_this_write || -+ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || -+ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= -+ (BIO_MAX_PAGES * PAGE_SIZE) || -+ bio_end_sector(&w->io->op.wbio.bio) != sector)) -+ bch2_writepage_do_io(w); -+ -+ if (!w->io) -+ bch2_writepage_io_alloc(c, wbc, w, inode, sector, -+ nr_replicas_this_write); -+ -+ atomic_inc(&s->write_count); -+ -+ BUG_ON(inode != w->io->inode); -+ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, -+ sectors << 9, offset << 9)); -+ -+ /* Check for writing past i_size: */ -+ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > -+ round_up(i_size, block_bytes(c))); -+ -+ w->io->op.res.sectors += reserved_sectors; -+ w->io->op.i_sectors_delta -= dirty_sectors; -+ w->io->op.new_i_size = i_size; -+ -+ offset += sectors; -+ } -+ -+ if (atomic_dec_and_test(&s->write_count)) -+ end_page_writeback(page); -+ -+ return 0; -+} -+ -+int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(mapping->host)); -+ struct blk_plug plug; -+ int ret; -+ -+ blk_start_plug(&plug); -+ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ blk_finish_plug(&plug); -+ return ret; -+} -+ -+int bch2_writepage(struct page *page, struct writeback_control *wbc) -+{ -+ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; -+ struct bch_writepage_state w = -+ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); -+ int ret; -+ -+ ret = __bch2_writepage(page, wbc, &w); -+ if (w.io) -+ bch2_writepage_do_io(&w); -+ -+ return ret; -+} -+ -+/* buffered writes: */ -+ -+int bch2_write_begin(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned flags, -+ struct page **pagep, void **fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res; -+ pgoff_t index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ struct page *page; -+ int ret = -ENOMEM; -+ -+ res = kmalloc(sizeof(*res), GFP_KERNEL); -+ if (!res) -+ return -ENOMEM; -+ -+ bch2_page_reservation_init(c, inode, res); -+ *fsdata = res; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ page = grab_cache_page_write_begin(mapping, index, flags); -+ if (!page) -+ goto err_unlock; -+ -+ if (PageUptodate(page)) -+ goto out; -+ -+ /* If we're writing entire page, don't need to read it in first: */ -+ if (len == PAGE_SIZE) -+ goto out; -+ -+ if (!offset && pos + len >= inode->v.i_size) { -+ zero_user_segment(page, len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+ -+ if (index > inode->v.i_size >> PAGE_SHIFT) { -+ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); -+ flush_dcache_page(page); -+ goto out; -+ } -+readpage: -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto err; -+out: -+ ret = bch2_page_reservation_get(c, inode, page, res, -+ offset, len, true); -+ if (ret) { -+ if (!PageUptodate(page)) { -+ /* -+ * If the page hasn't been read in, we won't know if we -+ * actually need a reservation - we don't actually need -+ * to read here, we just need to check if the page is -+ * fully backed by uncompressed data: -+ */ -+ goto readpage; -+ } -+ -+ goto err; -+ } -+ -+ *pagep = page; -+ return 0; -+err: -+ unlock_page(page); -+ put_page(page); -+ *pagep = NULL; -+err_unlock: -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ kfree(res); -+ *fsdata = NULL; -+ return ret; -+} -+ -+int bch2_write_end(struct file *file, struct address_space *mapping, -+ loff_t pos, unsigned len, unsigned copied, -+ struct page *page, void *fsdata) -+{ -+ struct bch_inode_info *inode = to_bch_ei(mapping->host); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch2_page_reservation *res = fsdata; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ if (unlikely(copied < len && !PageUptodate(page))) { -+ /* -+ * The page needs to be read in, but that would destroy -+ * our partial write - simplest thing is to just force -+ * userspace to redo the write: -+ */ -+ zero_user(page, 0, PAGE_SIZE); -+ flush_dcache_page(page); -+ copied = 0; -+ } -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ if (copied) { -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, res, offset, copied); -+ -+ inode->ei_last_dirtied = (unsigned long) current; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ bch2_page_reservation_put(c, inode, res); -+ kfree(res); -+ -+ return copied; -+} -+ -+#define WRITE_BATCH_PAGES 32 -+ -+static int __bch2_buffered_write(struct bch_inode_info *inode, -+ struct address_space *mapping, -+ struct iov_iter *iter, -+ loff_t pos, unsigned len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct page *pages[WRITE_BATCH_PAGES]; -+ struct bch2_page_reservation res; -+ unsigned long index = pos >> PAGE_SHIFT; -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); -+ unsigned i, reserved = 0, set_dirty = 0; -+ unsigned copied = 0, nr_pages_copied = 0; -+ int ret = 0; -+ -+ BUG_ON(!len); -+ BUG_ON(nr_pages > ARRAY_SIZE(pages)); -+ -+ bch2_page_reservation_init(c, inode, &res); -+ -+ for (i = 0; i < nr_pages; i++) { -+ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); -+ if (!pages[i]) { -+ nr_pages = i; -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ len = min_t(unsigned, len, -+ nr_pages * PAGE_SIZE - offset); -+ break; -+ } -+ } -+ -+ if (offset && !PageUptodate(pages[0])) { -+ ret = bch2_read_single_page(pages[0], mapping); -+ if (ret) -+ goto out; -+ } -+ -+ if ((pos + len) & (PAGE_SIZE - 1) && -+ !PageUptodate(pages[nr_pages - 1])) { -+ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { -+ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); -+ } else { -+ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); -+ if (ret) -+ goto out; -+ } -+ } -+ -+ while (reserved < len) { -+ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - reserved, -+ PAGE_SIZE - pg_offset); -+retry_reservation: -+ ret = bch2_page_reservation_get(c, inode, page, &res, -+ pg_offset, pg_len, true); -+ -+ if (ret && !PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (!ret) -+ goto retry_reservation; -+ } -+ -+ if (ret) -+ goto out; -+ -+ reserved += pg_len; -+ } -+ -+ if (mapping_writably_mapped(mapping)) -+ for (i = 0; i < nr_pages; i++) -+ flush_dcache_page(pages[i]); -+ -+ while (copied < len) { -+ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, len - copied, -+ PAGE_SIZE - pg_offset); -+ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, -+ iter, pg_offset, pg_len); -+ -+ if (!pg_copied) -+ break; -+ -+ if (!PageUptodate(page) && -+ pg_copied != PAGE_SIZE && -+ pos + copied + pg_copied < inode->v.i_size) { -+ zero_user(page, 0, PAGE_SIZE); -+ break; -+ } -+ -+ flush_dcache_page(page); -+ iov_iter_advance(iter, pg_copied); -+ copied += pg_copied; -+ -+ if (pg_copied != pg_len) -+ break; -+ } -+ -+ if (!copied) -+ goto out; -+ -+ spin_lock(&inode->v.i_lock); -+ if (pos + copied > inode->v.i_size) -+ i_size_write(&inode->v, pos + copied); -+ spin_unlock(&inode->v.i_lock); -+ -+ while (set_dirty < copied) { -+ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; -+ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); -+ unsigned pg_len = min_t(unsigned, copied - set_dirty, -+ PAGE_SIZE - pg_offset); -+ -+ if (!PageUptodate(page)) -+ SetPageUptodate(page); -+ -+ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); -+ unlock_page(page); -+ put_page(page); -+ -+ set_dirty += pg_len; -+ } -+ -+ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); -+ inode->ei_last_dirtied = (unsigned long) current; -+out: -+ for (i = nr_pages_copied; i < nr_pages; i++) { -+ unlock_page(pages[i]); -+ put_page(pages[i]); -+ } -+ -+ bch2_page_reservation_put(c, inode, &res); -+ -+ return copied ?: ret; -+} -+ -+static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ loff_t pos = iocb->ki_pos; -+ ssize_t written = 0; -+ int ret = 0; -+ -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ -+ do { -+ unsigned offset = pos & (PAGE_SIZE - 1); -+ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE * WRITE_BATCH_PAGES - offset); -+again: -+ /* -+ * Bring in the user page that we will copy from _first_. -+ * Otherwise there's a nasty deadlock on copying from the -+ * same page as we're writing to, without it being marked -+ * up-to-date. -+ * -+ * Not only is this an optimisation, but it is also required -+ * to check that the address is actually valid, when atomic -+ * usercopies are used, below. -+ */ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ bytes = min_t(unsigned long, iov_iter_count(iter), -+ PAGE_SIZE - offset); -+ -+ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { -+ ret = -EFAULT; -+ break; -+ } -+ } -+ -+ if (unlikely(fatal_signal_pending(current))) { -+ ret = -EINTR; -+ break; -+ } -+ -+ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); -+ if (unlikely(ret < 0)) -+ break; -+ -+ cond_resched(); -+ -+ if (unlikely(ret == 0)) { -+ /* -+ * If we were unable to copy any data at all, we must -+ * fall back to a single segment length write. -+ * -+ * If we didn't fallback here, we could livelock -+ * because not all segments in the iov can be copied at -+ * once without a pagefault. -+ */ -+ bytes = min_t(unsigned long, PAGE_SIZE - offset, -+ iov_iter_single_seg_count(iter)); -+ goto again; -+ } -+ pos += ret; -+ written += ret; -+ ret = 0; -+ -+ balance_dirty_pages_ratelimited(mapping); -+ } while (iov_iter_count(iter)); -+ -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ -+ return written ? written : ret; -+} -+ -+/* O_DIRECT reads */ -+ -+static void bch2_dio_read_complete(struct closure *cl) -+{ -+ struct dio_read *dio = container_of(cl, struct dio_read, cl); -+ -+ dio->req->ki_complete(dio->req, dio->ret, 0); -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+} -+ -+static void bch2_direct_IO_read_endio(struct bio *bio) -+{ -+ struct dio_read *dio = bio->bi_private; -+ -+ if (bio->bi_status) -+ dio->ret = blk_status_to_errno(bio->bi_status); -+ -+ closure_put(&dio->cl); -+} -+ -+static void bch2_direct_IO_read_split_endio(struct bio *bio) -+{ -+ bch2_direct_IO_read_endio(bio); -+ bio_check_pages_dirty(bio); /* transfers ownership */ -+} -+ -+static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); -+ struct dio_read *dio; -+ struct bio *bio; -+ loff_t offset = req->ki_pos; -+ bool sync = is_sync_kiocb(req); -+ size_t shorten; -+ ssize_t ret; -+ -+ if ((offset|iter->count) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ ret = min_t(loff_t, iter->count, -+ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); -+ -+ if (!ret) -+ return ret; -+ -+ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); -+ iter->count -= shorten; -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_read_bioset); -+ -+ bio->bi_end_io = bch2_direct_IO_read_endio; -+ -+ dio = container_of(bio, struct dio_read, rbio.bio); -+ closure_init(&dio->cl, NULL); -+ -+ /* -+ * this is a _really_ horrible hack just to avoid an atomic sub at the -+ * end: -+ */ -+ if (!sync) { -+ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER - -+ CLOSURE_RUNNING + -+ CLOSURE_DESTRUCTOR); -+ } else { -+ atomic_set(&dio->cl.remaining, -+ CLOSURE_REMAINING_INITIALIZER + 1); -+ } -+ -+ dio->req = req; -+ dio->ret = ret; -+ -+ goto start; -+ while (iter->count) { -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->bio_read); -+ bio->bi_end_io = bch2_direct_IO_read_split_endio; -+start: -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); -+ bio->bi_iter.bi_sector = offset >> 9; -+ bio->bi_private = dio; -+ -+ ret = bio_iov_iter_get_pages(bio, iter); -+ if (ret < 0) { -+ /* XXX: fault inject this path */ -+ bio->bi_status = BLK_STS_RESOURCE; -+ bio_endio(bio); -+ break; -+ } -+ -+ offset += bio->bi_iter.bi_size; -+ bio_set_pages_dirty(bio); -+ -+ if (iter->count) -+ closure_get(&dio->cl); -+ -+ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); -+ } -+ -+ iter->count += shorten; -+ -+ if (sync) { -+ closure_sync(&dio->cl); -+ closure_debug_destroy(&dio->cl); -+ ret = dio->ret; -+ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ -+ return ret; -+ } else { -+ return -EIOCBQUEUED; -+ } -+} -+ -+ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct address_space *mapping = file->f_mapping; -+ size_t count = iov_iter_count(iter); -+ ssize_t ret; -+ -+ if (!count) -+ return 0; /* skip atime */ -+ -+ if (iocb->ki_flags & IOCB_DIRECT) { -+ struct blk_plug plug; -+ -+ ret = filemap_write_and_wait_range(mapping, -+ iocb->ki_pos, -+ iocb->ki_pos + count - 1); -+ if (ret < 0) -+ return ret; -+ -+ file_accessed(file); -+ -+ blk_start_plug(&plug); -+ ret = bch2_direct_IO_read(iocb, iter); -+ blk_finish_plug(&plug); -+ -+ if (ret >= 0) -+ iocb->ki_pos += ret; -+ } else { -+ bch2_pagecache_add_get(&inode->ei_pagecache_lock); -+ ret = generic_file_read_iter(iocb, iter); -+ bch2_pagecache_add_put(&inode->ei_pagecache_lock); -+ } -+ -+ return ret; -+} -+ -+/* O_DIRECT writes */ -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *); -+ -+static long bch2_dio_write_loop(struct dio_write *dio) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct kiocb *req = dio->req; -+ struct address_space *mapping = req->ki_filp->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bio *bio = &dio->op.wbio.bio; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ unsigned unaligned; -+ bool sync = dio->sync; -+ long ret; -+ -+ if (dio->loop) -+ goto loop; -+ -+ while (1) { -+ if (kthread) -+ kthread_use_mm(dio->mm); -+ BUG_ON(current->faults_disabled_mapping); -+ current->faults_disabled_mapping = mapping; -+ -+ ret = bio_iov_iter_get_pages(bio, &dio->iter); -+ -+ current->faults_disabled_mapping = NULL; -+ if (kthread) -+ kthread_unuse_mm(dio->mm); -+ -+ if (unlikely(ret < 0)) -+ goto err; -+ -+ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); -+ bio->bi_iter.bi_size -= unaligned; -+ iov_iter_revert(&dio->iter, unaligned); -+ -+ if (!bio->bi_iter.bi_size) { -+ /* -+ * bio_iov_iter_get_pages was only able to get < -+ * blocksize worth of pages: -+ */ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ ret = -EFAULT; -+ goto err; -+ } -+ -+ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); -+ dio->op.end_io = bch2_dio_write_loop_async; -+ dio->op.target = dio->op.opts.foreground_target; -+ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); -+ dio->op.write_point = writepoint_hashed((unsigned long) current); -+ dio->op.nr_replicas = dio->op.opts.data_replicas; -+ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); -+ -+ if ((req->ki_flags & IOCB_DSYNC) && -+ !c->opts.journal_flush_disabled) -+ dio->op.flags |= BCH_WRITE_FLUSH; -+ -+ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), -+ dio->op.opts.data_replicas, 0); -+ if (unlikely(ret) && -+ !bch2_check_range_allocated(c, dio->op.pos, -+ bio_sectors(bio), dio->op.opts.data_replicas)) -+ goto err; -+ -+ task_io_account_write(bio->bi_iter.bi_size); -+ -+ if (!dio->sync && !dio->loop && dio->iter.count) { -+ struct iovec *iov = dio->inline_vecs; -+ -+ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { -+ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), -+ GFP_KERNEL); -+ if (unlikely(!iov)) { -+ dio->sync = sync = true; -+ goto do_io; -+ } -+ -+ dio->free_iov = true; -+ } -+ -+ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); -+ dio->iter.iov = iov; -+ } -+do_io: -+ dio->loop = true; -+ closure_call(&dio->op.cl, bch2_write, NULL, NULL); -+ -+ if (sync) -+ wait_for_completion(&dio->done); -+ else -+ return -EIOCBQUEUED; -+loop: -+ i_sectors_acct(c, inode, &dio->quota_res, -+ dio->op.i_sectors_delta); -+ req->ki_pos += (u64) dio->op.written << 9; -+ dio->written += dio->op.written; -+ -+ spin_lock(&inode->v.i_lock); -+ if (req->ki_pos > inode->v.i_size) -+ i_size_write(&inode->v, req->ki_pos); -+ spin_unlock(&inode->v.i_lock); -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ put_page(bv->bv_page); -+ if (!dio->iter.count || dio->op.error) -+ break; -+ -+ bio_reset(bio); -+ reinit_completion(&dio->done); -+ } -+ -+ ret = dio->op.error ?: ((long) dio->written << 9); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ -+ if (dio->free_iov) -+ kfree(dio->iter.iov); -+ -+ bio_put(bio); -+ -+ /* inode->i_dio_count is our ref on inode and thus bch_fs */ -+ inode_dio_end(&inode->v); -+ -+ if (!sync) { -+ req->ki_complete(req, ret, 0); -+ ret = -EIOCBQUEUED; -+ } -+ return ret; -+} -+ -+static void bch2_dio_write_loop_async(struct bch_write_op *op) -+{ -+ struct dio_write *dio = container_of(op, struct dio_write, op); -+ -+ if (dio->sync) -+ complete(&dio->done); -+ else -+ bch2_dio_write_loop(dio); -+} -+ -+static noinline -+ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) -+{ -+ struct file *file = req->ki_filp; -+ struct address_space *mapping = file->f_mapping; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct dio_write *dio; -+ struct bio *bio; -+ bool locked = true, extending; -+ ssize_t ret; -+ -+ prefetch(&c->opts); -+ prefetch((void *) &c->opts + 64); -+ prefetch(&inode->ei_inode); -+ prefetch((void *) &inode->ei_inode + 64); -+ -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(req, iter); -+ if (unlikely(ret <= 0)) -+ goto err; -+ -+ ret = file_remove_privs(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = file_update_time(file); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) -+ goto err; -+ -+ inode_dio_begin(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ extending = req->ki_pos + iter->count > inode->v.i_size; -+ if (!extending) { -+ inode_unlock(&inode->v); -+ locked = false; -+ } -+ -+ bio = bio_alloc_bioset(GFP_KERNEL, -+ iov_iter_npages(iter, BIO_MAX_PAGES), -+ &c->dio_write_bioset); -+ dio = container_of(bio, struct dio_write, op.wbio.bio); -+ init_completion(&dio->done); -+ dio->req = req; -+ dio->mm = current->mm; -+ dio->loop = false; -+ dio->sync = is_sync_kiocb(req) || extending; -+ dio->free_iov = false; -+ dio->quota_res.sectors = 0; -+ dio->written = 0; -+ dio->iter = *iter; -+ -+ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, -+ iter->count >> 9, true); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = write_invalidate_inode_pages_range(mapping, -+ req->ki_pos, -+ req->ki_pos + iter->count - 1); -+ if (unlikely(ret)) -+ goto err_put_bio; -+ -+ ret = bch2_dio_write_loop(dio); -+err: -+ if (locked) -+ inode_unlock(&inode->v); -+ return ret; -+err_put_bio: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ bch2_quota_reservation_put(c, inode, &dio->quota_res); -+ bio_put(bio); -+ inode_dio_end(&inode->v); -+ goto err; -+} -+ -+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) -+{ -+ struct file *file = iocb->ki_filp; -+ struct bch_inode_info *inode = file_bch_inode(file); -+ ssize_t ret; -+ -+ if (iocb->ki_flags & IOCB_DIRECT) -+ return bch2_direct_write(iocb, from); -+ -+ /* We can write back this queue in page reclaim */ -+ current->backing_dev_info = inode_to_bdi(&inode->v); -+ inode_lock(&inode->v); -+ -+ ret = generic_write_checks(iocb, from); -+ if (ret <= 0) -+ goto unlock; -+ -+ ret = file_remove_privs(file); -+ if (ret) -+ goto unlock; -+ -+ ret = file_update_time(file); -+ if (ret) -+ goto unlock; -+ -+ ret = bch2_buffered_write(iocb, from); -+ if (likely(ret > 0)) -+ iocb->ki_pos += ret; -+unlock: -+ inode_unlock(&inode->v); -+ current->backing_dev_info = NULL; -+ -+ if (ret > 0) -+ ret = generic_write_sync(iocb, ret); -+ -+ return ret; -+} -+ -+/* fsync: */ -+ -+int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ int ret, ret2; -+ -+ ret = file_write_and_wait_range(file, start, end); -+ if (ret) -+ return ret; -+ -+ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) -+ goto out; -+ -+ ret = sync_inode_metadata(&inode->v, 1); -+ if (ret) -+ return ret; -+out: -+ if (!c->opts.journal_flush_disabled) -+ ret = bch2_journal_flush_seq(&c->journal, -+ inode->ei_journal_seq); -+ ret2 = file_check_and_advance_wb_err(file); -+ -+ return ret ?: ret2; -+} -+ -+/* truncate: */ -+ -+static inline int range_has_data(struct bch_fs *c, -+ struct bpos start, -+ struct bpos end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (bkey_extent_is_data(k.k)) { -+ ret = 1; -+ break; -+ } -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_truncate_page(struct bch_inode_info *inode, -+ pgoff_t index, loff_t start, loff_t end) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_page_state *s; -+ unsigned start_offset = start & (PAGE_SIZE - 1); -+ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; -+ unsigned i; -+ struct page *page; -+ int ret = 0; -+ -+ /* Page boundary? Nothing to do */ -+ if (!((index == start >> PAGE_SHIFT && start_offset) || -+ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) -+ return 0; -+ -+ /* Above i_size? */ -+ if (index << PAGE_SHIFT >= inode->v.i_size) -+ return 0; -+ -+ page = find_lock_page(mapping, index); -+ if (!page) { -+ /* -+ * XXX: we're doing two index lookups when we end up reading the -+ * page -+ */ -+ ret = range_has_data(c, -+ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), -+ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); -+ if (ret <= 0) -+ return ret; -+ -+ page = find_or_create_page(mapping, index, GFP_KERNEL); -+ if (unlikely(!page)) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ } -+ -+ s = bch2_page_state_create(page, 0); -+ if (!s) { -+ ret = -ENOMEM; -+ goto unlock; -+ } -+ -+ if (!PageUptodate(page)) { -+ ret = bch2_read_single_page(page, mapping); -+ if (ret) -+ goto unlock; -+ } -+ -+ if (index != start >> PAGE_SHIFT) -+ start_offset = 0; -+ if (index != end >> PAGE_SHIFT) -+ end_offset = PAGE_SIZE; -+ -+ for (i = round_up(start_offset, block_bytes(c)) >> 9; -+ i < round_down(end_offset, block_bytes(c)) >> 9; -+ i++) { -+ s->s[i].nr_replicas = 0; -+ s->s[i].state = SECTOR_UNALLOCATED; -+ } -+ -+ zero_user_segment(page, start_offset, end_offset); -+ -+ /* -+ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. -+ * -+ * XXX: because we aren't currently tracking whether the page has actual -+ * data in it (vs. just 0s, or only partially written) this wrong. ick. -+ */ -+ ret = bch2_get_page_disk_reservation(c, inode, page, false); -+ BUG_ON(ret); -+ -+ /* -+ * This removes any writeable userspace mappings; we need to force -+ * .page_mkwrite to be called again before any mmapped writes, to -+ * redirty the full page: -+ */ -+ page_mkclean(page); -+ __set_page_dirty_nobuffers(page); -+unlock: -+ unlock_page(page); -+ put_page(page); -+out: -+ return ret; -+} -+ -+static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) -+{ -+ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, -+ from, round_up(from, PAGE_SIZE)); -+} -+ -+static int bch2_extend(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *inode_u, -+ struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ int ret; -+ -+ /* -+ * sync appends: -+ * -+ * this has to be done _before_ extending i_size: -+ */ -+ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); -+ if (ret) -+ return ret; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, inode->v.i_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_truncate_finish_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); -+ return 0; -+} -+ -+static int bch2_truncate_start_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, void *p) -+{ -+ u64 *new_i_size = p; -+ -+ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; -+ bi->bi_size = *new_i_size; -+ return 0; -+} -+ -+int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_inode_unpacked inode_u; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ u64 new_i_size = iattr->ia_size; -+ s64 i_sectors_delta = 0; -+ int ret = 0; -+ -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ /* -+ * fetch current on disk i_size: inode is locked, i_size can only -+ * increase underneath us: -+ */ -+ bch2_trans_init(&trans, c, 0, 0); -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(iter); -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * check this before next assertion; on filesystem error our normal -+ * invariants are a bit broken (truncate has to truncate the page cache -+ * before the inode). -+ */ -+ ret = bch2_journal_error(&c->journal); -+ if (ret) -+ goto err; -+ -+ BUG_ON(inode->v.i_size < inode_u.bi_size); -+ -+ if (iattr->ia_size > inode->v.i_size) { -+ ret = bch2_extend(inode, &inode_u, iattr); -+ goto err; -+ } -+ -+ ret = bch2_truncate_page(inode, iattr->ia_size); -+ if (unlikely(ret)) -+ goto err; -+ -+ /* -+ * When extending, we're going to write the new i_size to disk -+ * immediately so we need to flush anything above the current on disk -+ * i_size first: -+ * -+ * Also, when extending we need to flush the page that i_size currently -+ * straddles - if it's mapped to userspace, we need to ensure that -+ * userspace has to redirty it and call .mkwrite -> set_page_dirty -+ * again to allocate the part of the page that was extended. -+ */ -+ if (iattr->ia_size > inode_u.bi_size) -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, -+ iattr->ia_size - 1); -+ else if (iattr->ia_size & (PAGE_SIZE - 1)) -+ ret = filemap_write_and_wait_range(mapping, -+ round_down(iattr->ia_size, PAGE_SIZE), -+ iattr->ia_size - 1); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, -+ &new_i_size, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_setsize(&inode->v, iattr->ia_size); -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ round_up(iattr->ia_size, block_bytes(c)) >> 9, -+ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ setattr_copy(&inode->v, iattr); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ return ret; -+} -+ -+/* fallocate: */ -+ -+static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; -+ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; -+ int ret = 0; -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ -+ if (offset >> PAGE_SHIFT != -+ (offset + len) >> PAGE_SHIFT) { -+ ret = __bch2_truncate_page(inode, -+ (offset + len) >> PAGE_SHIFT, -+ offset, offset + len); -+ if (unlikely(ret)) -+ goto err; -+ } -+ -+ truncate_pagecache_range(&inode->v, offset, offset + len - 1); -+ -+ if (discard_start < discard_end) { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ discard_start, discard_end, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ } -+err: -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ -+ return ret; -+} -+ -+static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, -+ loff_t offset, loff_t len, -+ bool insert) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bkey_on_stack copy; -+ struct btree_trans trans; -+ struct btree_iter *src, *dst; -+ loff_t shift, new_size; -+ u64 src_start; -+ int ret; -+ -+ if ((offset | len) & (block_bytes(c) - 1)) -+ return -EINVAL; -+ -+ bkey_on_stack_init(©); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); -+ -+ /* -+ * We need i_mutex to keep the page cache consistent with the extents -+ * btree, and the btree consistent with i_size - we don't need outside -+ * locking for the extents btree itself, because we're using linked -+ * iterators -+ */ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (insert) { -+ ret = -EFBIG; -+ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) -+ goto err; -+ -+ ret = -EINVAL; -+ if (offset >= inode->v.i_size) -+ goto err; -+ -+ src_start = U64_MAX; -+ shift = len; -+ } else { -+ ret = -EINVAL; -+ if (offset + len >= inode->v.i_size) -+ goto err; -+ -+ src_start = offset + len; -+ shift = -len; -+ } -+ -+ new_size = inode->v.i_size + shift; -+ -+ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); -+ if (ret) -+ goto err; -+ -+ if (insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } else { -+ s64 i_sectors_delta = 0; -+ -+ ret = bch2_fpunch(c, inode->v.i_ino, -+ offset >> 9, (offset + len) >> 9, -+ &inode->ei_journal_seq, -+ &i_sectors_delta); -+ i_sectors_acct(c, inode, NULL, i_sectors_delta); -+ -+ if (ret) -+ goto err; -+ } -+ -+ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, src_start >> 9), -+ BTREE_ITER_INTENT); -+ BUG_ON(IS_ERR_OR_NULL(src)); -+ -+ dst = bch2_trans_copy_iter(&trans, src); -+ BUG_ON(IS_ERR_OR_NULL(dst)); -+ -+ while (1) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ struct bpos next_pos; -+ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); -+ struct bpos atomic_end; -+ unsigned trigger_flags = 0; -+ -+ k = insert -+ ? bch2_btree_iter_peek_prev(src) -+ : bch2_btree_iter_peek(src); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ if (!k.k || k.k->p.inode != inode->v.i_ino) -+ break; -+ -+ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); -+ -+ if (insert && -+ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) -+ break; -+reassemble: -+ bkey_on_stack_reassemble(©, c, k); -+ -+ if (insert && -+ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) -+ bch2_cut_front(move_pos, copy.k); -+ -+ copy.k->k.p.offset += shift >> 9; -+ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); -+ -+ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); -+ if (ret) -+ goto bkey_err; -+ -+ if (bkey_cmp(atomic_end, copy.k->k.p)) { -+ if (insert) { -+ move_pos = atomic_end; -+ move_pos.offset -= shift >> 9; -+ goto reassemble; -+ } else { -+ bch2_cut_back(atomic_end, copy.k); -+ } -+ } -+ -+ bkey_init(&delete.k); -+ delete.k.p = copy.k->k.p; -+ delete.k.size = copy.k->k.size; -+ delete.k.p.offset -= shift >> 9; -+ -+ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; -+ -+ if (copy.k->k.size == k.k->size) { -+ /* -+ * If we're moving the entire extent, we can skip -+ * running triggers: -+ */ -+ trigger_flags |= BTREE_TRIGGER_NORUN; -+ } else { -+ /* We might end up splitting compressed extents: */ -+ unsigned nr_ptrs = -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); -+ -+ ret = bch2_disk_reservation_get(c, &disk_res, -+ copy.k->k.size, nr_ptrs, -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ } -+ -+ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); -+ -+ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: -+ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: -+ bch2_trans_commit(&trans, &disk_res, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOFAIL); -+ bch2_disk_reservation_put(c, &disk_res); -+bkey_err: -+ if (!ret) -+ bch2_btree_iter_set_pos(src, next_pos); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ bch2_trans_unlock(&trans); -+ -+ if (!insert) { -+ i_size_write(&inode->v, new_size); -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, new_size, -+ ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(©, c); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+static long bchfs_fallocate(struct bch_inode_info *inode, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct address_space *mapping = inode->v.i_mapping; -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bpos end_pos; -+ loff_t end = offset + len; -+ loff_t block_start = round_down(offset, block_bytes(c)); -+ loff_t block_end = round_up(end, block_bytes(c)); -+ unsigned sectors; -+ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inode_lock(&inode->v); -+ inode_dio_wait(&inode->v); -+ bch2_pagecache_block_get(&inode->ei_pagecache_lock); -+ -+ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { -+ ret = inode_newsize_ok(&inode->v, end); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode & FALLOC_FL_ZERO_RANGE) { -+ ret = __bch2_truncate_page(inode, -+ offset >> PAGE_SHIFT, -+ offset, end); -+ -+ if (!ret && -+ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) -+ ret = __bch2_truncate_page(inode, -+ end >> PAGE_SHIFT, -+ offset, end); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ truncate_pagecache_range(&inode->v, offset, end - 1); -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, block_start >> 9), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ end_pos = POS(inode->v.i_ino, block_end >> 9); -+ -+ while (bkey_cmp(iter->pos, end_pos) < 0) { -+ s64 i_sectors_delta = 0; -+ struct disk_reservation disk_res = { 0 }; -+ struct quota_res quota_res = { 0 }; -+ struct bkey_i_reservation reservation; -+ struct bkey_s_c k; -+ -+ bch2_trans_begin(&trans); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto bkey_err; -+ -+ /* already reserved */ -+ if (k.k->type == KEY_TYPE_reservation && -+ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ if (bkey_extent_is_data(k.k) && -+ !(mode & FALLOC_FL_ZERO_RANGE)) { -+ bch2_btree_iter_next_slot(iter); -+ continue; -+ } -+ -+ bkey_reservation_init(&reservation.k_i); -+ reservation.k.type = KEY_TYPE_reservation; -+ reservation.k.p = k.k->p; -+ reservation.k.size = k.k->size; -+ -+ bch2_cut_front(iter->pos, &reservation.k_i); -+ bch2_cut_back(end_pos, &reservation.k_i); -+ -+ sectors = reservation.k.size; -+ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); -+ -+ if (!bkey_extent_is_allocation(k.k)) { -+ ret = bch2_quota_reservation_add(c, inode, -+ "a_res, -+ sectors, true); -+ if (unlikely(ret)) -+ goto bkey_err; -+ } -+ -+ if (reservation.v.nr_replicas < replicas || -+ bch2_bkey_sectors_compressed(k)) { -+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, -+ replicas, 0); -+ if (unlikely(ret)) -+ goto bkey_err; -+ -+ reservation.v.nr_replicas = disk_res.nr_replicas; -+ } -+ -+ ret = bch2_extent_update(&trans, iter, &reservation.k_i, -+ &disk_res, &inode->ei_journal_seq, -+ 0, &i_sectors_delta); -+ i_sectors_acct(c, inode, "a_res, i_sectors_delta); -+bkey_err: -+ bch2_quota_reservation_put(c, inode, "a_res); -+ bch2_disk_reservation_put(c, &disk_res); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ /* -+ * Do we need to extend the file? -+ * -+ * If we zeroed up to the end of the file, we dropped whatever writes -+ * were going to write out the current i_size, so we have to extend -+ * manually even if FL_KEEP_SIZE was set: -+ */ -+ if (end >= inode->v.i_size && -+ (!(mode & FALLOC_FL_KEEP_SIZE) || -+ (mode & FALLOC_FL_ZERO_RANGE))) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ do { -+ bch2_trans_begin(&trans); -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ inode->v.i_ino, 0); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ } while (ret == -EINTR); -+ -+ bch2_trans_unlock(&trans); -+ -+ if (ret) -+ goto err; -+ -+ /* -+ * Sync existing appends before extending i_size, -+ * as in bch2_extend(): -+ */ -+ ret = filemap_write_and_wait_range(mapping, -+ inode_u.bi_size, S64_MAX); -+ if (ret) -+ goto err; -+ -+ if (mode & FALLOC_FL_KEEP_SIZE) -+ end = inode->v.i_size; -+ else -+ i_size_write(&inode->v, end); -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode_size(c, inode, end, 0); -+ mutex_unlock(&inode->ei_update_lock); -+ } -+err: -+ bch2_trans_exit(&trans); -+ bch2_pagecache_block_put(&inode->ei_pagecache_lock); -+ inode_unlock(&inode->v); -+ return ret; -+} -+ -+long bch2_fallocate_dispatch(struct file *file, int mode, -+ loff_t offset, loff_t len) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ long ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) -+ ret = bchfs_fallocate(inode, mode, offset, len); -+ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) -+ ret = bchfs_fpunch(inode, offset, len); -+ else if (mode == FALLOC_FL_INSERT_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, true); -+ else if (mode == FALLOC_FL_COLLAPSE_RANGE) -+ ret = bchfs_fcollapse_finsert(inode, offset, len, false); -+ else -+ ret = -EOPNOTSUPP; -+ -+ percpu_ref_put(&c->writes); -+ -+ return ret; -+} -+ -+static void mark_range_unallocated(struct bch_inode_info *inode, -+ loff_t start, loff_t end) -+{ -+ pgoff_t index = start >> PAGE_SHIFT; -+ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; -+ struct pagevec pvec; -+ -+ pagevec_init(&pvec); -+ -+ do { -+ unsigned nr_pages, i, j; -+ -+ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, -+ &index, end_index); -+ if (nr_pages == 0) -+ break; -+ -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pvec.pages[i]; -+ struct bch_page_state *s; -+ -+ lock_page(page); -+ s = bch2_page_state(page); -+ -+ if (s) { -+ spin_lock(&s->lock); -+ for (j = 0; j < PAGE_SECTORS; j++) -+ s->s[j].nr_replicas = 0; -+ spin_unlock(&s->lock); -+ } -+ -+ unlock_page(page); -+ } -+ pagevec_release(&pvec); -+ } while (index <= end_index); -+} -+ -+loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, -+ struct file *file_dst, loff_t pos_dst, -+ loff_t len, unsigned remap_flags) -+{ -+ struct bch_inode_info *src = file_bch_inode(file_src); -+ struct bch_inode_info *dst = file_bch_inode(file_dst); -+ struct bch_fs *c = src->v.i_sb->s_fs_info; -+ s64 i_sectors_delta = 0; -+ u64 aligned_len; -+ loff_t ret = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) -+ return -EINVAL; -+ -+ if (remap_flags & REMAP_FILE_DEDUP) -+ return -EOPNOTSUPP; -+ -+ if ((pos_src & (block_bytes(c) - 1)) || -+ (pos_dst & (block_bytes(c) - 1))) -+ return -EINVAL; -+ -+ if (src == dst && -+ abs(pos_src - pos_dst) < len) -+ return -EINVAL; -+ -+ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ file_update_time(file_dst); -+ -+ inode_dio_wait(&src->v); -+ inode_dio_wait(&dst->v); -+ -+ ret = generic_remap_file_range_prep(file_src, pos_src, -+ file_dst, pos_dst, -+ &len, remap_flags); -+ if (ret < 0 || len == 0) -+ goto err; -+ -+ aligned_len = round_up((u64) len, block_bytes(c)); -+ -+ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, -+ pos_dst, pos_dst + len - 1); -+ if (ret) -+ goto err; -+ -+ mark_range_unallocated(src, pos_src, pos_src + aligned_len); -+ -+ ret = bch2_remap_range(c, -+ POS(dst->v.i_ino, pos_dst >> 9), -+ POS(src->v.i_ino, pos_src >> 9), -+ aligned_len >> 9, -+ &dst->ei_journal_seq, -+ pos_dst + len, &i_sectors_delta); -+ if (ret < 0) -+ goto err; -+ -+ /* -+ * due to alignment, we might have remapped slightly more than requsted -+ */ -+ ret = min((u64) ret << 9, (u64) len); -+ -+ /* XXX get a quota reservation */ -+ i_sectors_acct(c, dst, NULL, i_sectors_delta); -+ -+ spin_lock(&dst->v.i_lock); -+ if (pos_dst + ret > dst->v.i_size) -+ i_size_write(&dst->v, pos_dst + ret); -+ spin_unlock(&dst->v.i_lock); -+err: -+ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); -+ -+ return ret; -+} -+ -+/* fseek: */ -+ -+static int page_data_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (s) -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state >= SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t bch2_seek_pagecache_data(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ struct page *page; -+ pgoff_t start_index = start_offset >> PAGE_SHIFT; -+ pgoff_t end_index = end_offset >> PAGE_SHIFT; -+ pgoff_t index = start_index; -+ loff_t ret; -+ int offset; -+ -+ while (index <= end_index) { -+ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { -+ lock_page(page); -+ -+ offset = page_data_offset(page, -+ page->index == start_index -+ ? start_offset & (PAGE_SIZE - 1) -+ : 0); -+ if (offset >= 0) { -+ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + -+ offset, -+ start_offset, end_offset); -+ unlock_page(page); -+ put_page(page); -+ return ret; -+ } -+ -+ unlock_page(page); -+ put_page(page); -+ } else { -+ break; -+ } -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_data(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_data = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ break; -+ } else if (bkey_extent_is_data(k.k)) { -+ next_data = max(offset, bkey_start_offset(k.k) << 9); -+ break; -+ } else if (k.k->p.offset >> 9 > isize) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_data > offset) -+ next_data = bch2_seek_pagecache_data(&inode->v, -+ offset, next_data); -+ -+ if (next_data >= isize) -+ return -ENXIO; -+ -+ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); -+} -+ -+static int __page_hole_offset(struct page *page, unsigned offset) -+{ -+ struct bch_page_state *s = bch2_page_state(page); -+ unsigned i; -+ -+ if (!s) -+ return 0; -+ -+ for (i = offset >> 9; i < PAGE_SECTORS; i++) -+ if (s->s[i].state < SECTOR_DIRTY) -+ return i << 9; -+ -+ return -1; -+} -+ -+static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) -+{ -+ pgoff_t index = offset >> PAGE_SHIFT; -+ struct page *page; -+ int pg_offset; -+ loff_t ret = -1; -+ -+ page = find_lock_entry(mapping, index); -+ if (!page || xa_is_value(page)) -+ return offset; -+ -+ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); -+ if (pg_offset >= 0) -+ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; -+ -+ unlock_page(page); -+ -+ return ret; -+} -+ -+static loff_t bch2_seek_pagecache_hole(struct inode *vinode, -+ loff_t start_offset, -+ loff_t end_offset) -+{ -+ struct address_space *mapping = vinode->i_mapping; -+ loff_t offset = start_offset, hole; -+ -+ while (offset < end_offset) { -+ hole = page_hole_offset(mapping, offset); -+ if (hole >= 0 && hole <= end_offset) -+ return max(start_offset, hole); -+ -+ offset += PAGE_SIZE; -+ offset &= PAGE_MASK; -+ } -+ -+ return end_offset; -+} -+ -+static loff_t bch2_seek_hole(struct file *file, u64 offset) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 isize, next_hole = MAX_LFS_FILESIZE; -+ int ret; -+ -+ isize = i_size_read(&inode->v); -+ if (offset >= isize) -+ return -ENXIO; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode->v.i_ino, offset >> 9), -+ BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->p.inode != inode->v.i_ino) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ offset, MAX_LFS_FILESIZE); -+ break; -+ } else if (!bkey_extent_is_data(k.k)) { -+ next_hole = bch2_seek_pagecache_hole(&inode->v, -+ max(offset, bkey_start_offset(k.k) << 9), -+ k.k->p.offset << 9); -+ -+ if (next_hole < k.k->p.offset << 9) -+ break; -+ } else { -+ offset = max(offset, bkey_start_offset(k.k) << 9); -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ if (next_hole > isize) -+ next_hole = isize; -+ -+ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); -+} -+ -+loff_t bch2_llseek(struct file *file, loff_t offset, int whence) -+{ -+ switch (whence) { -+ case SEEK_SET: -+ case SEEK_CUR: -+ case SEEK_END: -+ return generic_file_llseek(file, offset, whence); -+ case SEEK_DATA: -+ return bch2_seek_data(file, offset); -+ case SEEK_HOLE: -+ return bch2_seek_hole(file, offset); -+ } -+ -+ return -EINVAL; -+} -+ -+void bch2_fs_fsio_exit(struct bch_fs *c) -+{ -+ bioset_exit(&c->dio_write_bioset); -+ bioset_exit(&c->dio_read_bioset); -+ bioset_exit(&c->writepage_bioset); -+} -+ -+int bch2_fs_fsio_init(struct bch_fs *c) -+{ -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bioset_init(&c->writepage_bioset, -+ 4, offsetof(struct bch_writepage_io, op.wbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_read_bioset, -+ 4, offsetof(struct dio_read, rbio.bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->dio_write_bioset, -+ 4, offsetof(struct dio_write, op.wbio.bio), -+ BIOSET_NEED_BVECS)) -+ ret = -ENOMEM; -+ -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h -new file mode 100644 -index 000000000000..7063556d289b ---- /dev/null -+++ b/fs/bcachefs/fs-io.h -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IO_H -+#define _BCACHEFS_FS_IO_H -+ -+#ifndef NO_BCACHEFS_FS -+ -+#include "buckets.h" -+#include "io_types.h" -+ -+#include -+ -+struct quota_res; -+ -+int __must_check bch2_write_inode_size(struct bch_fs *, -+ struct bch_inode_info *, -+ loff_t, unsigned); -+ -+int bch2_writepage(struct page *, struct writeback_control *); -+int bch2_readpage(struct file *, struct page *); -+ -+int bch2_writepages(struct address_space *, struct writeback_control *); -+int bch2_readpages(struct file *, struct address_space *, -+ struct list_head *, unsigned); -+ -+int bch2_write_begin(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page **, void **); -+int bch2_write_end(struct file *, struct address_space *, loff_t, -+ unsigned, unsigned, struct page *, void *); -+ -+ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); -+ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); -+ -+int bch2_fsync(struct file *, loff_t, loff_t, int); -+ -+int bch2_truncate(struct bch_inode_info *, struct iattr *); -+long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); -+ -+loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, -+ loff_t, loff_t, unsigned); -+ -+loff_t bch2_llseek(struct file *, loff_t, int); -+ -+vm_fault_t bch2_page_fault(struct vm_fault *); -+vm_fault_t bch2_page_mkwrite(struct vm_fault *); -+void bch2_invalidatepage(struct page *, unsigned int, unsigned int); -+int bch2_releasepage(struct page *, gfp_t); -+int bch2_migrate_page(struct address_space *, struct page *, -+ struct page *, enum migrate_mode); -+ -+void bch2_fs_fsio_exit(struct bch_fs *); -+int bch2_fs_fsio_init(struct bch_fs *); -+#else -+static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} -+static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } -+#endif -+ -+#endif /* _BCACHEFS_FS_IO_H */ -diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c -new file mode 100644 -index 000000000000..0873d2f0928c ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.c -@@ -0,0 +1,312 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-ioctl.h" -+#include "quota.h" -+ -+#include -+#include -+ -+#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -+ -+struct flags_set { -+ unsigned mask; -+ unsigned flags; -+ -+ unsigned projid; -+}; -+ -+static int bch2_inode_flags_set(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ /* -+ * We're relying on btree locking here for exclusion with other ioctl -+ * calls - use the flags in the btree (@bi), not inode->i_flags: -+ */ -+ struct flags_set *s = p; -+ unsigned newflags = s->flags; -+ unsigned oldflags = bi->bi_flags & s->mask; -+ -+ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && -+ !capable(CAP_LINUX_IMMUTABLE)) -+ return -EPERM; -+ -+ if (!S_ISREG(bi->bi_mode) && -+ !S_ISDIR(bi->bi_mode) && -+ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) -+ return -EINVAL; -+ -+ bi->bi_flags &= ~s->mask; -+ bi->bi_flags |= newflags; -+ -+ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); -+ return 0; -+} -+ -+static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -+{ -+ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); -+ -+ return put_user(flags, arg); -+} -+ -+static int bch2_ioc_setflags(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ void __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; -+ unsigned uflags; -+ int ret; -+ -+ if (get_user(uflags, (int __user *) arg)) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); -+ if (uflags) -+ return -EOPNOTSUPP; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto setflags_out; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, -+ ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+setflags_out: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct fsxattr fa = { 0 }; -+ -+ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); -+ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; -+ -+ return copy_to_user(arg, &fa, sizeof(fa)); -+} -+ -+static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct flags_set *s = p; -+ -+ if (s->projid != bi->bi_project) { -+ bi->bi_fields_set |= 1U << Inode_opt_project; -+ bi->bi_project = s->projid; -+ } -+ -+ return bch2_inode_flags_set(inode, bi, p); -+} -+ -+static int bch2_ioc_fssetxattr(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *inode, -+ struct fsxattr __user *arg) -+{ -+ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; -+ struct fsxattr fa; -+ int ret; -+ -+ if (copy_from_user(&fa, arg, sizeof(fa))) -+ return -EFAULT; -+ -+ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); -+ if (fa.fsx_xflags) -+ return -EOPNOTSUPP; -+ -+ if (fa.fsx_projid >= U32_MAX) -+ return -EINVAL; -+ -+ /* -+ * inode fields accessible via the xattr interface are stored with a +1 -+ * bias, so that 0 means unset: -+ */ -+ s.projid = fa.fsx_projid + 1; -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ return ret; -+ -+ inode_lock(&inode->v); -+ if (!inode_owner_or_capable(&inode->v)) { -+ ret = -EACCES; -+ goto err; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_set_projid(c, inode, fa.fsx_projid); -+ if (ret) -+ goto err_unlock; -+ -+ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, -+ ATTR_CTIME); -+err_unlock: -+ mutex_unlock(&inode->ei_update_lock); -+err: -+ inode_unlock(&inode->v); -+ mnt_drop_write_file(file); -+ return ret; -+} -+ -+static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_inode_info *dir = p; -+ -+ return !bch2_reinherit_attrs(bi, &dir->ei_inode); -+} -+ -+static int bch2_ioc_reinherit_attrs(struct bch_fs *c, -+ struct file *file, -+ struct bch_inode_info *src, -+ const char __user *name) -+{ -+ struct bch_inode_info *dst; -+ struct inode *vinode = NULL; -+ char *kname = NULL; -+ struct qstr qstr; -+ int ret = 0; -+ u64 inum; -+ -+ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); -+ if (!kname) -+ return -ENOMEM; -+ -+ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); -+ if (unlikely(ret < 0)) -+ goto err1; -+ -+ qstr.len = ret; -+ qstr.name = kname; -+ -+ ret = -ENOENT; -+ inum = bch2_dirent_lookup(c, src->v.i_ino, -+ &src->ei_str_hash, -+ &qstr); -+ if (!inum) -+ goto err1; -+ -+ vinode = bch2_vfs_inode_get(c, inum); -+ ret = PTR_ERR_OR_ZERO(vinode); -+ if (ret) -+ goto err1; -+ -+ dst = to_bch_ei(vinode); -+ -+ ret = mnt_want_write_file(file); -+ if (ret) -+ goto err2; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ if (inode_attr_changing(src, dst, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst, -+ src->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err3; -+ } -+ -+ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); -+err3: -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); -+ -+ /* return true if we did work */ -+ if (ret >= 0) -+ ret = !ret; -+ -+ mnt_drop_write_file(file); -+err2: -+ iput(vinode); -+err1: -+ kfree(kname); -+ -+ return ret; -+} -+ -+long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct super_block *sb = inode->v.i_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ return bch2_ioc_getflags(inode, (int __user *) arg); -+ -+ case FS_IOC_SETFLAGS: -+ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); -+ -+ case FS_IOC_FSGETXATTR: -+ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); -+ case FS_IOC_FSSETXATTR: -+ return bch2_ioc_fssetxattr(c, file, inode, -+ (void __user *) arg); -+ -+ case BCHFS_IOC_REINHERIT_ATTRS: -+ return bch2_ioc_reinherit_attrs(c, file, inode, -+ (void __user *) arg); -+ -+ case FS_IOC_GETVERSION: -+ return -ENOTTY; -+ case FS_IOC_SETVERSION: -+ return -ENOTTY; -+ -+ case FS_IOC_GOINGDOWN: -+ if (!capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ down_write(&sb->s_umount); -+ sb->s_flags |= SB_RDONLY; -+ if (bch2_fs_emergency_read_only(c)) -+ bch_err(c, "emergency read only due to ioctl"); -+ up_write(&sb->s_umount); -+ return 0; -+ -+ default: -+ return bch2_fs_ioctl(c, cmd, (void __user *) arg); -+ } -+} -+ -+#ifdef CONFIG_COMPAT -+long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) -+{ -+ /* These are just misnamed, they actually get/put from/to user an int */ -+ switch (cmd) { -+ case FS_IOC_GETFLAGS: -+ cmd = FS_IOC_GETFLAGS; -+ break; -+ case FS_IOC32_SETFLAGS: -+ cmd = FS_IOC_SETFLAGS; -+ break; -+ default: -+ return -ENOIOCTLCMD; -+ } -+ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); -+} -+#endif -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h -new file mode 100644 -index 000000000000..f201980ef2c3 ---- /dev/null -+++ b/fs/bcachefs/fs-ioctl.h -@@ -0,0 +1,81 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_IOCTL_H -+#define _BCACHEFS_FS_IOCTL_H -+ -+/* Inode flags: */ -+ -+/* bcachefs inode flags -> vfs inode flags: */ -+static const unsigned bch_flags_to_vfs[] = { -+ [__BCH_INODE_SYNC] = S_SYNC, -+ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, -+ [__BCH_INODE_APPEND] = S_APPEND, -+ [__BCH_INODE_NOATIME] = S_NOATIME, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -+static const unsigned bch_flags_to_uflags[] = { -+ [__BCH_INODE_SYNC] = FS_SYNC_FL, -+ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, -+ [__BCH_INODE_APPEND] = FS_APPEND_FL, -+ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, -+ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -+}; -+ -+/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -+static const unsigned bch_flags_to_xflags[] = { -+ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, -+ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, -+ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, -+ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, -+ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, -+ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -+}; -+ -+#define set_flags(_map, _in, _out) \ -+do { \ -+ unsigned _i; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & (1 << _i)) \ -+ (_out) |= _map[_i]; \ -+ else \ -+ (_out) &= ~_map[_i]; \ -+} while (0) -+ -+#define map_flags(_map, _in) \ -+({ \ -+ unsigned _out = 0; \ -+ \ -+ set_flags(_map, _in, _out); \ -+ _out; \ -+}) -+ -+#define map_flags_rev(_map, _in) \ -+({ \ -+ unsigned _i, _out = 0; \ -+ \ -+ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ -+ if ((_in) & _map[_i]) { \ -+ (_out) |= 1 << _i; \ -+ (_in) &= ~_map[_i]; \ -+ } \ -+ (_out); \ -+}) -+ -+#define map_defined(_map) \ -+({ \ -+ unsigned _in = ~0; \ -+ \ -+ map_flags_rev(_map, _in); \ -+}) -+ -+/* Set VFS inode flags from bcachefs inode: */ -+static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -+{ -+ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -+} -+ -+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); -+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); -+ -+#endif /* _BCACHEFS_FS_IOCTL_H */ -diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c -new file mode 100644 -index 000000000000..6a9820e83db7 ---- /dev/null -+++ b/fs/bcachefs/fs.c -@@ -0,0 +1,1614 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifndef NO_BCACHEFS_FS -+ -+#include "bcachefs.h" -+#include "acl.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "chardev.h" -+#include "dirent.h" -+#include "extents.h" -+#include "fs.h" -+#include "fs-common.h" -+#include "fs-io.h" -+#include "fs-ioctl.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "quota.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+static struct kmem_cache *bch2_inode_cache; -+ -+static void bch2_vfs_inode_init(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *); -+ -+static void journal_seq_copy(struct bch_fs *c, -+ struct bch_inode_info *dst, -+ u64 journal_seq) -+{ -+ u64 old, v = READ_ONCE(dst->ei_journal_seq); -+ -+ do { -+ old = v; -+ -+ if (old >= journal_seq) -+ break; -+ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); -+ -+ bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); -+} -+ -+static void __pagecache_lock_put(struct pagecache_lock *lock, long i) -+{ -+ BUG_ON(atomic_long_read(&lock->v) == 0); -+ -+ if (atomic_long_sub_return_release(i, &lock->v) == 0) -+ wake_up_all(&lock->wait); -+} -+ -+static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) -+{ -+ long v = atomic_long_read(&lock->v), old; -+ -+ do { -+ old = v; -+ -+ if (i > 0 ? v < 0 : v > 0) -+ return false; -+ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, -+ old, old + i)) != old); -+ return true; -+} -+ -+static void __pagecache_lock_get(struct pagecache_lock *lock, long i) -+{ -+ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, 1); -+} -+ -+void bch2_pagecache_add_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, 1); -+} -+ -+void bch2_pagecache_block_put(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_put(lock, -1); -+} -+ -+void bch2_pagecache_block_get(struct pagecache_lock *lock) -+{ -+ __pagecache_lock_get(lock, -1); -+} -+ -+void bch2_inode_update_after_write(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ unsigned fields) -+{ -+ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); -+ i_uid_write(&inode->v, bi->bi_uid); -+ i_gid_write(&inode->v, bi->bi_gid); -+ inode->v.i_mode = bi->bi_mode; -+ -+ if (fields & ATTR_ATIME) -+ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); -+ if (fields & ATTR_MTIME) -+ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); -+ if (fields & ATTR_CTIME) -+ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); -+ -+ inode->ei_inode = *bi; -+ -+ bch2_inode_flags_to_vfs(inode); -+} -+ -+int __must_check bch2_write_inode(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ inode_set_fn set, -+ void *p, unsigned fields) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked inode_u; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ (set ? set(inode, &inode_u, p) : 0) ?: -+ bch2_inode_write(&trans, iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * the btree node lock protects inode->ei_inode, not ei_update_lock; -+ * this is important for inode updates via bchfs_write_index_update -+ */ -+ if (!ret) -+ bch2_inode_update_after_write(c, inode, &inode_u, fields); -+ -+ bch2_trans_iter_put(&trans, iter); -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_trans_exit(&trans); -+ return ret < 0 ? ret : 0; -+} -+ -+int bch2_fs_quota_transfer(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_qid new_qid, -+ unsigned qtypes, -+ enum quota_acct_mode mode) -+{ -+ unsigned i; -+ int ret; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ for (i = 0; i < QTYP_NR; i++) -+ if (new_qid.q[i] == inode->ei_qid.q[i]) -+ qtypes &= ~(1U << i); -+ -+ if (!qtypes) -+ return 0; -+ -+ mutex_lock(&inode->ei_quota_lock); -+ -+ ret = bch2_quota_transfer(c, qtypes, new_qid, -+ inode->ei_qid, -+ inode->v.i_blocks + -+ inode->ei_quota_reserved, -+ mode); -+ if (!ret) -+ for (i = 0; i < QTYP_NR; i++) -+ if (qtypes & (1 << i)) -+ inode->ei_qid.q[i] = new_qid.q[i]; -+ -+ mutex_unlock(&inode->ei_quota_lock); -+ -+ return ret; -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) -+{ -+ struct bch_inode_unpacked inode_u; -+ struct bch_inode_info *inode; -+ int ret; -+ -+ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); -+ if (unlikely(!inode)) -+ return ERR_PTR(-ENOMEM); -+ if (!(inode->v.i_state & I_NEW)) -+ return &inode->v; -+ -+ ret = bch2_inode_find_by_inum(c, inum, &inode_u); -+ if (ret) { -+ iget_failed(&inode->v); -+ return ERR_PTR(ret); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ -+ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); -+ -+ unlock_new_inode(&inode->v); -+ -+ return &inode->v; -+} -+ -+static struct bch_inode_info * -+__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, -+ umode_t mode, dev_t rdev, bool tmpfile) -+{ -+ struct bch_fs *c = dir->v.i_sb->s_fs_info; -+ struct user_namespace *ns = dir->v.i_sb->s_user_ns; -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u; -+ struct bch_inode_info *inode, *old; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *default_acl = NULL, *acl = NULL; -+ u64 journal_seq = 0; -+ int ret; -+ -+ /* -+ * preallocate acls + vfs inode before btree transaction, so that -+ * nothing can fail after the transaction succeeds: -+ */ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); -+ if (ret) -+ return ERR_PTR(ret); -+#endif -+ inode = to_bch_ei(new_inode(c->vfs_sb)); -+ if (unlikely(!inode)) { -+ inode = ERR_PTR(-ENOMEM); -+ goto err; -+ } -+ -+ bch2_inode_init_early(c, &inode_u); -+ -+ if (!tmpfile) -+ mutex_lock(&dir->ei_update_lock); -+ -+ bch2_trans_init(&trans, c, 8, 1024); -+retry: -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, -+ !tmpfile ? &dentry->d_name : NULL, -+ from_kuid(ns, current_fsuid()), -+ from_kgid(ns, current_fsgid()), -+ mode, rdev, -+ default_acl, acl) ?: -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (unlikely(ret)) -+ goto err_before_quota; -+ -+ ret = bch2_trans_commit(&trans, NULL, &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (unlikely(ret)) { -+ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+err_before_quota: -+ if (ret == -EINTR) -+ goto retry; -+ goto err_trans; -+ } -+ -+ if (!tmpfile) { -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(c, dir, journal_seq); -+ mutex_unlock(&dir->ei_update_lock); -+ } -+ -+ bch2_vfs_inode_init(c, inode, &inode_u); -+ journal_seq_copy(c, inode, journal_seq); -+ -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); -+ -+ /* -+ * we must insert the new inode into the inode cache before calling -+ * bch2_trans_exit() and dropping locks, else we could race with another -+ * thread pulling the inode in and modifying it: -+ */ -+ -+ old = to_bch_ei(insert_inode_locked2(&inode->v)); -+ if (unlikely(old)) { -+ /* -+ * We raced, another process pulled the new inode into cache -+ * before us: -+ */ -+ journal_seq_copy(c, old, journal_seq); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ -+ inode = old; -+ } else { -+ /* -+ * we really don't want insert_inode_locked2() to be setting -+ * I_NEW... -+ */ -+ unlock_new_inode(&inode->v); -+ } -+ -+ bch2_trans_exit(&trans); -+err: -+ posix_acl_release(default_acl); -+ posix_acl_release(acl); -+ return inode; -+err_trans: -+ if (!tmpfile) -+ mutex_unlock(&dir->ei_update_lock); -+ -+ bch2_trans_exit(&trans); -+ make_bad_inode(&inode->v); -+ iput(&inode->v); -+ inode = ERR_PTR(ret); -+ goto err; -+} -+ -+/* methods */ -+ -+static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, -+ unsigned int flags) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct inode *vinode = NULL; -+ u64 inum; -+ -+ inum = bch2_dirent_lookup(c, dir->v.i_ino, -+ &dir->ei_str_hash, -+ &dentry->d_name); -+ -+ if (inum) -+ vinode = bch2_vfs_inode_get(c, inum); -+ -+ return d_splice_alias(vinode, dentry); -+} -+ -+static int bch2_mknod(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, dev_t rdev) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_create(struct inode *vdir, struct dentry *dentry, -+ umode_t mode, bool excl) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); -+} -+ -+static int __bch2_link(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_info *dir, -+ struct dentry *dentry) -+{ -+ struct btree_trans trans; -+ struct bch_inode_unpacked dir_u, inode_u; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ ret = bch2_link_trans(&trans, -+ dir->v.i_ino, -+ inode->v.i_ino, &dir_u, &inode_u, -+ &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(c, inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ mutex_unlock(&inode->ei_update_lock); -+ return ret; -+} -+ -+static int bch2_link(struct dentry *old_dentry, struct inode *vdir, -+ struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ return ret; -+ -+ ihold(&inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_unlink(struct inode *vdir, struct dentry *dentry) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir); -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct bch_inode_unpacked dir_u, inode_u; -+ struct btree_trans trans; -+ int ret; -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ bch2_trans_init(&trans, c, 4, 1024); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ ret = bch2_unlink_trans(&trans, -+ dir->v.i_ino, &dir_u, -+ &inode_u, &dentry->d_name) ?: -+ bch2_trans_commit(&trans, NULL, -+ &dir->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ if (likely(!ret)) { -+ BUG_ON(inode_u.bi_inum != inode->v.i_ino); -+ -+ journal_seq_copy(c, inode, dir->ei_journal_seq); -+ bch2_inode_update_after_write(c, dir, &dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ bch2_inode_update_after_write(c, inode, &inode_u, -+ ATTR_MTIME); -+ } -+ -+ bch2_trans_exit(&trans); -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -+ -+ return ret; -+} -+ -+static int bch2_symlink(struct inode *vdir, struct dentry *dentry, -+ const char *symname) -+{ -+ struct bch_fs *c = vdir->i_sb->s_fs_info; -+ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; -+ int ret; -+ -+ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); -+ if (unlikely(IS_ERR(inode))) -+ return PTR_ERR(inode); -+ -+ inode_lock(&inode->v); -+ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); -+ inode_unlock(&inode->v); -+ -+ if (unlikely(ret)) -+ goto err; -+ -+ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); -+ if (unlikely(ret)) -+ goto err; -+ -+ journal_seq_copy(c, dir, inode->ei_journal_seq); -+ -+ ret = __bch2_link(c, inode, dir, dentry); -+ if (unlikely(ret)) -+ goto err; -+ -+ d_instantiate(dentry, &inode->v); -+ return 0; -+err: -+ iput(&inode->v); -+ return ret; -+} -+ -+static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); -+} -+ -+static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, -+ struct inode *dst_vdir, struct dentry *dst_dentry, -+ unsigned flags) -+{ -+ struct bch_fs *c = src_vdir->i_sb->s_fs_info; -+ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); -+ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); -+ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); -+ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); -+ struct bch_inode_unpacked dst_dir_u, src_dir_u; -+ struct bch_inode_unpacked src_inode_u, dst_inode_u; -+ struct btree_trans trans; -+ enum bch_rename_mode mode = flags & RENAME_EXCHANGE -+ ? BCH_RENAME_EXCHANGE -+ : dst_dentry->d_inode -+ ? BCH_RENAME_OVERWRITE : BCH_RENAME; -+ u64 journal_seq = 0; -+ int ret; -+ -+ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) -+ return -EINVAL; -+ -+ if (mode == BCH_RENAME_OVERWRITE) { -+ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, -+ 0, LLONG_MAX); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 8, 2048); -+ -+ bch2_lock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, src_inode, -+ dst_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+ if (mode == BCH_RENAME_EXCHANGE && -+ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { -+ ret = bch2_fs_quota_transfer(c, dst_inode, -+ src_dir->ei_qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ } -+ -+retry: -+ bch2_trans_begin(&trans); -+ ret = bch2_rename_trans(&trans, -+ src_dir->v.i_ino, &src_dir_u, -+ dst_dir->v.i_ino, &dst_dir_u, -+ &src_inode_u, -+ &dst_inode_u, -+ &src_dentry->d_name, -+ &dst_dentry->d_name, -+ mode) ?: -+ bch2_trans_commit(&trans, NULL, -+ &journal_seq, -+ BTREE_INSERT_NOUNLOCK); -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err; -+ -+ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); -+ BUG_ON(dst_inode && -+ dst_inode->v.i_ino != dst_inode_u.bi_inum); -+ -+ bch2_inode_update_after_write(c, src_dir, &src_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(c, src_dir, journal_seq); -+ -+ if (src_dir != dst_dir) { -+ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, -+ ATTR_MTIME|ATTR_CTIME); -+ journal_seq_copy(c, dst_dir, journal_seq); -+ } -+ -+ bch2_inode_update_after_write(c, src_inode, &src_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(c, src_inode, journal_seq); -+ -+ if (dst_inode) { -+ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, -+ ATTR_CTIME); -+ journal_seq_copy(c, dst_inode, journal_seq); -+ } -+err: -+ bch2_trans_exit(&trans); -+ -+ bch2_fs_quota_transfer(c, src_inode, -+ bch_qid(&src_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ if (dst_inode) -+ bch2_fs_quota_transfer(c, dst_inode, -+ bch_qid(&dst_inode->ei_inode), -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_NOCHECK); -+ -+ bch2_unlock_inodes(INODE_UPDATE_LOCK, -+ src_dir, -+ dst_dir, -+ src_inode, -+ dst_inode); -+ -+ return ret; -+} -+ -+void bch2_setattr_copy(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ unsigned int ia_valid = attr->ia_valid; -+ -+ if (ia_valid & ATTR_UID) -+ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); -+ if (ia_valid & ATTR_GID) -+ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); -+ -+ if (ia_valid & ATTR_ATIME) -+ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); -+ if (ia_valid & ATTR_MTIME) -+ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); -+ if (ia_valid & ATTR_CTIME) -+ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); -+ -+ if (ia_valid & ATTR_MODE) { -+ umode_t mode = attr->ia_mode; -+ kgid_t gid = ia_valid & ATTR_GID -+ ? attr->ia_gid -+ : inode->v.i_gid; -+ -+ if (!in_group_p(gid) && -+ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) -+ mode &= ~S_ISGID; -+ bi->bi_mode = mode; -+ } -+} -+ -+static int bch2_setattr_nonsize(struct bch_inode_info *inode, -+ struct iattr *attr) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_qid qid; -+ struct btree_trans trans; -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ struct posix_acl *acl = NULL; -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ -+ qid = inode->ei_qid; -+ -+ if (attr->ia_valid & ATTR_UID) -+ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); -+ -+ if (attr->ia_valid & ATTR_GID) -+ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); -+ -+ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, -+ KEY_TYPE_QUOTA_PREALLOC); -+ if (ret) -+ goto err; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ kfree(acl); -+ acl = NULL; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(inode_iter); -+ if (ret) -+ goto btree_err; -+ -+ bch2_setattr_copy(inode, &inode_u, attr); -+ -+ if (attr->ia_valid & ATTR_MODE) { -+ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); -+ if (ret) -+ goto btree_err; -+ } -+ -+ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, -+ &inode->ei_journal_seq, -+ BTREE_INSERT_NOUNLOCK| -+ BTREE_INSERT_NOFAIL); -+btree_err: -+ if (ret == -EINTR) -+ goto retry; -+ if (unlikely(ret)) -+ goto err_trans; -+ -+ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); -+ -+ if (acl) -+ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); -+err_trans: -+ bch2_trans_exit(&trans); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static int bch2_getattr(const struct path *path, struct kstat *stat, -+ u32 request_mask, unsigned query_flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ stat->dev = inode->v.i_sb->s_dev; -+ stat->ino = inode->v.i_ino; -+ stat->mode = inode->v.i_mode; -+ stat->nlink = inode->v.i_nlink; -+ stat->uid = inode->v.i_uid; -+ stat->gid = inode->v.i_gid; -+ stat->rdev = inode->v.i_rdev; -+ stat->size = i_size_read(&inode->v); -+ stat->atime = inode->v.i_atime; -+ stat->mtime = inode->v.i_mtime; -+ stat->ctime = inode->v.i_ctime; -+ stat->blksize = block_bytes(c); -+ stat->blocks = inode->v.i_blocks; -+ -+ if (request_mask & STATX_BTIME) { -+ stat->result_mask |= STATX_BTIME; -+ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); -+ } -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) -+ stat->attributes |= STATX_ATTR_IMMUTABLE; -+ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) -+ stat->attributes |= STATX_ATTR_APPEND; -+ stat->attributes_mask |= STATX_ATTR_APPEND; -+ -+ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) -+ stat->attributes |= STATX_ATTR_NODUMP; -+ stat->attributes_mask |= STATX_ATTR_NODUMP; -+ -+ return 0; -+} -+ -+static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) -+{ -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ int ret; -+ -+ lockdep_assert_held(&inode->v.i_rwsem); -+ -+ ret = setattr_prepare(dentry, iattr); -+ if (ret) -+ return ret; -+ -+ return iattr->ia_valid & ATTR_SIZE -+ ? bch2_truncate(inode, iattr) -+ : bch2_setattr_nonsize(inode, iattr); -+} -+ -+static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) -+{ -+ struct bch_inode_info *inode = -+ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); -+ -+ if (IS_ERR(inode)) -+ return PTR_ERR(inode); -+ -+ d_mark_tmpfile(dentry, &inode->v); -+ d_instantiate(dentry, &inode->v); -+ return 0; -+} -+ -+static int bch2_fill_extent(struct bch_fs *c, -+ struct fiemap_extent_info *info, -+ struct bkey_s_c k, unsigned flags) -+{ -+ if (bkey_extent_is_data(k.k)) { -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ flags |= FIEMAP_EXTENT_SHARED; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ int flags2 = 0; -+ u64 offset = p.ptr.offset; -+ -+ if (p.crc.compression_type) -+ flags2 |= FIEMAP_EXTENT_ENCODED; -+ else -+ offset += p.crc.offset; -+ -+ if ((offset & (c->opts.block_size - 1)) || -+ (k.k->size & (c->opts.block_size - 1))) -+ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; -+ -+ ret = fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ offset << 9, -+ k.k->size << 9, flags|flags2); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+ } else if (k.k->type == KEY_TYPE_reservation) { -+ return fiemap_fill_next_extent(info, -+ bkey_start_offset(k.k) << 9, -+ 0, k.k->size << 9, -+ flags| -+ FIEMAP_EXTENT_DELALLOC| -+ FIEMAP_EXTENT_UNWRITTEN); -+ } else { -+ BUG(); -+ } -+} -+ -+static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, -+ u64 start, u64 len) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *ei = to_bch_ei(vinode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack cur, prev; -+ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); -+ unsigned offset_into_extent, sectors; -+ bool have_extent = false; -+ int ret = 0; -+ -+ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); -+ if (ret) -+ return ret; -+ -+ if (start + len < start) -+ return -EINVAL; -+ -+ bkey_on_stack_init(&cur); -+ bkey_on_stack_init(&prev); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(ei->v.i_ino, start >> 9), 0); -+retry: -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k)) && -+ bkey_cmp(iter->pos, end) < 0) { -+ if (!bkey_extent_is_data(k.k) && -+ k.k->type != KEY_TYPE_reservation) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_realloc(&cur, c, k.k->u64s); -+ bkey_on_stack_realloc(&prev, c, k.k->u64s); -+ bkey_reassemble(cur.k, k); -+ k = bkey_i_to_s_c(cur.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &cur); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ if (offset_into_extent) -+ bch2_cut_front(POS(k.k->p.inode, -+ bkey_start_offset(k.k) + -+ offset_into_extent), -+ cur.k); -+ bch2_key_resize(&cur.k->k, sectors); -+ cur.k->k.p = iter->pos; -+ cur.k->k.p.offset += cur.k->k.size; -+ -+ if (have_extent) { -+ ret = bch2_fill_extent(c, info, -+ bkey_i_to_s_c(prev.k), 0); -+ if (ret) -+ break; -+ } -+ -+ bkey_copy(prev.k, cur.k); -+ have_extent = true; -+ -+ if (k.k->type == KEY_TYPE_reflink_v) -+ bch2_btree_iter_set_pos(iter, k.k->p); -+ else -+ bch2_btree_iter_next(iter); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ -+ if (!ret && have_extent) -+ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), -+ FIEMAP_EXTENT_LAST); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&cur, c); -+ bkey_on_stack_exit(&prev, c); -+ return ret < 0 ? ret : 0; -+} -+ -+static const struct vm_operations_struct bch_vm_ops = { -+ .fault = bch2_page_fault, -+ .map_pages = filemap_map_pages, -+ .page_mkwrite = bch2_page_mkwrite, -+}; -+ -+static int bch2_mmap(struct file *file, struct vm_area_struct *vma) -+{ -+ file_accessed(file); -+ -+ vma->vm_ops = &bch_vm_ops; -+ return 0; -+} -+ -+/* Directories: */ -+ -+static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) -+{ -+ return generic_file_llseek_size(file, offset, whence, -+ S64_MAX, S64_MAX); -+} -+ -+static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) -+{ -+ struct bch_inode_info *inode = file_bch_inode(file); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ if (!dir_emit_dots(file, ctx)) -+ return 0; -+ -+ return bch2_readdir(c, inode->v.i_ino, ctx); -+} -+ -+static const struct file_operations bch_file_operations = { -+ .llseek = bch2_llseek, -+ .read_iter = bch2_read_iter, -+ .write_iter = bch2_write_iter, -+ .mmap = bch2_mmap, -+ .open = generic_file_open, -+ .fsync = bch2_fsync, -+ .splice_read = generic_file_splice_read, -+ /* -+ * Broken, on v5.3: -+ .splice_write = iter_file_splice_write, -+ */ -+ .fallocate = bch2_fallocate_dispatch, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+ .remap_file_range = bch2_remap_file_range, -+}; -+ -+static const struct inode_operations bch_file_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .fiemap = bch2_fiemap, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_dir_inode_operations = { -+ .lookup = bch2_lookup, -+ .create = bch2_create, -+ .link = bch2_link, -+ .unlink = bch2_unlink, -+ .symlink = bch2_symlink, -+ .mkdir = bch2_mkdir, -+ .rmdir = bch2_unlink, -+ .mknod = bch2_mknod, -+ .rename = bch2_rename2, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .tmpfile = bch2_tmpfile, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct file_operations bch_dir_file_operations = { -+ .llseek = bch2_dir_llseek, -+ .read = generic_read_dir, -+ .iterate_shared = bch2_vfs_readdir, -+ .fsync = bch2_fsync, -+ .unlocked_ioctl = bch2_fs_file_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = bch2_compat_fs_ioctl, -+#endif -+}; -+ -+static const struct inode_operations bch_symlink_inode_operations = { -+ .get_link = page_get_link, -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct inode_operations bch_special_inode_operations = { -+ .getattr = bch2_getattr, -+ .setattr = bch2_setattr, -+ .listxattr = bch2_xattr_list, -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ .get_acl = bch2_get_acl, -+ .set_acl = bch2_set_acl, -+#endif -+}; -+ -+static const struct address_space_operations bch_address_space_operations = { -+ .writepage = bch2_writepage, -+ .readpage = bch2_readpage, -+ .writepages = bch2_writepages, -+ .readpages = bch2_readpages, -+ .set_page_dirty = __set_page_dirty_nobuffers, -+ .write_begin = bch2_write_begin, -+ .write_end = bch2_write_end, -+ .invalidatepage = bch2_invalidatepage, -+ .releasepage = bch2_releasepage, -+ .direct_IO = noop_direct_IO, -+#ifdef CONFIG_MIGRATION -+ .migratepage = bch2_migrate_page, -+#endif -+ .error_remove_page = generic_error_remove_page, -+}; -+ -+static struct inode *bch2_nfs_get_inode(struct super_block *sb, -+ u64 ino, u32 generation) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct inode *vinode; -+ -+ if (ino < BCACHEFS_ROOT_INO) -+ return ERR_PTR(-ESTALE); -+ -+ vinode = bch2_vfs_inode_get(c, ino); -+ if (IS_ERR(vinode)) -+ return ERR_CAST(vinode); -+ if (generation && vinode->i_generation != generation) { -+ /* we didn't find the right inode.. */ -+ iput(vinode); -+ return ERR_PTR(-ESTALE); -+ } -+ return vinode; -+} -+ -+static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, -+ int fh_len, int fh_type) -+{ -+ return generic_fh_to_parent(sb, fid, fh_len, fh_type, -+ bch2_nfs_get_inode); -+} -+ -+static const struct export_operations bch_export_ops = { -+ .fh_to_dentry = bch2_fh_to_dentry, -+ .fh_to_parent = bch2_fh_to_parent, -+ //.get_parent = bch2_get_parent, -+}; -+ -+static void bch2_vfs_inode_init(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi) -+{ -+ bch2_inode_update_after_write(c, inode, bi, ~0); -+ -+ inode->v.i_blocks = bi->bi_sectors; -+ inode->v.i_ino = bi->bi_inum; -+ inode->v.i_rdev = bi->bi_dev; -+ inode->v.i_generation = bi->bi_generation; -+ inode->v.i_size = bi->bi_size; -+ -+ inode->ei_journal_seq = 0; -+ inode->ei_quota_reserved = 0; -+ inode->ei_str_hash = bch2_hash_info_init(c, bi); -+ inode->ei_qid = bch_qid(bi); -+ -+ inode->v.i_mapping->a_ops = &bch_address_space_operations; -+ -+ switch (inode->v.i_mode & S_IFMT) { -+ case S_IFREG: -+ inode->v.i_op = &bch_file_inode_operations; -+ inode->v.i_fop = &bch_file_operations; -+ break; -+ case S_IFDIR: -+ inode->v.i_op = &bch_dir_inode_operations; -+ inode->v.i_fop = &bch_dir_file_operations; -+ break; -+ case S_IFLNK: -+ inode_nohighmem(&inode->v); -+ inode->v.i_op = &bch_symlink_inode_operations; -+ break; -+ default: -+ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); -+ inode->v.i_op = &bch_special_inode_operations; -+ break; -+ } -+} -+ -+static struct inode *bch2_alloc_inode(struct super_block *sb) -+{ -+ struct bch_inode_info *inode; -+ -+ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); -+ if (!inode) -+ return NULL; -+ -+ inode_init_once(&inode->v); -+ mutex_init(&inode->ei_update_lock); -+ pagecache_lock_init(&inode->ei_pagecache_lock); -+ mutex_init(&inode->ei_quota_lock); -+ inode->ei_journal_seq = 0; -+ -+ return &inode->v; -+} -+ -+static void bch2_i_callback(struct rcu_head *head) -+{ -+ struct inode *vinode = container_of(head, struct inode, i_rcu); -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ kmem_cache_free(bch2_inode_cache, inode); -+} -+ -+static void bch2_destroy_inode(struct inode *vinode) -+{ -+ call_rcu(&vinode->i_rcu, bch2_i_callback); -+} -+ -+static int inode_update_times_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); -+ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); -+ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); -+ -+ return 0; -+} -+ -+static int bch2_vfs_write_inode(struct inode *vinode, -+ struct writeback_control *wbc) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ int ret; -+ -+ mutex_lock(&inode->ei_update_lock); -+ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, -+ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); -+ mutex_unlock(&inode->ei_update_lock); -+ -+ return ret; -+} -+ -+static void bch2_evict_inode(struct inode *vinode) -+{ -+ struct bch_fs *c = vinode->i_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ -+ truncate_inode_pages_final(&inode->v.i_data); -+ -+ clear_inode(&inode->v); -+ -+ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); -+ -+ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { -+ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), -+ KEY_TYPE_QUOTA_WARN); -+ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, -+ KEY_TYPE_QUOTA_WARN); -+ bch2_inode_rm(c, inode->v.i_ino); -+ } -+} -+ -+static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) -+{ -+ struct super_block *sb = dentry->d_sb; -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); -+ unsigned shift = sb->s_blocksize_bits - 9; -+ u64 fsid; -+ -+ buf->f_type = BCACHEFS_STATFS_MAGIC; -+ buf->f_bsize = sb->s_blocksize; -+ buf->f_blocks = usage.capacity >> shift; -+ buf->f_bfree = (usage.capacity - usage.used) >> shift; -+ buf->f_bavail = buf->f_bfree; -+ buf->f_files = 0; -+ buf->f_ffree = 0; -+ -+ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ -+ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); -+ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; -+ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; -+ buf->f_namelen = BCH_NAME_MAX; -+ -+ return 0; -+} -+ -+static int bch2_sync_fs(struct super_block *sb, int wait) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (c->opts.journal_flush_disabled) -+ return 0; -+ -+ if (!wait) { -+ bch2_journal_flush_async(&c->journal, NULL); -+ return 0; -+ } -+ -+ return bch2_journal_flush(&c->journal); -+} -+ -+static struct bch_fs *bch2_path_to_fs(const char *dev) -+{ -+ struct bch_fs *c; -+ struct block_device *bdev = lookup_bdev(dev); -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ c = bch2_bdev_to_fs(bdev); -+ bdput(bdev); -+ if (c) -+ closure_put(&c->cl); -+ return c ?: ERR_PTR(-ENOENT); -+} -+ -+static char **split_devs(const char *_dev_name, unsigned *nr) -+{ -+ char *dev_name = NULL, **devs = NULL, *s; -+ size_t i, nr_devs = 0; -+ -+ dev_name = kstrdup(_dev_name, GFP_KERNEL); -+ if (!dev_name) -+ return NULL; -+ -+ for (s = dev_name; s; s = strchr(s + 1, ':')) -+ nr_devs++; -+ -+ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); -+ if (!devs) { -+ kfree(dev_name); -+ return NULL; -+ } -+ -+ for (i = 0, s = dev_name; -+ s; -+ (s = strchr(s, ':')) && (*s++ = '\0')) -+ devs[i++] = s; -+ -+ *nr = nr_devs; -+ return devs; -+} -+ -+static int bch2_remount(struct super_block *sb, int *flags, char *data) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_opts opts = bch2_opts_empty(); -+ int ret; -+ -+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ret; -+ -+ if (opts.read_only != c->opts.read_only) { -+ down_write(&c->state_lock); -+ -+ if (opts.read_only) { -+ bch2_fs_read_only(c); -+ -+ sb->s_flags |= SB_RDONLY; -+ } else { -+ ret = bch2_fs_read_write(c); -+ if (ret) { -+ bch_err(c, "error going rw: %i", ret); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ sb->s_flags &= ~SB_RDONLY; -+ } -+ -+ c->opts.read_only = opts.read_only; -+ -+ up_write(&c->state_lock); -+ } -+ -+ if (opts.errors >= 0) -+ c->opts.errors = opts.errors; -+ -+ return ret; -+} -+ -+static int bch2_show_devname(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ struct bch_dev *ca; -+ unsigned i; -+ bool first = true; -+ -+ for_each_online_member(ca, c, i) { -+ if (!first) -+ seq_putc(seq, ':'); -+ first = false; -+ seq_puts(seq, "/dev/"); -+ seq_puts(seq, ca->name); -+ } -+ -+ return 0; -+} -+ -+static int bch2_show_options(struct seq_file *seq, struct dentry *root) -+{ -+ struct bch_fs *c = root->d_sb->s_fs_info; -+ enum bch_opt_id i; -+ char buf[512]; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ bch2_opt_to_text(&PBUF(buf), c, opt, v, -+ OPT_SHOW_MOUNT_STYLE); -+ seq_putc(seq, ','); -+ seq_puts(seq, buf); -+ } -+ -+ return 0; -+} -+ -+static void bch2_put_super(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ __bch2_fs_stop(c); -+} -+ -+static const struct super_operations bch_super_operations = { -+ .alloc_inode = bch2_alloc_inode, -+ .destroy_inode = bch2_destroy_inode, -+ .write_inode = bch2_vfs_write_inode, -+ .evict_inode = bch2_evict_inode, -+ .sync_fs = bch2_sync_fs, -+ .statfs = bch2_statfs, -+ .show_devname = bch2_show_devname, -+ .show_options = bch2_show_options, -+ .remount_fs = bch2_remount, -+ .put_super = bch2_put_super, -+#if 0 -+ .freeze_fs = bch2_freeze, -+ .unfreeze_fs = bch2_unfreeze, -+#endif -+}; -+ -+static int bch2_set_super(struct super_block *s, void *data) -+{ -+ s->s_fs_info = data; -+ return 0; -+} -+ -+static int bch2_noset_super(struct super_block *s, void *data) -+{ -+ return -EBUSY; -+} -+ -+static int bch2_test_super(struct super_block *s, void *data) -+{ -+ struct bch_fs *c = s->s_fs_info; -+ struct bch_fs **devs = data; -+ unsigned i; -+ -+ if (!c) -+ return false; -+ -+ for (i = 0; devs[i]; i++) -+ if (c != devs[i]) -+ return false; -+ return true; -+} -+ -+static struct dentry *bch2_mount(struct file_system_type *fs_type, -+ int flags, const char *dev_name, void *data) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ struct super_block *sb; -+ struct inode *vinode; -+ struct bch_opts opts = bch2_opts_empty(); -+ char **devs; -+ struct bch_fs **devs_to_fs = NULL; -+ unsigned i, nr_devs; -+ int ret; -+ -+ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); -+ -+ ret = bch2_parse_mount_opts(&opts, data); -+ if (ret) -+ return ERR_PTR(ret); -+ -+ devs = split_devs(dev_name, &nr_devs); -+ if (!devs) -+ return ERR_PTR(-ENOMEM); -+ -+ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); -+ if (!devs_to_fs) { -+ sb = ERR_PTR(-ENOMEM); -+ goto got_sb; -+ } -+ -+ for (i = 0; i < nr_devs; i++) -+ devs_to_fs[i] = bch2_path_to_fs(devs[i]); -+ -+ sb = sget(fs_type, bch2_test_super, bch2_noset_super, -+ flags|SB_NOSEC, devs_to_fs); -+ if (!IS_ERR(sb)) -+ goto got_sb; -+ -+ c = bch2_fs_open(devs, nr_devs, opts); -+ -+ if (!IS_ERR(c)) -+ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); -+ else -+ sb = ERR_CAST(c); -+got_sb: -+ kfree(devs_to_fs); -+ kfree(devs[0]); -+ kfree(devs); -+ -+ if (IS_ERR(sb)) -+ return ERR_CAST(sb); -+ -+ c = sb->s_fs_info; -+ -+ if (sb->s_root) { -+ if ((flags ^ sb->s_flags) & SB_RDONLY) { -+ ret = -EBUSY; -+ goto err_put_super; -+ } -+ goto out; -+ } -+ -+ sb->s_blocksize = block_bytes(c); -+ sb->s_blocksize_bits = ilog2(block_bytes(c)); -+ sb->s_maxbytes = MAX_LFS_FILESIZE; -+ sb->s_op = &bch_super_operations; -+ sb->s_export_op = &bch_export_ops; -+#ifdef CONFIG_BCACHEFS_QUOTA -+ sb->s_qcop = &bch2_quotactl_operations; -+ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; -+#endif -+ sb->s_xattr = bch2_xattr_handlers; -+ sb->s_magic = BCACHEFS_STATFS_MAGIC; -+ sb->s_time_gran = c->sb.time_precision; -+ c->vfs_sb = sb; -+ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); -+ -+ ret = super_setup_bdi(sb); -+ if (ret) -+ goto err_put_super; -+ -+ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; -+ -+ for_each_online_member(ca, c, i) { -+ struct block_device *bdev = ca->disk_sb.bdev; -+ -+ /* XXX: create an anonymous device for multi device filesystems */ -+ sb->s_bdev = bdev; -+ sb->s_dev = bdev->bd_dev; -+ percpu_ref_put(&ca->io_ref); -+ break; -+ } -+ -+#ifdef CONFIG_BCACHEFS_POSIX_ACL -+ if (c->opts.acl) -+ sb->s_flags |= SB_POSIXACL; -+#endif -+ -+ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); -+ if (IS_ERR(vinode)) { -+ bch_err(c, "error mounting: error getting root inode %i", -+ (int) PTR_ERR(vinode)); -+ ret = PTR_ERR(vinode); -+ goto err_put_super; -+ } -+ -+ sb->s_root = d_make_root(vinode); -+ if (!sb->s_root) { -+ bch_err(c, "error mounting: error allocating root dentry"); -+ ret = -ENOMEM; -+ goto err_put_super; -+ } -+ -+ sb->s_flags |= SB_ACTIVE; -+out: -+ return dget(sb->s_root); -+ -+err_put_super: -+ deactivate_locked_super(sb); -+ return ERR_PTR(ret); -+} -+ -+static void bch2_kill_sb(struct super_block *sb) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ generic_shutdown_super(sb); -+ bch2_fs_free(c); -+} -+ -+static struct file_system_type bcache_fs_type = { -+ .owner = THIS_MODULE, -+ .name = "bcachefs", -+ .mount = bch2_mount, -+ .kill_sb = bch2_kill_sb, -+ .fs_flags = FS_REQUIRES_DEV, -+}; -+ -+MODULE_ALIAS_FS("bcachefs"); -+ -+void bch2_vfs_exit(void) -+{ -+ unregister_filesystem(&bcache_fs_type); -+ if (bch2_inode_cache) -+ kmem_cache_destroy(bch2_inode_cache); -+} -+ -+int __init bch2_vfs_init(void) -+{ -+ int ret = -ENOMEM; -+ -+ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); -+ if (!bch2_inode_cache) -+ goto err; -+ -+ ret = register_filesystem(&bcache_fs_type); -+ if (ret) -+ goto err; -+ -+ return 0; -+err: -+ bch2_vfs_exit(); -+ return ret; -+} -+ -+#endif /* NO_BCACHEFS_FS */ -diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h -new file mode 100644 -index 000000000000..eda903a45325 ---- /dev/null -+++ b/fs/bcachefs/fs.h -@@ -0,0 +1,174 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FS_H -+#define _BCACHEFS_FS_H -+ -+#include "inode.h" -+#include "opts.h" -+#include "str_hash.h" -+#include "quota_types.h" -+ -+#include -+#include -+ -+/* -+ * Two-state lock - can be taken for add or block - both states are shared, -+ * like read side of rwsem, but conflict with other state: -+ */ -+struct pagecache_lock { -+ atomic_long_t v; -+ wait_queue_head_t wait; -+}; -+ -+static inline void pagecache_lock_init(struct pagecache_lock *lock) -+{ -+ atomic_long_set(&lock->v, 0); -+ init_waitqueue_head(&lock->wait); -+} -+ -+void bch2_pagecache_add_put(struct pagecache_lock *); -+void bch2_pagecache_add_get(struct pagecache_lock *); -+void bch2_pagecache_block_put(struct pagecache_lock *); -+void bch2_pagecache_block_get(struct pagecache_lock *); -+ -+struct bch_inode_info { -+ struct inode v; -+ -+ struct mutex ei_update_lock; -+ u64 ei_journal_seq; -+ u64 ei_quota_reserved; -+ unsigned long ei_last_dirtied; -+ -+ struct pagecache_lock ei_pagecache_lock; -+ -+ struct mutex ei_quota_lock; -+ struct bch_qid ei_qid; -+ -+ struct bch_hash_info ei_str_hash; -+ -+ /* copy of inode in btree: */ -+ struct bch_inode_unpacked ei_inode; -+}; -+ -+#define to_bch_ei(_inode) \ -+ container_of_or_null(_inode, struct bch_inode_info, v) -+ -+static inline int ptrcmp(void *l, void *r) -+{ -+ return cmp_int(l, r); -+} -+ -+enum bch_inode_lock_op { -+ INODE_LOCK = (1U << 0), -+ INODE_PAGECACHE_BLOCK = (1U << 1), -+ INODE_UPDATE_LOCK = (1U << 2), -+}; -+ -+#define bch2_lock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ down_write_nested(&a[i]->v.i_rwsem, i); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_lock_nested(&a[i]->ei_update_lock, i);\ -+ } \ -+} while (0) -+ -+#define bch2_unlock_inodes(_locks, ...) \ -+do { \ -+ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ -+ unsigned i; \ -+ \ -+ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ -+ \ -+ for (i = 1; i < ARRAY_SIZE(a); i++) \ -+ if (a[i] != a[i - 1]) { \ -+ if ((_locks) & INODE_LOCK) \ -+ up_write(&a[i]->v.i_rwsem); \ -+ if ((_locks) & INODE_PAGECACHE_BLOCK) \ -+ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ -+ if ((_locks) & INODE_UPDATE_LOCK) \ -+ mutex_unlock(&a[i]->ei_update_lock); \ -+ } \ -+} while (0) -+ -+static inline struct bch_inode_info *file_bch_inode(struct file *file) -+{ -+ return to_bch_ei(file_inode(file)); -+} -+ -+static inline bool inode_attr_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode, -+ enum inode_opt_id id) -+{ -+ return !(inode->ei_inode.bi_fields_set & (1 << id)) && -+ bch2_inode_opt_get(&dir->ei_inode, id) != -+ bch2_inode_opt_get(&inode->ei_inode, id); -+} -+ -+static inline bool inode_attrs_changing(struct bch_inode_info *dir, -+ struct bch_inode_info *inode) -+{ -+ unsigned id; -+ -+ for (id = 0; id < Inode_opt_nr; id++) -+ if (inode_attr_changing(dir, inode, id)) -+ return true; -+ -+ return false; -+} -+ -+struct bch_inode_unpacked; -+ -+#ifndef NO_BCACHEFS_FS -+ -+int bch2_fs_quota_transfer(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_qid, -+ unsigned, -+ enum quota_acct_mode); -+ -+static inline int bch2_set_projid(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ u32 projid) -+{ -+ struct bch_qid qid = inode->ei_qid; -+ -+ qid.q[QTYP_PRJ] = projid; -+ -+ return bch2_fs_quota_transfer(c, inode, qid, -+ 1 << QTYP_PRJ, -+ KEY_TYPE_QUOTA_PREALLOC); -+} -+ -+struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); -+ -+/* returns 0 if we want to do the update, or error is passed up */ -+typedef int (*inode_set_fn)(struct bch_inode_info *, -+ struct bch_inode_unpacked *, void *); -+ -+void bch2_inode_update_after_write(struct bch_fs *, -+ struct bch_inode_info *, -+ struct bch_inode_unpacked *, -+ unsigned); -+int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, -+ inode_set_fn, void *, unsigned); -+ -+void bch2_vfs_exit(void); -+int bch2_vfs_init(void); -+ -+#else -+ -+static inline void bch2_vfs_exit(void) {} -+static inline int bch2_vfs_init(void) { return 0; } -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+#endif /* _BCACHEFS_FS_H */ -diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c -new file mode 100644 -index 000000000000..5a6df3d1973a ---- /dev/null -+++ b/fs/bcachefs/fsck.c -@@ -0,0 +1,1502 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "dirent.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "inode.h" -+#include "keylist.h" -+#include "super.h" -+#include "xattr.h" -+ -+#include /* struct qstr */ -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 sectors = 0; -+ int ret; -+ -+ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, -+ POS(inum, 0), 0, k, ret) { -+ if (k.k->p.inode != inum) -+ break; -+ -+ if (bkey_extent_is_allocation(k.k)) -+ sectors += k.k->size; -+ } -+ -+ bch2_trans_iter_free(trans, iter); -+ -+ return ret ?: sectors; -+} -+ -+static int __remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ struct bch_fs *c = trans->c; -+ struct qstr name; -+ struct bch_inode_unpacked dir_inode; -+ struct bch_hash_info dir_hash_info; -+ u64 dir_inum = dirent.k->p.inode; -+ int ret; -+ char *buf; -+ -+ name.len = bch2_dirent_name_bytes(dirent); -+ buf = bch2_trans_kmalloc(trans, name.len + 1); -+ if (IS_ERR(buf)) -+ return PTR_ERR(buf); -+ -+ memcpy(buf, dirent.v->d_name, name.len); -+ buf[name.len] = '\0'; -+ name.name = buf; -+ -+ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); -+ if (ret) -+ return ret; -+ -+ dir_hash_info = bch2_hash_info_init(c, &dir_inode); -+ -+ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, -+ &dir_hash_info, dir_inum, &name); -+ if (ret && ret != -EINTR) -+ bch_err(c, "remove_dirent: err %i deleting dirent", ret); -+ if (ret) -+ return ret; -+ -+ return 0; -+} -+ -+static int remove_dirent(struct btree_trans *trans, -+ struct bkey_s_c_dirent dirent) -+{ -+ return __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ __remove_dirent(trans, dirent)); -+} -+ -+static int reattach_inode(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ u64 inum) -+{ -+ struct bch_inode_unpacked dir_u, inode_u; -+ char name_buf[20]; -+ struct qstr name; -+ int ret; -+ -+ snprintf(name_buf, sizeof(name_buf), "%llu", inum); -+ name = (struct qstr) QSTR(name_buf); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_LAZY_RW, -+ bch2_link_trans(&trans, lostfound_inode->bi_inum, -+ inum, &dir_u, &inode_u, &name)); -+ if (ret) -+ bch_err(c, "error %i reattaching inode %llu", ret, inum); -+ -+ return ret; -+} -+ -+struct inode_walker { -+ bool first_this_inode; -+ bool have_inode; -+ u64 cur_inum; -+ struct bch_inode_unpacked inode; -+}; -+ -+static struct inode_walker inode_walker_init(void) -+{ -+ return (struct inode_walker) { -+ .cur_inum = -1, -+ .have_inode = false, -+ }; -+} -+ -+static int walk_inode(struct btree_trans *trans, -+ struct inode_walker *w, u64 inum) -+{ -+ if (inum != w->cur_inum) { -+ int ret = bch2_inode_find_by_inum_trans(trans, inum, -+ &w->inode); -+ -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ w->have_inode = !ret; -+ w->cur_inum = inum; -+ w->first_this_inode = true; -+ } else { -+ w->first_this_inode = false; -+ } -+ -+ return 0; -+} -+ -+struct hash_check { -+ struct bch_hash_info info; -+ -+ /* start of current chain of hash collisions: */ -+ struct btree_iter *chain; -+ -+ /* next offset in current chain of hash collisions: */ -+ u64 chain_end; -+}; -+ -+static void hash_check_init(struct hash_check *h) -+{ -+ h->chain = NULL; -+ h->chain_end = 0; -+} -+ -+static void hash_stop_chain(struct btree_trans *trans, -+ struct hash_check *h) -+{ -+ if (h->chain) -+ bch2_trans_iter_free(trans, h->chain); -+ h->chain = NULL; -+} -+ -+static void hash_check_set_inode(struct btree_trans *trans, -+ struct hash_check *h, -+ const struct bch_inode_unpacked *bi) -+{ -+ h->info = bch2_hash_info_init(trans->c, bi); -+ hash_stop_chain(trans, h); -+} -+ -+static int hash_redo_key(const struct bch_hash_desc desc, -+ struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k, -+ u64 hashed) -+{ -+ struct bkey_i delete; -+ struct bkey_i *tmp; -+ -+ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ if (IS_ERR(tmp)) -+ return PTR_ERR(tmp); -+ -+ bkey_reassemble(tmp, k); -+ -+ bkey_init(&delete.k); -+ delete.k.p = k_iter->pos; -+ bch2_trans_update(trans, k_iter, &delete, 0); -+ -+ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, -+ tmp, BCH_HASH_SET_MUST_CREATE); -+} -+ -+static int fsck_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ int ret; -+retry: -+ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: -+ bch2_trans_commit(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret == -EINTR) { -+ ret = bch2_btree_iter_traverse(iter); -+ if (!ret) -+ goto retry; -+ } -+ -+ return ret; -+} -+ -+static int hash_check_duplicates(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *iter; -+ struct bkey_s_c k2; -+ char buf[200]; -+ int ret = 0; -+ -+ if (!bkey_cmp(h->chain->pos, k_iter->pos)) -+ return 0; -+ -+ iter = bch2_trans_copy_iter(trans, h->chain); -+ BUG_ON(IS_ERR(iter)); -+ -+ for_each_btree_key_continue(iter, 0, k2, ret) { -+ if (bkey_cmp(k2.k->p, k.k->p) >= 0) -+ break; -+ -+ if (fsck_err_on(k2.k->type == desc.key_type && -+ !desc.cmp_bkey(k, k2), c, -+ "duplicate hash table keys:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); -+ if (ret) -+ return ret; -+ ret = 1; -+ break; -+ } -+ } -+fsck_err: -+ bch2_trans_iter_free(trans, iter); -+ return ret; -+} -+ -+static void hash_set_chain_start(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ bool hole = (k.k->type != KEY_TYPE_whiteout && -+ k.k->type != desc.key_type); -+ -+ if (hole || k.k->p.offset > h->chain_end + 1) -+ hash_stop_chain(trans, h); -+ -+ if (!hole) { -+ if (!h->chain) { -+ h->chain = bch2_trans_copy_iter(trans, k_iter); -+ BUG_ON(IS_ERR(h->chain)); -+ } -+ -+ h->chain_end = k.k->p.offset; -+ } -+} -+ -+static bool key_has_correct_hash(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ u64 hash; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return true; -+ -+ hash = desc.hash_bkey(&h->info, k); -+ -+ return hash >= h->chain->pos.offset && -+ hash <= k.k->p.offset; -+} -+ -+static int hash_check_key(struct btree_trans *trans, -+ const struct bch_hash_desc desc, struct hash_check *h, -+ struct btree_iter *k_iter, struct bkey_s_c k) -+{ -+ struct bch_fs *c = trans->c; -+ char buf[200]; -+ u64 hashed; -+ int ret = 0; -+ -+ hash_set_chain_start(trans, desc, h, k_iter, k); -+ -+ if (k.k->type != desc.key_type) -+ return 0; -+ -+ hashed = desc.hash_bkey(&h->info, k); -+ -+ if (fsck_err_on(hashed < h->chain->pos.offset || -+ hashed > k.k->p.offset, c, -+ "hash table key at wrong offset: btree %u, %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ desc.btree_id, k.k->p.offset, -+ hashed, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(desc, trans, h, k_iter, k, hashed)); -+ if (ret) { -+ bch_err(c, "hash_redo_key err %i", ret); -+ return ret; -+ } -+ return 1; -+ } -+ -+ ret = hash_check_duplicates(trans, desc, h, k_iter, k); -+fsck_err: -+ return ret; -+} -+ -+static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, -+ struct btree_iter *iter, struct bkey_s_c *k) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_i_dirent *d = NULL; -+ int ret = -EINVAL; -+ char buf[200]; -+ unsigned len; -+ u64 hash; -+ -+ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) -+ return 0; -+ -+ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); -+ BUG_ON(!len); -+ -+ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); -+ buf[len] = '\0'; -+ -+ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); -+ if (!d) { -+ bch_err(c, "memory allocation failure"); -+ return -ENOMEM; -+ } -+ -+ bkey_reassemble(&d->k_i, *k); -+ -+ do { -+ --len; -+ if (!len) -+ goto err_redo; -+ -+ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); -+ -+ BUG_ON(bkey_val_bytes(&d->k) < -+ offsetof(struct bch_dirent, d_name) + len); -+ -+ memset(d->v.d_name + len, 0, -+ bkey_val_bytes(&d->k) - -+ offsetof(struct bch_dirent, d_name) - len); -+ -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, -+ bkey_i_to_s_c(&d->k_i)); -+ } while (hash < h->chain->pos.offset || -+ hash > k->k->p.offset); -+ -+ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", -+ buf, strlen(buf), d->v.d_name, len)) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); -+ if (ret) -+ goto err; -+ -+ *k = bch2_btree_iter_peek(iter); -+ -+ BUG_ON(k->k->type != KEY_TYPE_dirent); -+ } -+err: -+fsck_err: -+ kfree(d); -+ return ret; -+err_redo: -+ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); -+ -+ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" -+ "hash table key at wrong offset: btree %u, offset %llu, " -+ "hashed to %llu chain starts at %llu\n%s", -+ buf, strlen(buf), BTREE_ID_DIRENTS, -+ k->k->p.offset, hash, h->chain->pos.offset, -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ *k), buf))) { -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, -+ hash_redo_key(bch2_dirent_hash_desc, trans, -+ h, iter, *k, hash)); -+ if (ret) -+ bch_err(c, "hash_redo_key err %i", ret); -+ else -+ ret = 1; -+ } -+ -+ goto err; -+} -+ -+static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) -+{ -+ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), -+ POS(inode_nr + 1, 0), NULL); -+} -+ -+static int bch2_fix_overlapping_extent(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_s_c k, struct bpos cut_at) -+{ -+ struct btree_iter *u_iter; -+ struct bkey_i *u; -+ int ret; -+ -+ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); -+ ret = PTR_ERR_OR_ZERO(u); -+ if (ret) -+ return ret; -+ -+ bkey_reassemble(u, k); -+ bch2_cut_front(cut_at, u); -+ -+ u_iter = bch2_trans_copy_iter(trans, iter); -+ ret = PTR_ERR_OR_ZERO(u_iter); -+ if (ret) -+ return ret; -+ -+ /* -+ * We don't want to go through the -+ * extent_handle_overwrites path: -+ */ -+ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); -+ -+ /* -+ * XXX: this is going to leave disk space -+ * accounting slightly wrong -+ */ -+ ret = bch2_trans_update(trans, u_iter, u, 0); -+ bch2_trans_iter_put(trans, u_iter); -+ return ret; -+} -+ -+/* -+ * Walk extents: verify that extents have a corresponding S_ISREG inode, and -+ * that i_size an i_sectors are consistent -+ */ -+noinline_for_stack -+static int check_extents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack prev; -+ u64 i_sectors; -+ int ret = 0; -+ -+ bkey_on_stack_init(&prev); -+ prev.k->k = KEY(0, 0, 0); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking extents"); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(BCACHEFS_ROOT_INO, 0), -+ BTREE_ITER_INTENT); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { -+ char buf1[200]; -+ char buf2[200]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); -+ bch2_bkey_val_to_text(&PBUF(buf2), c, k); -+ -+ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_fix_overlapping_extent(&trans, -+ iter, k, prev.k->k.p)); -+ if (ret) -+ goto err; -+ } -+ } -+ bkey_on_stack_reassemble(&prev, c, k); -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "extent type %u for missing inode %llu", -+ k.k->type, k.k->p.inode) || -+ fsck_err_on(w.have_inode && -+ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, -+ "extent type %u for non regular file, inode %llu mode %o", -+ k.k->type, k.k->p.inode, w.inode.bi_mode)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(w.first_this_inode && -+ w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && -+ w.inode.bi_sectors != -+ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), -+ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", -+ w.inode.bi_inum, -+ w.inode.bi_sectors, i_sectors)) { -+ struct bkey_inode_buf p; -+ -+ w.inode.bi_sectors = i_sectors; -+ -+ bch2_trans_unlock(&trans); -+ -+ bch2_inode_pack(&p, &w.inode); -+ -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &p.inode.k_i, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i updating inode", ret); -+ goto err; -+ } -+ -+ /* revalidate iterator: */ -+ k = bch2_btree_iter_peek(iter); -+ } -+ -+ if (fsck_err_on(w.have_inode && -+ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ k.k->type != KEY_TYPE_reservation && -+ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, -+ "extent type %u offset %llu past end of inode %llu, i_size %llu", -+ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = bch2_inode_truncate(c, k.k->p.inode, -+ w.inode.bi_size); -+ if (ret) -+ goto err; -+ continue; -+ } -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ bkey_on_stack_exit(&prev, c); -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, -+ * validate d_type -+ */ -+noinline_for_stack -+static int check_dirents(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ unsigned name_len; -+ char buf[200]; -+ int ret = 0; -+ -+ bch_verbose(c, "checking dirents"); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ hash_check_init(&h); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ struct bkey_s_c_dirent d; -+ struct bch_inode_unpacked target; -+ bool have_target; -+ u64 d_inum; -+ -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "dirent in nonexisting directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf)) || -+ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, -+ "dirent in non directory inode type %u:\n%s", -+ mode_to_type(w.inode.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = check_dirent_hash(&trans, &h, iter, &k); -+ if (ret > 0) { -+ ret = 0; -+ continue; -+ } -+ if (ret) -+ goto fsck_err; -+ -+ if (ret) -+ goto fsck_err; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ name_len = bch2_dirent_name_bytes(d); -+ -+ if (fsck_err_on(!name_len, c, "empty dirent") || -+ fsck_err_on(name_len == 1 && -+ !memcmp(d.v->d_name, ".", 1), c, -+ ". dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(name_len == 2 && -+ !memcmp(d.v->d_name, "..", 2), c, -+ ".. dirent") || -+ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, -+ "dirent name has invalid chars")) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(d_inum == d.k->p.inode, c, -+ "dirent points to own directory:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); -+ if (ret && ret != -ENOENT) -+ break; -+ -+ have_target = !ret; -+ ret = 0; -+ -+ if (fsck_err_on(!have_target, c, -+ "dirent points to missing inode:\n%s", -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ ret = remove_dirent(&trans, d); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (fsck_err_on(have_target && -+ d.v->d_type != -+ mode_to_type(target.bi_mode), c, -+ "incorrect d_type: should be %u:\n%s", -+ mode_to_type(target.bi_mode), -+ (bch2_bkey_val_to_text(&PBUF(buf), c, -+ k), buf))) { -+ struct bkey_i_dirent *n; -+ -+ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); -+ if (!n) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ bkey_reassemble(&n->k_i, d.s_c); -+ n->v.d_type = mode_to_type(target.bi_mode); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); -+ kfree(n); -+ if (ret) -+ goto err; -+ -+ } -+ } -+ -+ hash_stop_chain(&trans, &h); -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* -+ * Walk xattrs: verify that they all have a corresponding inode -+ */ -+noinline_for_stack -+static int check_xattrs(struct bch_fs *c) -+{ -+ struct inode_walker w = inode_walker_init(); -+ struct hash_check h; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch_verbose(c, "checking xattrs"); -+ -+ hash_check_init(&h); -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(BCACHEFS_ROOT_INO, 0), 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ ret = walk_inode(&trans, &w, k.k->p.inode); -+ if (ret) -+ break; -+ -+ if (fsck_err_on(!w.have_inode, c, -+ "xattr for missing inode %llu", -+ k.k->p.inode)) { -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ if (w.first_this_inode && w.have_inode) -+ hash_check_set_inode(&trans, &h, &w.inode); -+ -+ ret = hash_check_key(&trans, bch2_xattr_hash_desc, -+ &h, iter, k); -+ if (ret) -+ goto fsck_err; -+ } -+err: -+fsck_err: -+ if (ret == -EINTR) -+ goto retry; -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Get root directory, create if it doesn't exist: */ -+static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) -+{ -+ struct bkey_inode_buf packed; -+ int ret; -+ -+ bch_verbose(c, "checking root directory"); -+ -+ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "root directory missing")) -+ goto create_root; -+ -+ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, -+ "root inode not a directory")) -+ goto create_root; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_root: -+ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, -+ 0, NULL); -+ root_inode->bi_inum = BCACHEFS_ROOT_INO; -+ -+ bch2_inode_pack(&packed, root_inode); -+ -+ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, -+ NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW); -+} -+ -+/* Get lost+found, create if it doesn't exist: */ -+static int check_lostfound(struct bch_fs *c, -+ struct bch_inode_unpacked *root_inode, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct qstr lostfound = QSTR("lost+found"); -+ struct bch_hash_info root_hash_info = -+ bch2_hash_info_init(c, root_inode); -+ u64 inum; -+ int ret; -+ -+ bch_verbose(c, "checking lost+found"); -+ -+ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, -+ &lostfound); -+ if (!inum) { -+ bch_notice(c, "creating lost+found"); -+ goto create_lostfound; -+ } -+ -+ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); -+ if (ret && ret != -ENOENT) -+ return ret; -+ -+ if (fsck_err_on(ret, c, "lost+found missing")) -+ goto create_lostfound; -+ -+ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, -+ "lost+found inode not a directory")) -+ goto create_lostfound; -+ -+ return 0; -+fsck_err: -+ return ret; -+create_lostfound: -+ bch2_inode_init_early(c, lostfound_inode); -+ -+ ret = bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ bch2_create_trans(&trans, -+ BCACHEFS_ROOT_INO, root_inode, -+ lostfound_inode, &lostfound, -+ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); -+ if (ret) -+ bch_err(c, "error creating lost+found: %i", ret); -+ -+ return ret; -+} -+ -+struct inode_bitmap { -+ unsigned long *bits; -+ size_t size; -+}; -+ -+static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) -+{ -+ return nr < b->size ? test_bit(nr, b->bits) : false; -+} -+ -+static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) -+{ -+ if (nr >= b->size) { -+ size_t new_size = max_t(size_t, max_t(size_t, -+ PAGE_SIZE * 8, -+ b->size * 2), -+ nr + 1); -+ void *n; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); -+ if (!n) { -+ return -ENOMEM; -+ } -+ -+ b->bits = n; -+ b->size = new_size; -+ } -+ -+ __set_bit(nr, b->bits); -+ return 0; -+} -+ -+struct pathbuf { -+ size_t nr; -+ size_t size; -+ -+ struct pathbuf_entry { -+ u64 inum; -+ u64 offset; -+ } *entries; -+}; -+ -+static int path_down(struct pathbuf *p, u64 inum) -+{ -+ if (p->nr == p->size) { -+ size_t new_size = max_t(size_t, 256UL, p->size * 2); -+ void *n = krealloc(p->entries, -+ new_size * sizeof(p->entries[0]), -+ GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ p->entries = n; -+ p->size = new_size; -+ }; -+ -+ p->entries[p->nr++] = (struct pathbuf_entry) { -+ .inum = inum, -+ .offset = 0, -+ }; -+ return 0; -+} -+ -+noinline_for_stack -+static int check_directory_structure(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ struct inode_bitmap dirs_done = { NULL, 0 }; -+ struct pathbuf path = { 0, 0, NULL }; -+ struct pathbuf_entry *e; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent dirent; -+ bool had_unreachable; -+ u64 d_inum; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ bch_verbose(c, "checking directory structure"); -+ -+ /* DFS: */ -+restart_dfs: -+ had_unreachable = false; -+ -+ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, BCACHEFS_ROOT_INO); -+ if (ret) -+ goto err; -+ -+ while (path.nr) { -+next: -+ e = &path.entries[path.nr - 1]; -+ -+ if (e->offset == U64_MAX) -+ goto up; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, -+ POS(e->inum, e->offset + 1), 0, k, ret) { -+ if (k.k->p.inode != e->inum) -+ break; -+ -+ e->offset = k.k->p.offset; -+ -+ if (k.k->type != KEY_TYPE_dirent) -+ continue; -+ -+ dirent = bkey_s_c_to_dirent(k); -+ -+ if (dirent.v->d_type != DT_DIR) -+ continue; -+ -+ d_inum = le64_to_cpu(dirent.v->d_inum); -+ -+ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, -+ "directory %llu has multiple hardlinks", -+ d_inum)) { -+ ret = remove_dirent(&trans, dirent); -+ if (ret) -+ goto err; -+ continue; -+ } -+ -+ ret = inode_bitmap_set(&dirs_done, d_inum); -+ if (ret) { -+ bch_err(c, "memory allocation failure in inode_bitmap_set()"); -+ goto err; -+ } -+ -+ ret = path_down(&path, d_inum); -+ if (ret) { -+ goto err; -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter); -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+ goto next; -+ } -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ if (ret) { -+ bch_err(c, "btree error %i in fsck", ret); -+ goto err; -+ } -+up: -+ path.nr--; -+ } -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); -+retry: -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) -+ continue; -+ -+ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); -+ if (ret == -EINTR) -+ goto retry; -+ if (!ret) -+ continue; -+ -+ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, -+ "unreachable directory found (inum %llu)", -+ k.k->p.offset)) { -+ bch2_trans_unlock(&trans); -+ -+ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); -+ if (ret) { -+ goto err; -+ } -+ -+ had_unreachable = true; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ if (ret) -+ goto err; -+ -+ if (had_unreachable) { -+ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ memset(&dirs_done, 0, sizeof(dirs_done)); -+ memset(&path, 0, sizeof(path)); -+ goto restart_dfs; -+ } -+err: -+fsck_err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ kfree(dirs_done.bits); -+ kfree(path.entries); -+ return ret; -+} -+ -+struct nlink { -+ u32 count; -+ u32 dir_count; -+}; -+ -+typedef GENRADIX(struct nlink) nlink_table; -+ -+static void inc_link(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end, -+ u64 inum, bool dir) -+{ -+ struct nlink *link; -+ -+ if (inum < range_start || inum >= *range_end) -+ return; -+ -+ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); -+ if (!link) { -+ bch_verbose(c, "allocation failed during fsck - will need another pass"); -+ *range_end = inum; -+ return; -+ } -+ -+ if (dir) -+ link->dir_count++; -+ else -+ link->count++; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, -+ u64 range_start, u64 *range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_dirent d; -+ u64 d_inum; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_dirent: -+ d = bkey_s_c_to_dirent(k); -+ d_inum = le64_to_cpu(d.v->d_inum); -+ -+ if (d.v->d_type == DT_DIR) -+ inc_link(c, links, range_start, range_end, -+ d.k->p.inode, true); -+ -+ inc_link(c, links, range_start, range_end, -+ d_inum, false); -+ -+ break; -+ } -+ -+ bch2_trans_cond_resched(&trans); -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); -+ -+ return ret; -+} -+ -+static int check_inode_nlink(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct bch_inode_unpacked *u, -+ struct nlink *link, -+ bool *do_update) -+{ -+ u32 i_nlink = bch2_inode_nlink_get(u); -+ u32 real_i_nlink = -+ link->count * nlink_bias(u->bi_mode) + -+ link->dir_count; -+ int ret = 0; -+ -+ /* -+ * These should have been caught/fixed by earlier passes, we don't -+ * repair them here: -+ */ -+ if (S_ISDIR(u->bi_mode) && link->count > 1) { -+ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", -+ u->bi_inum, link->count); -+ return 0; -+ } -+ -+ if (S_ISDIR(u->bi_mode) && !link->count) { -+ need_fsck_err(c, "unreachable directory found (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!S_ISDIR(u->bi_mode) && link->dir_count) { -+ need_fsck_err(c, "non directory with subdirectories (inum %llu)", -+ u->bi_inum); -+ return 0; -+ } -+ -+ if (!link->count && -+ !(u->bi_flags & BCH_INODE_UNLINKED) && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", -+ u->bi_inum, mode_to_type(u->bi_mode)) == -+ FSCK_ERR_IGNORE) -+ return 0; -+ -+ ret = reattach_inode(c, lostfound_inode, u->bi_inum); -+ if (ret) -+ return ret; -+ -+ link->count = 1; -+ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink < link->count) { -+ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", -+ u->bi_inum, i_nlink, link->count, -+ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ c->sb.clean) { -+ if (fsck_err(c, "filesystem marked clean, " -+ "but inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (i_nlink != real_i_nlink && -+ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ if (fsck_err(c, "inode %llu has wrong i_nlink " -+ "(type %u i_nlink %u, should be %u)", -+ u->bi_inum, mode_to_type(u->bi_mode), -+ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) -+ return 0; -+ goto set_i_nlink; -+ } -+ -+ if (real_i_nlink && i_nlink != real_i_nlink) -+ bch_verbose(c, "setting inode %llu nlink from %u to %u", -+ u->bi_inum, i_nlink, real_i_nlink); -+set_i_nlink: -+ if (i_nlink != real_i_nlink) { -+ bch2_inode_nlink_set(u, real_i_nlink); -+ *do_update = true; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int check_inode(struct btree_trans *trans, -+ struct bch_inode_unpacked *lostfound_inode, -+ struct btree_iter *iter, -+ struct bkey_s_c_inode inode, -+ struct nlink *link) -+{ -+ struct bch_fs *c = trans->c; -+ struct bch_inode_unpacked u; -+ bool do_update = false; -+ int ret = 0; -+ -+ ret = bch2_inode_unpack(inode, &u); -+ -+ bch2_trans_unlock(trans); -+ -+ if (bch2_fs_inconsistent_on(ret, c, -+ "error unpacking inode %llu in fsck", -+ inode.k->p.inode)) -+ return ret; -+ -+ if (link) { -+ ret = check_inode_nlink(c, lostfound_inode, &u, link, -+ &do_update); -+ if (ret) -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_UNLINKED && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", -+ u.bi_inum))) { -+ bch_verbose(c, "deleting inode %llu", u.bi_inum); -+ -+ bch2_fs_lazy_rw(c); -+ -+ ret = bch2_inode_rm(c, u.bi_inum); -+ if (ret) -+ bch_err(c, "error in fsck: error %i while deleting inode", ret); -+ return ret; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", -+ u.bi_inum))) { -+ bch_verbose(c, "truncating inode %llu", u.bi_inum); -+ -+ bch2_fs_lazy_rw(c); -+ -+ /* -+ * XXX: need to truncate partial blocks too here - or ideally -+ * just switch units to bytes and that issue goes away -+ */ -+ -+ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); -+ if (ret) { -+ bch_err(c, "error in fsck: error %i truncating inode", ret); -+ return ret; -+ } -+ -+ /* -+ * We truncated without our normal sector accounting hook, just -+ * make sure we recalculate it: -+ */ -+ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; -+ -+ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; -+ do_update = true; -+ } -+ -+ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && -+ (!c->sb.clean || -+ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", -+ u.bi_inum))) { -+ s64 sectors; -+ -+ bch_verbose(c, "recounting sectors for inode %llu", -+ u.bi_inum); -+ -+ sectors = bch2_count_inode_sectors(trans, u.bi_inum); -+ if (sectors < 0) { -+ bch_err(c, "error in fsck: error %i recounting inode sectors", -+ (int) sectors); -+ return sectors; -+ } -+ -+ u.bi_sectors = sectors; -+ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; -+ do_update = true; -+ } -+ -+ if (do_update) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, &u); -+ -+ ret = __bch2_trans_do(trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW, -+ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); -+ if (ret) -+ bch_err(c, "error in fsck: error %i " -+ "updating inode", ret); -+ } -+fsck_err: -+ return ret; -+} -+ -+noinline_for_stack -+static int bch2_gc_walk_inodes(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode, -+ nlink_table *links, -+ u64 range_start, u64 range_end) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct nlink *link, zero_links = { 0, 0 }; -+ struct genradix_iter nlinks_iter; -+ int ret = 0, ret2 = 0; -+ u64 nlinks_pos; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, -+ POS(0, range_start), 0); -+ nlinks_iter = genradix_iter_init(links, 0); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret2 = bkey_err(k))) { -+peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); -+ -+ if (!link && (!k.k || iter->pos.offset >= range_end)) -+ break; -+ -+ nlinks_pos = range_start + nlinks_iter.pos; -+ if (iter->pos.offset > nlinks_pos) { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link && link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ genradix_iter_advance(&nlinks_iter, links); -+ goto peek_nlinks; -+ } -+ -+ if (iter->pos.offset < nlinks_pos || !link) -+ link = &zero_links; -+ -+ if (k.k && k.k->type == KEY_TYPE_inode) { -+ ret = check_inode(&trans, lostfound_inode, iter, -+ bkey_s_c_to_inode(k), link); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } else { -+ /* Should have been caught by dirents pass: */ -+ need_fsck_err_on(link->count, c, -+ "missing inode %llu (nlink %u)", -+ nlinks_pos, link->count); -+ } -+ -+ if (nlinks_pos == iter->pos.offset) -+ genradix_iter_advance(&nlinks_iter, links); -+ -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+fsck_err: -+ bch2_trans_exit(&trans); -+ -+ if (ret2) -+ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); -+ -+ return ret ?: ret2; -+} -+ -+noinline_for_stack -+static int check_inode_nlinks(struct bch_fs *c, -+ struct bch_inode_unpacked *lostfound_inode) -+{ -+ nlink_table links; -+ u64 this_iter_range_start, next_iter_range_start = 0; -+ int ret = 0; -+ -+ bch_verbose(c, "checking inode nlinks"); -+ -+ genradix_init(&links); -+ -+ do { -+ this_iter_range_start = next_iter_range_start; -+ next_iter_range_start = U64_MAX; -+ -+ ret = bch2_gc_walk_dirents(c, &links, -+ this_iter_range_start, -+ &next_iter_range_start); -+ if (ret) -+ break; -+ -+ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, -+ this_iter_range_start, -+ next_iter_range_start); -+ if (ret) -+ break; -+ -+ genradix_free(&links); -+ } while (next_iter_range_start != U64_MAX); -+ -+ genradix_free(&links); -+ -+ return ret; -+} -+ -+/* -+ * Checks for inconsistencies that shouldn't happen, unless we have a bug. -+ * Doesn't fix them yet, mainly because they haven't yet been observed: -+ */ -+int bch2_fsck_full(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_extents(c) ?: -+ check_dirents(c) ?: -+ check_xattrs(c) ?: -+ check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_directory_structure(c, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_inode_nlink(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ -+ return check_root(c, &root_inode) ?: -+ check_lostfound(c, &root_inode, &lostfound_inode) ?: -+ check_inode_nlinks(c, &lostfound_inode); -+} -+ -+int bch2_fsck_walk_inodes_only(struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_s_c_inode inode; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { -+ if (k.k->type != KEY_TYPE_inode) -+ continue; -+ -+ inode = bkey_s_c_to_inode(k); -+ -+ if (inode.v->bi_flags & -+ (BCH_INODE_I_SIZE_DIRTY| -+ BCH_INODE_I_SECTORS_DIRTY| -+ BCH_INODE_UNLINKED)) { -+ ret = check_inode(&trans, NULL, iter, inode, NULL); -+ BUG_ON(ret == -EINTR); -+ if (ret) -+ break; -+ } -+ } -+ BUG_ON(ret == -EINTR); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h -new file mode 100644 -index 000000000000..9e4af02bde1e ---- /dev/null -+++ b/fs/bcachefs/fsck.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_FSCK_H -+#define _BCACHEFS_FSCK_H -+ -+int bch2_fsck_full(struct bch_fs *); -+int bch2_fsck_inode_nlink(struct bch_fs *); -+int bch2_fsck_walk_inodes_only(struct bch_fs *); -+ -+#endif /* _BCACHEFS_FSCK_H */ -diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c -new file mode 100644 -index 000000000000..7d20f082ad45 ---- /dev/null -+++ b/fs/bcachefs/inode.c -@@ -0,0 +1,554 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "error.h" -+#include "extents.h" -+#include "inode.h" -+#include "str_hash.h" -+ -+#include -+ -+#include -+ -+const char * const bch2_inode_opts[] = { -+#define x(name, ...) #name, -+ BCH_INODE_OPTS() -+#undef x -+ NULL, -+}; -+ -+static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; -+static const u8 bits_table[8] = { -+ 1 * 8 - 1, -+ 2 * 8 - 2, -+ 3 * 8 - 3, -+ 4 * 8 - 4, -+ 6 * 8 - 5, -+ 8 * 8 - 6, -+ 10 * 8 - 7, -+ 13 * 8 - 8, -+}; -+ -+static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) -+{ -+ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; -+ unsigned shift, bytes, bits = likely(!hi) -+ ? fls64(lo) -+ : fls64(hi) + 64; -+ -+ for (shift = 1; shift <= 8; shift++) -+ if (bits < bits_table[shift - 1]) -+ goto got_shift; -+ -+ BUG(); -+got_shift: -+ bytes = byte_table[shift - 1]; -+ -+ BUG_ON(out + bytes > end); -+ -+ memcpy(out, (u8 *) in + 16 - bytes, bytes); -+ *out |= (1 << 8) >> shift; -+ -+ return bytes; -+} -+ -+static int inode_decode_field(const u8 *in, const u8 *end, -+ u64 out[2], unsigned *out_bits) -+{ -+ __be64 be[2] = { 0, 0 }; -+ unsigned bytes, shift; -+ u8 *p; -+ -+ if (in >= end) -+ return -1; -+ -+ if (!*in) -+ return -1; -+ -+ /* -+ * position of highest set bit indicates number of bytes: -+ * shift = number of bits to remove in high byte: -+ */ -+ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ -+ bytes = byte_table[shift - 1]; -+ -+ if (in + bytes > end) -+ return -1; -+ -+ p = (u8 *) be + 16 - bytes; -+ memcpy(p, in, bytes); -+ *p ^= (1 << 8) >> shift; -+ -+ out[0] = be64_to_cpu(be[0]); -+ out[1] = be64_to_cpu(be[1]); -+ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); -+ -+ return bytes; -+} -+ -+void bch2_inode_pack(struct bkey_inode_buf *packed, -+ const struct bch_inode_unpacked *inode) -+{ -+ u8 *out = packed->inode.v.fields; -+ u8 *end = (void *) &packed[1]; -+ u8 *last_nonzero_field = out; -+ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; -+ unsigned bytes; -+ -+ bkey_inode_init(&packed->inode.k_i); -+ packed->inode.k.p.offset = inode->bi_inum; -+ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; -+ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); -+ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); -+ -+#define x(_name, _bits) \ -+ out += inode_encode_field(out, end, 0, inode->_name); \ -+ nr_fields++; \ -+ \ -+ if (inode->_name) { \ -+ last_nonzero_field = out; \ -+ last_nonzero_fieldnr = nr_fields; \ -+ } -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ out = last_nonzero_field; -+ nr_fields = last_nonzero_fieldnr; -+ -+ bytes = out - (u8 *) &packed->inode.v; -+ set_bkey_val_bytes(&packed->inode.k, bytes); -+ memset_u64s_tail(&packed->inode.v, 0, bytes); -+ -+ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); -+ -+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { -+ struct bch_inode_unpacked unpacked; -+ -+ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), -+ &unpacked); -+ BUG_ON(ret); -+ BUG_ON(unpacked.bi_inum != inode->bi_inum); -+ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); -+ BUG_ON(unpacked.bi_mode != inode->bi_mode); -+ -+#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); -+ BCH_INODE_FIELDS() -+#undef x -+ } -+} -+ -+int bch2_inode_unpack(struct bkey_s_c_inode inode, -+ struct bch_inode_unpacked *unpacked) -+{ -+ const u8 *in = inode.v->fields; -+ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); -+ u64 field[2]; -+ unsigned fieldnr = 0, field_bits; -+ int ret; -+ -+ unpacked->bi_inum = inode.k->p.offset; -+ unpacked->bi_hash_seed = inode.v->bi_hash_seed; -+ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); -+ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); -+ -+#define x(_name, _bits) \ -+ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ -+ memset(&unpacked->_name, 0, \ -+ sizeof(*unpacked) - \ -+ offsetof(struct bch_inode_unpacked, _name)); \ -+ return 0; \ -+ } \ -+ \ -+ ret = inode_decode_field(in, end, field, &field_bits); \ -+ if (ret < 0) \ -+ return ret; \ -+ \ -+ if (field_bits > sizeof(unpacked->_name) * 8) \ -+ return -1; \ -+ \ -+ unpacked->_name = field[1]; \ -+ in += ret; -+ -+ BCH_INODE_FIELDS() -+#undef x -+ -+ /* XXX: signal if there were more fields than expected? */ -+ -+ return 0; -+} -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode, -+ u64 inum, unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), -+ BTREE_ITER_SLOTS|flags); -+ if (IS_ERR(iter)) -+ return iter; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; -+ if (ret) -+ goto err; -+ -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); -+ if (ret) -+ goto err; -+ -+ return iter; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ERR_PTR(ret); -+} -+ -+int bch2_inode_write(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bch_inode_unpacked *inode) -+{ -+ struct bkey_inode_buf *inode_p; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+ -+ bch2_inode_pack(inode_p, inode); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ return 0; -+} -+ -+const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) -+ return "incorrect value size"; -+ -+ if (k.k->p.offset < BLOCKDEV_INODE_MAX) -+ return "fs inode in blockdev range"; -+ -+ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) -+ return "invalid str hash type"; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) -+ return "invalid variable length fields"; -+ -+ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) -+ return "invalid data checksum type"; -+ -+ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && -+ unpacked.bi_nlink != 0) -+ return "flagged as unlinked but bi_nlink != 0"; -+ -+ return NULL; -+} -+ -+void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); -+ struct bch_inode_unpacked unpacked; -+ -+ if (bch2_inode_unpack(inode, &unpacked)) { -+ pr_buf(out, "(unpack error)"); -+ return; -+ } -+ -+#define x(_name, _bits) \ -+ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); -+ BCH_INODE_FIELDS() -+#undef x -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ if (k.k->p.inode) -+ return "nonzero k.p.inode"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); -+ -+ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); -+} -+ -+void bch2_inode_init_early(struct bch_fs *c, -+ struct bch_inode_unpacked *inode_u) -+{ -+ enum bch_str_hash_type str_hash = -+ bch2_str_hash_opt_to_type(c, c->opts.str_hash); -+ -+ memset(inode_u, 0, sizeof(*inode_u)); -+ -+ /* ick */ -+ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; -+ get_random_bytes(&inode_u->bi_hash_seed, -+ sizeof(inode_u->bi_hash_seed)); -+} -+ -+void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ inode_u->bi_mode = mode; -+ inode_u->bi_uid = uid; -+ inode_u->bi_gid = gid; -+ inode_u->bi_dev = rdev; -+ inode_u->bi_atime = now; -+ inode_u->bi_mtime = now; -+ inode_u->bi_ctime = now; -+ inode_u->bi_otime = now; -+ -+ if (parent && parent->bi_mode & S_ISGID) { -+ inode_u->bi_gid = parent->bi_gid; -+ if (S_ISDIR(mode)) -+ inode_u->bi_mode |= S_ISGID; -+ } -+ -+ if (parent) { -+#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ } -+} -+ -+void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, -+ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, -+ struct bch_inode_unpacked *parent) -+{ -+ bch2_inode_init_early(c, inode_u); -+ bch2_inode_init_late(inode_u, bch2_current_time(c), -+ uid, gid, mode, rdev, parent); -+} -+ -+static inline u32 bkey_generation(struct bkey_s_c k) -+{ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ BUG(); -+ case KEY_TYPE_inode_generation: -+ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); -+ default: -+ return 0; -+ } -+} -+ -+int bch2_inode_create(struct btree_trans *trans, -+ struct bch_inode_unpacked *inode_u, -+ u64 min, u64 max, u64 *hint) -+{ -+ struct bkey_inode_buf *inode_p; -+ struct btree_iter *iter = NULL; -+ struct bkey_s_c k; -+ u64 start; -+ int ret; -+ -+ if (!max) -+ max = ULLONG_MAX; -+ -+ if (trans->c->opts.inodes_32bit) -+ max = min_t(u64, max, U32_MAX); -+ -+ start = READ_ONCE(*hint); -+ -+ if (start >= max || start < min) -+ start = min; -+ -+ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); -+ if (IS_ERR(inode_p)) -+ return PTR_ERR(inode_p); -+again: -+ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (bkey_cmp(iter->pos, POS(0, max)) > 0) -+ break; -+ -+ if (k.k->type != KEY_TYPE_inode) -+ goto found_slot; -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ -+ if (ret) -+ return ret; -+ -+ if (start != min) { -+ /* Retry from start */ -+ start = min; -+ goto again; -+ } -+ -+ return -ENOSPC; -+found_slot: -+ *hint = k.k->p.offset; -+ inode_u->bi_inum = k.k->p.offset; -+ inode_u->bi_generation = bkey_generation(k); -+ -+ bch2_inode_pack(inode_p, inode_u); -+ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); -+ bch2_trans_iter_put(trans, iter); -+ return 0; -+} -+ -+int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_inode_generation delete; -+ struct bpos start = POS(inode_nr, 0); -+ struct bpos end = POS(inode_nr + 1, 0); -+ int ret; -+ -+ /* -+ * If this was a directory, there shouldn't be any real dirents left - -+ * but there could be whiteouts (from hash collisions) that we should -+ * delete: -+ * -+ * XXX: the dirent could ideally would delete whiteouts when they're no -+ * longer needed -+ */ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ start, end, NULL) ?: -+ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, -+ start, end, NULL); -+ if (ret) -+ return ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ do { -+ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); -+ u32 bi_generation = 0; -+ -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ -+ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, -+ "inode %llu not found when deleting", -+ inode_nr); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_inode: { -+ struct bch_inode_unpacked inode_u; -+ -+ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) -+ bi_generation = inode_u.bi_generation + 1; -+ break; -+ } -+ case KEY_TYPE_inode_generation: { -+ struct bkey_s_c_inode_generation g = -+ bkey_s_c_to_inode_generation(k); -+ bi_generation = le32_to_cpu(g.v->bi_generation); -+ break; -+ } -+ } -+ -+ if (!bi_generation) { -+ bkey_init(&delete.k); -+ delete.k.p.offset = inode_nr; -+ } else { -+ bkey_inode_generation_init(&delete.k_i); -+ delete.k.p.offset = inode_nr; -+ delete.v.bi_generation = cpu_to_le32(bi_generation); -+ } -+ -+ bch2_trans_update(&trans, iter, &delete.k_i, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ } while (ret == -EINTR); -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, -+ POS(0, inode_nr), BTREE_ITER_SLOTS); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ ret = k.k->type == KEY_TYPE_inode -+ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) -+ : -ENOENT; -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, -+ struct bch_inode_unpacked *inode) -+{ -+ return bch2_trans_do(c, NULL, NULL, 0, -+ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void) -+{ -+ struct bch_inode_unpacked *u, test_inodes[] = { -+ { -+ .bi_atime = U64_MAX, -+ .bi_ctime = U64_MAX, -+ .bi_mtime = U64_MAX, -+ .bi_otime = U64_MAX, -+ .bi_size = U64_MAX, -+ .bi_sectors = U64_MAX, -+ .bi_uid = U32_MAX, -+ .bi_gid = U32_MAX, -+ .bi_nlink = U32_MAX, -+ .bi_generation = U32_MAX, -+ .bi_dev = U32_MAX, -+ }, -+ }; -+ -+ for (u = test_inodes; -+ u < test_inodes + ARRAY_SIZE(test_inodes); -+ u++) { -+ struct bkey_inode_buf p; -+ -+ bch2_inode_pack(&p, u); -+ } -+} -+#endif -diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h -new file mode 100644 -index 000000000000..bb759a46dc41 ---- /dev/null -+++ b/fs/bcachefs/inode.h -@@ -0,0 +1,177 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_INODE_H -+#define _BCACHEFS_INODE_H -+ -+#include "opts.h" -+ -+extern const char * const bch2_inode_opts[]; -+ -+const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_invalid, \ -+ .val_to_text = bch2_inode_to_text, \ -+} -+ -+const char *bch2_inode_generation_invalid(const struct bch_fs *, -+ struct bkey_s_c); -+void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ -+ .key_invalid = bch2_inode_generation_invalid, \ -+ .val_to_text = bch2_inode_generation_to_text, \ -+} -+ -+struct bch_inode_unpacked { -+ u64 bi_inum; -+ __le64 bi_hash_seed; -+ u32 bi_flags; -+ u16 bi_mode; -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_FIELDS() -+#undef x -+}; -+ -+struct bkey_inode_buf { -+ struct bkey_i_inode inode; -+ -+#define x(_name, _bits) + 8 + _bits / 8 -+ u8 _pad[0 + BCH_INODE_FIELDS()]; -+#undef x -+} __attribute__((packed, aligned(8))); -+ -+void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); -+int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); -+ -+struct btree_iter *bch2_inode_peek(struct btree_trans *, -+ struct bch_inode_unpacked *, u64, unsigned); -+int bch2_inode_write(struct btree_trans *, struct btree_iter *, -+ struct bch_inode_unpacked *); -+ -+void bch2_inode_init_early(struct bch_fs *, -+ struct bch_inode_unpacked *); -+void bch2_inode_init_late(struct bch_inode_unpacked *, u64, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, -+ uid_t, gid_t, umode_t, dev_t, -+ struct bch_inode_unpacked *); -+ -+int bch2_inode_create(struct btree_trans *, -+ struct bch_inode_unpacked *, -+ u64, u64, u64 *); -+ -+int bch2_inode_rm(struct bch_fs *, u64); -+ -+int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, -+ struct bch_inode_unpacked *); -+int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); -+ -+static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts ret = { 0 }; -+ -+#define x(_name, _bits) \ -+ if (inode->bi_##_name) \ -+ opt_set(ret, _name, inode->bi_##_name - 1); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ inode->bi_##_name = v; \ -+ break; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, -+ enum inode_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Inode_opt_##_name: \ -+ return inode->bi_##_name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+static inline struct bch_io_opts -+io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) -+{ -+ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); -+ -+ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); -+ return opts; -+} -+ -+static inline u8 mode_to_type(umode_t mode) -+{ -+ return (mode >> 12) & 15; -+} -+ -+/* i_nlink: */ -+ -+static inline unsigned nlink_bias(umode_t mode) -+{ -+ return S_ISDIR(mode) ? 2 : 1; -+} -+ -+static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) -+{ -+ if (bi->bi_flags & BCH_INODE_UNLINKED) -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ else -+ bi->bi_nlink++; -+} -+ -+static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) -+{ -+ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); -+ if (bi->bi_nlink) -+ bi->bi_nlink--; -+ else -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+} -+ -+static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) -+{ -+ return bi->bi_flags & BCH_INODE_UNLINKED -+ ? 0 -+ : bi->bi_nlink + nlink_bias(bi->bi_mode); -+} -+ -+static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, -+ unsigned nlink) -+{ -+ if (nlink) { -+ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); -+ bi->bi_flags &= ~BCH_INODE_UNLINKED; -+ } else { -+ bi->bi_nlink = 0; -+ bi->bi_flags |= BCH_INODE_UNLINKED; -+ } -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_inode_pack_test(void); -+#else -+static inline void bch2_inode_pack_test(void) {} -+#endif -+ -+#endif /* _BCACHEFS_INODE_H */ -diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c -new file mode 100644 -index 000000000000..0a4b4eed465c ---- /dev/null -+++ b/fs/bcachefs/io.c -@@ -0,0 +1,2389 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Some low level IO code, and hacks for various block layer limitations -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "bset.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "compress.h" -+#include "clock.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "extent_update.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+ -+#include -+ -+const char *bch2_blk_status_to_str(blk_status_t status) -+{ -+ if (status == BLK_STS_REMOVED) -+ return "device removed"; -+ return blk_status_to_str(status); -+} -+ -+static bool bch2_target_congested(struct bch_fs *c, u16 target) -+{ -+ const struct bch_devs_mask *devs; -+ unsigned d, nr = 0, total = 0; -+ u64 now = local_clock(), last; -+ s64 congested; -+ struct bch_dev *ca; -+ -+ if (!target) -+ return false; -+ -+ rcu_read_lock(); -+ devs = bch2_target_to_mask(c, target) ?: -+ &c->rw_devs[BCH_DATA_user]; -+ -+ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { -+ ca = rcu_dereference(c->devs[d]); -+ if (!ca) -+ continue; -+ -+ congested = atomic_read(&ca->congested); -+ last = READ_ONCE(ca->congested_last); -+ if (time_after64(now, last)) -+ congested -= (now - last) >> 12; -+ -+ total += max(congested, 0LL); -+ nr++; -+ } -+ rcu_read_unlock(); -+ -+ return bch2_rand_range(nr * CONGESTED_MAX) < total; -+} -+ -+static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, -+ u64 now, int rw) -+{ -+ u64 latency_capable = -+ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; -+ /* ideally we'd be taking into account the device's variance here: */ -+ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); -+ s64 latency_over = io_latency - latency_threshold; -+ -+ if (latency_threshold && latency_over > 0) { -+ /* -+ * bump up congested by approximately latency_over * 4 / -+ * latency_threshold - we don't need much accuracy here so don't -+ * bother with the divide: -+ */ -+ if (atomic_read(&ca->congested) < CONGESTED_MAX) -+ atomic_add(latency_over >> -+ max_t(int, ilog2(latency_threshold) - 2, 0), -+ &ca->congested); -+ -+ ca->congested_last = now; -+ } else if (atomic_read(&ca->congested) > 0) { -+ atomic_dec(&ca->congested); -+ } -+} -+ -+void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) -+{ -+ atomic64_t *latency = &ca->cur_latency[rw]; -+ u64 now = local_clock(); -+ u64 io_latency = time_after64(now, submit_time) -+ ? now - submit_time -+ : 0; -+ u64 old, new, v = atomic64_read(latency); -+ -+ do { -+ old = v; -+ -+ /* -+ * If the io latency was reasonably close to the current -+ * latency, skip doing the update and atomic operation - most of -+ * the time: -+ */ -+ if (abs((int) (old - io_latency)) < (old >> 1) && -+ now & ~(~0 << 5)) -+ break; -+ -+ new = ewma_add(old, io_latency, 5); -+ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); -+ -+ bch2_congested_acct(ca, io_latency, now, rw); -+ -+ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); -+} -+ -+/* Allocate, free from mempool: */ -+ -+void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) -+{ -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bio_for_each_segment_all(bv, bio, iter) -+ if (bv->bv_page != ZERO_PAGE(0)) -+ mempool_free(bv->bv_page, &c->bio_bounce_pages); -+ bio->bi_vcnt = 0; -+} -+ -+static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) -+{ -+ struct page *page; -+ -+ if (likely(!*using_mempool)) { -+ page = alloc_page(GFP_NOIO); -+ if (unlikely(!page)) { -+ mutex_lock(&c->bio_bounce_pages_lock); -+ *using_mempool = true; -+ goto pool_alloc; -+ -+ } -+ } else { -+pool_alloc: -+ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); -+ } -+ -+ return page; -+} -+ -+void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, -+ size_t size) -+{ -+ bool using_mempool = false; -+ -+ while (size) { -+ struct page *page = __bio_alloc_page_pool(c, &using_mempool); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ if (using_mempool) -+ mutex_unlock(&c->bio_bounce_pages_lock); -+} -+ -+/* Extent update path: */ -+ -+static int sum_sector_overwrites(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i *new, -+ bool may_allocate, -+ bool *maybe_extending, -+ s64 *delta) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c old; -+ int ret = 0; -+ -+ *maybe_extending = true; -+ *delta = 0; -+ -+ iter = bch2_trans_copy_iter(trans, extent_iter); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { -+ if (!may_allocate && -+ bch2_bkey_nr_ptrs_fully_allocated(old) < -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { -+ ret = -ENOSPC; -+ break; -+ } -+ -+ *delta += (min(new->k.p.offset, -+ old.k->p.offset) - -+ max(bkey_start_offset(&new->k), -+ bkey_start_offset(old.k))) * -+ (bkey_extent_is_allocation(&new->k) - -+ bkey_extent_is_allocation(old.k)); -+ -+ if (bkey_cmp(old.k->p, new->k.p) >= 0) { -+ /* -+ * Check if there's already data above where we're -+ * going to be writing to - this means we're definitely -+ * not extending the file: -+ * -+ * Note that it's not sufficient to check if there's -+ * data up to the sector offset we're going to be -+ * writing to, because i_size could be up to one block -+ * less: -+ */ -+ if (!bkey_cmp(old.k->p, new->k.p)) -+ old = bch2_btree_iter_next(iter); -+ -+ if (old.k && !bkey_err(old) && -+ old.k->p.inode == extent_iter->pos.inode && -+ bkey_extent_is_data(old.k)) -+ *maybe_extending = false; -+ -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int bch2_extent_update(struct btree_trans *trans, -+ struct btree_iter *iter, -+ struct bkey_i *k, -+ struct disk_reservation *disk_res, -+ u64 *journal_seq, -+ u64 new_i_size, -+ s64 *i_sectors_delta) -+{ -+ /* this must live until after bch2_trans_commit(): */ -+ struct bkey_inode_buf inode_p; -+ bool extending = false; -+ s64 delta = 0; -+ int ret; -+ -+ ret = bch2_extent_trim_atomic(k, iter); -+ if (ret) -+ return ret; -+ -+ ret = sum_sector_overwrites(trans, iter, k, -+ disk_res && disk_res->sectors != 0, -+ &extending, &delta); -+ if (ret) -+ return ret; -+ -+ new_i_size = extending -+ ? min(k->k.p.offset << 9, new_i_size) -+ : 0; -+ -+ if (delta || new_i_size) { -+ struct btree_iter *inode_iter; -+ struct bch_inode_unpacked inode_u; -+ -+ inode_iter = bch2_inode_peek(trans, &inode_u, -+ k->k.p.inode, BTREE_ITER_INTENT); -+ if (IS_ERR(inode_iter)) -+ return PTR_ERR(inode_iter); -+ -+ /* -+ * XXX: -+ * writeback can race a bit with truncate, because truncate -+ * first updates the inode then truncates the pagecache. This is -+ * ugly, but lets us preserve the invariant that the in memory -+ * i_size is always >= the on disk i_size. -+ * -+ BUG_ON(new_i_size > inode_u.bi_size && -+ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); -+ */ -+ BUG_ON(new_i_size > inode_u.bi_size && !extending); -+ -+ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && -+ new_i_size > inode_u.bi_size) -+ inode_u.bi_size = new_i_size; -+ else -+ new_i_size = 0; -+ -+ inode_u.bi_sectors += delta; -+ -+ if (delta || new_i_size) { -+ bch2_inode_pack(&inode_p, &inode_u); -+ bch2_trans_update(trans, inode_iter, -+ &inode_p.inode.k_i, 0); -+ } -+ -+ bch2_trans_iter_put(trans, inode_iter); -+ } -+ -+ bch2_trans_update(trans, iter, k, 0); -+ -+ ret = bch2_trans_commit(trans, disk_res, journal_seq, -+ BTREE_INSERT_NOCHECK_RW| -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE); -+ if (!ret && i_sectors_delta) -+ *i_sectors_delta += delta; -+ -+ return ret; -+} -+ -+int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, -+ struct bpos end, u64 *journal_seq, -+ s64 *i_sectors_delta) -+{ -+ struct bch_fs *c = trans->c; -+ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); -+ struct bkey_s_c k; -+ int ret = 0, ret2 = 0; -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ bkey_cmp(iter->pos, end) < 0) { -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i delete; -+ -+ bch2_trans_begin(trans); -+ -+ ret = bkey_err(k); -+ if (ret) -+ goto btree_err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = iter->pos; -+ -+ /* create the biggest key we can */ -+ bch2_key_resize(&delete.k, max_sectors); -+ bch2_cut_back(end, &delete); -+ -+ ret = bch2_extent_update(trans, iter, &delete, -+ &disk_res, journal_seq, -+ 0, i_sectors_delta); -+ bch2_disk_reservation_put(c, &disk_res); -+btree_err: -+ if (ret == -EINTR) { -+ ret2 = ret; -+ ret = 0; -+ } -+ if (ret) -+ break; -+ } -+ -+ if (bkey_cmp(iter->pos, end) > 0) { -+ bch2_btree_iter_set_pos(iter, end); -+ ret = bch2_btree_iter_traverse(iter); -+ } -+ -+ return ret ?: ret2; -+} -+ -+int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, -+ u64 *journal_seq, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inum, start), -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), -+ journal_seq, i_sectors_delta); -+ bch2_trans_exit(&trans); -+ -+ if (ret == -EINTR) -+ ret = 0; -+ -+ return ret; -+} -+ -+int bch2_write_index_default(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_on_stack sk; -+ struct keylist *keys = &op->insert_keys; -+ struct bkey_i *k = bch2_keylist_front(keys); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ int ret; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ do { -+ bch2_trans_begin(&trans); -+ -+ k = bch2_keylist_front(keys); -+ -+ bkey_on_stack_realloc(&sk, c, k->k.u64s); -+ bkey_copy(sk.k, k); -+ bch2_cut_front(iter->pos, sk.k); -+ -+ ret = bch2_extent_update(&trans, iter, sk.k, -+ &op->res, op_journal_seq(op), -+ op->new_i_size, &op->i_sectors_delta); -+ if (ret == -EINTR) -+ continue; -+ if (ret) -+ break; -+ -+ if (bkey_cmp(iter->pos, k->k.p) >= 0) -+ bch2_keylist_pop_front(keys); -+ } while (!bch2_keylist_empty(keys)); -+ -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+/* Writes */ -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, -+ enum bch_data_type type, -+ const struct bkey_i *k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); -+ const struct bch_extent_ptr *ptr; -+ struct bch_write_bio *n; -+ struct bch_dev *ca; -+ -+ BUG_ON(c->opts.nochanges); -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || -+ !c->devs[ptr->dev]); -+ -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ -+ if (to_entry(ptr + 1) < ptrs.end) { -+ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, -+ &ca->replica_set)); -+ -+ n->bio.bi_end_io = wbio->bio.bi_end_io; -+ n->bio.bi_private = wbio->bio.bi_private; -+ n->parent = wbio; -+ n->split = true; -+ n->bounce = false; -+ n->put_bio = true; -+ n->bio.bi_opf = wbio->bio.bi_opf; -+ bio_inc_remaining(&wbio->bio); -+ } else { -+ n = wbio; -+ n->split = false; -+ } -+ -+ n->c = c; -+ n->dev = ptr->dev; -+ n->have_ioref = bch2_dev_get_ioref(ca, -+ type == BCH_DATA_btree ? READ : WRITE); -+ n->submit_time = local_clock(); -+ n->bio.bi_iter.bi_sector = ptr->offset; -+ -+ if (!journal_flushes_device(ca)) -+ n->bio.bi_opf |= REQ_FUA; -+ -+ if (likely(n->have_ioref)) { -+ this_cpu_add(ca->io_done->sectors[WRITE][type], -+ bio_sectors(&n->bio)); -+ -+ bio_set_dev(&n->bio, ca->disk_sb.bdev); -+ submit_bio(&n->bio); -+ } else { -+ n->bio.bi_status = BLK_STS_REMOVED; -+ bio_endio(&n->bio); -+ } -+ } -+} -+ -+static void __bch2_write(struct closure *); -+ -+static void bch2_write_done(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) -+ op->error = bch2_journal_error(&c->journal); -+ -+ bch2_disk_reservation_put(c, &op->res); -+ percpu_ref_put(&c->writes); -+ bch2_keylist_free(&op->insert_keys, op->inline_keys); -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); -+ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ up(&c->io_in_flight); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/** -+ * bch_write_index - after a write, update index to point to new data -+ */ -+static void __bch2_write_index(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct keylist *keys = &op->insert_keys; -+ struct bch_extent_ptr *ptr; -+ struct bkey_i *src, *dst = keys->keys, *n, *k; -+ unsigned dev; -+ int ret; -+ -+ for (src = keys->keys; src != keys->top; src = n) { -+ n = bkey_next(src); -+ -+ if (bkey_extent_is_direct_data(&src->k)) { -+ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, -+ test_bit(ptr->dev, op->failed.d)); -+ -+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { -+ ret = -EIO; -+ goto err; -+ } -+ } -+ -+ if (dst != src) -+ memmove_u64s_down(dst, src, src->u64s); -+ dst = bkey_next(dst); -+ } -+ -+ keys->top = dst; -+ -+ /* -+ * probably not the ideal place to hook this in, but I don't -+ * particularly want to plumb io_opts all the way through the btree -+ * update stack right now -+ */ -+ for_each_keylist_key(keys, k) { -+ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); -+ -+ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) -+ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); -+ -+ } -+ -+ if (!bch2_keylist_empty(keys)) { -+ u64 sectors_start = keylist_sectors(keys); -+ int ret = op->index_update_fn(op); -+ -+ BUG_ON(ret == -EINTR); -+ BUG_ON(keylist_sectors(keys) && !ret); -+ -+ op->written += sectors_start - keylist_sectors(keys); -+ -+ if (ret) { -+ __bcache_io_error(c, "btree IO error %i", ret); -+ op->error = ret; -+ } -+ } -+out: -+ /* If some a bucket wasn't written, we can't erasure code it: */ -+ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) -+ bch2_open_bucket_write_error(c, &op->open_buckets, dev); -+ -+ bch2_open_buckets_put(c, &op->open_buckets); -+ return; -+err: -+ keys->top = keys->keys; -+ op->error = ret; -+ goto out; -+} -+ -+static void bch2_write_index(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ -+ __bch2_write_index(op); -+ -+ if (!(op->flags & BCH_WRITE_DONE)) { -+ continue_at(cl, __bch2_write, index_update_wq(op)); -+ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { -+ bch2_journal_flush_seq_async(&c->journal, -+ *op_journal_seq(op), -+ cl); -+ continue_at(cl, bch2_write_done, index_update_wq(op)); -+ } else { -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ } -+} -+ -+static void bch2_write_endio(struct bio *bio) -+{ -+ struct closure *cl = bio->bi_private; -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_write_bio *wbio = to_wbio(bio); -+ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; -+ struct bch_fs *c = wbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", -+ bch2_blk_status_to_str(bio->bi_status))) -+ set_bit(wbio->dev, op->failed.d); -+ -+ if (wbio->have_ioref) { -+ bch2_latency_acct(ca, wbio->submit_time, WRITE); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (wbio->bounce) -+ bch2_bio_free_pages_pool(c, bio); -+ -+ if (wbio->put_bio) -+ bio_put(bio); -+ -+ if (parent) -+ bio_endio(&parent->bio); -+ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) -+ closure_put(cl); -+ else -+ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); -+} -+ -+static void init_append_extent(struct bch_write_op *op, -+ struct write_point *wp, -+ struct bversion version, -+ struct bch_extent_crc_unpacked crc) -+{ -+ struct bch_fs *c = op->c; -+ struct bkey_i_extent *e; -+ struct open_bucket *ob; -+ unsigned i; -+ -+ BUG_ON(crc.compressed_size > wp->sectors_free); -+ wp->sectors_free -= crc.compressed_size; -+ op->pos.offset += crc.uncompressed_size; -+ -+ e = bkey_extent_init(op->insert_keys.top); -+ e->k.p = op->pos; -+ e->k.size = crc.uncompressed_size; -+ e->k.version = version; -+ -+ if (crc.csum_type || -+ crc.compression_type || -+ crc.nonce) -+ bch2_extent_crc_append(&e->k_i, crc); -+ -+ open_bucket_for_each(c, &wp->ptrs, ob, i) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); -+ union bch_extent_entry *end = -+ bkey_val_end(bkey_i_to_s(&e->k_i)); -+ -+ end->ptr = ob->ptr; -+ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; -+ end->ptr.cached = !ca->mi.durability || -+ (op->flags & BCH_WRITE_CACHED) != 0; -+ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; -+ -+ e->k.u64s++; -+ -+ BUG_ON(crc.compressed_size > ob->sectors_free); -+ ob->sectors_free -= crc.compressed_size; -+ } -+ -+ bch2_keylist_push(&op->insert_keys); -+} -+ -+static struct bio *bch2_write_bio_alloc(struct bch_fs *c, -+ struct write_point *wp, -+ struct bio *src, -+ bool *page_alloc_failed, -+ void *buf) -+{ -+ struct bch_write_bio *wbio; -+ struct bio *bio; -+ unsigned output_available = -+ min(wp->sectors_free << 9, src->bi_iter.bi_size); -+ unsigned pages = DIV_ROUND_UP(output_available + -+ (buf -+ ? ((unsigned long) buf & (PAGE_SIZE - 1)) -+ : 0), PAGE_SIZE); -+ -+ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); -+ wbio = wbio_init(bio); -+ wbio->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ wbio->bio.bi_opf = src->bi_opf; -+ -+ if (buf) { -+ bch2_bio_map(bio, buf, output_available); -+ return bio; -+ } -+ -+ wbio->bounce = true; -+ -+ /* -+ * We can't use mempool for more than c->sb.encoded_extent_max -+ * worth of pages, but we'd like to allocate more if we can: -+ */ -+ bch2_bio_alloc_pages_pool(c, bio, -+ min_t(unsigned, output_available, -+ c->sb.encoded_extent_max << 9)); -+ -+ if (bio->bi_iter.bi_size < output_available) -+ *page_alloc_failed = -+ bch2_bio_alloc_pages(bio, -+ output_available - -+ bio->bi_iter.bi_size, -+ GFP_NOFS) != 0; -+ -+ return bio; -+} -+ -+static int bch2_write_rechecksum(struct bch_fs *c, -+ struct bch_write_op *op, -+ unsigned new_csum_type) -+{ -+ struct bio *bio = &op->wbio.bio; -+ struct bch_extent_crc_unpacked new_crc; -+ int ret; -+ -+ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ -+ -+ if (bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(new_csum_type)) -+ new_csum_type = op->crc.csum_type; -+ -+ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, -+ NULL, &new_crc, -+ op->crc.offset, op->crc.live_size, -+ new_csum_type); -+ if (ret) -+ return ret; -+ -+ bio_advance(bio, op->crc.offset << 9); -+ bio->bi_iter.bi_size = op->crc.live_size << 9; -+ op->crc = new_crc; -+ return 0; -+} -+ -+static int bch2_write_decrypt(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct nonce nonce = extent_nonce(op->version, op->crc); -+ struct bch_csum csum; -+ -+ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) -+ return 0; -+ -+ /* -+ * If we need to decrypt data in the write path, we'll no longer be able -+ * to verify the existing checksum (poly1305 mac, in this case) after -+ * it's decrypted - this is the last point we'll be able to reverify the -+ * checksum: -+ */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return -EIO; -+ -+ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); -+ op->crc.csum_type = 0; -+ op->crc.csum = (struct bch_csum) { 0, 0 }; -+ return 0; -+} -+ -+static enum prep_encoded_ret { -+ PREP_ENCODED_OK, -+ PREP_ENCODED_ERR, -+ PREP_ENCODED_CHECKSUM_ERR, -+ PREP_ENCODED_DO_WRITE, -+} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *bio = &op->wbio.bio; -+ -+ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) -+ return PREP_ENCODED_OK; -+ -+ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); -+ -+ /* Can we just write the entire extent as is? */ -+ if (op->crc.uncompressed_size == op->crc.live_size && -+ op->crc.compressed_size <= wp->sectors_free && -+ (op->crc.compression_type == op->compression_type || -+ op->incompressible)) { -+ if (!crc_is_compressed(op->crc) && -+ op->csum_type != op->crc.csum_type && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_DO_WRITE; -+ } -+ -+ /* -+ * If the data is compressed and we couldn't write the entire extent as -+ * is, we have to decompress it: -+ */ -+ if (crc_is_compressed(op->crc)) { -+ struct bch_csum csum; -+ -+ if (bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* Last point we can still verify checksum: */ -+ csum = bch2_checksum_bio(c, op->crc.csum_type, -+ extent_nonce(op->version, op->crc), -+ bio); -+ if (bch2_crc_cmp(op->crc.csum, csum)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) -+ return PREP_ENCODED_ERR; -+ } -+ -+ /* -+ * No longer have compressed data after this point - data might be -+ * encrypted: -+ */ -+ -+ /* -+ * If the data is checksummed and we're only writing a subset, -+ * rechecksum and adjust bio to point to currently live data: -+ */ -+ if ((op->crc.live_size != op->crc.uncompressed_size || -+ op->crc.csum_type != op->csum_type) && -+ bch2_write_rechecksum(c, op, op->csum_type)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ /* -+ * If we want to compress the data, it has to be decrypted: -+ */ -+ if ((op->compression_type || -+ bch2_csum_type_is_encryption(op->crc.csum_type) != -+ bch2_csum_type_is_encryption(op->csum_type)) && -+ bch2_write_decrypt(op)) -+ return PREP_ENCODED_CHECKSUM_ERR; -+ -+ return PREP_ENCODED_OK; -+} -+ -+static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, -+ struct bio **_dst) -+{ -+ struct bch_fs *c = op->c; -+ struct bio *src = &op->wbio.bio, *dst = src; -+ struct bvec_iter saved_iter; -+ void *ec_buf; -+ struct bpos ec_pos = op->pos; -+ unsigned total_output = 0, total_input = 0; -+ bool bounce = false; -+ bool page_alloc_failed = false; -+ int ret, more = 0; -+ -+ BUG_ON(!bio_sectors(src)); -+ -+ ec_buf = bch2_writepoint_ec_buf(c, wp); -+ -+ switch (bch2_write_prep_encoded_data(op, wp)) { -+ case PREP_ENCODED_OK: -+ break; -+ case PREP_ENCODED_ERR: -+ ret = -EIO; -+ goto err; -+ case PREP_ENCODED_CHECKSUM_ERR: -+ BUG(); -+ goto csum_err; -+ case PREP_ENCODED_DO_WRITE: -+ /* XXX look for bug here */ -+ if (ec_buf) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bio_copy_data(dst, src); -+ bounce = true; -+ } -+ init_append_extent(op, wp, op->version, op->crc); -+ goto do_write; -+ } -+ -+ if (ec_buf || -+ op->compression_type || -+ (op->csum_type && -+ !(op->flags & BCH_WRITE_PAGES_STABLE)) || -+ (bch2_csum_type_is_encryption(op->csum_type) && -+ !(op->flags & BCH_WRITE_PAGES_OWNED))) { -+ dst = bch2_write_bio_alloc(c, wp, src, -+ &page_alloc_failed, -+ ec_buf); -+ bounce = true; -+ } -+ -+ saved_iter = dst->bi_iter; -+ -+ do { -+ struct bch_extent_crc_unpacked crc = -+ (struct bch_extent_crc_unpacked) { 0 }; -+ struct bversion version = op->version; -+ size_t dst_len, src_len; -+ -+ if (page_alloc_failed && -+ bio_sectors(dst) < wp->sectors_free && -+ bio_sectors(dst) < c->sb.encoded_extent_max) -+ break; -+ -+ BUG_ON(op->compression_type && -+ (op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_csum_type_is_encryption(op->crc.csum_type)); -+ BUG_ON(op->compression_type && !bounce); -+ -+ crc.compression_type = op->incompressible -+ ? BCH_COMPRESSION_TYPE_incompressible -+ : op->compression_type -+ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, -+ op->compression_type) -+ : 0; -+ if (!crc_is_compressed(crc)) { -+ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); -+ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); -+ -+ if (op->csum_type) -+ dst_len = min_t(unsigned, dst_len, -+ c->sb.encoded_extent_max << 9); -+ -+ if (bounce) { -+ swap(dst->bi_iter.bi_size, dst_len); -+ bio_copy_data(dst, src); -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ src_len = dst_len; -+ } -+ -+ BUG_ON(!src_len || !dst_len); -+ -+ if (bch2_csum_type_is_encryption(op->csum_type)) { -+ if (bversion_zero(version)) { -+ version.lo = atomic64_inc_return(&c->key_version); -+ } else { -+ crc.nonce = op->nonce; -+ op->nonce += src_len >> 9; -+ } -+ } -+ -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ !crc_is_compressed(crc) && -+ bch2_csum_type_is_encryption(op->crc.csum_type) == -+ bch2_csum_type_is_encryption(op->csum_type)) { -+ /* -+ * Note: when we're using rechecksum(), we need to be -+ * checksumming @src because it has all the data our -+ * existing checksum covers - if we bounced (because we -+ * were trying to compress), @dst will only have the -+ * part of the data the new checksum will cover. -+ * -+ * But normally we want to be checksumming post bounce, -+ * because part of the reason for bouncing is so the -+ * data can't be modified (by userspace) while it's in -+ * flight. -+ */ -+ if (bch2_rechecksum_bio(c, src, version, op->crc, -+ &crc, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->csum_type)) -+ goto csum_err; -+ } else { -+ if ((op->flags & BCH_WRITE_DATA_ENCODED) && -+ bch2_rechecksum_bio(c, src, version, op->crc, -+ NULL, &op->crc, -+ src_len >> 9, -+ bio_sectors(src) - (src_len >> 9), -+ op->crc.csum_type)) -+ goto csum_err; -+ -+ crc.compressed_size = dst_len >> 9; -+ crc.uncompressed_size = src_len >> 9; -+ crc.live_size = src_len >> 9; -+ -+ swap(dst->bi_iter.bi_size, dst_len); -+ bch2_encrypt_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum = bch2_checksum_bio(c, op->csum_type, -+ extent_nonce(version, crc), dst); -+ crc.csum_type = op->csum_type; -+ swap(dst->bi_iter.bi_size, dst_len); -+ } -+ -+ init_append_extent(op, wp, version, crc); -+ -+ if (dst != src) -+ bio_advance(dst, dst_len); -+ bio_advance(src, src_len); -+ total_output += dst_len; -+ total_input += src_len; -+ } while (dst->bi_iter.bi_size && -+ src->bi_iter.bi_size && -+ wp->sectors_free && -+ !bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)); -+ -+ more = src->bi_iter.bi_size != 0; -+ -+ dst->bi_iter = saved_iter; -+ -+ if (dst == src && more) { -+ BUG_ON(total_output != total_input); -+ -+ dst = bio_split(src, total_input >> 9, -+ GFP_NOIO, &c->bio_write); -+ wbio_init(dst)->put_bio = true; -+ /* copy WRITE_SYNC flag */ -+ dst->bi_opf = src->bi_opf; -+ } -+ -+ dst->bi_iter.bi_size = total_output; -+do_write: -+ /* might have done a realloc... */ -+ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); -+ -+ *_dst = dst; -+ return more; -+csum_err: -+ bch_err(c, "error verifying existing checksum while " -+ "rewriting existing data (memory corruption?)"); -+ ret = -EIO; -+err: -+ if (to_wbio(dst)->bounce) -+ bch2_bio_free_pages_pool(c, dst); -+ if (to_wbio(dst)->put_bio) -+ bio_put(dst); -+ -+ return ret; -+} -+ -+static void __bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bch_fs *c = op->c; -+ struct write_point *wp; -+ struct bio *bio; -+ bool skip_put = true; -+ unsigned nofs_flags; -+ int ret; -+ -+ nofs_flags = memalloc_nofs_save(); -+again: -+ memset(&op->failed, 0, sizeof(op->failed)); -+ -+ do { -+ struct bkey_i *key_to_write; -+ unsigned key_to_write_offset = op->insert_keys.top_p - -+ op->insert_keys.keys_p; -+ -+ /* +1 for possible cache device: */ -+ if (op->open_buckets.nr + op->nr_replicas + 1 > -+ ARRAY_SIZE(op->open_buckets.v)) -+ goto flush_io; -+ -+ if (bch2_keylist_realloc(&op->insert_keys, -+ op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_EXTENT_U64s_MAX)) -+ goto flush_io; -+ -+ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && -+ percpu_ref_is_dying(&c->writes)) { -+ ret = -EROFS; -+ goto err; -+ } -+ -+ /* -+ * The copygc thread is now global, which means it's no longer -+ * freeing up space on specific disks, which means that -+ * allocations for specific disks may hang arbitrarily long: -+ */ -+ wp = bch2_alloc_sectors_start(c, -+ op->target, -+ op->opts.erasure_code, -+ op->write_point, -+ &op->devs_have, -+ op->nr_replicas, -+ op->nr_replicas_required, -+ op->alloc_reserve, -+ op->flags, -+ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| -+ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); -+ EBUG_ON(!wp); -+ -+ if (unlikely(IS_ERR(wp))) { -+ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { -+ ret = PTR_ERR(wp); -+ goto err; -+ } -+ -+ goto flush_io; -+ } -+ -+ /* -+ * It's possible for the allocator to fail, put us on the -+ * freelist waitlist, and then succeed in one of various retry -+ * paths: if that happens, we need to disable the skip_put -+ * optimization because otherwise there won't necessarily be a -+ * barrier before we free the bch_write_op: -+ */ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ skip_put = false; -+ -+ bch2_open_bucket_get(c, wp, &op->open_buckets); -+ ret = bch2_write_extent(op, wp, &bio); -+ bch2_alloc_sectors_done(c, wp); -+ -+ if (ret < 0) -+ goto err; -+ -+ if (ret) { -+ skip_put = false; -+ } else { -+ /* -+ * for the skip_put optimization this has to be set -+ * before we submit the bio: -+ */ -+ op->flags |= BCH_WRITE_DONE; -+ } -+ -+ bio->bi_end_io = bch2_write_endio; -+ bio->bi_private = &op->cl; -+ bio->bi_opf |= REQ_OP_WRITE; -+ -+ if (!skip_put) -+ closure_get(bio->bi_private); -+ else -+ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; -+ -+ key_to_write = (void *) (op->insert_keys.keys_p + -+ key_to_write_offset); -+ -+ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, -+ key_to_write); -+ } while (ret); -+ -+ if (!skip_put) -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+out: -+ memalloc_nofs_restore(nofs_flags); -+ return; -+err: -+ op->error = ret; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+flush_io: -+ /* -+ * If the write can't all be submitted at once, we generally want to -+ * block synchronously as that signals backpressure to the caller. -+ * -+ * However, if we're running out of a workqueue, we can't block here -+ * because we'll be blocking other work items from completing: -+ */ -+ if (current->flags & PF_WQ_WORKER) { -+ continue_at(cl, bch2_write_index, index_update_wq(op)); -+ goto out; -+ } -+ -+ closure_sync(cl); -+ -+ if (!bch2_keylist_empty(&op->insert_keys)) { -+ __bch2_write_index(op); -+ -+ if (op->error) { -+ op->flags |= BCH_WRITE_DONE; -+ continue_at_nobarrier(cl, bch2_write_done, NULL); -+ goto out; -+ } -+ } -+ -+ goto again; -+} -+ -+static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) -+{ -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->wbio.bio; -+ struct bvec_iter iter; -+ struct bkey_i_inline_data *id; -+ unsigned sectors; -+ int ret; -+ -+ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); -+ -+ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, -+ ARRAY_SIZE(op->inline_keys), -+ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); -+ if (ret) { -+ op->error = ret; -+ goto err; -+ } -+ -+ sectors = bio_sectors(bio); -+ op->pos.offset += sectors; -+ -+ id = bkey_inline_data_init(op->insert_keys.top); -+ id->k.p = op->pos; -+ id->k.version = op->version; -+ id->k.size = sectors; -+ -+ iter = bio->bi_iter; -+ iter.bi_size = data_len; -+ memcpy_from_bio(id->v.data, bio, iter); -+ -+ while (data_len & 7) -+ id->v.data[data_len++] = '\0'; -+ set_bkey_val_bytes(&id->k, data_len); -+ bch2_keylist_push(&op->insert_keys); -+ -+ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; -+ op->flags |= BCH_WRITE_DONE; -+ -+ continue_at_nobarrier(cl, bch2_write_index, NULL); -+ return; -+err: -+ bch2_write_done(&op->cl); -+} -+ -+/** -+ * bch_write - handle a write to a cache device or flash only volume -+ * -+ * This is the starting point for any data to end up in a cache device; it could -+ * be from a normal write, or a writeback write, or a write to a flash only -+ * volume - it's also used by the moving garbage collector to compact data in -+ * mostly empty buckets. -+ * -+ * It first writes the data to the cache, creating a list of keys to be inserted -+ * (if the data won't fit in a single open bucket, there will be multiple keys); -+ * after the data is written it calls bch_journal, and after the keys have been -+ * added to the next journal write they're inserted into the btree. -+ * -+ * If op->discard is true, instead of inserting the data it invalidates the -+ * region of the cache represented by op->bio and op->inode. -+ */ -+void bch2_write(struct closure *cl) -+{ -+ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); -+ struct bio *bio = &op->wbio.bio; -+ struct bch_fs *c = op->c; -+ unsigned data_len; -+ -+ BUG_ON(!op->nr_replicas); -+ BUG_ON(!op->write_point.v); -+ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); -+ -+ op->start_time = local_clock(); -+ bch2_keylist_init(&op->insert_keys, op->inline_keys); -+ wbio_init(bio)->put_bio = false; -+ -+ if (bio_sectors(bio) & (c->opts.block_size - 1)) { -+ __bcache_io_error(c, "misaligned write"); -+ op->error = -EIO; -+ goto err; -+ } -+ -+ if (c->opts.nochanges || -+ !percpu_ref_tryget(&c->writes)) { -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ __bcache_io_error(c, "read only"); -+ op->error = -EROFS; -+ goto err; -+ } -+ -+ /* -+ * Can't ratelimit copygc - we'd deadlock: -+ */ -+ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) -+ down(&c->io_in_flight); -+ -+ bch2_increment_clock(c, bio_sectors(bio), WRITE); -+ -+ data_len = min_t(u64, bio->bi_iter.bi_size, -+ op->new_i_size - (op->pos.offset << 9)); -+ -+ if (c->opts.inline_data && -+ data_len <= min(block_bytes(c) / 2, 1024U)) { -+ bch2_write_data_inline(op, data_len); -+ return; -+ } -+ -+ continue_at_nobarrier(cl, __bch2_write, NULL); -+ return; -+err: -+ bch2_disk_reservation_put(c, &op->res); -+ -+ if (op->end_io) { -+ EBUG_ON(cl->parent); -+ closure_debug_destroy(cl); -+ op->end_io(op); -+ } else { -+ closure_return(cl); -+ } -+} -+ -+/* Cache promotion on read */ -+ -+struct promote_op { -+ struct closure cl; -+ struct rcu_head rcu; -+ u64 start_time; -+ -+ struct rhash_head hash; -+ struct bpos pos; -+ -+ struct migrate_write write; -+ struct bio_vec bi_inline_vecs[0]; /* must be last */ -+}; -+ -+static const struct rhashtable_params bch_promote_params = { -+ .head_offset = offsetof(struct promote_op, hash), -+ .key_offset = offsetof(struct promote_op, pos), -+ .key_len = sizeof(struct bpos), -+}; -+ -+static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, -+ struct bpos pos, -+ struct bch_io_opts opts, -+ unsigned flags) -+{ -+ if (!(flags & BCH_READ_MAY_PROMOTE)) -+ return false; -+ -+ if (!opts.promote_target) -+ return false; -+ -+ if (bch2_bkey_has_target(c, k, opts.promote_target)) -+ return false; -+ -+ if (bch2_target_congested(c, opts.promote_target)) { -+ /* XXX trace this */ -+ return false; -+ } -+ -+ if (rhashtable_lookup_fast(&c->promote_table, &pos, -+ bch_promote_params)) -+ return false; -+ -+ return true; -+} -+ -+static void promote_free(struct bch_fs *c, struct promote_op *op) -+{ -+ int ret; -+ -+ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, -+ bch_promote_params); -+ BUG_ON(ret); -+ percpu_ref_put(&c->writes); -+ kfree_rcu(op, rcu); -+} -+ -+static void promote_done(struct closure *cl) -+{ -+ struct promote_op *op = -+ container_of(cl, struct promote_op, cl); -+ struct bch_fs *c = op->write.op.c; -+ -+ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], -+ op->start_time); -+ -+ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); -+ promote_free(c, op); -+} -+ -+static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ struct closure *cl = &op->cl; -+ struct bio *bio = &op->write.op.wbio.bio; -+ -+ trace_promote(&rbio->bio); -+ -+ /* we now own pages: */ -+ BUG_ON(!rbio->bounce); -+ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); -+ -+ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, -+ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); -+ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); -+ -+ bch2_migrate_read_done(&op->write, rbio); -+ -+ closure_init(cl, NULL); -+ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); -+ closure_return_with_destructor(cl, promote_done); -+} -+ -+static struct promote_op *__promote_alloc(struct bch_fs *c, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ struct bpos pos, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned sectors, -+ struct bch_read_bio **rbio) -+{ -+ struct promote_op *op = NULL; -+ struct bio *bio; -+ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ int ret; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return NULL; -+ -+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); -+ if (!op) -+ goto err; -+ -+ op->start_time = local_clock(); -+ op->pos = pos; -+ -+ /* -+ * We don't use the mempool here because extents that aren't -+ * checksummed or compressed can be too big for the mempool: -+ */ -+ *rbio = kzalloc(sizeof(struct bch_read_bio) + -+ sizeof(struct bio_vec) * pages, -+ GFP_NOIO); -+ if (!*rbio) -+ goto err; -+ -+ rbio_init(&(*rbio)->bio, opts); -+ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); -+ -+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, -+ GFP_NOIO)) -+ goto err; -+ -+ (*rbio)->bounce = true; -+ (*rbio)->split = true; -+ (*rbio)->kmalloc = true; -+ -+ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, -+ bch_promote_params)) -+ goto err; -+ -+ bio = &op->write.op.wbio.bio; -+ bio_init(bio, bio->bi_inline_vecs, pages); -+ -+ ret = bch2_migrate_write_init(c, &op->write, -+ writepoint_hashed((unsigned long) current), -+ opts, -+ DATA_PROMOTE, -+ (struct data_opts) { -+ .target = opts.promote_target -+ }, -+ btree_id, k); -+ BUG_ON(ret); -+ -+ return op; -+err: -+ if (*rbio) -+ bio_free_pages(&(*rbio)->bio); -+ kfree(*rbio); -+ *rbio = NULL; -+ kfree(op); -+ percpu_ref_put(&c->writes); -+ return NULL; -+} -+ -+noinline -+static struct promote_op *promote_alloc(struct bch_fs *c, -+ struct bvec_iter iter, -+ struct bkey_s_c k, -+ struct extent_ptr_decoded *pick, -+ struct bch_io_opts opts, -+ unsigned flags, -+ struct bch_read_bio **rbio, -+ bool *bounce, -+ bool *read_full) -+{ -+ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); -+ /* data might have to be decompressed in the write path: */ -+ unsigned sectors = promote_full -+ ? max(pick->crc.compressed_size, pick->crc.live_size) -+ : bvec_iter_sectors(iter); -+ struct bpos pos = promote_full -+ ? bkey_start_pos(k.k) -+ : POS(k.k->p.inode, iter.bi_sector); -+ struct promote_op *promote; -+ -+ if (!should_promote(c, k, pos, opts, flags)) -+ return NULL; -+ -+ promote = __promote_alloc(c, -+ k.k->type == KEY_TYPE_reflink_v -+ ? BTREE_ID_REFLINK -+ : BTREE_ID_EXTENTS, -+ k, pos, pick, opts, sectors, rbio); -+ if (!promote) -+ return NULL; -+ -+ *bounce = true; -+ *read_full = promote_full; -+ return promote; -+} -+ -+/* Read */ -+ -+#define READ_RETRY_AVOID 1 -+#define READ_RETRY 2 -+#define READ_ERR 3 -+ -+enum rbio_context { -+ RBIO_CONTEXT_NULL, -+ RBIO_CONTEXT_HIGHPRI, -+ RBIO_CONTEXT_UNBOUND, -+}; -+ -+static inline struct bch_read_bio * -+bch2_rbio_parent(struct bch_read_bio *rbio) -+{ -+ return rbio->split ? rbio->parent : rbio; -+} -+ -+__always_inline -+static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, -+ enum rbio_context context, -+ struct workqueue_struct *wq) -+{ -+ if (context <= rbio->context) { -+ fn(&rbio->work); -+ } else { -+ rbio->work.func = fn; -+ rbio->context = context; -+ queue_work(wq, &rbio->work); -+ } -+} -+ -+static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) -+{ -+ BUG_ON(rbio->bounce && !rbio->split); -+ -+ if (rbio->promote) -+ promote_free(rbio->c, rbio->promote); -+ rbio->promote = NULL; -+ -+ if (rbio->bounce) -+ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); -+ -+ if (rbio->split) { -+ struct bch_read_bio *parent = rbio->parent; -+ -+ if (rbio->kmalloc) -+ kfree(rbio); -+ else -+ bio_put(&rbio->bio); -+ -+ rbio = parent; -+ } -+ -+ return rbio; -+} -+ -+/* -+ * Only called on a top level bch_read_bio to complete an entire read request, -+ * not a split: -+ */ -+static void bch2_rbio_done(struct bch_read_bio *rbio) -+{ -+ if (rbio->start_time) -+ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], -+ rbio->start_time); -+ bio_endio(&rbio->bio); -+} -+ -+static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, -+ unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ rbio->pos, BTREE_ITER_SLOTS); -+retry: -+ rbio->bio.bi_status = 0; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if (bkey_err(k)) -+ goto err; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ if (!bch2_bkey_matches_ptr(c, k, -+ rbio->pick.ptr, -+ rbio->pos.offset - -+ rbio->pick.crc.offset)) { -+ /* extent we wanted to read no longer exists: */ -+ rbio->hole = true; -+ goto out; -+ } -+ -+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); -+ if (ret == READ_RETRY) -+ goto retry; -+ if (ret) -+ goto err; -+out: -+ bch2_rbio_done(rbio); -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+ goto out; -+} -+ -+static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, -+ struct bvec_iter bvec_iter, u64 inode, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ int ret; -+ -+ flags &= ~BCH_READ_LAST_FRAGMENT; -+ flags |= BCH_READ_MUST_CLONE; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS(inode, bvec_iter.bi_sector), -+ BTREE_ITER_SLOTS, k, ret) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ break; -+ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; -+ swap(bvec_iter.bi_size, bytes); -+ -+ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, -+ offset_into_extent, failed, flags); -+ switch (ret) { -+ case READ_RETRY: -+ goto retry; -+ case READ_ERR: -+ goto err; -+ }; -+ -+ if (bytes == bvec_iter.bi_size) -+ goto out; -+ -+ swap(bvec_iter.bi_size, bytes); -+ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); -+ } -+ -+ if (ret == -EINTR) -+ goto retry; -+ /* -+ * If we get here, it better have been because there was an error -+ * reading a btree node -+ */ -+ BUG_ON(!ret); -+ __bcache_io_error(c, "btree IO error: %i", ret); -+err: -+ rbio->bio.bi_status = BLK_STS_IOERR; -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ bch2_rbio_done(rbio); -+} -+ -+static void bch2_rbio_retry(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bvec_iter iter = rbio->bvec_iter; -+ unsigned flags = rbio->flags; -+ u64 inode = rbio->pos.inode; -+ struct bch_io_failures failed = { .nr = 0 }; -+ -+ trace_read_retry(&rbio->bio); -+ -+ if (rbio->retry == READ_RETRY_AVOID) -+ bch2_mark_io_failure(&failed, &rbio->pick); -+ -+ rbio->bio.bi_status = 0; -+ -+ rbio = bch2_rbio_free(rbio); -+ -+ flags |= BCH_READ_IN_RETRY; -+ flags &= ~BCH_READ_MAY_PROMOTE; -+ -+ if (flags & BCH_READ_NODECODE) -+ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); -+ else -+ bch2_read_retry(c, rbio, iter, inode, &failed, flags); -+} -+ -+static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, -+ blk_status_t error) -+{ -+ rbio->retry = retry; -+ -+ if (rbio->flags & BCH_READ_IN_RETRY) -+ return; -+ -+ if (retry == READ_ERR) { -+ rbio = bch2_rbio_free(rbio); -+ -+ rbio->bio.bi_status = error; -+ bch2_rbio_done(rbio); -+ } else { -+ bch2_rbio_punt(rbio, bch2_rbio_retry, -+ RBIO_CONTEXT_UNBOUND, system_unbound_wq); -+ } -+} -+ -+static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, -+ struct bch_read_bio *rbio) -+{ -+ struct bch_fs *c = rbio->c; -+ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; -+ struct bch_extent_crc_unpacked new_crc; -+ struct btree_iter *iter = NULL; -+ struct bkey_i *new; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ if (crc_is_compressed(rbio->pick.crc)) -+ return 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ if ((ret = PTR_ERR_OR_ZERO(iter))) -+ goto out; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ if ((ret = bkey_err(k))) -+ goto out; -+ -+ /* -+ * going to be temporarily appending another checksum entry: -+ */ -+ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + -+ BKEY_EXTENT_U64s_MAX * 8); -+ if ((ret = PTR_ERR_OR_ZERO(new))) -+ goto out; -+ -+ bkey_reassemble(new, k); -+ k = bkey_i_to_s_c(new); -+ -+ if (bversion_cmp(k.k->version, rbio->version) || -+ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) -+ goto out; -+ -+ /* Extent was merged? */ -+ if (bkey_start_offset(k.k) < data_offset || -+ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) -+ goto out; -+ -+ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, -+ rbio->pick.crc, NULL, &new_crc, -+ bkey_start_offset(k.k) - data_offset, k.k->size, -+ rbio->pick.crc.csum_type)) { -+ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); -+ ret = 0; -+ goto out; -+ } -+ -+ if (!bch2_bkey_narrow_crcs(new, new_crc)) -+ goto out; -+ -+ bch2_trans_update(trans, iter, new, 0); -+out: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) -+{ -+ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, -+ __bch2_rbio_narrow_crcs(&trans, rbio)); -+} -+ -+/* Inner part that may run in process context */ -+static void __bch2_read_endio(struct work_struct *work) -+{ -+ struct bch_read_bio *rbio = -+ container_of(work, struct bch_read_bio, work); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct bio *src = &rbio->bio; -+ struct bio *dst = &bch2_rbio_parent(rbio)->bio; -+ struct bvec_iter dst_iter = rbio->bvec_iter; -+ struct bch_extent_crc_unpacked crc = rbio->pick.crc; -+ struct nonce nonce = extent_nonce(rbio->version, crc); -+ struct bch_csum csum; -+ -+ /* Reset iterator for checksumming and copying bounced data: */ -+ if (rbio->bounce) { -+ src->bi_iter.bi_size = crc.compressed_size << 9; -+ src->bi_iter.bi_idx = 0; -+ src->bi_iter.bi_bvec_done = 0; -+ } else { -+ src->bi_iter = rbio->bvec_iter; -+ } -+ -+ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); -+ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) -+ goto csum_err; -+ -+ if (unlikely(rbio->narrow_crcs)) -+ bch2_rbio_narrow_crcs(rbio); -+ -+ if (rbio->flags & BCH_READ_NODECODE) -+ goto nodecode; -+ -+ /* Adjust crc to point to subset of data we want: */ -+ crc.offset += rbio->offset_into_extent; -+ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); -+ -+ if (crc_is_compressed(crc)) { -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) -+ goto decompression_err; -+ } else { -+ /* don't need to decrypt the entire bio: */ -+ nonce = nonce_add(nonce, crc.offset << 9); -+ bio_advance(src, crc.offset << 9); -+ -+ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); -+ src->bi_iter.bi_size = dst_iter.bi_size; -+ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ -+ if (rbio->bounce) { -+ struct bvec_iter src_iter = src->bi_iter; -+ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); -+ } -+ } -+ -+ if (rbio->promote) { -+ /* -+ * Re encrypt data we decrypted, so it's consistent with -+ * rbio->crc: -+ */ -+ bch2_encrypt_bio(c, crc.csum_type, nonce, src); -+ promote_start(rbio->promote, rbio); -+ rbio->promote = NULL; -+ } -+nodecode: -+ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { -+ rbio = bch2_rbio_free(rbio); -+ bch2_rbio_done(rbio); -+ } -+ return; -+csum_err: -+ /* -+ * Checksum error: if the bio wasn't bounced, we may have been -+ * reading into buffers owned by userspace (that userspace can -+ * scribble over) - retry the read, bouncing it this time: -+ */ -+ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { -+ rbio->flags |= BCH_READ_MUST_BOUNCE; -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); -+ return; -+ } -+ -+ bch2_dev_io_error(ca, -+ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", -+ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, -+ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, -+ csum.hi, csum.lo, crc.csum_type); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ return; -+decompression_err: -+ __bcache_io_error(c, "decompression error, inode %llu offset %llu", -+ rbio->pos.inode, -+ (u64) rbio->bvec_iter.bi_sector); -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); -+ return; -+} -+ -+static void bch2_read_endio(struct bio *bio) -+{ -+ struct bch_read_bio *rbio = -+ container_of(bio, struct bch_read_bio, bio); -+ struct bch_fs *c = rbio->c; -+ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); -+ struct workqueue_struct *wq = NULL; -+ enum rbio_context context = RBIO_CONTEXT_NULL; -+ -+ if (rbio->have_ioref) { -+ bch2_latency_acct(ca, rbio->submit_time, READ); -+ percpu_ref_put(&ca->io_ref); -+ } -+ -+ if (!rbio->split) -+ rbio->bio.bi_end_io = rbio->end_io; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", -+ bch2_blk_status_to_str(bio->bi_status))) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); -+ return; -+ } -+ -+ if (rbio->pick.ptr.cached && -+ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || -+ ptr_stale(ca, &rbio->pick.ptr))) { -+ atomic_long_inc(&c->read_realloc_races); -+ -+ if (rbio->flags & BCH_READ_RETRY_IF_STALE) -+ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); -+ else -+ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); -+ return; -+ } -+ -+ if (rbio->narrow_crcs || -+ crc_is_compressed(rbio->pick.crc) || -+ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) -+ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; -+ else if (rbio->pick.crc.csum_type) -+ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; -+ -+ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); -+} -+ -+int __bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *orig_k) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 reflink_offset; -+ int ret; -+ -+ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + -+ *offset_into_extent; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, -+ POS(0, reflink_offset), -+ BTREE_ITER_SLOTS); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ return ret; -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ if (k.k->type != KEY_TYPE_reflink_v) { -+ __bcache_io_error(trans->c, -+ "pointer to nonexistent indirect extent"); -+ ret = -EIO; -+ goto err; -+ } -+ -+ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); -+ bkey_on_stack_reassemble(orig_k, trans->c, k); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, -+ struct bvec_iter iter, struct bkey_s_c k, -+ unsigned offset_into_extent, -+ struct bch_io_failures *failed, unsigned flags) -+{ -+ struct bch_fs *c = trans->c; -+ struct extent_ptr_decoded pick; -+ struct bch_read_bio *rbio = NULL; -+ struct bch_dev *ca; -+ struct promote_op *promote = NULL; -+ bool bounce = false, read_full = false, narrow_crcs = false; -+ struct bpos pos = bkey_start_pos(k.k); -+ int pick_ret; -+ -+ if (k.k->type == KEY_TYPE_inline_data) { -+ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); -+ unsigned bytes = min_t(unsigned, iter.bi_size, -+ bkey_val_bytes(d.k)); -+ -+ swap(iter.bi_size, bytes); -+ memcpy_to_bio(&orig->bio, iter, d.v->data); -+ swap(iter.bi_size, bytes); -+ bio_advance_iter(&orig->bio, &iter, bytes); -+ zero_fill_bio_iter(&orig->bio, iter); -+ goto out_read_done; -+ } -+ -+ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); -+ -+ /* hole or reservation - just zero fill: */ -+ if (!pick_ret) -+ goto hole; -+ -+ if (pick_ret < 0) { -+ __bcache_io_error(c, "no device to read from"); -+ goto err; -+ } -+ -+ if (pick_ret > 0) -+ ca = bch_dev_bkey_exists(c, pick.ptr.dev); -+ -+ if (flags & BCH_READ_NODECODE) { -+ /* -+ * can happen if we retry, and the extent we were going to read -+ * has been merged in the meantime: -+ */ -+ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) -+ goto hole; -+ -+ iter.bi_size = pick.crc.compressed_size << 9; -+ goto get_bio; -+ } -+ -+ if (!(flags & BCH_READ_LAST_FRAGMENT) || -+ bio_flagged(&orig->bio, BIO_CHAIN)) -+ flags |= BCH_READ_MUST_CLONE; -+ -+ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && -+ bch2_can_narrow_extent_crcs(k, pick.crc); -+ -+ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) -+ flags |= BCH_READ_MUST_BOUNCE; -+ -+ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); -+ -+ if (crc_is_compressed(pick.crc) || -+ (pick.crc.csum_type != BCH_CSUM_NONE && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ (bch2_csum_type_is_encryption(pick.crc.csum_type) && -+ (flags & BCH_READ_USER_MAPPED)) || -+ (flags & BCH_READ_MUST_BOUNCE)))) { -+ read_full = true; -+ bounce = true; -+ } -+ -+ if (orig->opts.promote_target) -+ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, -+ &rbio, &bounce, &read_full); -+ -+ if (!read_full) { -+ EBUG_ON(crc_is_compressed(pick.crc)); -+ EBUG_ON(pick.crc.csum_type && -+ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || -+ bvec_iter_sectors(iter) != pick.crc.live_size || -+ pick.crc.offset || -+ offset_into_extent)); -+ -+ pos.offset += offset_into_extent; -+ pick.ptr.offset += pick.crc.offset + -+ offset_into_extent; -+ offset_into_extent = 0; -+ pick.crc.compressed_size = bvec_iter_sectors(iter); -+ pick.crc.uncompressed_size = bvec_iter_sectors(iter); -+ pick.crc.offset = 0; -+ pick.crc.live_size = bvec_iter_sectors(iter); -+ offset_into_extent = 0; -+ } -+get_bio: -+ if (rbio) { -+ /* -+ * promote already allocated bounce rbio: -+ * promote needs to allocate a bio big enough for uncompressing -+ * data in the write path, but we're not going to use it all -+ * here: -+ */ -+ EBUG_ON(rbio->bio.bi_iter.bi_size < -+ pick.crc.compressed_size << 9); -+ rbio->bio.bi_iter.bi_size = -+ pick.crc.compressed_size << 9; -+ } else if (bounce) { -+ unsigned sectors = pick.crc.compressed_size; -+ -+ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, -+ DIV_ROUND_UP(sectors, PAGE_SECTORS), -+ &c->bio_read_split), -+ orig->opts); -+ -+ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); -+ rbio->bounce = true; -+ rbio->split = true; -+ } else if (flags & BCH_READ_MUST_CLONE) { -+ /* -+ * Have to clone if there were any splits, due to error -+ * reporting issues (if a split errored, and retrying didn't -+ * work, when it reports the error to its parent (us) we don't -+ * know if the error was from our bio, and we should retry, or -+ * from the whole bio, in which case we don't want to retry and -+ * lose the error) -+ */ -+ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, -+ &c->bio_read_split), -+ orig->opts); -+ rbio->bio.bi_iter = iter; -+ rbio->split = true; -+ } else { -+ rbio = orig; -+ rbio->bio.bi_iter = iter; -+ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); -+ } -+ -+ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); -+ -+ rbio->c = c; -+ rbio->submit_time = local_clock(); -+ if (rbio->split) -+ rbio->parent = orig; -+ else -+ rbio->end_io = orig->bio.bi_end_io; -+ rbio->bvec_iter = iter; -+ rbio->offset_into_extent= offset_into_extent; -+ rbio->flags = flags; -+ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); -+ rbio->narrow_crcs = narrow_crcs; -+ rbio->hole = 0; -+ rbio->retry = 0; -+ rbio->context = 0; -+ /* XXX: only initialize this if needed */ -+ rbio->devs_have = bch2_bkey_devs(k); -+ rbio->pick = pick; -+ rbio->pos = pos; -+ rbio->version = k.k->version; -+ rbio->promote = promote; -+ INIT_WORK(&rbio->work, NULL); -+ -+ rbio->bio.bi_opf = orig->bio.bi_opf; -+ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; -+ rbio->bio.bi_end_io = bch2_read_endio; -+ -+ if (rbio->bounce) -+ trace_read_bounce(&rbio->bio); -+ -+ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); -+ -+ if (pick.ptr.cached) -+ bch2_bucket_io_time_reset(trans, pick.ptr.dev, -+ PTR_BUCKET_NR(ca, &pick.ptr), READ); -+ -+ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { -+ bio_inc_remaining(&orig->bio); -+ trace_read_split(&orig->bio); -+ } -+ -+ if (!rbio->pick.idx) { -+ if (!rbio->have_ioref) { -+ __bcache_io_error(c, "no device to read from"); -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], -+ bio_sectors(&rbio->bio)); -+ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ submit_bio(&rbio->bio); -+ else -+ submit_bio_wait(&rbio->bio); -+ } else { -+ /* Attempting reconstruct read: */ -+ if (bch2_ec_read_extent(c, rbio)) { -+ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); -+ goto out; -+ } -+ -+ if (likely(!(flags & BCH_READ_IN_RETRY))) -+ bio_endio(&rbio->bio); -+ } -+out: -+ if (likely(!(flags & BCH_READ_IN_RETRY))) { -+ return 0; -+ } else { -+ int ret; -+ -+ rbio->context = RBIO_CONTEXT_UNBOUND; -+ bch2_read_endio(&rbio->bio); -+ -+ ret = rbio->retry; -+ rbio = bch2_rbio_free(rbio); -+ -+ if (ret == READ_RETRY_AVOID) { -+ bch2_mark_io_failure(failed, &pick); -+ ret = READ_RETRY; -+ } -+ -+ return ret; -+ } -+ -+err: -+ if (flags & BCH_READ_IN_RETRY) -+ return READ_ERR; -+ -+ orig->bio.bi_status = BLK_STS_IOERR; -+ goto out_read_done; -+ -+hole: -+ /* -+ * won't normally happen in the BCH_READ_NODECODE -+ * (bch2_move_extent()) path, but if we retry and the extent we wanted -+ * to read no longer exists we have to signal that: -+ */ -+ if (flags & BCH_READ_NODECODE) -+ orig->hole = true; -+ -+ zero_fill_bio_iter(&orig->bio, iter); -+out_read_done: -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ bch2_rbio_done(orig); -+ return 0; -+} -+ -+void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_on_stack sk; -+ struct bkey_s_c k; -+ unsigned flags = BCH_READ_RETRY_IF_STALE| -+ BCH_READ_MAY_PROMOTE| -+ BCH_READ_USER_MAPPED; -+ int ret; -+ -+ BUG_ON(rbio->_state); -+ BUG_ON(flags & BCH_READ_NODECODE); -+ BUG_ON(flags & BCH_READ_IN_RETRY); -+ -+ rbio->c = c; -+ rbio->start_time = local_clock(); -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, -+ POS(inode, rbio->bio.bi_iter.bi_sector), -+ BTREE_ITER_SLOTS); -+ while (1) { -+ unsigned bytes, sectors, offset_into_extent; -+ -+ bch2_btree_iter_set_pos(iter, -+ POS(inode, rbio->bio.bi_iter.bi_sector)); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ offset_into_extent = iter->pos.offset - -+ bkey_start_offset(k.k); -+ sectors = k.k->size - offset_into_extent; -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ -+ ret = bch2_read_indirect_extent(&trans, -+ &offset_into_extent, &sk); -+ if (ret) -+ goto err; -+ -+ /* -+ * With indirect extents, the amount of data to read is the min -+ * of the original extent and the indirect extent: -+ */ -+ sectors = min(sectors, k.k->size - offset_into_extent); -+ -+ /* -+ * Unlock the iterator while the btree node's lock is still in -+ * cache, before doing the IO: -+ */ -+ bch2_trans_unlock(&trans); -+ -+ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ -+ if (rbio->bio.bi_iter.bi_size == bytes) -+ flags |= BCH_READ_LAST_FRAGMENT; -+ -+ bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); -+ -+ if (flags & BCH_READ_LAST_FRAGMENT) -+ break; -+ -+ swap(rbio->bio.bi_iter.bi_size, bytes); -+ bio_advance(&rbio->bio, bytes); -+ } -+out: -+ bch2_trans_exit(&trans); -+ bkey_on_stack_exit(&sk, c); -+ return; -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); -+ bch2_rbio_done(rbio); -+ goto out; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *c) -+{ -+ if (c->promote_table.tbl) -+ rhashtable_destroy(&c->promote_table); -+ mempool_exit(&c->bio_bounce_pages); -+ bioset_exit(&c->bio_write); -+ bioset_exit(&c->bio_read_split); -+ bioset_exit(&c->bio_read); -+} -+ -+int bch2_fs_io_init(struct bch_fs *c) -+{ -+ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), -+ BIOSET_NEED_BVECS) || -+ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), -+ BIOSET_NEED_BVECS) || -+ mempool_init_page_pool(&c->bio_bounce_pages, -+ max_t(unsigned, -+ c->opts.btree_node_size, -+ c->sb.encoded_extent_max) / -+ PAGE_SECTORS, 0) || -+ rhashtable_init(&c->promote_table, &bch_promote_params)) -+ return -ENOMEM; -+ -+ return 0; -+} -diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h -new file mode 100644 -index 000000000000..e6aac594f3e6 ---- /dev/null -+++ b/fs/bcachefs/io.h -@@ -0,0 +1,169 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_H -+#define _BCACHEFS_IO_H -+ -+#include "checksum.h" -+#include "bkey_on_stack.h" -+#include "io_types.h" -+ -+#define to_wbio(_bio) \ -+ container_of((_bio), struct bch_write_bio, bio) -+ -+#define to_rbio(_bio) \ -+ container_of((_bio), struct bch_read_bio, bio) -+ -+void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); -+void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -+ -+void bch2_latency_acct(struct bch_dev *, u64, int); -+ -+void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, -+ enum bch_data_type, const struct bkey_i *); -+ -+#define BLK_STS_REMOVED ((__force blk_status_t)128) -+ -+const char *bch2_blk_status_to_str(blk_status_t); -+ -+enum bch_write_flags { -+ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), -+ BCH_WRITE_CACHED = (1 << 1), -+ BCH_WRITE_FLUSH = (1 << 2), -+ BCH_WRITE_DATA_ENCODED = (1 << 3), -+ BCH_WRITE_PAGES_STABLE = (1 << 4), -+ BCH_WRITE_PAGES_OWNED = (1 << 5), -+ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), -+ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), -+ BCH_WRITE_FROM_INTERNAL = (1 << 8), -+ -+ /* Internal: */ -+ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), -+ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), -+ BCH_WRITE_DONE = (1 << 11), -+}; -+ -+static inline u64 *op_journal_seq(struct bch_write_op *op) -+{ -+ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) -+ ? op->journal_seq_p : &op->journal_seq; -+} -+ -+static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -+{ -+ op->journal_seq_p = journal_seq; -+ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -+} -+ -+static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) -+{ -+ return op->alloc_reserve == RESERVE_MOVINGGC -+ ? op->c->copygc_wq -+ : op->c->wq; -+} -+ -+int bch2_extent_update(struct btree_trans *, struct btree_iter *, -+ struct bkey_i *, struct disk_reservation *, -+ u64 *, u64, s64 *); -+int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, -+ struct bpos, u64 *, s64 *); -+int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); -+ -+int bch2_write_index_default(struct bch_write_op *); -+ -+static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, -+ struct bch_io_opts opts) -+{ -+ op->c = c; -+ op->end_io = NULL; -+ op->flags = 0; -+ op->written = 0; -+ op->error = 0; -+ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); -+ op->compression_type = bch2_compression_opt_to_type[opts.compression]; -+ op->nr_replicas = 0; -+ op->nr_replicas_required = c->opts.data_replicas_required; -+ op->alloc_reserve = RESERVE_NONE; -+ op->incompressible = 0; -+ op->open_buckets.nr = 0; -+ op->devs_have.nr = 0; -+ op->target = 0; -+ op->opts = opts; -+ op->pos = POS_MAX; -+ op->version = ZERO_VERSION; -+ op->write_point = (struct write_point_specifier) { 0 }; -+ op->res = (struct disk_reservation) { 0 }; -+ op->journal_seq = 0; -+ op->new_i_size = U64_MAX; -+ op->i_sectors_delta = 0; -+ op->index_update_fn = bch2_write_index_default; -+} -+ -+void bch2_write(struct closure *); -+ -+static inline struct bch_write_bio *wbio_init(struct bio *bio) -+{ -+ struct bch_write_bio *wbio = to_wbio(bio); -+ -+ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); -+ return wbio; -+} -+ -+struct bch_devs_mask; -+struct cache_promote_op; -+struct extent_ptr_decoded; -+ -+int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, -+ struct bkey_on_stack *); -+ -+static inline int bch2_read_indirect_extent(struct btree_trans *trans, -+ unsigned *offset_into_extent, -+ struct bkey_on_stack *k) -+{ -+ return k->k->k.type == KEY_TYPE_reflink_p -+ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) -+ : 0; -+} -+ -+enum bch_read_flags { -+ BCH_READ_RETRY_IF_STALE = 1 << 0, -+ BCH_READ_MAY_PROMOTE = 1 << 1, -+ BCH_READ_USER_MAPPED = 1 << 2, -+ BCH_READ_NODECODE = 1 << 3, -+ BCH_READ_LAST_FRAGMENT = 1 << 4, -+ -+ /* internal: */ -+ BCH_READ_MUST_BOUNCE = 1 << 5, -+ BCH_READ_MUST_CLONE = 1 << 6, -+ BCH_READ_IN_RETRY = 1 << 7, -+}; -+ -+int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, -+ struct bvec_iter, struct bkey_s_c, unsigned, -+ struct bch_io_failures *, unsigned); -+ -+static inline void bch2_read_extent(struct btree_trans *trans, -+ struct bch_read_bio *rbio, -+ struct bkey_s_c k, -+ unsigned offset_into_extent, -+ unsigned flags) -+{ -+ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, -+ offset_into_extent, NULL, flags); -+} -+ -+void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); -+ -+static inline struct bch_read_bio *rbio_init(struct bio *bio, -+ struct bch_io_opts opts) -+{ -+ struct bch_read_bio *rbio = to_rbio(bio); -+ -+ rbio->_state = 0; -+ rbio->promote = NULL; -+ rbio->opts = opts; -+ return rbio; -+} -+ -+void bch2_fs_io_exit(struct bch_fs *); -+int bch2_fs_io_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_IO_H */ -diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h -new file mode 100644 -index 000000000000..b23727d212b9 ---- /dev/null -+++ b/fs/bcachefs/io_types.h -@@ -0,0 +1,148 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_IO_TYPES_H -+#define _BCACHEFS_IO_TYPES_H -+ -+#include "alloc_types.h" -+#include "btree_types.h" -+#include "buckets_types.h" -+#include "extents_types.h" -+#include "keylist_types.h" -+#include "opts.h" -+#include "super_types.h" -+ -+#include -+#include -+ -+struct bch_read_bio { -+ struct bch_fs *c; -+ u64 start_time; -+ u64 submit_time; -+ -+ /* -+ * Reads will often have to be split, and if the extent being read from -+ * was checksummed or compressed we'll also have to allocate bounce -+ * buffers and copy the data back into the original bio. -+ * -+ * If we didn't have to split, we have to save and restore the original -+ * bi_end_io - @split below indicates which: -+ */ -+ union { -+ struct bch_read_bio *parent; -+ bio_end_io_t *end_io; -+ }; -+ -+ /* -+ * Saved copy of bio->bi_iter, from submission time - allows us to -+ * resubmit on IO error, and also to copy data back to the original bio -+ * when we're bouncing: -+ */ -+ struct bvec_iter bvec_iter; -+ -+ unsigned offset_into_extent; -+ -+ u16 flags; -+ union { -+ struct { -+ u16 bounce:1, -+ split:1, -+ kmalloc:1, -+ have_ioref:1, -+ narrow_crcs:1, -+ hole:1, -+ retry:2, -+ context:2; -+ }; -+ u16 _state; -+ }; -+ -+ struct bch_devs_list devs_have; -+ -+ struct extent_ptr_decoded pick; -+ /* start pos of data we read (may not be pos of data we want) */ -+ struct bpos pos; -+ struct bversion version; -+ -+ struct promote_op *promote; -+ -+ struct bch_io_opts opts; -+ -+ struct work_struct work; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_bio { -+ struct bch_fs *c; -+ struct bch_write_bio *parent; -+ -+ u64 submit_time; -+ -+ struct bch_devs_list failed; -+ u8 dev; -+ -+ unsigned split:1, -+ bounce:1, -+ put_bio:1, -+ have_ioref:1, -+ used_mempool:1; -+ -+ struct bio bio; -+}; -+ -+struct bch_write_op { -+ struct closure cl; -+ struct bch_fs *c; -+ void (*end_io)(struct bch_write_op *); -+ u64 start_time; -+ -+ unsigned written; /* sectors */ -+ u16 flags; -+ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ -+ -+ unsigned csum_type:4; -+ unsigned compression_type:4; -+ unsigned nr_replicas:4; -+ unsigned nr_replicas_required:4; -+ unsigned alloc_reserve:3; -+ unsigned incompressible:1; -+ -+ struct bch_devs_list devs_have; -+ u16 target; -+ u16 nonce; -+ struct bch_io_opts opts; -+ -+ struct bpos pos; -+ struct bversion version; -+ -+ /* For BCH_WRITE_DATA_ENCODED: */ -+ struct bch_extent_crc_unpacked crc; -+ -+ struct write_point_specifier write_point; -+ -+ struct disk_reservation res; -+ -+ struct open_buckets open_buckets; -+ -+ /* -+ * If caller wants to flush but hasn't passed us a journal_seq ptr, we -+ * still need to stash the journal_seq somewhere: -+ */ -+ union { -+ u64 *journal_seq_p; -+ u64 journal_seq; -+ }; -+ u64 new_i_size; -+ s64 i_sectors_delta; -+ -+ int (*index_update_fn)(struct bch_write_op *); -+ -+ struct bch_devs_mask failed; -+ -+ struct keylist insert_keys; -+ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; -+ -+ /* Must be last: */ -+ struct bch_write_bio wbio; -+}; -+ -+#endif /* _BCACHEFS_IO_TYPES_H */ -diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c -new file mode 100644 -index 000000000000..b8b719902c63 ---- /dev/null -+++ b/fs/bcachefs/journal.c -@@ -0,0 +1,1263 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs journalling code, for btree insertions -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_methods.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+#include -+ -+static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); -+ -+static bool __journal_entry_is_open(union journal_res_state state) -+{ -+ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; -+} -+ -+static bool journal_entry_is_open(struct journal *j) -+{ -+ return __journal_entry_is_open(j->reservations); -+} -+ -+static void journal_pin_new_entry(struct journal *j, int count) -+{ -+ struct journal_entry_pin_list *p; -+ -+ /* -+ * The fifo_push() needs to happen at the same time as j->seq is -+ * incremented for journal_last_seq() to be calculated correctly -+ */ -+ atomic64_inc(&j->seq); -+ p = fifo_push_ref(&j->pin); -+ -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, count); -+ p->devs.nr = 0; -+} -+ -+static void bch2_journal_buf_init(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ -+ memset(buf->has_inode, 0, sizeof(buf->has_inode)); -+ -+ memset(buf->data, 0, sizeof(*buf->data)); -+ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); -+ buf->data->u64s = 0; -+} -+ -+void bch2_journal_halt(struct journal *j) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ journal_wake(j); -+ closure_wake_up(&journal_cur_buf(j)->wait); -+} -+ -+/* journal entry close/open: */ -+ -+void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) -+{ -+ if (!need_write_just_set && -+ test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ bch2_time_stats_update(j->delay_time, -+ j->need_write_time); -+ -+ clear_bit(JOURNAL_NEED_WRITE, &j->flags); -+ -+ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); -+} -+ -+/* -+ * Returns true if journal entry is now closed: -+ */ -+static bool __journal_entry_close(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ bool set_need_write = false; -+ unsigned sectors; -+ -+ lockdep_assert_held(&j->lock); -+ -+ do { -+ old.v = new.v = v; -+ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) -+ return true; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { -+ /* this entry will never be written: */ -+ closure_wake_up(&buf->wait); -+ return true; -+ } -+ -+ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { -+ set_bit(JOURNAL_NEED_WRITE, &j->flags); -+ j->need_write_time = local_clock(); -+ set_need_write = true; -+ } -+ -+ if (new.prev_buf_unwritten) -+ return false; -+ -+ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; -+ new.idx++; -+ new.prev_buf_unwritten = 1; -+ -+ BUG_ON(journal_state_count(new, new.idx)); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); -+ -+ sectors = vstruct_blocks_plus(buf->data, c->block_bits, -+ buf->u64s_reserved) << c->block_bits; -+ BUG_ON(sectors > buf->sectors); -+ buf->sectors = sectors; -+ -+ bkey_extent_init(&buf->key); -+ -+ /* -+ * We have to set last_seq here, _before_ opening a new journal entry: -+ * -+ * A threads may replace an old pin with a new pin on their current -+ * journal reservation - the expectation being that the journal will -+ * contain either what the old pin protected or what the new pin -+ * protects. -+ * -+ * After the old pin is dropped journal_last_seq() won't include the old -+ * pin, so we can only write the updated last_seq on the entry that -+ * contains whatever the new pin protects. -+ * -+ * Restated, we can _not_ update last_seq for a given entry if there -+ * could be a newer entry open with reservations/pins that have been -+ * taken against it. -+ * -+ * Hence, we want update/set last_seq on the current journal entry right -+ * before we open a new one: -+ */ -+ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); -+ -+ if (journal_entry_empty(buf->data)) -+ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ else -+ set_bit(JOURNAL_NOT_EMPTY, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ -+ bch2_journal_buf_init(j); -+ -+ cancel_delayed_work(&j->write_work); -+ -+ bch2_journal_space_available(j); -+ -+ bch2_journal_buf_put(j, old.idx, set_need_write); -+ return true; -+} -+ -+static bool journal_entry_close(struct journal *j) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * should _only_ called from journal_res_get() - when we actually want a -+ * journal reservation - journal entry is open means journal is dirty: -+ * -+ * returns: -+ * 0: success -+ * -ENOSPC: journal currently full, must invoke reclaim -+ * -EAGAIN: journal blocked, must wait -+ * -EROFS: insufficient rw devices or journal error -+ */ -+static int journal_entry_open(struct journal *j) -+{ -+ struct journal_buf *buf = journal_cur_buf(j); -+ union journal_res_state old, new; -+ int u64s; -+ u64 v; -+ -+ lockdep_assert_held(&j->lock); -+ BUG_ON(journal_entry_is_open(j)); -+ -+ if (j->blocked) -+ return -EAGAIN; -+ -+ if (j->cur_entry_error) -+ return j->cur_entry_error; -+ -+ BUG_ON(!j->cur_entry_sectors); -+ -+ buf->u64s_reserved = j->entry_u64s_reserved; -+ buf->disk_sectors = j->cur_entry_sectors; -+ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); -+ -+ u64s = (int) (buf->sectors << 9) / sizeof(u64) - -+ journal_entry_overhead(j); -+ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); -+ -+ if (u64s <= le32_to_cpu(buf->data->u64s)) -+ return -ENOSPC; -+ -+ /* -+ * Must be set before marking the journal entry as open: -+ */ -+ j->cur_entry_u64s = u64s; -+ -+ v = atomic64_read(&j->reservations.counter); -+ do { -+ old.v = new.v = v; -+ -+ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) -+ return -EROFS; -+ -+ /* Handle any already added entries */ -+ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); -+ -+ EBUG_ON(journal_state_count(new, new.idx)); -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ if (j->res_get_blocked_start) -+ bch2_time_stats_update(j->blocked_time, -+ j->res_get_blocked_start); -+ j->res_get_blocked_start = 0; -+ -+ mod_delayed_work(system_freezable_wq, -+ &j->write_work, -+ msecs_to_jiffies(j->write_delay_ms)); -+ journal_wake(j); -+ return 0; -+} -+ -+static bool journal_quiesced(struct journal *j) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); -+ -+ if (!ret) -+ journal_entry_close(j); -+ return ret; -+} -+ -+static void journal_quiesce(struct journal *j) -+{ -+ wait_event(j->wait, journal_quiesced(j)); -+} -+ -+static void journal_write_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(work, struct journal, write_work.work); -+ -+ journal_entry_close(j); -+} -+ -+/* -+ * Given an inode number, if that inode number has data in the journal that -+ * hasn't yet been flushed, return the journal sequence number that needs to be -+ * flushed: -+ */ -+u64 bch2_inode_journal_seq(struct journal *j, u64 inode) -+{ -+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); -+ u64 seq = 0; -+ -+ if (!test_bit(h, j->buf[0].has_inode) && -+ !test_bit(h, j->buf[1].has_inode)) -+ return 0; -+ -+ spin_lock(&j->lock); -+ if (test_bit(h, journal_cur_buf(j)->has_inode)) -+ seq = journal_cur_seq(j); -+ else if (test_bit(h, journal_prev_buf(j)->has_inode)) -+ seq = journal_cur_seq(j) - 1; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) -+{ -+ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if ((buf = journal_seq_to_buf(j, seq))) -+ set_bit(h, buf->has_inode); -+ -+ spin_unlock(&j->lock); -+} -+ -+static int __journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *buf; -+ bool can_discard; -+ int ret; -+retry: -+ if (journal_res_get_fast(j, res, flags)) -+ return 0; -+ -+ if (bch2_journal_error(j)) -+ return -EROFS; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Recheck after taking the lock, so we don't race with another thread -+ * that just did journal_entry_open() and call journal_entry_close() -+ * unnecessarily -+ */ -+ if (journal_res_get_fast(j, res, flags)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ /* -+ * Don't want to close current journal entry, just need to -+ * invoke reclaim: -+ */ -+ ret = -ENOSPC; -+ goto unlock; -+ } -+ -+ /* -+ * If we couldn't get a reservation because the current buf filled up, -+ * and we had room for a bigger entry on disk, signal that we want to -+ * realloc the journal bufs: -+ */ -+ buf = journal_cur_buf(j); -+ if (journal_entry_is_open(j) && -+ buf->buf_size >> 9 < buf->disk_sectors && -+ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) -+ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); -+ -+ if (journal_entry_is_open(j) && -+ !__journal_entry_close(j)) { -+ /* -+ * We failed to get a reservation on the current open journal -+ * entry because it's full, and we can't close it because -+ * there's still a previous one in flight: -+ */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ ret = journal_entry_open(j); -+ } -+unlock: -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ can_discard = j->can_discard; -+ spin_unlock(&j->lock); -+ -+ if (!ret) -+ goto retry; -+ -+ if (ret == -ENOSPC) { -+ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), -+ "JOURNAL_RES_GET_RESERVED set but journal full"); -+ -+ /* -+ * Journal is full - can't rely on reclaim from work item due to -+ * freezing: -+ */ -+ trace_journal_full(c); -+ -+ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { -+ if (can_discard) { -+ bch2_journal_do_discards(j); -+ goto retry; -+ } -+ -+ if (mutex_trylock(&j->reclaim_lock)) { -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+ } -+ } -+ -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+/* -+ * Essentially the entry function to the journaling code. When bcachefs is doing -+ * a btree insert, it calls this function to get the current journal write. -+ * Journal write is the structure used set up journal writes. The calling -+ * function will then add its keys to the structure, queuing them for the next -+ * write. -+ * -+ * To ensure forward progress, the current task must not be holding any -+ * btree node write locks. -+ */ -+int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->async_wait, -+ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || -+ (flags & JOURNAL_RES_GET_NONBLOCK)); -+ return ret; -+} -+ -+/* journal_preres: */ -+ -+static bool journal_preres_available(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); -+ -+ if (!ret) -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ -+ return ret; -+} -+ -+int __bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int ret; -+ -+ closure_wait_event(&j->preres_wait, -+ (ret = bch2_journal_error(j)) || -+ journal_preres_available(j, res, new_u64s, flags)); -+ return ret; -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *j, -+ struct journal_entry_res *res, -+ unsigned new_u64s) -+{ -+ union journal_res_state state; -+ int d = new_u64s - res->u64s; -+ -+ spin_lock(&j->lock); -+ -+ j->entry_u64s_reserved += d; -+ if (d <= 0) -+ goto out; -+ -+ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); -+ smp_mb(); -+ state = READ_ONCE(j->reservations); -+ -+ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && -+ state.cur_entry_offset > j->cur_entry_u64s) { -+ j->cur_entry_u64s += d; -+ /* -+ * Not enough room in current journal entry, have to flush it: -+ */ -+ __journal_entry_close(j); -+ } else { -+ journal_cur_buf(j)->u64s_reserved += d; -+ } -+out: -+ spin_unlock(&j->lock); -+ res->u64s += d; -+} -+ -+/* journal flushing: */ -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *j) -+{ -+ u64 seq; -+ -+ spin_lock(&j->lock); -+ seq = journal_cur_seq(j); -+ if (j->reservations.prev_buf_unwritten) -+ seq--; -+ spin_unlock(&j->lock); -+ -+ return seq; -+} -+ -+/** -+ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't -+ * open yet, or wait if we cannot -+ * -+ * used by the btree interior update machinery, when it needs to write a new -+ * btree root - every journal entry contains the roots of all the btrees, so it -+ * doesn't need to bother with getting a journal reservation -+ */ -+int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ int ret; -+ -+ spin_lock(&j->lock); -+ -+ /* -+ * Can't try to open more than one sequence number ahead: -+ */ -+ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); -+ -+ if (journal_cur_seq(j) > seq || -+ journal_entry_is_open(j)) { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ -+ if (journal_cur_seq(j) < seq && -+ !__journal_entry_close(j)) { -+ /* haven't finished writing out the previous one: */ -+ trace_journal_entry_full(c); -+ ret = -EAGAIN; -+ } else { -+ BUG_ON(journal_cur_seq(j) != seq); -+ -+ ret = journal_entry_open(j); -+ } -+ -+ if ((ret == -EAGAIN || ret == -ENOSPC) && -+ !j->res_get_blocked_start) -+ j->res_get_blocked_start = local_clock() ?: 1; -+ -+ if (ret == -EAGAIN || ret == -ENOSPC) -+ closure_wait(&j->async_wait, cl); -+ -+ spin_unlock(&j->lock); -+ -+ if (ret == -ENOSPC) { -+ trace_journal_full(c); -+ bch2_journal_reclaim_work(&j->reclaim_work.work); -+ ret = -EAGAIN; -+ } -+ -+ return ret; -+} -+ -+static int journal_seq_error(struct journal *j, u64 seq) -+{ -+ union journal_res_state state = READ_ONCE(j->reservations); -+ -+ if (seq == journal_cur_seq(j)) -+ return bch2_journal_error(j); -+ -+ if (seq + 1 == journal_cur_seq(j) && -+ !state.prev_buf_unwritten && -+ seq > j->seq_ondisk) -+ return -EIO; -+ -+ return 0; -+} -+ -+static inline struct journal_buf * -+journal_seq_to_buf(struct journal *j, u64 seq) -+{ -+ /* seq should be for a journal entry that has been opened: */ -+ BUG_ON(seq > journal_cur_seq(j)); -+ BUG_ON(seq == journal_cur_seq(j) && -+ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); -+ -+ if (seq == journal_cur_seq(j)) -+ return journal_cur_buf(j); -+ if (seq + 1 == journal_cur_seq(j) && -+ j->reservations.prev_buf_unwritten) -+ return journal_prev_buf(j); -+ return NULL; -+} -+ -+/** -+ * bch2_journal_wait_on_seq - wait for a journal entry to be written -+ * -+ * does _not_ cause @seq to be written immediately - if there is no other -+ * activity to cause the relevant journal entry to be filled up or flushed it -+ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is -+ * configurable). -+ */ -+void bch2_journal_wait_on_seq(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if ((buf = journal_seq_to_buf(j, seq))) { -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) { -+ smp_mb(); -+ if (bch2_journal_error(j)) -+ closure_wake_up(&buf->wait); -+ } -+ } -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_flush_seq_async - wait for a journal entry to be written -+ * -+ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if -+ * necessary -+ */ -+void bch2_journal_flush_seq_async(struct journal *j, u64 seq, -+ struct closure *parent) -+{ -+ struct journal_buf *buf; -+ -+ spin_lock(&j->lock); -+ -+ if (parent && -+ (buf = journal_seq_to_buf(j, seq))) -+ if (!closure_wait(&buf->wait, parent)) -+ BUG(); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+} -+ -+static int journal_seq_flushed(struct journal *j, u64 seq) -+{ -+ int ret; -+ -+ spin_lock(&j->lock); -+ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); -+ -+ if (seq == journal_cur_seq(j)) -+ __journal_entry_close(j); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+int bch2_journal_flush_seq(struct journal *j, u64 seq) -+{ -+ u64 start_time = local_clock(); -+ int ret, ret2; -+ -+ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); -+ -+ bch2_time_stats_update(j->flush_seq_time, start_time); -+ -+ return ret ?: ret2 < 0 ? ret2 : 0; -+} -+ -+/** -+ * bch2_journal_meta_async - force a journal entry to be written -+ */ -+void bch2_journal_meta_async(struct journal *j, struct closure *parent) -+{ -+ struct journal_res res; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ bch2_journal_res_put(j, &res); -+ -+ bch2_journal_flush_seq_async(j, res.seq, parent); -+} -+ -+int bch2_journal_meta(struct journal *j) -+{ -+ struct journal_res res; -+ int ret; -+ -+ memset(&res, 0, sizeof(res)); -+ -+ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); -+ if (ret) -+ return ret; -+ -+ bch2_journal_res_put(j, &res); -+ -+ return bch2_journal_flush_seq(j, res.seq); -+} -+ -+/* -+ * bch2_journal_flush_async - if there is an open journal entry, or a journal -+ * still being written, write it and wait for the write to complete -+ */ -+void bch2_journal_flush_async(struct journal *j, struct closure *parent) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return; -+ } -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_seq_async(j, seq, parent); -+} -+ -+int bch2_journal_flush(struct journal *j) -+{ -+ u64 seq, journal_seq; -+ -+ spin_lock(&j->lock); -+ journal_seq = journal_cur_seq(j); -+ -+ if (journal_entry_is_open(j)) { -+ seq = journal_seq; -+ } else if (journal_seq) { -+ seq = journal_seq - 1; -+ } else { -+ spin_unlock(&j->lock); -+ return 0; -+ } -+ spin_unlock(&j->lock); -+ -+ return bch2_journal_flush_seq(j, seq); -+} -+ -+/* block/unlock the journal: */ -+ -+void bch2_journal_unblock(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked--; -+ spin_unlock(&j->lock); -+ -+ journal_wake(j); -+} -+ -+void bch2_journal_block(struct journal *j) -+{ -+ spin_lock(&j->lock); -+ j->blocked++; -+ spin_unlock(&j->lock); -+ -+ journal_quiesce(j); -+} -+ -+/* allocate journal on a device: */ -+ -+static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, -+ bool new_fs, struct closure *cl) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets; -+ u64 *new_bucket_seq = NULL, *new_buckets = NULL; -+ int ret = 0; -+ -+ /* don't handle reducing nr of buckets yet: */ -+ if (nr <= ja->nr) -+ return 0; -+ -+ ret = -ENOMEM; -+ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); -+ if (!new_buckets || !new_bucket_seq) -+ goto err; -+ -+ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, -+ nr + sizeof(*journal_buckets) / sizeof(u64)); -+ if (!journal_buckets) -+ goto err; -+ -+ /* -+ * We may be called from the device add path, before the new device has -+ * actually been added to the running filesystem: -+ */ -+ if (c) -+ spin_lock(&c->journal.lock); -+ -+ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); -+ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); -+ swap(new_buckets, ja->buckets); -+ swap(new_bucket_seq, ja->bucket_seq); -+ -+ if (c) -+ spin_unlock(&c->journal.lock); -+ -+ while (ja->nr < nr) { -+ struct open_bucket *ob = NULL; -+ unsigned pos; -+ long bucket; -+ -+ if (new_fs) { -+ bucket = bch2_bucket_alloc_new_fs(ca); -+ if (bucket < 0) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ } else { -+ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, -+ false, cl); -+ if (IS_ERR(ob)) { -+ ret = cl ? -EAGAIN : -ENOSPC; -+ goto err; -+ } -+ -+ bucket = sector_to_bucket(ca, ob->ptr.offset); -+ } -+ -+ if (c) { -+ percpu_down_read(&c->mark_lock); -+ spin_lock(&c->journal.lock); -+ } -+ -+ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; -+ __array_insert_item(ja->buckets, ja->nr, pos); -+ __array_insert_item(ja->bucket_seq, ja->nr, pos); -+ __array_insert_item(journal_buckets->buckets, ja->nr, pos); -+ ja->nr++; -+ -+ ja->buckets[pos] = bucket; -+ ja->bucket_seq[pos] = 0; -+ journal_buckets->buckets[pos] = cpu_to_le64(bucket); -+ -+ if (pos <= ja->discard_idx) -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ if (pos <= ja->dirty_idx_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ if (pos <= ja->dirty_idx) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ if (pos <= ja->cur_idx) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, -+ ca->mi.bucket_size, -+ gc_phase(GC_PHASE_SB), -+ 0); -+ -+ if (c) { -+ spin_unlock(&c->journal.lock); -+ percpu_up_read(&c->mark_lock); -+ } -+ -+ if (!new_fs) -+ bch2_open_bucket_put(c, ob); -+ } -+ -+ ret = 0; -+err: -+ kfree(new_bucket_seq); -+ kfree(new_buckets); -+ -+ return ret; -+} -+ -+/* -+ * Allocate more journal space at runtime - not currently making use if it, but -+ * the code works: -+ */ -+int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, -+ unsigned nr) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct closure cl; -+ unsigned current_nr; -+ int ret; -+ -+ closure_init_stack(&cl); -+ -+ do { -+ struct disk_reservation disk_res = { 0, 0 }; -+ -+ closure_sync(&cl); -+ -+ mutex_lock(&c->sb_lock); -+ current_nr = ja->nr; -+ -+ /* -+ * note: journal buckets aren't really counted as _sectors_ used yet, so -+ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c -+ * when space used goes up without a reservation - but we do need the -+ * reservation to ensure we'll actually be able to allocate: -+ */ -+ -+ if (bch2_disk_reservation_get(c, &disk_res, -+ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { -+ mutex_unlock(&c->sb_lock); -+ return -ENOSPC; -+ } -+ -+ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ if (ja->nr != current_nr) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } while (ret == -EAGAIN); -+ -+ return ret; -+} -+ -+int bch2_dev_journal_alloc(struct bch_dev *ca) -+{ -+ unsigned nr; -+ -+ if (dynamic_fault("bcachefs:add:journal_alloc")) -+ return -ENOMEM; -+ -+ /* -+ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever -+ * is smaller: -+ */ -+ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, -+ BCH_JOURNAL_BUCKETS_MIN, -+ min(1 << 10, -+ (1 << 20) / ca->mi.bucket_size)); -+ -+ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); -+} -+ -+/* startup/shutdown: */ -+ -+static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) -+{ -+ union journal_res_state state; -+ struct journal_buf *w; -+ bool ret; -+ -+ spin_lock(&j->lock); -+ state = READ_ONCE(j->reservations); -+ w = j->buf + !state.idx; -+ -+ ret = state.prev_buf_unwritten && -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) -+{ -+ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); -+} -+ -+void bch2_fs_journal_stop(struct journal *j) -+{ -+ bch2_journal_flush_all_pins(j); -+ -+ wait_event(j->wait, journal_entry_close(j)); -+ -+ /* do we need to write another journal entry? */ -+ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) -+ bch2_journal_meta(j); -+ -+ journal_quiesce(j); -+ -+ BUG_ON(!bch2_journal_error(j) && -+ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); -+ -+ cancel_delayed_work_sync(&j->write_work); -+ cancel_delayed_work_sync(&j->reclaim_work); -+} -+ -+int bch2_fs_journal_start(struct journal *j, u64 cur_seq, -+ struct list_head *journal_entries) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ struct journal_replay *i; -+ u64 last_seq = cur_seq, nr, seq; -+ -+ if (!list_empty(journal_entries)) -+ last_seq = le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ nr = cur_seq - last_seq; -+ -+ if (nr + 1 > j->pin.size) { -+ free_fifo(&j->pin); -+ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); -+ if (!j->pin.data) { -+ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); -+ return -ENOMEM; -+ } -+ } -+ -+ j->replay_journal_seq = last_seq; -+ j->replay_journal_seq_end = cur_seq; -+ j->last_seq_ondisk = last_seq; -+ j->pin.front = last_seq; -+ j->pin.back = cur_seq; -+ atomic64_set(&j->seq, cur_seq - 1); -+ -+ fifo_for_each_entry_ptr(p, &j->pin, seq) { -+ INIT_LIST_HEAD(&p->list); -+ INIT_LIST_HEAD(&p->flushed); -+ atomic_set(&p->count, 1); -+ p->devs.nr = 0; -+ } -+ -+ list_for_each_entry(i, journal_entries, list) { -+ seq = le64_to_cpu(i->j.seq); -+ BUG_ON(seq >= cur_seq); -+ -+ if (seq < last_seq) -+ continue; -+ -+ journal_seq_pin(j, seq)->devs = i->devs; -+ } -+ -+ spin_lock(&j->lock); -+ -+ set_bit(JOURNAL_STARTED, &j->flags); -+ -+ journal_pin_new_entry(j, 1); -+ bch2_journal_buf_init(j); -+ -+ c->last_bucket_seq_cleanup = journal_cur_seq(j); -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ return 0; -+} -+ -+/* init/exit: */ -+ -+void bch2_dev_journal_exit(struct bch_dev *ca) -+{ -+ kfree(ca->journal.bio); -+ kfree(ca->journal.buckets); -+ kfree(ca->journal.bucket_seq); -+ -+ ca->journal.bio = NULL; -+ ca->journal.buckets = NULL; -+ ca->journal.bucket_seq = NULL; -+} -+ -+int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) -+{ -+ struct journal_device *ja = &ca->journal; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(sb); -+ unsigned i; -+ -+ ja->nr = bch2_nr_journal_buckets(journal_buckets); -+ -+ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->bucket_seq) -+ return -ENOMEM; -+ -+ ca->journal.bio = bio_kmalloc(GFP_KERNEL, -+ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); -+ if (!ca->journal.bio) -+ return -ENOMEM; -+ -+ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); -+ if (!ja->buckets) -+ return -ENOMEM; -+ -+ for (i = 0; i < ja->nr; i++) -+ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); -+ -+ return 0; -+} -+ -+void bch2_fs_journal_exit(struct journal *j) -+{ -+ kvpfree(j->buf[1].data, j->buf[1].buf_size); -+ kvpfree(j->buf[0].data, j->buf[0].buf_size); -+ free_fifo(&j->pin); -+} -+ -+int bch2_fs_journal_init(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ static struct lock_class_key res_key; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ spin_lock_init(&j->lock); -+ spin_lock_init(&j->err_lock); -+ init_waitqueue_head(&j->wait); -+ INIT_DELAYED_WORK(&j->write_work, journal_write_work); -+ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); -+ init_waitqueue_head(&j->pin_flush_wait); -+ mutex_init(&j->reclaim_lock); -+ mutex_init(&j->discard_lock); -+ -+ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); -+ -+ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; -+ j->write_delay_ms = 1000; -+ j->reclaim_delay_ms = 100; -+ -+ /* Btree roots: */ -+ j->entry_u64s_reserved += -+ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); -+ -+ atomic64_set(&j->reservations.counter, -+ ((union journal_res_state) -+ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); -+ -+ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || -+ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || -+ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ j->pin.front = j->pin.back = 1; -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+} -+ -+/* debug: */ -+ -+void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ union journal_res_state s; -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ rcu_read_lock(); -+ spin_lock(&j->lock); -+ s = READ_ONCE(j->reservations); -+ -+ pr_buf(out, -+ "active journal entries:\t%llu\n" -+ "seq:\t\t\t%llu\n" -+ "last_seq:\t\t%llu\n" -+ "last_seq_ondisk:\t%llu\n" -+ "prereserved:\t\t%u/%u\n" -+ "current entry sectors:\t%u\n" -+ "current entry:\t\t", -+ fifo_used(&j->pin), -+ journal_cur_seq(j), -+ journal_last_seq(j), -+ j->last_seq_ondisk, -+ j->prereserved.reserved, -+ j->prereserved.remaining, -+ j->cur_entry_sectors); -+ -+ switch (s.cur_entry_offset) { -+ case JOURNAL_ENTRY_ERROR_VAL: -+ pr_buf(out, "error\n"); -+ break; -+ case JOURNAL_ENTRY_CLOSED_VAL: -+ pr_buf(out, "closed\n"); -+ break; -+ default: -+ pr_buf(out, "%u/%u\n", -+ s.cur_entry_offset, -+ j->cur_entry_u64s); -+ break; -+ } -+ -+ pr_buf(out, -+ "current entry refs:\t%u\n" -+ "prev entry unwritten:\t", -+ journal_state_count(s, s.idx)); -+ -+ if (s.prev_buf_unwritten) -+ pr_buf(out, "yes, ref %u sectors %u\n", -+ journal_state_count(s, !s.idx), -+ journal_prev_buf(j)->sectors); -+ else -+ pr_buf(out, "no\n"); -+ -+ pr_buf(out, -+ "need write:\t\t%i\n" -+ "replay done:\t\t%i\n", -+ test_bit(JOURNAL_NEED_WRITE, &j->flags), -+ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); -+ -+ for_each_member_device_rcu(ca, c, iter, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ pr_buf(out, -+ "dev %u:\n" -+ "\tnr\t\t%u\n" -+ "\tavailable\t%u:%u\n" -+ "\tdiscard_idx\t\t%u\n" -+ "\tdirty_idx_ondisk\t%u (seq %llu)\n" -+ "\tdirty_idx\t\t%u (seq %llu)\n" -+ "\tcur_idx\t\t%u (seq %llu)\n", -+ iter, ja->nr, -+ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), -+ ja->sectors_free, -+ ja->discard_idx, -+ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], -+ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], -+ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); -+ } -+ -+ spin_unlock(&j->lock); -+ rcu_read_unlock(); -+} -+ -+void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *pin; -+ u64 i; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { -+ pr_buf(out, "%llu: count %u\n", -+ i, atomic_read(&pin_list->count)); -+ -+ list_for_each_entry(pin, &pin_list->list, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); -+ -+ if (!list_empty(&pin_list->flushed)) -+ pr_buf(out, "flushed:\n"); -+ -+ list_for_each_entry(pin, &pin_list->flushed, list) -+ pr_buf(out, "\t%px %ps\n", -+ pin, pin->flush); -+ } -+ spin_unlock(&j->lock); -+} -diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h -new file mode 100644 -index 000000000000..f60bc964ee1f ---- /dev/null -+++ b/fs/bcachefs/journal.h -@@ -0,0 +1,520 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_H -+#define _BCACHEFS_JOURNAL_H -+ -+/* -+ * THE JOURNAL: -+ * -+ * The primary purpose of the journal is to log updates (insertions) to the -+ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. -+ * -+ * Without the journal, the b-tree is always internally consistent on -+ * disk - and in fact, in the earliest incarnations bcache didn't have a journal -+ * but did handle unclean shutdowns by doing all index updates synchronously -+ * (with coalescing). -+ * -+ * Updates to interior nodes still happen synchronously and without the journal -+ * (for simplicity) - this may change eventually but updates to interior nodes -+ * are rare enough it's not a huge priority. -+ * -+ * This means the journal is relatively separate from the b-tree; it consists of -+ * just a list of keys and journal replay consists of just redoing those -+ * insertions in same order that they appear in the journal. -+ * -+ * PERSISTENCE: -+ * -+ * For synchronous updates (where we're waiting on the index update to hit -+ * disk), the journal entry will be written out immediately (or as soon as -+ * possible, if the write for the previous journal entry was still in flight). -+ * -+ * Synchronous updates are specified by passing a closure (@flush_cl) to -+ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter -+ * down to the journalling code. That closure will will wait on the journal -+ * write to complete (via closure_wait()). -+ * -+ * If the index update wasn't synchronous, the journal entry will be -+ * written out after 10 ms have elapsed, by default (the delay_ms field -+ * in struct journal). -+ * -+ * JOURNAL ENTRIES: -+ * -+ * A journal entry is variable size (struct jset), it's got a fixed length -+ * header and then a variable number of struct jset_entry entries. -+ * -+ * Journal entries are identified by monotonically increasing 64 bit sequence -+ * numbers - jset->seq; other places in the code refer to this sequence number. -+ * -+ * A jset_entry entry contains one or more bkeys (which is what gets inserted -+ * into the b-tree). We need a container to indicate which b-tree the key is -+ * for; also, the roots of the various b-trees are stored in jset_entry entries -+ * (one for each b-tree) - this lets us add new b-tree types without changing -+ * the on disk format. -+ * -+ * We also keep some things in the journal header that are logically part of the -+ * superblock - all the things that are frequently updated. This is for future -+ * bcache on raw flash support; the superblock (which will become another -+ * journal) can't be moved or wear leveled, so it contains just enough -+ * information to find the main journal, and the superblock only has to be -+ * rewritten when we want to move/wear level the main journal. -+ * -+ * JOURNAL LAYOUT ON DISK: -+ * -+ * The journal is written to a ringbuffer of buckets (which is kept in the -+ * superblock); the individual buckets are not necessarily contiguous on disk -+ * which means that journal entries are not allowed to span buckets, but also -+ * that we can resize the journal at runtime if desired (unimplemented). -+ * -+ * The journal buckets exist in the same pool as all the other buckets that are -+ * managed by the allocator and garbage collection - garbage collection marks -+ * the journal buckets as metadata buckets. -+ * -+ * OPEN/DIRTY JOURNAL ENTRIES: -+ * -+ * Open/dirty journal entries are journal entries that contain b-tree updates -+ * that have not yet been written out to the b-tree on disk. We have to track -+ * which journal entries are dirty, and we also have to avoid wrapping around -+ * the journal and overwriting old but still dirty journal entries with new -+ * journal entries. -+ * -+ * On disk, this is represented with the "last_seq" field of struct jset; -+ * last_seq is the first sequence number that journal replay has to replay. -+ * -+ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in -+ * journal_device->seq) of for each journal bucket, the highest sequence number -+ * any journal entry it contains. Then, by comparing that against last_seq we -+ * can determine whether that journal bucket contains dirty journal entries or -+ * not. -+ * -+ * To track which journal entries are dirty, we maintain a fifo of refcounts -+ * (where each entry corresponds to a specific sequence number) - when a ref -+ * goes to 0, that journal entry is no longer dirty. -+ * -+ * Journalling of index updates is done at the same time as the b-tree itself is -+ * being modified (see btree_insert_key()); when we add the key to the journal -+ * the pending b-tree write takes a ref on the journal entry the key was added -+ * to. If a pending b-tree write would need to take refs on multiple dirty -+ * journal entries, it only keeps the ref on the oldest one (since a newer -+ * journal entry will still be replayed if an older entry was dirty). -+ * -+ * JOURNAL FILLING UP: -+ * -+ * There are two ways the journal could fill up; either we could run out of -+ * space to write to, or we could have too many open journal entries and run out -+ * of room in the fifo of refcounts. Since those refcounts are decremented -+ * without any locking we can't safely resize that fifo, so we handle it the -+ * same way. -+ * -+ * If the journal fills up, we start flushing dirty btree nodes until we can -+ * allocate space for a journal write again - preferentially flushing btree -+ * nodes that are pinning the oldest journal entries first. -+ */ -+ -+#include -+ -+#include "journal_types.h" -+ -+struct bch_fs; -+ -+static inline void journal_wake(struct journal *j) -+{ -+ wake_up(&j->wait); -+ closure_wake_up(&j->async_wait); -+ closure_wake_up(&j->preres_wait); -+} -+ -+static inline struct journal_buf *journal_cur_buf(struct journal *j) -+{ -+ return j->buf + j->reservations.idx; -+} -+ -+static inline struct journal_buf *journal_prev_buf(struct journal *j) -+{ -+ return j->buf + !j->reservations.idx; -+} -+ -+/* Sequence number of oldest dirty journal entry */ -+ -+static inline u64 journal_last_seq(struct journal *j) -+{ -+ return j->pin.front; -+} -+ -+static inline u64 journal_cur_seq(struct journal *j) -+{ -+ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); -+ -+ return j->pin.back - 1; -+} -+ -+u64 bch2_inode_journal_seq(struct journal *, u64); -+void bch2_journal_set_has_inum(struct journal *, u64, u64); -+ -+static inline int journal_state_count(union journal_res_state s, int idx) -+{ -+ return idx == 0 ? s.buf0_count : s.buf1_count; -+} -+ -+static inline void journal_state_inc(union journal_res_state *s) -+{ -+ s->buf0_count += s->idx == 0; -+ s->buf1_count += s->idx == 1; -+} -+ -+static inline void bch2_journal_set_has_inode(struct journal *j, -+ struct journal_res *res, -+ u64 inum) -+{ -+ struct journal_buf *buf = &j->buf[res->idx]; -+ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); -+ -+ /* avoid atomic op if possible */ -+ if (unlikely(!test_bit(bit, buf->has_inode))) -+ set_bit(bit, buf->has_inode); -+} -+ -+/* -+ * Amount of space that will be taken up by some keys in the journal (i.e. -+ * including the jset header) -+ */ -+static inline unsigned jset_u64s(unsigned u64s) -+{ -+ return u64s + sizeof(struct jset_entry) / sizeof(u64); -+} -+ -+static inline int journal_entry_overhead(struct journal *j) -+{ -+ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; -+} -+ -+static inline struct jset_entry * -+bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) -+{ -+ struct jset *jset = buf->data; -+ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); -+ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ -+ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); -+ -+ return entry; -+} -+ -+static inline struct jset_entry * -+journal_res_entry(struct journal *j, struct journal_res *res) -+{ -+ return vstruct_idx(j->buf[res->idx].data, res->offset); -+} -+ -+static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, -+ enum btree_id id, unsigned level, -+ const void *data, unsigned u64s) -+{ -+ memset(entry, 0, sizeof(*entry)); -+ entry->u64s = cpu_to_le16(u64s); -+ entry->type = type; -+ entry->btree_id = id; -+ entry->level = level; -+ memcpy_u64s_small(entry->_data, data, u64s); -+ -+ return jset_u64s(u64s); -+} -+ -+static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, -+ unsigned type, enum btree_id id, -+ unsigned level, -+ const void *data, unsigned u64s) -+{ -+ unsigned actual = journal_entry_set(journal_res_entry(j, res), -+ type, id, level, data, u64s); -+ -+ EBUG_ON(!res->ref); -+ EBUG_ON(actual > res->u64s); -+ -+ res->offset += actual; -+ res->u64s -= actual; -+} -+ -+static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, -+ enum btree_id id, const struct bkey_i *k) -+{ -+ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, -+ id, 0, k, k->k.u64s); -+} -+ -+static inline bool journal_entry_empty(struct jset *j) -+{ -+ struct jset_entry *i; -+ -+ if (j->seq != j->last_seq) -+ return false; -+ -+ vstruct_for_each(j, i) -+ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) -+ return false; -+ return true; -+} -+ -+void __bch2_journal_buf_put(struct journal *, bool); -+ -+static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, -+ bool need_write_just_set) -+{ -+ union journal_res_state s; -+ -+ s.v = atomic64_sub_return(((union journal_res_state) { -+ .buf0_count = idx == 0, -+ .buf1_count = idx == 1, -+ }).v, &j->reservations.counter); -+ if (!journal_state_count(s, idx)) { -+ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); -+ __bch2_journal_buf_put(j, need_write_just_set); -+ } -+} -+ -+/* -+ * This function releases the journal write structure so other threads can -+ * then proceed to add their keys as well. -+ */ -+static inline void bch2_journal_res_put(struct journal *j, -+ struct journal_res *res) -+{ -+ if (!res->ref) -+ return; -+ -+ lock_release(&j->res_map, _THIS_IP_); -+ -+ while (res->u64s) -+ bch2_journal_add_entry(j, res, -+ BCH_JSET_ENTRY_btree_keys, -+ 0, 0, NULL, 0); -+ -+ bch2_journal_buf_put(j, res->idx, false); -+ -+ res->ref = 0; -+} -+ -+int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, -+ unsigned); -+ -+#define JOURNAL_RES_GET_NONBLOCK (1 << 0) -+#define JOURNAL_RES_GET_CHECK (1 << 1) -+#define JOURNAL_RES_GET_RESERVED (1 << 2) -+#define JOURNAL_RES_GET_RECLAIM (1 << 3) -+ -+static inline int journal_res_get_fast(struct journal *j, -+ struct journal_res *res, -+ unsigned flags) -+{ -+ union journal_res_state old, new; -+ u64 v = atomic64_read(&j->reservations.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ /* -+ * Check if there is still room in the current journal -+ * entry: -+ */ -+ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) -+ return 0; -+ -+ EBUG_ON(!journal_state_count(new, new.idx)); -+ -+ if (!(flags & JOURNAL_RES_GET_RESERVED) && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_CHECK) -+ return 1; -+ -+ new.cur_entry_offset += res->u64s; -+ journal_state_inc(&new); -+ } while ((v = atomic64_cmpxchg(&j->reservations.counter, -+ old.v, new.v)) != old.v); -+ -+ res->ref = true; -+ res->idx = old.idx; -+ res->offset = old.cur_entry_offset; -+ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); -+ return 1; -+} -+ -+static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, -+ unsigned u64s, unsigned flags) -+{ -+ int ret; -+ -+ EBUG_ON(res->ref); -+ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ -+ res->u64s = u64s; -+ -+ if (journal_res_get_fast(j, res, flags)) -+ goto out; -+ -+ ret = bch2_journal_res_get_slowpath(j, res, flags); -+ if (ret) -+ return ret; -+out: -+ if (!(flags & JOURNAL_RES_GET_CHECK)) { -+ lock_acquire_shared(&j->res_map, 0, -+ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, -+ NULL, _THIS_IP_); -+ EBUG_ON(!res->ref); -+ } -+ return 0; -+} -+ -+/* journal_preres: */ -+ -+static inline bool journal_check_may_get_unreserved(struct journal *j) -+{ -+ union journal_preres_state s = READ_ONCE(j->prereserved); -+ bool ret = s.reserved <= s.remaining && -+ fifo_free(&j->pin) > 8; -+ -+ lockdep_assert_held(&j->lock); -+ -+ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ if (ret) { -+ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ journal_wake(j); -+ } else { -+ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); -+ } -+ } -+ return ret; -+} -+ -+static inline void bch2_journal_preres_put(struct journal *j, -+ struct journal_preres *res) -+{ -+ union journal_preres_state s = { .reserved = res->u64s }; -+ -+ if (!res->u64s) -+ return; -+ -+ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); -+ res->u64s = 0; -+ closure_wake_up(&j->preres_wait); -+ -+ if (s.reserved <= s.remaining && -+ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { -+ spin_lock(&j->lock); -+ journal_check_may_get_unreserved(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+int __bch2_journal_preres_get(struct journal *, -+ struct journal_preres *, unsigned, unsigned); -+ -+static inline int bch2_journal_preres_get_fast(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ int d = new_u64s - res->u64s; -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ -+ new.reserved += d; -+ -+ /* -+ * If we're being called from the journal reclaim path, we have -+ * to unconditionally give out the pre-reservation, there's -+ * nothing else sensible we can do - otherwise we'd recurse back -+ * into the reclaim path and deadlock: -+ */ -+ -+ if (!(flags & JOURNAL_RES_GET_RECLAIM) && -+ new.reserved > new.remaining) -+ return 0; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+ -+ res->u64s += d; -+ return 1; -+} -+ -+static inline int bch2_journal_preres_get(struct journal *j, -+ struct journal_preres *res, -+ unsigned new_u64s, -+ unsigned flags) -+{ -+ if (new_u64s <= res->u64s) -+ return 0; -+ -+ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) -+ return 0; -+ -+ if (flags & JOURNAL_RES_GET_NONBLOCK) -+ return -EAGAIN; -+ -+ return __bch2_journal_preres_get(j, res, new_u64s, flags); -+} -+ -+/* journal_entry_res: */ -+ -+void bch2_journal_entry_res_resize(struct journal *, -+ struct journal_entry_res *, -+ unsigned); -+ -+u64 bch2_journal_last_unwritten_seq(struct journal *); -+int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); -+ -+void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); -+void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); -+void bch2_journal_flush_async(struct journal *, struct closure *); -+void bch2_journal_meta_async(struct journal *, struct closure *); -+ -+int bch2_journal_flush_seq(struct journal *, u64); -+int bch2_journal_flush(struct journal *); -+int bch2_journal_meta(struct journal *); -+ -+void bch2_journal_halt(struct journal *); -+ -+static inline int bch2_journal_error(struct journal *j) -+{ -+ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL -+ ? -EIO : 0; -+} -+ -+struct bch_dev; -+ -+static inline bool journal_flushes_device(struct bch_dev *ca) -+{ -+ return true; -+} -+ -+static inline void bch2_journal_set_replay_done(struct journal *j) -+{ -+ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); -+ set_bit(JOURNAL_REPLAY_DONE, &j->flags); -+} -+ -+void bch2_journal_unblock(struct journal *); -+void bch2_journal_block(struct journal *); -+ -+void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -+void bch2_journal_pins_to_text(struct printbuf *, struct journal *); -+ -+int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, -+ unsigned nr); -+int bch2_dev_journal_alloc(struct bch_dev *); -+ -+void bch2_dev_journal_stop(struct journal *, struct bch_dev *); -+ -+void bch2_fs_journal_stop(struct journal *); -+int bch2_fs_journal_start(struct journal *, u64, struct list_head *); -+ -+void bch2_dev_journal_exit(struct bch_dev *); -+int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); -+void bch2_fs_journal_exit(struct journal *); -+int bch2_fs_journal_init(struct journal *); -+ -+#endif /* _BCACHEFS_JOURNAL_H */ -diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c -new file mode 100644 -index 000000000000..bd0e6b371701 ---- /dev/null -+++ b/fs/bcachefs/journal_io.c -@@ -0,0 +1,1183 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_io.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+ -+#include -+ -+struct journal_list { -+ struct closure cl; -+ struct mutex lock; -+ struct list_head *head; -+ int ret; -+}; -+ -+#define JOURNAL_ENTRY_ADD_OK 0 -+#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 -+ -+/* -+ * Given a journal entry we just read, add it to the list of journal entries to -+ * be replayed: -+ */ -+static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, -+ struct journal_list *jlist, struct jset *j, -+ bool bad) -+{ -+ struct journal_replay *i, *pos; -+ struct bch_devs_list devs = { .nr = 0 }; -+ struct list_head *where; -+ size_t bytes = vstruct_bytes(j); -+ __le64 last_seq; -+ int ret; -+ -+ last_seq = !list_empty(jlist->head) -+ ? list_last_entry(jlist->head, struct journal_replay, -+ list)->j.last_seq -+ : 0; -+ -+ if (!c->opts.read_entire_journal) { -+ /* Is this entry older than the range we need? */ -+ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { -+ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; -+ goto out; -+ } -+ -+ /* Drop entries we don't need anymore */ -+ list_for_each_entry_safe(i, pos, jlist->head, list) { -+ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) -+ break; -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+ } -+ -+ list_for_each_entry_reverse(i, jlist->head, list) { -+ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { -+ where = &i->list; -+ goto add; -+ } -+ } -+ -+ where = jlist->head; -+add: -+ i = where->next != jlist->head -+ ? container_of(where->next, struct journal_replay, list) -+ : NULL; -+ -+ /* -+ * Duplicate journal entries? If so we want the one that didn't have a -+ * checksum error: -+ */ -+ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { -+ if (i->bad) { -+ devs = i->devs; -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } else if (bad) { -+ goto found; -+ } else { -+ fsck_err_on(bytes != vstruct_bytes(&i->j) || -+ memcmp(j, &i->j, bytes), c, -+ "found duplicate but non identical journal entries (seq %llu)", -+ le64_to_cpu(j->seq)); -+ goto found; -+ } -+ -+ } -+ -+ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); -+ if (!i) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ list_add(&i->list, where); -+ i->devs = devs; -+ i->bad = bad; -+ memcpy(&i->j, j, bytes); -+found: -+ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) -+ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); -+ else -+ fsck_err_on(1, c, "duplicate journal entries on same device"); -+ ret = JOURNAL_ENTRY_ADD_OK; -+out: -+fsck_err: -+ return ret; -+} -+ -+static struct nonce journal_nonce(const struct jset *jset) -+{ -+ return (struct nonce) {{ -+ [0] = 0, -+ [1] = ((__le32 *) &jset->seq)[0], -+ [2] = ((__le32 *) &jset->seq)[1], -+ [3] = BCH_NONCE_JOURNAL, -+ }}; -+} -+ -+/* this fills in a range with empty jset_entries: */ -+static void journal_entry_null_range(void *start, void *end) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = start; entry != end; entry = vstruct_next(entry)) -+ memset(entry, 0, sizeof(*entry)); -+} -+ -+#define JOURNAL_ENTRY_REREAD 5 -+#define JOURNAL_ENTRY_NONE 6 -+#define JOURNAL_ENTRY_BAD 7 -+ -+#define journal_entry_err(c, msg, ...) \ -+({ \ -+ switch (write) { \ -+ case READ: \ -+ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ -+ break; \ -+ case WRITE: \ -+ bch_err(c, "corrupt metadata before write:\n" \ -+ msg, ##__VA_ARGS__); \ -+ if (bch2_fs_inconsistent(c)) { \ -+ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ -+ goto fsck_err; \ -+ } \ -+ break; \ -+ } \ -+ true; \ -+}) -+ -+#define journal_entry_err_on(cond, c, msg, ...) \ -+ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) -+ -+static int journal_validate_key(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, -+ unsigned level, enum btree_id btree_id, -+ struct bkey_i *k, -+ const char *type, int write) -+{ -+ void *next = vstruct_next(entry); -+ const char *invalid; -+ unsigned version = le32_to_cpu(jset->version); -+ int ret = 0; -+ -+ if (journal_entry_err_on(!k->k.u64s, c, -+ "invalid %s in journal: k->u64s 0", type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on((void *) bkey_next(k) > -+ (void *) vstruct_next(entry), c, -+ "invalid %s in journal: extends past end of journal entry", -+ type)) { -+ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, -+ "invalid %s in journal: bad format %u", -+ type, k->k.format)) { -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (!write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+ -+ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), -+ __btree_node_type(level, btree_id)); -+ if (invalid) { -+ char buf[160]; -+ -+ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); -+ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", -+ type, invalid, buf); -+ -+ le16_add_cpu(&entry->u64s, -k->k.u64s); -+ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ if (write) -+ bch2_bkey_compat(level, btree_id, version, -+ JSET_BIG_ENDIAN(jset), write, -+ NULL, bkey_to_packed(k)); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_btree_keys(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k; -+ -+ vstruct_for_each(entry, k) { -+ int ret = journal_validate_key(c, jset, entry, -+ entry->level, -+ entry->btree_id, -+ k, "key", write); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static int journal_entry_validate_btree_root(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct bkey_i *k = entry->start; -+ int ret = 0; -+ -+ if (journal_entry_err_on(!entry->u64s || -+ le16_to_cpu(entry->u64s) != k->k.u64s, c, -+ "invalid btree root journal entry: wrong number of keys")) { -+ void *next = vstruct_next(entry); -+ /* -+ * we don't want to null out this jset_entry, -+ * just the contents, so that later we can tell -+ * we were _supposed_ to have a btree root -+ */ -+ entry->u64s = 0; -+ journal_entry_null_range(vstruct_next(entry), next); -+ return 0; -+ } -+ -+ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, -+ "btree root", write); -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_prio_ptrs(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ /* obsolete, don't care: */ -+ return 0; -+} -+ -+static int journal_entry_validate_blacklist(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_blacklist_v2(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_blacklist_v2 *bl_entry; -+ int ret = 0; -+ -+ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, -+ "invalid journal seq blacklist entry: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ goto out; -+ } -+ -+ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > -+ le64_to_cpu(bl_entry->end), c, -+ "invalid journal seq blacklist entry: start > end")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ } -+out: -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u), -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+static int journal_entry_validate_data_usage(struct bch_fs *c, -+ struct jset *jset, -+ struct jset_entry *entry, -+ int write) -+{ -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); -+ int ret = 0; -+ -+ if (journal_entry_err_on(bytes < sizeof(*u) || -+ bytes < sizeof(*u) + u->r.nr_devs, -+ c, -+ "invalid journal entry usage: bad size")) { -+ journal_entry_null_range(entry, vstruct_next(entry)); -+ return ret; -+ } -+ -+fsck_err: -+ return ret; -+} -+ -+struct jset_entry_ops { -+ int (*validate)(struct bch_fs *, struct jset *, -+ struct jset_entry *, int); -+}; -+ -+static const struct jset_entry_ops bch2_jset_entry_ops[] = { -+#define x(f, nr) \ -+ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ -+ .validate = journal_entry_validate_##f, \ -+ }, -+ BCH_JSET_ENTRY_TYPES() -+#undef x -+}; -+ -+static int journal_entry_validate(struct bch_fs *c, struct jset *jset, -+ struct jset_entry *entry, int write) -+{ -+ return entry->type < BCH_JSET_ENTRY_NR -+ ? bch2_jset_entry_ops[entry->type].validate(c, jset, -+ entry, write) -+ : 0; -+} -+ -+static int jset_validate_entries(struct bch_fs *c, struct jset *jset, -+ int write) -+{ -+ struct jset_entry *entry; -+ int ret = 0; -+ -+ vstruct_for_each(jset, entry) { -+ if (journal_entry_err_on(vstruct_next(entry) > -+ vstruct_last(jset), c, -+ "journal entry extends past end of jset")) { -+ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); -+ break; -+ } -+ -+ ret = journal_entry_validate(c, jset, entry, write); -+ if (ret) -+ break; -+ } -+fsck_err: -+ return ret; -+} -+ -+static int jset_validate(struct bch_fs *c, -+ struct bch_dev *ca, -+ struct jset *jset, u64 sector, -+ unsigned bucket_sectors_left, -+ unsigned sectors_read, -+ int write) -+{ -+ size_t bytes = vstruct_bytes(jset); -+ struct bch_csum csum; -+ unsigned version; -+ int ret = 0; -+ -+ if (le64_to_cpu(jset->magic) != jset_magic(c)) -+ return JOURNAL_ENTRY_NONE; -+ -+ version = le32_to_cpu(jset->version); -+ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && -+ version < bcachefs_metadata_version_min) || -+ version >= bcachefs_metadata_version_max, c, -+ "%s sector %llu seq %llu: unknown journal entry version %u", -+ ca->name, sector, le64_to_cpu(jset->seq), -+ version)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, -+ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", -+ ca->name, sector, le64_to_cpu(jset->seq), bytes)) { -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ if (bytes > sectors_read << 9) -+ return JOURNAL_ENTRY_REREAD; -+ -+ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, -+ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", -+ ca->name, sector, le64_to_cpu(jset->seq), -+ JSET_CSUM_TYPE(jset))) -+ return JOURNAL_ENTRY_BAD; -+ -+ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); -+ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, -+ "%s sector %llu seq %llu: journal checksum bad", -+ ca->name, sector, le64_to_cpu(jset->seq))) { -+ /* XXX: retry IO, when we start retrying checksum errors */ -+ /* XXX: note we might have missing journal entries */ -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, -+ "invalid journal entry: last_seq > seq")) { -+ jset->last_seq = jset->seq; -+ return JOURNAL_ENTRY_BAD; -+ } -+ -+ return 0; -+fsck_err: -+ return ret; -+} -+ -+struct journal_read_buf { -+ void *data; -+ size_t size; -+}; -+ -+static int journal_read_buf_realloc(struct journal_read_buf *b, -+ size_t new_size) -+{ -+ void *n; -+ -+ /* the bios are sized for this many pages, max: */ -+ if (new_size > JOURNAL_ENTRY_SIZE_MAX) -+ return -ENOMEM; -+ -+ new_size = roundup_pow_of_two(new_size); -+ n = kvpmalloc(new_size, GFP_KERNEL); -+ if (!n) -+ return -ENOMEM; -+ -+ kvpfree(b->data, b->size); -+ b->data = n; -+ b->size = new_size; -+ return 0; -+} -+ -+static int journal_read_bucket(struct bch_dev *ca, -+ struct journal_read_buf *buf, -+ struct journal_list *jlist, -+ unsigned bucket) -+{ -+ struct bch_fs *c = ca->fs; -+ struct journal_device *ja = &ca->journal; -+ struct jset *j = NULL; -+ unsigned sectors, sectors_read = 0; -+ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), -+ end = offset + ca->mi.bucket_size; -+ bool saw_bad = false; -+ int ret = 0; -+ -+ pr_debug("reading %u", bucket); -+ -+ while (offset < end) { -+ if (!sectors_read) { -+ struct bio *bio; -+reread: -+ sectors_read = min_t(unsigned, -+ end - offset, buf->size >> 9); -+ -+ bio = bio_kmalloc(GFP_KERNEL, -+ buf_pages(buf->data, -+ sectors_read << 9)); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(bio, REQ_OP_READ, 0); -+ bch2_bio_map(bio, buf->data, sectors_read << 9); -+ -+ ret = submit_bio_wait(bio); -+ bio_put(bio); -+ -+ if (bch2_dev_io_err_on(ret, ca, -+ "journal read from sector %llu", -+ offset) || -+ bch2_meta_read_fault("journal")) -+ return -EIO; -+ -+ j = buf->data; -+ } -+ -+ ret = jset_validate(c, ca, j, offset, -+ end - offset, sectors_read, -+ READ); -+ switch (ret) { -+ case BCH_FSCK_OK: -+ sectors = vstruct_sectors(j, c->block_bits); -+ break; -+ case JOURNAL_ENTRY_REREAD: -+ if (vstruct_bytes(j) > buf->size) { -+ ret = journal_read_buf_realloc(buf, -+ vstruct_bytes(j)); -+ if (ret) -+ return ret; -+ } -+ goto reread; -+ case JOURNAL_ENTRY_NONE: -+ if (!saw_bad) -+ return 0; -+ sectors = c->opts.block_size; -+ goto next_block; -+ case JOURNAL_ENTRY_BAD: -+ saw_bad = true; -+ /* -+ * On checksum error we don't really trust the size -+ * field of the journal entry we read, so try reading -+ * again at next block boundary: -+ */ -+ sectors = c->opts.block_size; -+ break; -+ default: -+ return ret; -+ } -+ -+ /* -+ * This happens sometimes if we don't have discards on - -+ * when we've partially overwritten a bucket with new -+ * journal entries. We don't need the rest of the -+ * bucket: -+ */ -+ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) -+ return 0; -+ -+ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); -+ -+ mutex_lock(&jlist->lock); -+ ret = journal_entry_add(c, ca, jlist, j, ret != 0); -+ mutex_unlock(&jlist->lock); -+ -+ switch (ret) { -+ case JOURNAL_ENTRY_ADD_OK: -+ break; -+ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: -+ break; -+ default: -+ return ret; -+ } -+next_block: -+ pr_debug("next"); -+ offset += sectors; -+ sectors_read -= sectors; -+ j = ((void *) j) + (sectors << 9); -+ } -+ -+ return 0; -+} -+ -+static void bch2_journal_read_device(struct closure *cl) -+{ -+ struct journal_device *ja = -+ container_of(cl, struct journal_device, read); -+ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); -+ struct journal_list *jlist = -+ container_of(cl->parent, struct journal_list, cl); -+ struct journal_read_buf buf = { NULL, 0 }; -+ u64 min_seq = U64_MAX; -+ unsigned i; -+ int ret; -+ -+ if (!ja->nr) -+ goto out; -+ -+ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); -+ if (ret) -+ goto err; -+ -+ pr_debug("%u journal buckets", ja->nr); -+ -+ for (i = 0; i < ja->nr; i++) { -+ ret = journal_read_bucket(ca, &buf, jlist, i); -+ if (ret) -+ goto err; -+ } -+ -+ /* Find the journal bucket with the highest sequence number: */ -+ for (i = 0; i < ja->nr; i++) { -+ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) -+ ja->cur_idx = i; -+ -+ min_seq = min(ja->bucket_seq[i], min_seq); -+ } -+ -+ /* -+ * If there's duplicate journal entries in multiple buckets (which -+ * definitely isn't supposed to happen, but...) - make sure to start -+ * cur_idx at the last of those buckets, so we don't deadlock trying to -+ * allocate -+ */ -+ while (ja->bucket_seq[ja->cur_idx] > min_seq && -+ ja->bucket_seq[ja->cur_idx] > -+ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ -+ ja->sectors_free = 0; -+ -+ /* -+ * Set dirty_idx to indicate the entire journal is full and needs to be -+ * reclaimed - journal reclaim will immediately reclaim whatever isn't -+ * pinned when it first runs: -+ */ -+ ja->discard_idx = ja->dirty_idx_ondisk = -+ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; -+out: -+ kvpfree(buf.data, buf.size); -+ percpu_ref_put(&ca->io_ref); -+ closure_return(cl); -+ return; -+err: -+ mutex_lock(&jlist->lock); -+ jlist->ret = ret; -+ mutex_unlock(&jlist->lock); -+ goto out; -+} -+ -+int bch2_journal_read(struct bch_fs *c, struct list_head *list) -+{ -+ struct journal_list jlist; -+ struct journal_replay *i; -+ struct bch_dev *ca; -+ unsigned iter; -+ size_t keys = 0, entries = 0; -+ bool degraded = false; -+ int ret = 0; -+ -+ closure_init_stack(&jlist.cl); -+ mutex_init(&jlist.lock); -+ jlist.head = list; -+ jlist.ret = 0; -+ -+ for_each_member_device(ca, c, iter) { -+ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && -+ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) -+ continue; -+ -+ if ((ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO) && -+ percpu_ref_tryget(&ca->io_ref)) -+ closure_call(&ca->journal.read, -+ bch2_journal_read_device, -+ system_unbound_wq, -+ &jlist.cl); -+ else -+ degraded = true; -+ } -+ -+ closure_sync(&jlist.cl); -+ -+ if (jlist.ret) -+ return jlist.ret; -+ -+ list_for_each_entry(i, list, list) { -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct bch_replicas_padded replicas; -+ char buf[80]; -+ -+ ret = jset_validate_entries(c, &i->j, READ); -+ if (ret) -+ goto fsck_err; -+ -+ /* -+ * If we're mounting in degraded mode - if we didn't read all -+ * the devices - this is wrong: -+ */ -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); -+ -+ if (!degraded && -+ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || -+ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, -+ "superblock not marked as containing replicas %s", -+ (bch2_replicas_entry_to_text(&PBUF(buf), -+ &replicas.e), buf)))) { -+ ret = bch2_mark_replicas(c, &replicas.e); -+ if (ret) -+ return ret; -+ } -+ -+ for_each_jset_key(k, _n, entry, &i->j) -+ keys++; -+ entries++; -+ } -+ -+ if (!list_empty(list)) { -+ i = list_last_entry(list, struct journal_replay, list); -+ -+ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", -+ keys, entries, le64_to_cpu(i->j.seq)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal write: */ -+ -+static void __journal_write_alloc(struct journal *j, -+ struct journal_buf *w, -+ struct dev_alloc_list *devs_sorted, -+ unsigned sectors, -+ unsigned *replicas, -+ unsigned replicas_want) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (*replicas >= replicas_want) -+ return; -+ -+ for (i = 0; i < devs_sorted->nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ /* -+ * Check that we can use this device, and aren't already using -+ * it: -+ */ -+ if (!ca->mi.durability || -+ ca->mi.state != BCH_MEMBER_STATE_RW || -+ !ja->nr || -+ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), -+ ca->dev_idx) || -+ sectors > ja->sectors_free) -+ continue; -+ -+ bch2_dev_stripe_increment(ca, &j->wp.stripe); -+ -+ bch2_bkey_append_ptr(&w->key, -+ (struct bch_extent_ptr) { -+ .offset = bucket_to_sector(ca, -+ ja->buckets[ja->cur_idx]) + -+ ca->mi.bucket_size - -+ ja->sectors_free, -+ .dev = ca->dev_idx, -+ }); -+ -+ ja->sectors_free -= sectors; -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ -+ *replicas += ca->mi.durability; -+ -+ if (*replicas >= replicas_want) -+ break; -+ } -+} -+ -+/** -+ * journal_next_bucket - move on to the next journal bucket if possible -+ */ -+static int journal_write_alloc(struct journal *j, struct journal_buf *w, -+ unsigned sectors) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_device *ja; -+ struct bch_dev *ca; -+ struct dev_alloc_list devs_sorted; -+ unsigned i, replicas = 0, replicas_want = -+ READ_ONCE(c->opts.metadata_replicas); -+ -+ rcu_read_lock(); -+ -+ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, -+ &c->rw_devs[BCH_DATA_journal]); -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+ -+ if (replicas >= replicas_want) -+ goto done; -+ -+ for (i = 0; i < devs_sorted.nr; i++) { -+ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); -+ if (!ca) -+ continue; -+ -+ ja = &ca->journal; -+ -+ if (sectors > ja->sectors_free && -+ sectors <= ca->mi.bucket_size && -+ bch2_journal_dev_buckets_available(j, ja, -+ journal_space_discarded)) { -+ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; -+ ja->sectors_free = ca->mi.bucket_size; -+ -+ /* -+ * ja->bucket_seq[ja->cur_idx] must always have -+ * something sensible: -+ */ -+ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); -+ } -+ } -+ -+ __journal_write_alloc(j, w, &devs_sorted, -+ sectors, &replicas, replicas_want); -+done: -+ rcu_read_unlock(); -+ -+ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; -+} -+ -+static void journal_write_compact(struct jset *jset) -+{ -+ struct jset_entry *i, *next, *prev = NULL; -+ -+ /* -+ * Simple compaction, dropping empty jset_entries (from journal -+ * reservations that weren't fully used) and merging jset_entries that -+ * can be. -+ * -+ * If we wanted to be really fancy here, we could sort all the keys in -+ * the jset and drop keys that were overwritten - probably not worth it: -+ */ -+ vstruct_for_each_safe(jset, i, next) { -+ unsigned u64s = le16_to_cpu(i->u64s); -+ -+ /* Empty entry: */ -+ if (!u64s) -+ continue; -+ -+ /* Can we merge with previous entry? */ -+ if (prev && -+ i->btree_id == prev->btree_id && -+ i->level == prev->level && -+ i->type == prev->type && -+ i->type == BCH_JSET_ENTRY_btree_keys && -+ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { -+ memmove_u64s_down(vstruct_next(prev), -+ i->_data, -+ u64s); -+ le16_add_cpu(&prev->u64s, u64s); -+ continue; -+ } -+ -+ /* Couldn't merge, move i into new position (after prev): */ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ if (i != prev) -+ memmove_u64s_down(prev, i, jset_u64s(u64s)); -+ } -+ -+ prev = prev ? vstruct_next(prev) : jset->start; -+ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); -+} -+ -+static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) -+{ -+ /* we aren't holding j->lock: */ -+ unsigned new_size = READ_ONCE(j->buf_size_want); -+ void *new_buf; -+ -+ if (buf->buf_size >= new_size) -+ return; -+ -+ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); -+ if (!new_buf) -+ return; -+ -+ memcpy(new_buf, buf->data, buf->buf_size); -+ kvpfree(buf->data, buf->buf_size); -+ buf->data = new_buf; -+ buf->buf_size = new_size; -+} -+ -+static void journal_write_done(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_buf *w = journal_prev_buf(j); -+ struct bch_devs_list devs = -+ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); -+ struct bch_replicas_padded replicas; -+ u64 seq = le64_to_cpu(w->data->seq); -+ u64 last_seq = le64_to_cpu(w->data->last_seq); -+ -+ bch2_time_stats_update(j->write_time, j->write_start_time); -+ -+ if (!devs.nr) { -+ bch_err(c, "unable to write journal to sufficient devices"); -+ goto err; -+ } -+ -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); -+ -+ if (bch2_mark_replicas(c, &replicas.e)) -+ goto err; -+ -+ spin_lock(&j->lock); -+ if (seq >= j->pin.front) -+ journal_seq_pin(j, seq)->devs = devs; -+ -+ j->seq_ondisk = seq; -+ j->last_seq_ondisk = last_seq; -+ bch2_journal_space_available(j); -+ -+ /* -+ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard -+ * more buckets: -+ * -+ * Must come before signaling write completion, for -+ * bch2_fs_journal_stop(): -+ */ -+ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); -+out: -+ /* also must come before signalling write completion: */ -+ closure_debug_destroy(cl); -+ -+ BUG_ON(!j->reservations.prev_buf_unwritten); -+ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, -+ &j->reservations.counter); -+ -+ closure_wake_up(&w->wait); -+ journal_wake(j); -+ -+ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) -+ mod_delayed_work(system_freezable_wq, &j->write_work, 0); -+ spin_unlock(&j->lock); -+ return; -+err: -+ bch2_fatal_error(c); -+ spin_lock(&j->lock); -+ goto out; -+} -+ -+static void journal_write_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ struct journal *j = &ca->fs->journal; -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", -+ bch2_blk_status_to_str(bio->bi_status)) || -+ bch2_meta_write_fault("journal")) { -+ struct journal_buf *w = journal_prev_buf(j); -+ unsigned long flags; -+ -+ spin_lock_irqsave(&j->err_lock, flags); -+ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); -+ spin_unlock_irqrestore(&j->err_lock, flags); -+ } -+ -+ closure_put(&j->io); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+void bch2_journal_write(struct closure *cl) -+{ -+ struct journal *j = container_of(cl, struct journal, io); -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_buf *w = journal_prev_buf(j); -+ struct jset_entry *start, *end; -+ struct jset *jset; -+ struct bio *bio; -+ struct bch_extent_ptr *ptr; -+ bool validate_before_checksum = false; -+ unsigned i, sectors, bytes, u64s; -+ int ret; -+ -+ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); -+ -+ journal_buf_realloc(j, w); -+ jset = w->data; -+ -+ j->write_start_time = local_clock(); -+ -+ /* -+ * New btree roots are set by journalling them; when the journal entry -+ * gets written we have to propagate them to c->btree_roots -+ * -+ * But, every journal entry we write has to contain all the btree roots -+ * (at least for now); so after we copy btree roots to c->btree_roots we -+ * have to get any missing btree roots and add them to this journal -+ * entry: -+ */ -+ -+ bch2_journal_entries_to_btree_roots(c, jset); -+ -+ start = end = vstruct_last(jset); -+ -+ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); -+ -+ end = bch2_journal_super_entries_add_common(c, end, -+ le64_to_cpu(jset->seq)); -+ u64s = (u64 *) end - (u64 *) start; -+ BUG_ON(u64s > j->entry_u64s_reserved); -+ -+ le32_add_cpu(&jset->u64s, u64s); -+ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); -+ -+ journal_write_compact(jset); -+ -+ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ jset->magic = cpu_to_le64(jset_magic(c)); -+ -+ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning -+ ? cpu_to_le32(BCH_JSET_VERSION_OLD) -+ : cpu_to_le32(c->sb.version); -+ -+ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); -+ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); -+ -+ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) -+ validate_before_checksum = true; -+ -+ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) -+ validate_before_checksum = true; -+ -+ if (validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), -+ jset->encrypted_start, -+ vstruct_end(jset) - (void *) jset->encrypted_start); -+ -+ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), -+ journal_nonce(jset), jset); -+ -+ if (!validate_before_checksum && -+ jset_validate_entries(c, jset, WRITE)) -+ goto err; -+ -+ sectors = vstruct_sectors(jset, c->block_bits); -+ BUG_ON(sectors > w->sectors); -+ -+ bytes = vstruct_bytes(jset); -+ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); -+ -+retry_alloc: -+ spin_lock(&j->lock); -+ ret = journal_write_alloc(j, w, sectors); -+ -+ if (ret && j->can_discard) { -+ spin_unlock(&j->lock); -+ bch2_journal_do_discards(j); -+ goto retry_alloc; -+ } -+ -+ /* -+ * write is allocated, no longer need to account for it in -+ * bch2_journal_space_available(): -+ */ -+ w->sectors = 0; -+ -+ /* -+ * journal entry has been compacted and allocated, recalculate space -+ * available: -+ */ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ -+ if (ret) { -+ bch_err(c, "Unable to allocate journal write"); -+ bch2_fatal_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+ } -+ -+ /* -+ * XXX: we really should just disable the entire journal in nochanges -+ * mode -+ */ -+ if (c->opts.nochanges) -+ goto no_io; -+ -+ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { -+ ca = bch_dev_bkey_exists(c, ptr->dev); -+ if (!percpu_ref_tryget(&ca->io_ref)) { -+ /* XXX: fix this */ -+ bch_err(c, "missing device for journal write\n"); -+ continue; -+ } -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], -+ sectors); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = ptr->offset; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, -+ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); -+ bch2_bio_map(bio, jset, sectors << 9); -+ -+ trace_journal_write(bio); -+ closure_bio_submit(bio, cl); -+ -+ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); -+ } -+ -+ for_each_rw_member(ca, c, i) -+ if (journal_flushes_device(ca) && -+ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { -+ percpu_ref_get(&ca->io_ref); -+ -+ bio = ca->journal.bio; -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_opf = REQ_OP_FLUSH; -+ bio->bi_end_io = journal_write_endio; -+ bio->bi_private = ca; -+ closure_bio_submit(bio, cl); -+ } -+ -+no_io: -+ bch2_bucket_seq_cleanup(c); -+ -+ continue_at(cl, journal_write_done, system_highpri_wq); -+ return; -+err: -+ bch2_inconsistent_error(c); -+ continue_at(cl, journal_write_done, system_highpri_wq); -+} -diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h -new file mode 100644 -index 000000000000..6958ee0f8cf2 ---- /dev/null -+++ b/fs/bcachefs/journal_io.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_IO_H -+#define _BCACHEFS_JOURNAL_IO_H -+ -+/* -+ * Only used for holding the journal entries we read in btree_journal_read() -+ * during cache_registration -+ */ -+struct journal_replay { -+ struct list_head list; -+ struct bch_devs_list devs; -+ /* checksum error, but we may want to try using it anyways: */ -+ bool bad; -+ /* must be last: */ -+ struct jset j; -+}; -+ -+static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, -+ struct jset_entry *entry, unsigned type) -+{ -+ while (entry < vstruct_last(jset)) { -+ if (entry->type == type) -+ return entry; -+ -+ entry = vstruct_next(entry); -+ } -+ -+ return NULL; -+} -+ -+#define for_each_jset_entry_type(entry, jset, type) \ -+ for (entry = (jset)->start; \ -+ (entry = __jset_entry_type_next(jset, entry, type)); \ -+ entry = vstruct_next(entry)) -+ -+#define for_each_jset_key(k, _n, entry, jset) \ -+ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ -+ vstruct_for_each_safe(entry, k, _n) -+ -+int bch2_journal_read(struct bch_fs *, struct list_head *); -+ -+void bch2_journal_write(struct closure *); -+ -+#endif /* _BCACHEFS_JOURNAL_IO_H */ -diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c -new file mode 100644 -index 000000000000..57591983eebd ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.c -@@ -0,0 +1,644 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "journal.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "replicas.h" -+#include "super.h" -+ -+/* Free space calculations: */ -+ -+static unsigned journal_space_from(struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ switch (from) { -+ case journal_space_discarded: -+ return ja->discard_idx; -+ case journal_space_clean_ondisk: -+ return ja->dirty_idx_ondisk; -+ case journal_space_clean: -+ return ja->dirty_idx; -+ default: -+ BUG(); -+ } -+} -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *j, -+ struct journal_device *ja, -+ enum journal_space_from from) -+{ -+ unsigned available = (journal_space_from(ja, from) - -+ ja->cur_idx - 1 + ja->nr) % ja->nr; -+ -+ /* -+ * Don't use the last bucket unless writing the new last_seq -+ * will make another bucket available: -+ */ -+ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) -+ --available; -+ -+ return available; -+} -+ -+static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) -+{ -+ union journal_preres_state old, new; -+ u64 v = atomic64_read(&j->prereserved.counter); -+ -+ do { -+ old.v = new.v = v; -+ new.remaining = u64s_remaining; -+ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, -+ old.v, new.v)) != old.v); -+} -+ -+static struct journal_space { -+ unsigned next_entry; -+ unsigned remaining; -+} __journal_space_available(struct journal *j, unsigned nr_devs_want, -+ enum journal_space_from from) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned sectors_next_entry = UINT_MAX; -+ unsigned sectors_total = UINT_MAX; -+ unsigned i, nr_devs = 0; -+ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten -+ ? journal_prev_buf(j)->sectors -+ : 0; -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ unsigned buckets_this_device, sectors_this_device; -+ -+ if (!ja->nr) -+ continue; -+ -+ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); -+ sectors_this_device = ja->sectors_free; -+ -+ /* -+ * We that we don't allocate the space for a journal entry -+ * until we write it out - thus, account for it here: -+ */ -+ if (unwritten_sectors >= sectors_this_device) { -+ if (!buckets_this_device) -+ continue; -+ -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ sectors_this_device -= unwritten_sectors; -+ -+ if (sectors_this_device < ca->mi.bucket_size && -+ buckets_this_device) { -+ buckets_this_device--; -+ sectors_this_device = ca->mi.bucket_size; -+ } -+ -+ if (!sectors_this_device) -+ continue; -+ -+ sectors_next_entry = min(sectors_next_entry, -+ sectors_this_device); -+ -+ sectors_total = min(sectors_total, -+ buckets_this_device * ca->mi.bucket_size + -+ sectors_this_device); -+ -+ nr_devs++; -+ } -+ rcu_read_unlock(); -+ -+ if (nr_devs < nr_devs_want) -+ return (struct journal_space) { 0, 0 }; -+ -+ return (struct journal_space) { -+ .next_entry = sectors_next_entry, -+ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), -+ }; -+} -+ -+void bch2_journal_space_available(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ struct journal_space discarded, clean_ondisk, clean; -+ unsigned overhead, u64s_remaining = 0; -+ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, -+ j->buf[1].buf_size >> 9); -+ unsigned i, nr_online = 0, nr_devs_want; -+ bool can_discard = false; -+ int ret = 0; -+ -+ lockdep_assert_held(&j->lock); -+ -+ rcu_read_lock(); -+ for_each_member_device_rcu(ca, c, i, -+ &c->rw_devs[BCH_DATA_journal]) { -+ struct journal_device *ja = &ca->journal; -+ -+ if (!ja->nr) -+ continue; -+ -+ while (ja->dirty_idx != ja->cur_idx && -+ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) -+ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; -+ -+ while (ja->dirty_idx_ondisk != ja->dirty_idx && -+ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) -+ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; -+ -+ if (ja->discard_idx != ja->dirty_idx_ondisk) -+ can_discard = true; -+ -+ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); -+ nr_online++; -+ } -+ rcu_read_unlock(); -+ -+ j->can_discard = can_discard; -+ -+ if (nr_online < c->opts.metadata_replicas_required) { -+ ret = -EROFS; -+ goto out; -+ } -+ -+ if (!fifo_free(&j->pin)) { -+ ret = -ENOSPC; -+ goto out; -+ } -+ -+ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); -+ -+ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); -+ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); -+ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); -+ -+ if (!discarded.next_entry) -+ ret = -ENOSPC; -+ -+ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * -+ journal_entry_overhead(j); -+ u64s_remaining = clean.remaining << 6; -+ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); -+ u64s_remaining /= 4; -+out: -+ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; -+ j->cur_entry_error = ret; -+ journal_set_remaining(j, u64s_remaining); -+ journal_check_may_get_unreserved(j); -+ -+ if (!ret) -+ journal_wake(j); -+} -+ -+/* Discards - last part of journal reclaim: */ -+ -+static bool should_discard_bucket(struct journal *j, struct journal_device *ja) -+{ -+ bool ret; -+ -+ spin_lock(&j->lock); -+ ret = ja->discard_idx != ja->dirty_idx_ondisk; -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* -+ * Advance ja->discard_idx as long as it points to buckets that are no longer -+ * dirty, issuing discards if necessary: -+ */ -+void bch2_journal_do_discards(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter; -+ -+ mutex_lock(&j->discard_lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ -+ while (should_discard_bucket(j, ja)) { -+ if (ca->mi.discard && -+ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) -+ blkdev_issue_discard(ca->disk_sb.bdev, -+ bucket_to_sector(ca, -+ ja->buckets[ja->discard_idx]), -+ ca->mi.bucket_size, GFP_NOIO, 0); -+ -+ spin_lock(&j->lock); -+ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; -+ -+ bch2_journal_space_available(j); -+ spin_unlock(&j->lock); -+ } -+ } -+ -+ mutex_unlock(&j->discard_lock); -+} -+ -+/* -+ * Journal entry pinning - machinery for holding a reference on a given journal -+ * entry, holding it open to ensure it gets replayed during recovery: -+ */ -+ -+static void bch2_journal_reclaim_fast(struct journal *j) -+{ -+ struct journal_entry_pin_list temp; -+ bool popped = false; -+ -+ lockdep_assert_held(&j->lock); -+ -+ /* -+ * Unpin journal entries whose reference counts reached zero, meaning -+ * all btree nodes got written out -+ */ -+ while (!fifo_empty(&j->pin) && -+ !atomic_read(&fifo_peek_front(&j->pin).count)) { -+ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); -+ BUG_ON(!fifo_pop(&j->pin, temp)); -+ popped = true; -+ } -+ -+ if (popped) -+ bch2_journal_space_available(j); -+} -+ -+void bch2_journal_pin_put(struct journal *j, u64 seq) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ if (atomic_dec_and_test(&pin_list->count)) { -+ spin_lock(&j->lock); -+ bch2_journal_reclaim_fast(j); -+ spin_unlock(&j->lock); -+ } -+} -+ -+static inline void __journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ struct journal_entry_pin_list *pin_list; -+ -+ if (!journal_pin_active(pin)) -+ return; -+ -+ pin_list = journal_seq_pin(j, pin->seq); -+ pin->seq = 0; -+ list_del_init(&pin->list); -+ -+ /* -+ * Unpinning a journal entry make make journal_next_bucket() succeed, if -+ * writing a new last_seq will now make another bucket available: -+ */ -+ if (atomic_dec_and_test(&pin_list->count) && -+ pin_list == &fifo_peek_front(&j->pin)) -+ bch2_journal_reclaim_fast(j); -+ else if (fifo_used(&j->pin) == 1 && -+ atomic_read(&pin_list->count) == 1) -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_drop(struct journal *j, -+ struct journal_entry_pin *pin) -+{ -+ spin_lock(&j->lock); -+ __journal_pin_drop(j, pin); -+ spin_unlock(&j->lock); -+} -+ -+static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); -+ -+ __journal_pin_drop(j, pin); -+ -+ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); -+ -+ atomic_inc(&pin_list->count); -+ pin->seq = seq; -+ pin->flush = flush_fn; -+ -+ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); -+} -+ -+void __bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_update(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (journal_pin_active(pin) && pin->seq < seq) -+ return; -+ -+ spin_lock(&j->lock); -+ -+ if (pin->seq != seq) { -+ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); -+ } else { -+ struct journal_entry_pin_list *pin_list = -+ journal_seq_pin(j, seq); -+ -+ /* -+ * If the pin is already pinning the right sequence number, it -+ * still might've already been flushed: -+ */ -+ list_move(&pin->list, &pin_list->list); -+ } -+ -+ spin_unlock(&j->lock); -+ -+ /* -+ * If the journal is currently full, we might want to call flush_fn -+ * immediately: -+ */ -+ journal_wake(j); -+} -+ -+void bch2_journal_pin_copy(struct journal *j, -+ struct journal_entry_pin *dst, -+ struct journal_entry_pin *src, -+ journal_pin_flush_fn flush_fn) -+{ -+ spin_lock(&j->lock); -+ -+ if (journal_pin_active(src) && -+ (!journal_pin_active(dst) || src->seq < dst->seq)) -+ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); -+ -+ spin_unlock(&j->lock); -+} -+ -+/** -+ * bch2_journal_pin_flush: ensure journal pin callback is no longer running -+ */ -+void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) -+{ -+ BUG_ON(journal_pin_active(pin)); -+ -+ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); -+} -+ -+/* -+ * Journal reclaim: flush references to open journal entries to reclaim space in -+ * the journal -+ * -+ * May be done by the journal code in the background as needed to free up space -+ * for more journal entries, or as part of doing a clean shutdown, or to migrate -+ * data off of a specific device: -+ */ -+ -+static struct journal_entry_pin * -+journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) -+{ -+ struct journal_entry_pin_list *pin_list; -+ struct journal_entry_pin *ret = NULL; -+ -+ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) -+ return NULL; -+ -+ spin_lock(&j->lock); -+ -+ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) -+ if (*seq > max_seq || -+ (ret = list_first_entry_or_null(&pin_list->list, -+ struct journal_entry_pin, list))) -+ break; -+ -+ if (ret) { -+ list_move(&ret->list, &pin_list->flushed); -+ BUG_ON(j->flush_in_progress); -+ j->flush_in_progress = ret; -+ j->last_flushed = jiffies; -+ } -+ -+ spin_unlock(&j->lock); -+ -+ return ret; -+} -+ -+/* returns true if we did work */ -+static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, -+ unsigned min_nr) -+{ -+ struct journal_entry_pin *pin; -+ bool ret = false; -+ u64 seq; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ while ((pin = journal_get_next_pin(j, min_nr -+ ? U64_MAX : seq_to_flush, &seq))) { -+ if (min_nr) -+ min_nr--; -+ -+ pin->flush(j, pin, seq); -+ -+ BUG_ON(j->flush_in_progress != pin); -+ j->flush_in_progress = NULL; -+ wake_up(&j->pin_flush_wait); -+ ret = true; -+ } -+ -+ return ret; -+} -+ -+/** -+ * bch2_journal_reclaim - free up journal buckets -+ * -+ * Background journal reclaim writes out btree nodes. It should be run -+ * early enough so that we never completely run out of journal buckets. -+ * -+ * High watermarks for triggering background reclaim: -+ * - FIFO has fewer than 512 entries left -+ * - fewer than 25% journal buckets free -+ * -+ * Background reclaim runs until low watermarks are reached: -+ * - FIFO has more than 1024 entries left -+ * - more than 50% journal buckets free -+ * -+ * As long as a reclaim can complete in the time it takes to fill up -+ * 512 journal entries or 25% of all journal buckets, then -+ * journal_next_bucket() should not stall. -+ */ -+void bch2_journal_reclaim(struct journal *j) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct bch_dev *ca; -+ unsigned iter, min_nr = 0; -+ u64 seq_to_flush = 0; -+ -+ lockdep_assert_held(&j->reclaim_lock); -+ -+ bch2_journal_do_discards(j); -+ -+ spin_lock(&j->lock); -+ -+ for_each_rw_member(ca, c, iter) { -+ struct journal_device *ja = &ca->journal; -+ unsigned nr_buckets, bucket_to_flush; -+ -+ if (!ja->nr) -+ continue; -+ -+ /* Try to keep the journal at most half full: */ -+ nr_buckets = ja->nr / 2; -+ -+ /* And include pre-reservations: */ -+ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, -+ (ca->mi.bucket_size << 6) - -+ journal_entry_overhead(j)); -+ -+ nr_buckets = min(nr_buckets, ja->nr); -+ -+ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; -+ seq_to_flush = max(seq_to_flush, -+ ja->bucket_seq[bucket_to_flush]); -+ } -+ -+ /* Also flush if the pin fifo is more than half full */ -+ seq_to_flush = max_t(s64, seq_to_flush, -+ (s64) journal_cur_seq(j) - -+ (j->pin.size >> 1)); -+ spin_unlock(&j->lock); -+ -+ /* -+ * If it's been longer than j->reclaim_delay_ms since we last flushed, -+ * make sure to flush at least one journal pin: -+ */ -+ if (time_after(jiffies, j->last_flushed + -+ msecs_to_jiffies(j->reclaim_delay_ms))) -+ min_nr = 1; -+ -+ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { -+ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); -+ min_nr = 1; -+ } -+ -+ journal_flush_pins(j, seq_to_flush, min_nr); -+ -+ if (!bch2_journal_error(j)) -+ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, -+ msecs_to_jiffies(j->reclaim_delay_ms)); -+} -+ -+void bch2_journal_reclaim_work(struct work_struct *work) -+{ -+ struct journal *j = container_of(to_delayed_work(work), -+ struct journal, reclaim_work); -+ -+ mutex_lock(&j->reclaim_lock); -+ bch2_journal_reclaim(j); -+ mutex_unlock(&j->reclaim_lock); -+} -+ -+static int journal_flush_done(struct journal *j, u64 seq_to_flush, -+ bool *did_work) -+{ -+ int ret; -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&j->reclaim_lock); -+ -+ *did_work = journal_flush_pins(j, seq_to_flush, 0); -+ -+ spin_lock(&j->lock); -+ /* -+ * If journal replay hasn't completed, the unreplayed journal entries -+ * hold refs on their corresponding sequence numbers -+ */ -+ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || -+ journal_last_seq(j) > seq_to_flush || -+ (fifo_used(&j->pin) == 1 && -+ atomic_read(&fifo_peek_front(&j->pin).count) == 1); -+ -+ spin_unlock(&j->lock); -+ mutex_unlock(&j->reclaim_lock); -+ -+ return ret; -+} -+ -+bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) -+{ -+ bool did_work = false; -+ -+ if (!test_bit(JOURNAL_STARTED, &j->flags)) -+ return false; -+ -+ closure_wait_event(&j->async_wait, -+ journal_flush_done(j, seq_to_flush, &did_work)); -+ -+ return did_work; -+} -+ -+int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) -+{ -+ struct bch_fs *c = container_of(j, struct bch_fs, journal); -+ struct journal_entry_pin_list *p; -+ u64 iter, seq = 0; -+ int ret = 0; -+ -+ spin_lock(&j->lock); -+ fifo_for_each_entry_ptr(p, &j->pin, iter) -+ if (dev_idx >= 0 -+ ? bch2_dev_list_has_dev(p->devs, dev_idx) -+ : p->devs.nr < c->opts.metadata_replicas) -+ seq = iter; -+ spin_unlock(&j->lock); -+ -+ bch2_journal_flush_pins(j, seq); -+ -+ ret = bch2_journal_error(j); -+ if (ret) -+ return ret; -+ -+ mutex_lock(&c->replicas_gc_lock); -+ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); -+ -+ seq = 0; -+ -+ spin_lock(&j->lock); -+ while (!ret && seq < j->pin.back) { -+ struct bch_replicas_padded replicas; -+ -+ seq = max(seq, journal_last_seq(j)); -+ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, -+ journal_seq_pin(j, seq)->devs); -+ seq++; -+ -+ spin_unlock(&j->lock); -+ ret = bch2_mark_replicas(c, &replicas.e); -+ spin_lock(&j->lock); -+ } -+ spin_unlock(&j->lock); -+ -+ ret = bch2_replicas_gc_end(c, ret); -+ mutex_unlock(&c->replicas_gc_lock); -+ -+ return ret; -+} -diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h -new file mode 100644 -index 000000000000..8128907a7623 ---- /dev/null -+++ b/fs/bcachefs/journal_reclaim.h -@@ -0,0 +1,69 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_RECLAIM_H -+#define _BCACHEFS_JOURNAL_RECLAIM_H -+ -+#define JOURNAL_PIN (32 * 1024) -+ -+enum journal_space_from { -+ journal_space_discarded, -+ journal_space_clean_ondisk, -+ journal_space_clean, -+}; -+ -+unsigned bch2_journal_dev_buckets_available(struct journal *, -+ struct journal_device *, -+ enum journal_space_from); -+void bch2_journal_space_available(struct journal *); -+ -+static inline bool journal_pin_active(struct journal_entry_pin *pin) -+{ -+ return pin->seq != 0; -+} -+ -+static inline struct journal_entry_pin_list * -+journal_seq_pin(struct journal *j, u64 seq) -+{ -+ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); -+ -+ return &j->pin.data[seq & j->pin.mask]; -+} -+ -+void bch2_journal_pin_put(struct journal *, u64); -+void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); -+ -+void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+static inline void bch2_journal_pin_add(struct journal *j, u64 seq, -+ struct journal_entry_pin *pin, -+ journal_pin_flush_fn flush_fn) -+{ -+ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) -+ __bch2_journal_pin_add(j, seq, pin, flush_fn); -+} -+ -+void bch2_journal_pin_update(struct journal *, u64, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_copy(struct journal *, -+ struct journal_entry_pin *, -+ struct journal_entry_pin *, -+ journal_pin_flush_fn); -+ -+void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); -+ -+void bch2_journal_do_discards(struct journal *); -+void bch2_journal_reclaim(struct journal *); -+void bch2_journal_reclaim_work(struct work_struct *); -+ -+bool bch2_journal_flush_pins(struct journal *, u64); -+ -+static inline bool bch2_journal_flush_all_pins(struct journal *j) -+{ -+ return bch2_journal_flush_pins(j, U64_MAX); -+} -+ -+int bch2_journal_flush_device_pins(struct journal *, int); -+ -+#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ -diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c -new file mode 100644 -index 000000000000..d0f1bbf8f6a7 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.c -@@ -0,0 +1,309 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_iter.h" -+#include "eytzinger.h" -+#include "journal_seq_blacklist.h" -+#include "super-io.h" -+ -+/* -+ * journal_seq_blacklist machinery: -+ * -+ * To guarantee order of btree updates after a crash, we need to detect when a -+ * btree node entry (bset) is newer than the newest journal entry that was -+ * successfully written, and ignore it - effectively ignoring any btree updates -+ * that didn't make it into the journal. -+ * -+ * If we didn't do this, we might have two btree nodes, a and b, both with -+ * updates that weren't written to the journal yet: if b was updated after a, -+ * but b was flushed and not a - oops; on recovery we'll find that the updates -+ * to b happened, but not the updates to a that happened before it. -+ * -+ * Ignoring bsets that are newer than the newest journal entry is always safe, -+ * because everything they contain will also have been journalled - and must -+ * still be present in the journal on disk until a journal entry has been -+ * written _after_ that bset was written. -+ * -+ * To accomplish this, bsets record the newest journal sequence number they -+ * contain updates for; then, on startup, the btree code queries the journal -+ * code to ask "Is this sequence number newer than the newest journal entry? If -+ * so, ignore it." -+ * -+ * When this happens, we must blacklist that journal sequence number: the -+ * journal must not write any entries with that sequence number, and it must -+ * record that it was blacklisted so that a) on recovery we don't think we have -+ * missing journal entries and b) so that the btree code continues to ignore -+ * that bset, until that btree node is rewritten. -+ */ -+ -+static unsigned sb_blacklist_u64s(unsigned nr) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ -+ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); -+} -+ -+static struct bch_sb_field_journal_seq_blacklist * -+blacklist_entry_try_merge(struct bch_fs *c, -+ struct bch_sb_field_journal_seq_blacklist *bl, -+ unsigned i) -+{ -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ if (le64_to_cpu(bl->start[i].end) >= -+ le64_to_cpu(bl->start[i + 1].start)) { -+ bl->start[i].end = bl->start[i + 1].end; -+ --nr; -+ memmove(&bl->start[i], -+ &bl->start[i + 1], -+ sizeof(bl->start[0]) * (nr - i)); -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr)); -+ BUG_ON(!bl); -+ } -+ -+ return bl; -+} -+ -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ unsigned i, nr; -+ int ret = 0; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ nr = blacklist_nr_entries(bl); -+ -+ if (bl) { -+ for (i = 0; i < nr; i++) { -+ struct journal_seq_blacklist_entry *e = -+ bl->start + i; -+ -+ if (start == le64_to_cpu(e->start) && -+ end == le64_to_cpu(e->end)) -+ goto out; -+ -+ if (start <= le64_to_cpu(e->start) && -+ end >= le64_to_cpu(e->end)) { -+ e->start = cpu_to_le64(start); -+ e->end = cpu_to_le64(end); -+ -+ if (i + 1 < nr) -+ bl = blacklist_entry_try_merge(c, -+ bl, i); -+ if (i) -+ bl = blacklist_entry_try_merge(c, -+ bl, i - 1); -+ goto out_write_sb; -+ } -+ } -+ } -+ -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ sb_blacklist_u64s(nr + 1)); -+ if (!bl) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ bl->start[nr].start = cpu_to_le64(start); -+ bl->start[nr].end = cpu_to_le64(end); -+out_write_sb: -+ c->disk_sb.sb->features[0] |= -+ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; -+ -+ ret = bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static int journal_seq_blacklist_table_cmp(const void *_l, -+ const void *_r, size_t size) -+{ -+ const struct journal_seq_blacklist_table_entry *l = _l; -+ const struct journal_seq_blacklist_table_entry *r = _r; -+ -+ return cmp_int(l->start, r->start); -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, -+ bool dirty) -+{ -+ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; -+ struct journal_seq_blacklist_table_entry search = { .start = seq }; -+ int idx; -+ -+ if (!t) -+ return false; -+ -+ idx = eytzinger0_find_le(t->entries, t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ &search); -+ if (idx < 0) -+ return false; -+ -+ BUG_ON(t->entries[idx].start > seq); -+ -+ if (seq >= t->entries[idx].end) -+ return false; -+ -+ if (dirty) -+ t->entries[idx].dirty = true; -+ return true; -+} -+ -+int bch2_blacklist_table_initialize(struct bch_fs *c) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ struct journal_seq_blacklist_table *t; -+ unsigned i, nr = blacklist_nr_entries(bl); -+ -+ BUG_ON(c->journal_seq_blacklist_table); -+ -+ if (!bl) -+ return 0; -+ -+ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, -+ GFP_KERNEL); -+ if (!t) -+ return -ENOMEM; -+ -+ t->nr = nr; -+ -+ for (i = 0; i < nr; i++) { -+ t->entries[i].start = le64_to_cpu(bl->start[i].start); -+ t->entries[i].end = le64_to_cpu(bl->start[i].end); -+ } -+ -+ eytzinger0_sort(t->entries, -+ t->nr, -+ sizeof(t->entries[0]), -+ journal_seq_blacklist_table_cmp, -+ NULL); -+ -+ c->journal_seq_blacklist_table = t; -+ return 0; -+} -+ -+static const char * -+bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (le64_to_cpu(i->start) >= -+ le64_to_cpu(i->end)) -+ return "entry start >= end"; -+ -+ if (i + 1 < bl->start + nr && -+ le64_to_cpu(i[0].end) > -+ le64_to_cpu(i[1].start)) -+ return "entries out of order"; -+ } -+ -+ return NULL; -+} -+ -+static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal_seq_blacklist *bl = -+ field_to_type(f, journal_seq_blacklist); -+ struct journal_seq_blacklist_entry *i; -+ unsigned nr = blacklist_nr_entries(bl); -+ -+ for (i = bl->start; i < bl->start + nr; i++) { -+ if (i != bl->start) -+ pr_buf(out, " "); -+ -+ pr_buf(out, "%llu-%llu", -+ le64_to_cpu(i->start), -+ le64_to_cpu(i->end)); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { -+ .validate = bch2_sb_journal_seq_blacklist_validate, -+ .to_text = bch2_sb_journal_seq_blacklist_to_text -+}; -+ -+void bch2_blacklist_entries_gc(struct work_struct *work) -+{ -+ struct bch_fs *c = container_of(work, struct bch_fs, -+ journal_seq_blacklist_gc_work); -+ struct journal_seq_blacklist_table *t; -+ struct bch_sb_field_journal_seq_blacklist *bl; -+ struct journal_seq_blacklist_entry *src, *dst; -+ struct btree_trans trans; -+ unsigned i, nr, new_nr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_iter *iter; -+ struct btree *b; -+ -+ for_each_btree_node(&trans, iter, i, POS_MIN, -+ BTREE_ITER_PREFETCH, b) -+ if (test_bit(BCH_FS_STOPPING, &c->flags)) { -+ bch2_trans_exit(&trans); -+ return; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ ret = bch2_trans_exit(&trans); -+ if (ret) -+ return; -+ -+ mutex_lock(&c->sb_lock); -+ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); -+ if (!bl) -+ goto out; -+ -+ nr = blacklist_nr_entries(bl); -+ dst = bl->start; -+ -+ t = c->journal_seq_blacklist_table; -+ BUG_ON(nr != t->nr); -+ -+ for (src = bl->start, i = eytzinger0_first(t->nr); -+ src < bl->start + nr; -+ src++, i = eytzinger0_next(i, nr)) { -+ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); -+ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); -+ -+ if (t->entries[i].dirty) -+ *dst++ = *src; -+ } -+ -+ new_nr = dst - bl->start; -+ -+ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); -+ -+ if (new_nr != nr) { -+ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, -+ new_nr ? sb_blacklist_u64s(new_nr) : 0); -+ BUG_ON(new_nr && !bl); -+ -+ if (!new_nr) -+ c->disk_sb.sb->features[0] &= -+ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); -+ -+ bch2_write_super(c); -+ } -+out: -+ mutex_unlock(&c->sb_lock); -+} -diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h -new file mode 100644 -index 000000000000..afb886ec8e25 ---- /dev/null -+++ b/fs/bcachefs/journal_seq_blacklist.h -@@ -0,0 +1,22 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H -+ -+static inline unsigned -+blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) -+{ -+ return bl -+ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / -+ sizeof(struct journal_seq_blacklist_entry)) -+ : 0; -+} -+ -+bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); -+int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); -+int bch2_blacklist_table_initialize(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; -+ -+void bch2_blacklist_entries_gc(struct work_struct *); -+ -+#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ -diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h -new file mode 100644 -index 000000000000..154b51b891d3 ---- /dev/null -+++ b/fs/bcachefs/journal_types.h -@@ -0,0 +1,277 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_JOURNAL_TYPES_H -+#define _BCACHEFS_JOURNAL_TYPES_H -+ -+#include -+#include -+ -+#include "alloc_types.h" -+#include "super_types.h" -+#include "fifo.h" -+ -+struct journal_res; -+ -+/* -+ * We put two of these in struct journal; we used them for writes to the -+ * journal that are being staged or in flight. -+ */ -+struct journal_buf { -+ struct jset *data; -+ -+ BKEY_PADDED(key); -+ -+ struct closure_waitlist wait; -+ -+ unsigned buf_size; /* size in bytes of @data */ -+ unsigned sectors; /* maximum size for current entry */ -+ unsigned disk_sectors; /* maximum size entry could have been, if -+ buf_size was bigger */ -+ unsigned u64s_reserved; -+ /* bloom filter: */ -+ unsigned long has_inode[1024 / sizeof(unsigned long)]; -+}; -+ -+/* -+ * Something that makes a journal entry dirty - i.e. a btree node that has to be -+ * flushed: -+ */ -+ -+struct journal_entry_pin_list { -+ struct list_head list; -+ struct list_head flushed; -+ atomic_t count; -+ struct bch_devs_list devs; -+}; -+ -+struct journal; -+struct journal_entry_pin; -+typedef void (*journal_pin_flush_fn)(struct journal *j, -+ struct journal_entry_pin *, u64); -+ -+struct journal_entry_pin { -+ struct list_head list; -+ journal_pin_flush_fn flush; -+ u64 seq; -+}; -+ -+struct journal_res { -+ bool ref; -+ u8 idx; -+ u16 u64s; -+ u32 offset; -+ u64 seq; -+}; -+ -+/* -+ * For reserving space in the journal prior to getting a reservation on a -+ * particular journal entry: -+ */ -+struct journal_preres { -+ unsigned u64s; -+}; -+ -+union journal_res_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u64 cur_entry_offset:20, -+ idx:1, -+ prev_buf_unwritten:1, -+ buf0_count:21, -+ buf1_count:21; -+ }; -+}; -+ -+union journal_preres_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ u32 reserved; -+ u32 remaining; -+ }; -+}; -+ -+/* bytes: */ -+#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -+#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ -+ -+/* -+ * We stash some journal state as sentinal values in cur_entry_offset: -+ * note - cur_entry_offset is in units of u64s -+ */ -+#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) -+ -+#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) -+#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) -+ -+/* -+ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, -+ * either because something's waiting on the write to complete or because it's -+ * been dirty too long and the timer's expired. -+ */ -+ -+enum { -+ JOURNAL_REPLAY_DONE, -+ JOURNAL_STARTED, -+ JOURNAL_RECLAIM_STARTED, -+ JOURNAL_NEED_WRITE, -+ JOURNAL_NOT_EMPTY, -+ JOURNAL_MAY_GET_UNRESERVED, -+}; -+ -+/* Embedded in struct bch_fs */ -+struct journal { -+ /* Fastpath stuff up front: */ -+ -+ unsigned long flags; -+ -+ union journal_res_state reservations; -+ -+ /* Max size of current journal entry */ -+ unsigned cur_entry_u64s; -+ unsigned cur_entry_sectors; -+ -+ /* -+ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if -+ * insufficient devices: -+ */ -+ int cur_entry_error; -+ -+ union journal_preres_state prereserved; -+ -+ /* Reserved space in journal entry to be used just prior to write */ -+ unsigned entry_u64s_reserved; -+ -+ unsigned buf_size_want; -+ -+ /* -+ * Two journal entries -- one is currently open for new entries, the -+ * other is possibly being written out. -+ */ -+ struct journal_buf buf[2]; -+ -+ spinlock_t lock; -+ -+ /* if nonzero, we may not open a new journal entry: */ -+ unsigned blocked; -+ -+ /* Used when waiting because the journal was full */ -+ wait_queue_head_t wait; -+ struct closure_waitlist async_wait; -+ struct closure_waitlist preres_wait; -+ -+ struct closure io; -+ struct delayed_work write_work; -+ -+ /* Sequence number of most recent journal entry (last entry in @pin) */ -+ atomic64_t seq; -+ -+ /* seq, last_seq from the most recent journal entry successfully written */ -+ u64 seq_ondisk; -+ u64 last_seq_ondisk; -+ -+ /* -+ * FIFO of journal entries whose btree updates have not yet been -+ * written out. -+ * -+ * Each entry is a reference count. The position in the FIFO is the -+ * entry's sequence number relative to @seq. -+ * -+ * The journal entry itself holds a reference count, put when the -+ * journal entry is written out. Each btree node modified by the journal -+ * entry also holds a reference count, put when the btree node is -+ * written. -+ * -+ * When a reference count reaches zero, the journal entry is no longer -+ * needed. When all journal entries in the oldest journal bucket are no -+ * longer needed, the bucket can be discarded and reused. -+ */ -+ struct { -+ u64 front, back, size, mask; -+ struct journal_entry_pin_list *data; -+ } pin; -+ -+ u64 replay_journal_seq; -+ u64 replay_journal_seq_end; -+ -+ struct write_point wp; -+ spinlock_t err_lock; -+ -+ struct delayed_work reclaim_work; -+ struct mutex reclaim_lock; -+ unsigned long last_flushed; -+ struct journal_entry_pin *flush_in_progress; -+ wait_queue_head_t pin_flush_wait; -+ -+ /* protects advancing ja->discard_idx: */ -+ struct mutex discard_lock; -+ bool can_discard; -+ -+ unsigned write_delay_ms; -+ unsigned reclaim_delay_ms; -+ -+ u64 res_get_blocked_start; -+ u64 need_write_time; -+ u64 write_start_time; -+ -+ struct time_stats *write_time; -+ struct time_stats *delay_time; -+ struct time_stats *blocked_time; -+ struct time_stats *flush_seq_time; -+ -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map res_map; -+#endif -+}; -+ -+/* -+ * Embedded in struct bch_dev. First three fields refer to the array of journal -+ * buckets, in bch_sb. -+ */ -+struct journal_device { -+ /* -+ * For each journal bucket, contains the max sequence number of the -+ * journal writes it contains - so we know when a bucket can be reused. -+ */ -+ u64 *bucket_seq; -+ -+ unsigned sectors_free; -+ -+ /* -+ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: -+ */ -+ unsigned discard_idx; /* Next bucket to discard */ -+ unsigned dirty_idx_ondisk; -+ unsigned dirty_idx; -+ unsigned cur_idx; /* Journal bucket we're currently writing to */ -+ unsigned nr; -+ -+ u64 *buckets; -+ -+ /* Bio for journal reads/writes to this device */ -+ struct bio *bio; -+ -+ /* for bch_journal_read_device */ -+ struct closure read; -+}; -+ -+/* -+ * journal_entry_res - reserve space in every journal entry: -+ */ -+struct journal_entry_res { -+ unsigned u64s; -+}; -+ -+#endif /* _BCACHEFS_JOURNAL_TYPES_H */ -diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c -new file mode 100644 -index 000000000000..864dfaa67b7a ---- /dev/null -+++ b/fs/bcachefs/keylist.c -@@ -0,0 +1,67 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "keylist.h" -+ -+int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, -+ size_t nr_inline_u64s, size_t new_u64s) -+{ -+ size_t oldsize = bch2_keylist_u64s(l); -+ size_t newsize = oldsize + new_u64s; -+ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; -+ u64 *new_keys; -+ -+ newsize = roundup_pow_of_two(newsize); -+ -+ if (newsize <= nr_inline_u64s || -+ (old_buf && roundup_pow_of_two(oldsize) == newsize)) -+ return 0; -+ -+ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); -+ if (!new_keys) -+ return -ENOMEM; -+ -+ if (!old_buf) -+ memcpy_u64s(new_keys, inline_u64s, oldsize); -+ -+ l->keys_p = new_keys; -+ l->top_p = new_keys + oldsize; -+ -+ return 0; -+} -+ -+void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) -+{ -+ struct bkey_i *where; -+ -+ for_each_keylist_key(l, where) -+ if (bkey_cmp(insert->k.p, where->k.p) < 0) -+ break; -+ -+ memmove_u64s_up((u64 *) where + insert->k.u64s, -+ where, -+ ((u64 *) l->top) - ((u64 *) where)); -+ -+ l->top_p += insert->k.u64s; -+ bkey_copy(where, insert); -+} -+ -+void bch2_keylist_pop_front(struct keylist *l) -+{ -+ l->top_p -= bch2_keylist_front(l)->k.u64s; -+ -+ memmove_u64s_down(l->keys, -+ bkey_next(l->keys), -+ bch2_keylist_u64s(l)); -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *l) -+{ -+ struct bkey_i *k; -+ -+ for_each_keylist_key(l, k) -+ BUG_ON(bkey_next(k) != l->top && -+ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); -+} -+#endif -diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h -new file mode 100644 -index 000000000000..195799bb20bc ---- /dev/null -+++ b/fs/bcachefs/keylist.h -@@ -0,0 +1,76 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_H -+#define _BCACHEFS_KEYLIST_H -+ -+#include "keylist_types.h" -+ -+int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); -+void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); -+void bch2_keylist_pop_front(struct keylist *); -+ -+static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) -+{ -+ l->top_p = l->keys_p = inline_keys; -+} -+ -+static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) -+{ -+ if (l->keys_p != inline_keys) -+ kfree(l->keys_p); -+ bch2_keylist_init(l, inline_keys); -+} -+ -+static inline void bch2_keylist_push(struct keylist *l) -+{ -+ l->top = bkey_next(l->top); -+} -+ -+static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) -+{ -+ bkey_copy(l->top, k); -+ bch2_keylist_push(l); -+} -+ -+static inline bool bch2_keylist_empty(struct keylist *l) -+{ -+ return l->top == l->keys; -+} -+ -+static inline size_t bch2_keylist_u64s(struct keylist *l) -+{ -+ return l->top_p - l->keys_p; -+} -+ -+static inline size_t bch2_keylist_bytes(struct keylist *l) -+{ -+ return bch2_keylist_u64s(l) * sizeof(u64); -+} -+ -+static inline struct bkey_i *bch2_keylist_front(struct keylist *l) -+{ -+ return l->keys; -+} -+ -+#define for_each_keylist_key(_keylist, _k) \ -+ for (_k = (_keylist)->keys; \ -+ _k != (_keylist)->top; \ -+ _k = bkey_next(_k)) -+ -+static inline u64 keylist_sectors(struct keylist *keys) -+{ -+ struct bkey_i *k; -+ u64 ret = 0; -+ -+ for_each_keylist_key(keys, k) -+ ret += k->k.size; -+ -+ return ret; -+} -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+void bch2_verify_keylist_sorted(struct keylist *); -+#else -+static inline void bch2_verify_keylist_sorted(struct keylist *l) {} -+#endif -+ -+#endif /* _BCACHEFS_KEYLIST_H */ -diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h -new file mode 100644 -index 000000000000..4b3ff7d8a875 ---- /dev/null -+++ b/fs/bcachefs/keylist_types.h -@@ -0,0 +1,16 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_KEYLIST_TYPES_H -+#define _BCACHEFS_KEYLIST_TYPES_H -+ -+struct keylist { -+ union { -+ struct bkey_i *keys; -+ u64 *keys_p; -+ }; -+ union { -+ struct bkey_i *top; -+ u64 *top_p; -+ }; -+}; -+ -+#endif /* _BCACHEFS_KEYLIST_TYPES_H */ -diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c -new file mode 100644 -index 000000000000..96c8690adc5b ---- /dev/null -+++ b/fs/bcachefs/migrate.c -@@ -0,0 +1,170 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Code for moving data off a device. -+ */ -+ -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "extents.h" -+#include "io.h" -+#include "journal.h" -+#include "keylist.h" -+#include "migrate.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, -+ unsigned dev_idx, int flags, bool metadata) -+{ -+ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; -+ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; -+ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; -+ unsigned nr_good; -+ -+ bch2_bkey_drop_device(k, dev_idx); -+ -+ nr_good = bch2_bkey_durability(c, k.s_c); -+ if ((!nr_good && !(flags & lost)) || -+ (nr_good < replicas && !(flags & degraded))) -+ return -EINVAL; -+ -+ return 0; -+} -+ -+static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, -+ enum btree_id btree_id) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_on_stack sk; -+ int ret = 0; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, -+ BTREE_ITER_PREFETCH); -+ -+ while ((k = bch2_btree_iter_peek(iter)).k && -+ !(ret = bkey_err(k))) { -+ if (!bch2_bkey_has_device(k, dev_idx)) { -+ bch2_btree_iter_next(iter); -+ continue; -+ } -+ -+ bkey_on_stack_reassemble(&sk, c, k); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), -+ dev_idx, flags, false); -+ if (ret) -+ break; -+ -+ /* -+ * If the new extent no longer has any pointers, bch2_extent_normalize() -+ * will do the appropriate thing with it (turning it into a -+ * KEY_TYPE_error key, or just a discard if it was a cached extent) -+ */ -+ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); -+ -+ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); -+ -+ bch2_trans_update(&trans, iter, sk.k, 0); -+ -+ ret = bch2_trans_commit(&trans, NULL, NULL, -+ BTREE_INSERT_NOFAIL); -+ -+ /* -+ * don't want to leave ret == -EINTR, since if we raced and -+ * something else overwrote the key we could spuriously return -+ * -EINTR below: -+ */ -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: -+ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); -+} -+ -+static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct closure cl; -+ struct btree *b; -+ unsigned id; -+ int ret; -+ -+ /* don't handle this yet: */ -+ if (flags & BCH_FORCE_IF_METADATA_LOST) -+ return -EINVAL; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ closure_init_stack(&cl); -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; -+retry: -+ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), -+ dev_idx)) -+ continue; -+ -+ bkey_copy(&tmp.k, &b->key); -+ -+ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), -+ dev_idx, flags, true); -+ if (ret) { -+ bch_err(c, "Cannot drop device without losing data"); -+ goto err; -+ } -+ -+ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); -+ if (ret == -EINTR) { -+ b = bch2_btree_iter_peek_node(iter); -+ goto retry; -+ } -+ if (ret) { -+ bch_err(c, "Error updating btree node key: %i", ret); -+ goto err; -+ } -+ } -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ /* flush relevant btree updates */ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = 0; -+err: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ BUG_ON(ret == -EINTR); -+ -+ return ret; -+} -+ -+int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) -+{ -+ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: -+ bch2_dev_metadata_drop(c, dev_idx, flags); -+} -diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h -new file mode 100644 -index 000000000000..027efaa0d575 ---- /dev/null -+++ b/fs/bcachefs/migrate.h -@@ -0,0 +1,7 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MIGRATE_H -+#define _BCACHEFS_MIGRATE_H -+ -+int bch2_dev_data_drop(struct bch_fs *, unsigned, int); -+ -+#endif /* _BCACHEFS_MIGRATE_H */ -diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c -new file mode 100644 -index 000000000000..62dcac79ed06 ---- /dev/null -+++ b/fs/bcachefs/move.c -@@ -0,0 +1,826 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "bkey_on_stack.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "disk_groups.h" -+#include "inode.h" -+#include "io.h" -+#include "journal_reclaim.h" -+#include "move.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "keylist.h" -+ -+#include -+#include -+ -+#include -+ -+#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 -+ -+struct moving_io { -+ struct list_head list; -+ struct closure cl; -+ bool read_completed; -+ -+ unsigned read_sectors; -+ unsigned write_sectors; -+ -+ struct bch_read_bio rbio; -+ -+ struct migrate_write write; -+ /* Must be last since it is variable size */ -+ struct bio_vec bi_inline_vecs[0]; -+}; -+ -+struct moving_context { -+ /* Closure for waiting on all reads and writes to complete */ -+ struct closure cl; -+ -+ struct bch_move_stats *stats; -+ -+ struct list_head reads; -+ -+ /* in flight sectors: */ -+ atomic_t read_sectors; -+ atomic_t write_sectors; -+ -+ wait_queue_head_t wait; -+}; -+ -+static int bch2_migrate_index_update(struct bch_write_op *op) -+{ -+ struct bch_fs *c = op->c; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct migrate_write *m = -+ container_of(op, struct migrate_write, op); -+ struct keylist *keys = &op->insert_keys; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+ -+ iter = bch2_trans_get_iter(&trans, m->btree_id, -+ bkey_start_pos(&bch2_keylist_front(keys)->k), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ -+ while (1) { -+ struct bkey_s_c k; -+ struct bkey_i *insert; -+ struct bkey_i_extent *new; -+ BKEY_PADDED(k) _new, _insert; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ bool did_work = false; -+ int nr; -+ -+ bch2_trans_reset(&trans, 0); -+ -+ k = bch2_btree_iter_peek_slot(iter); -+ ret = bkey_err(k); -+ if (ret) { -+ if (ret == -EINTR) -+ continue; -+ break; -+ } -+ -+ new = bkey_i_to_extent(bch2_keylist_front(keys)); -+ -+ if (bversion_cmp(k.k->version, new->k.version) || -+ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) -+ goto nomatch; -+ -+ bkey_reassemble(&_insert.k, k); -+ insert = &_insert.k; -+ -+ bkey_copy(&_new.k, bch2_keylist_front(keys)); -+ new = bkey_i_to_extent(&_new.k); -+ bch2_cut_front(iter->pos, &new->k_i); -+ -+ bch2_cut_front(iter->pos, insert); -+ bch2_cut_back(new->k.p, insert); -+ bch2_cut_back(insert->k.p, &new->k_i); -+ -+ if (m->data_cmd == DATA_REWRITE) { -+ struct bch_extent_ptr *new_ptr, *old_ptr = (void *) -+ bch2_bkey_has_device(bkey_i_to_s_c(insert), -+ m->data_opts.rewrite_dev); -+ if (!old_ptr) -+ goto nomatch; -+ -+ if (old_ptr->cached) -+ extent_for_each_ptr(extent_i_to_s(new), new_ptr) -+ new_ptr->cached = true; -+ -+ bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); -+ } -+ -+ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { -+ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { -+ /* -+ * raced with another move op? extent already -+ * has a pointer to the device we just wrote -+ * data to -+ */ -+ continue; -+ } -+ -+ bch2_extent_ptr_decoded_append(insert, &p); -+ did_work = true; -+ } -+ -+ if (!did_work) -+ goto nomatch; -+ -+ bch2_bkey_narrow_crcs(insert, -+ (struct bch_extent_crc_unpacked) { 0 }); -+ bch2_extent_normalize(c, bkey_i_to_s(insert)); -+ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), -+ op->opts.background_target, -+ op->opts.data_replicas); -+ -+ /* -+ * If we're not fully overwriting @k, and it's compressed, we -+ * need a reservation for all the pointers in @insert -+ */ -+ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - -+ m->nr_ptrs_reserved; -+ -+ if (insert->k.size < k.k->size && -+ bch2_bkey_sectors_compressed(k) && -+ nr > 0) { -+ ret = bch2_disk_reservation_add(c, &op->res, -+ keylist_sectors(keys) * nr, 0); -+ if (ret) -+ goto out; -+ -+ m->nr_ptrs_reserved += nr; -+ goto next; -+ } -+ -+ bch2_trans_update(&trans, iter, insert, 0); -+ -+ ret = bch2_trans_commit(&trans, &op->res, -+ op_journal_seq(op), -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ m->data_opts.btree_insert_flags); -+ if (!ret) -+ atomic_long_inc(&c->extent_migrate_done); -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ break; -+next: -+ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { -+ bch2_keylist_pop_front(keys); -+ if (bch2_keylist_empty(keys)) -+ goto out; -+ } -+ continue; -+nomatch: -+ if (m->ctxt) { -+ BUG_ON(k.k->p.offset <= iter->pos.offset); -+ atomic64_inc(&m->ctxt->stats->keys_raced); -+ atomic64_add(k.k->p.offset - iter->pos.offset, -+ &m->ctxt->stats->sectors_raced); -+ } -+ atomic_long_inc(&c->extent_migrate_raced); -+ trace_move_race(&new->k); -+ bch2_btree_iter_next_slot(iter); -+ goto next; -+ } -+out: -+ bch2_trans_exit(&trans); -+ BUG_ON(ret == -EINTR); -+ return ret; -+} -+ -+void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) -+{ -+ /* write bio must own pages: */ -+ BUG_ON(!m->op.wbio.bio.bi_vcnt); -+ -+ m->ptr = rbio->pick.ptr; -+ m->offset = rbio->pos.offset - rbio->pick.crc.offset; -+ m->op.devs_have = rbio->devs_have; -+ m->op.pos = rbio->pos; -+ m->op.version = rbio->version; -+ m->op.crc = rbio->pick.crc; -+ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; -+ -+ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { -+ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; -+ m->op.csum_type = m->op.crc.csum_type; -+ } -+ -+ if (m->data_cmd == DATA_REWRITE) -+ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); -+} -+ -+int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ int ret; -+ -+ m->btree_id = btree_id; -+ m->data_cmd = data_cmd; -+ m->data_opts = data_opts; -+ m->nr_ptrs_reserved = 0; -+ -+ bch2_write_op_init(&m->op, c, io_opts); -+ -+ if (!bch2_bkey_is_incompressible(k)) -+ m->op.compression_type = -+ bch2_compression_opt_to_type[io_opts.background_compression ?: -+ io_opts.compression]; -+ else -+ m->op.incompressible = true; -+ -+ m->op.target = data_opts.target, -+ m->op.write_point = wp; -+ -+ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { -+ m->op.alloc_reserve = RESERVE_MOVINGGC; -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; -+ } else { -+ /* XXX: this should probably be passed in */ -+ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; -+ } -+ -+ m->op.flags |= BCH_WRITE_PAGES_STABLE| -+ BCH_WRITE_PAGES_OWNED| -+ BCH_WRITE_DATA_ENCODED| -+ BCH_WRITE_FROM_INTERNAL; -+ -+ m->op.nr_replicas = 1; -+ m->op.nr_replicas_required = 1; -+ m->op.index_update_fn = bch2_migrate_index_update; -+ -+ switch (data_cmd) { -+ case DATA_ADD_REPLICAS: { -+ /* -+ * DATA_ADD_REPLICAS is used for moving data to a different -+ * device in the background, and due to compression the new copy -+ * might take up more space than the old copy: -+ */ -+#if 0 -+ int nr = (int) io_opts.data_replicas - -+ bch2_bkey_nr_ptrs_allocated(k); -+#endif -+ int nr = (int) io_opts.data_replicas; -+ -+ if (nr > 0) { -+ m->op.nr_replicas = m->nr_ptrs_reserved = nr; -+ -+ ret = bch2_disk_reservation_get(c, &m->op.res, -+ k.k->size, m->op.nr_replicas, 0); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_REWRITE: { -+ unsigned compressed_sectors = 0; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (p.ptr.dev == data_opts.rewrite_dev && -+ !p.ptr.cached && -+ crc_is_compressed(p.crc)) -+ compressed_sectors += p.crc.compressed_size; -+ -+ if (compressed_sectors) { -+ ret = bch2_disk_reservation_add(c, &m->op.res, -+ k.k->size * m->op.nr_replicas, -+ BCH_DISK_RESERVATION_NOFAIL); -+ if (ret) -+ return ret; -+ } -+ break; -+ } -+ case DATA_PROMOTE: -+ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; -+ m->op.flags |= BCH_WRITE_CACHED; -+ break; -+ default: -+ BUG(); -+ } -+ -+ return 0; -+} -+ -+static void move_free(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ struct moving_context *ctxt = io->write.ctxt; -+ struct bvec_iter_all iter; -+ struct bio_vec *bv; -+ -+ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); -+ -+ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) -+ if (bv->bv_page) -+ __free_page(bv->bv_page); -+ -+ wake_up(&ctxt->wait); -+ -+ kfree(io); -+} -+ -+static void move_write_done(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_return_with_destructor(cl, move_free); -+} -+ -+static void move_write(struct closure *cl) -+{ -+ struct moving_io *io = container_of(cl, struct moving_io, cl); -+ -+ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { -+ closure_return_with_destructor(cl, move_free); -+ return; -+ } -+ -+ bch2_migrate_read_done(&io->write, &io->rbio); -+ -+ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); -+ closure_call(&io->write.op.cl, bch2_write, NULL, cl); -+ continue_at(cl, move_write_done, NULL); -+} -+ -+static inline struct moving_io *next_pending_write(struct moving_context *ctxt) -+{ -+ struct moving_io *io = -+ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); -+ -+ return io && io->read_completed ? io : NULL; -+} -+ -+static void move_read_endio(struct bio *bio) -+{ -+ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); -+ struct moving_context *ctxt = io->write.ctxt; -+ -+ atomic_sub(io->read_sectors, &ctxt->read_sectors); -+ io->read_completed = true; -+ -+ if (next_pending_write(ctxt)) -+ wake_up(&ctxt->wait); -+ -+ closure_put(&ctxt->cl); -+} -+ -+static void do_pending_writes(struct moving_context *ctxt) -+{ -+ struct moving_io *io; -+ -+ while ((io = next_pending_write(ctxt))) { -+ list_del(&io->list); -+ closure_call(&io->cl, move_write, NULL, &ctxt->cl); -+ } -+} -+ -+#define move_ctxt_wait_event(_ctxt, _cond) \ -+do { \ -+ do_pending_writes(_ctxt); \ -+ \ -+ if (_cond) \ -+ break; \ -+ __wait_event((_ctxt)->wait, \ -+ next_pending_write(_ctxt) || (_cond)); \ -+} while (1) -+ -+static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) -+{ -+ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); -+ -+ move_ctxt_wait_event(ctxt, -+ !atomic_read(&ctxt->write_sectors) || -+ atomic_read(&ctxt->write_sectors) != sectors_pending); -+} -+ -+static int bch2_move_extent(struct btree_trans *trans, -+ struct moving_context *ctxt, -+ struct write_point_specifier wp, -+ struct bch_io_opts io_opts, -+ enum btree_id btree_id, -+ struct bkey_s_c k, -+ enum data_cmd data_cmd, -+ struct data_opts data_opts) -+{ -+ struct bch_fs *c = trans->c; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ struct moving_io *io; -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ unsigned sectors = k.k->size, pages; -+ int ret = -ENOMEM; -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->write_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ move_ctxt_wait_event(ctxt, -+ atomic_read(&ctxt->read_sectors) < -+ SECTORS_IN_FLIGHT_PER_DEVICE); -+ -+ /* write path might have to decompress data: */ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); -+ -+ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); -+ io = kzalloc(sizeof(struct moving_io) + -+ sizeof(struct bio_vec) * pages, GFP_KERNEL); -+ if (!io) -+ goto err; -+ -+ io->write.ctxt = ctxt; -+ io->read_sectors = k.k->size; -+ io->write_sectors = k.k->size; -+ -+ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); -+ bio_set_prio(&io->write.op.wbio.bio, -+ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ -+ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, -+ GFP_KERNEL)) -+ goto err_free; -+ -+ io->rbio.c = c; -+ io->rbio.opts = io_opts; -+ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); -+ io->rbio.bio.bi_vcnt = pages; -+ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); -+ io->rbio.bio.bi_iter.bi_size = sectors << 9; -+ -+ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); -+ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); -+ io->rbio.bio.bi_end_io = move_read_endio; -+ -+ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, -+ data_cmd, data_opts, btree_id, k); -+ if (ret) -+ goto err_free_pages; -+ -+ atomic64_inc(&ctxt->stats->keys_moved); -+ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); -+ -+ trace_move_extent(k.k); -+ -+ atomic_add(io->read_sectors, &ctxt->read_sectors); -+ list_add_tail(&io->list, &ctxt->reads); -+ -+ /* -+ * dropped by move_read_endio() - guards against use after free of -+ * ctxt when doing wakeup -+ */ -+ closure_get(&ctxt->cl); -+ bch2_read_extent(trans, &io->rbio, k, 0, -+ BCH_READ_NODECODE| -+ BCH_READ_LAST_FRAGMENT); -+ return 0; -+err_free_pages: -+ bio_free_pages(&io->write.op.wbio.bio); -+err_free: -+ kfree(io); -+err: -+ trace_move_alloc_fail(k.k); -+ return ret; -+} -+ -+static int __bch2_move_data(struct bch_fs *c, -+ struct moving_context *ctxt, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats, -+ enum btree_id btree_id) -+{ -+ bool kthread = (current->flags & PF_KTHREAD) != 0; -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct bkey_on_stack sk; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct data_opts data_opts; -+ enum data_cmd data_cmd; -+ u64 delay, cur_inum = U64_MAX; -+ int ret = 0, ret2; -+ -+ bkey_on_stack_init(&sk); -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_user; -+ stats->btree_id = btree_id; -+ stats->pos = POS_MIN; -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, start, -+ BTREE_ITER_PREFETCH); -+ -+ if (rate) -+ bch2_ratelimit_reset(rate); -+ -+ while (1) { -+ do { -+ delay = rate ? bch2_ratelimit_delay(rate) : 0; -+ -+ if (delay) { -+ bch2_trans_unlock(&trans); -+ set_current_state(TASK_INTERRUPTIBLE); -+ } -+ -+ if (kthread && (ret = kthread_should_stop())) { -+ __set_current_state(TASK_RUNNING); -+ goto out; -+ } -+ -+ if (delay) -+ schedule_timeout(delay); -+ -+ if (unlikely(freezing(current))) { -+ bch2_trans_unlock(&trans); -+ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); -+ try_to_freeze(); -+ } -+ } while (delay); -+peek: -+ k = bch2_btree_iter_peek(iter); -+ -+ stats->pos = iter->pos; -+ -+ if (!k.k) -+ break; -+ ret = bkey_err(k); -+ if (ret) -+ break; -+ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) -+ break; -+ -+ if (!bkey_extent_is_direct_data(k.k)) -+ goto next_nondata; -+ -+ if (btree_id == BTREE_ID_EXTENTS && -+ cur_inum != k.k->p.inode) { -+ struct bch_inode_unpacked inode; -+ -+ /* don't hold btree locks while looking up inode: */ -+ bch2_trans_unlock(&trans); -+ -+ io_opts = bch2_opts_to_inode_opts(c->opts); -+ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) -+ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); -+ cur_inum = k.k->p.inode; -+ goto peek; -+ } -+ -+ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ case DATA_PROMOTE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ /* unlock before doing IO: */ -+ bkey_on_stack_reassemble(&sk, c, k); -+ k = bkey_i_to_s_c(sk.k); -+ bch2_trans_unlock(&trans); -+ -+ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, -+ data_cmd, data_opts); -+ if (ret2) { -+ if (ret2 == -ENOMEM) { -+ /* memory allocation failure, wait for some IO to finish */ -+ bch2_move_ctxt_wait_for_io(ctxt); -+ continue; -+ } -+ -+ /* XXX signal failure */ -+ goto next; -+ } -+ -+ if (rate) -+ bch2_ratelimit_increment(rate, k.k->size); -+next: -+ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), -+ &stats->sectors_seen); -+next_nondata: -+ bch2_btree_iter_next(iter); -+ bch2_trans_cond_resched(&trans); -+ } -+out: -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&sk, c); -+ -+ return ret; -+} -+ -+int bch2_move_data(struct bch_fs *c, -+ struct bch_ratelimit *rate, -+ struct write_point_specifier wp, -+ struct bpos start, -+ struct bpos end, -+ move_pred_fn pred, void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct moving_context ctxt = { .stats = stats }; -+ int ret; -+ -+ closure_init_stack(&ctxt.cl); -+ INIT_LIST_HEAD(&ctxt.reads); -+ init_waitqueue_head(&ctxt.wait); -+ -+ stats->data_type = BCH_DATA_user; -+ -+ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_EXTENTS) ?: -+ __bch2_move_data(c, &ctxt, rate, wp, start, end, -+ pred, arg, stats, BTREE_ID_REFLINK); -+ -+ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); -+ closure_sync(&ctxt.cl); -+ -+ EBUG_ON(atomic_read(&ctxt.write_sectors)); -+ -+ trace_move_data(c, -+ atomic64_read(&stats->sectors_moved), -+ atomic64_read(&stats->keys_moved)); -+ -+ return ret; -+} -+ -+static int bch2_move_btree(struct bch_fs *c, -+ move_pred_fn pred, -+ void *arg, -+ struct bch_move_stats *stats) -+{ -+ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct btree *b; -+ unsigned id; -+ struct data_opts data_opts; -+ enum data_cmd cmd; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ stats->data_type = BCH_DATA_btree; -+ -+ for (id = 0; id < BTREE_ID_NR; id++) { -+ stats->btree_id = id; -+ -+ for_each_btree_node(&trans, iter, id, POS_MIN, -+ BTREE_ITER_PREFETCH, b) { -+ stats->pos = iter->pos; -+ -+ switch ((cmd = pred(c, arg, -+ bkey_i_to_s_c(&b->key), -+ &io_opts, &data_opts))) { -+ case DATA_SKIP: -+ goto next; -+ case DATA_SCRUB: -+ BUG(); -+ case DATA_ADD_REPLICAS: -+ case DATA_REWRITE: -+ break; -+ default: -+ BUG(); -+ } -+ -+ ret = bch2_btree_node_rewrite(c, iter, -+ b->data->keys.seq, 0) ?: ret; -+next: -+ bch2_trans_cond_resched(&trans); -+ } -+ -+ ret = bch2_trans_iter_free(&trans, iter) ?: ret; -+ } -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+#if 0 -+static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ return DATA_SCRUB; -+} -+#endif -+ -+static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ unsigned nr_good = bch2_bkey_durability(c, k); -+ unsigned replicas = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ replicas = c->opts.metadata_replicas; -+ break; -+ case KEY_TYPE_extent: -+ replicas = io_opts->data_replicas; -+ break; -+ } -+ -+ if (!nr_good || nr_good >= replicas) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+} -+ -+static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ struct bch_ioctl_data *op = arg; -+ -+ if (!bch2_bkey_has_device(k, op->migrate.dev)) -+ return DATA_SKIP; -+ -+ data_opts->target = 0; -+ data_opts->btree_insert_flags = 0; -+ data_opts->rewrite_dev = op->migrate.dev; -+ return DATA_REWRITE; -+} -+ -+int bch2_data_job(struct bch_fs *c, -+ struct bch_move_stats *stats, -+ struct bch_ioctl_data op) -+{ -+ int ret = 0; -+ -+ switch (op.op) { -+ case BCH_DATA_OP_REREPLICATE: -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, -1); -+ -+ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; -+ -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ rereplicate_pred, c, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ case BCH_DATA_OP_MIGRATE: -+ if (op.migrate.dev >= c->sb.nr_devices) -+ return -EINVAL; -+ -+ stats->data_type = BCH_DATA_journal; -+ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); -+ -+ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ -+ ret = bch2_move_data(c, NULL, -+ writepoint_hashed((unsigned long) current), -+ op.start, -+ op.end, -+ migrate_pred, &op, stats) ?: ret; -+ ret = bch2_replicas_gc2(c) ?: ret; -+ break; -+ default: -+ ret = -EINVAL; -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h -new file mode 100644 -index 000000000000..0acd1720d4f8 ---- /dev/null -+++ b/fs/bcachefs/move.h -@@ -0,0 +1,64 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_H -+#define _BCACHEFS_MOVE_H -+ -+#include "btree_iter.h" -+#include "buckets.h" -+#include "io_types.h" -+#include "move_types.h" -+ -+struct bch_read_bio; -+struct moving_context; -+ -+enum data_cmd { -+ DATA_SKIP, -+ DATA_SCRUB, -+ DATA_ADD_REPLICAS, -+ DATA_REWRITE, -+ DATA_PROMOTE, -+}; -+ -+struct data_opts { -+ u16 target; -+ unsigned rewrite_dev; -+ int btree_insert_flags; -+}; -+ -+struct migrate_write { -+ enum btree_id btree_id; -+ enum data_cmd data_cmd; -+ struct data_opts data_opts; -+ -+ unsigned nr_ptrs_reserved; -+ -+ struct moving_context *ctxt; -+ -+ /* what we read: */ -+ struct bch_extent_ptr ptr; -+ u64 offset; -+ -+ struct bch_write_op op; -+}; -+ -+void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); -+int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, -+ struct write_point_specifier, -+ struct bch_io_opts, -+ enum data_cmd, struct data_opts, -+ enum btree_id, struct bkey_s_c); -+ -+typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, -+ struct bkey_s_c, -+ struct bch_io_opts *, struct data_opts *); -+ -+int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, -+ struct write_point_specifier, -+ struct bpos, struct bpos, -+ move_pred_fn, void *, -+ struct bch_move_stats *); -+ -+int bch2_data_job(struct bch_fs *, -+ struct bch_move_stats *, -+ struct bch_ioctl_data); -+ -+#endif /* _BCACHEFS_MOVE_H */ -diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h -new file mode 100644 -index 000000000000..fc0de165af9f ---- /dev/null -+++ b/fs/bcachefs/move_types.h -@@ -0,0 +1,17 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVE_TYPES_H -+#define _BCACHEFS_MOVE_TYPES_H -+ -+struct bch_move_stats { -+ enum bch_data_type data_type; -+ enum btree_id btree_id; -+ struct bpos pos; -+ -+ atomic64_t keys_moved; -+ atomic64_t keys_raced; -+ atomic64_t sectors_moved; -+ atomic64_t sectors_seen; -+ atomic64_t sectors_raced; -+}; -+ -+#endif /* _BCACHEFS_MOVE_TYPES_H */ -diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c -new file mode 100644 -index 000000000000..de0a7974ec9f ---- /dev/null -+++ b/fs/bcachefs/movinggc.c -@@ -0,0 +1,359 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Moving/copying garbage collector -+ * -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "error.h" -+#include "extents.h" -+#include "eytzinger.h" -+#include "io.h" -+#include "keylist.h" -+#include "move.h" -+#include "movinggc.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* -+ * We can't use the entire copygc reserve in one iteration of copygc: we may -+ * need the buckets we're freeing up to go back into the copygc reserve to make -+ * forward progress, but if the copygc reserve is full they'll be available for -+ * any allocation - and it's possible that in a given iteration, we free up most -+ * of the buckets we're going to free before we allocate most of the buckets -+ * we're going to allocate. -+ * -+ * If we only use half of the reserve per iteration, then in steady state we'll -+ * always have room in the reserve for the buckets we're going to need in the -+ * next iteration: -+ */ -+#define COPYGC_BUCKETS_PER_ITER(ca) \ -+ ((ca)->free[RESERVE_MOVINGGC].size / 2) -+ -+static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) -+{ -+ const struct copygc_heap_entry *l = _l; -+ const struct copygc_heap_entry *r = _r; -+ -+ return cmp_int(l->dev, r->dev) ?: -+ cmp_int(l->offset, r->offset); -+} -+ -+static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) -+{ -+ copygc_heap *h = &c->copygc_heap; -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const struct bch_extent_ptr *ptr; -+ -+ bkey_for_each_ptr(ptrs, ptr) { -+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); -+ struct copygc_heap_entry search = { -+ .dev = ptr->dev, -+ .offset = ptr->offset -+ }; -+ -+ ssize_t i = eytzinger0_find_le(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, &search); -+#if 0 -+ /* eytzinger search verify code: */ -+ ssize_t j = -1, k; -+ -+ for (k = 0; k < h->used; k++) -+ if (h->data[k].offset <= ptr->offset && -+ (j < 0 || h->data[k].offset > h->data[j].offset)) -+ j = k; -+ -+ BUG_ON(i != j); -+#endif -+ if (i >= 0 && -+ ptr->offset < h->data[i].offset + ca->mi.bucket_size && -+ ptr->gen == h->data[i].gen) -+ return ptr->dev; -+ } -+ -+ return -1; -+} -+ -+static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ int dev_idx = __copygc_pred(c, k); -+ if (dev_idx < 0) -+ return DATA_SKIP; -+ -+ data_opts->target = io_opts->background_target; -+ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; -+ data_opts->rewrite_dev = dev_idx; -+ return DATA_REWRITE; -+} -+ -+static bool have_copygc_reserve(struct bch_dev *ca) -+{ -+ bool ret; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || -+ ca->allocator_state != ALLOCATOR_RUNNING; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ return ret; -+} -+ -+static inline int fragmentation_cmp(copygc_heap *heap, -+ struct copygc_heap_entry l, -+ struct copygc_heap_entry r) -+{ -+ return cmp_int(l.fragmentation, r.fragmentation); -+} -+ -+static int bch2_copygc(struct bch_fs *c) -+{ -+ copygc_heap *h = &c->copygc_heap; -+ struct copygc_heap_entry e, *i; -+ struct bucket_array *buckets; -+ struct bch_move_stats move_stats; -+ u64 sectors_to_move = 0, sectors_not_moved = 0; -+ u64 sectors_reserved = 0; -+ u64 buckets_to_move, buckets_not_moved = 0; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ size_t b, heap_size = 0; -+ int ret; -+ -+ memset(&move_stats, 0, sizeof(move_stats)); -+ /* -+ * Find buckets with lowest sector counts, skipping completely -+ * empty buckets, by building a maxheap sorted by sector count, -+ * and repeatedly replacing the maximum element until all -+ * buckets have been visited. -+ */ -+ h->used = 0; -+ -+ for_each_rw_member(ca, c, dev_idx) -+ heap_size += ca->mi.nbuckets >> 7; -+ -+ if (h->size < heap_size) { -+ free_heap(&c->copygc_heap); -+ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { -+ bch_err(c, "error allocating copygc heap"); -+ return 0; -+ } -+ } -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); -+ -+ spin_lock(&ca->fs->freelist_lock); -+ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; -+ spin_unlock(&ca->fs->freelist_lock); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { -+ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); -+ struct copygc_heap_entry e; -+ -+ if (m.owned_by_allocator || -+ m.data_type != BCH_DATA_user || -+ !bucket_sectors_used(m) || -+ bucket_sectors_used(m) >= ca->mi.bucket_size) -+ continue; -+ -+ e = (struct copygc_heap_entry) { -+ .dev = dev_idx, -+ .gen = m.gen, -+ .fragmentation = bucket_sectors_used(m) * (1U << 15) -+ / ca->mi.bucket_size, -+ .sectors = bucket_sectors_used(m), -+ .offset = bucket_to_sector(ca, b), -+ }; -+ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); -+ } -+ up_read(&ca->bucket_lock); -+ } -+ -+ if (!sectors_reserved) { -+ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); -+ return -1; -+ } -+ -+ for (i = h->data; i < h->data + h->used; i++) -+ sectors_to_move += i->sectors; -+ -+ while (sectors_to_move > sectors_reserved) { -+ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); -+ sectors_to_move -= e.sectors; -+ } -+ -+ buckets_to_move = h->used; -+ -+ if (!buckets_to_move) -+ return 0; -+ -+ eytzinger0_sort(h->data, h->used, -+ sizeof(h->data[0]), -+ bucket_offset_cmp, NULL); -+ -+ ret = bch2_move_data(c, &c->copygc_pd.rate, -+ writepoint_ptr(&c->copygc_write_point), -+ POS_MIN, POS_MAX, -+ copygc_pred, NULL, -+ &move_stats); -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ for (i = h->data; i < h->data + h->used; i++) { -+ struct bucket_mark m; -+ size_t b; -+ -+ if (i->dev != dev_idx) -+ continue; -+ -+ b = sector_to_bucket(ca, i->offset); -+ m = READ_ONCE(buckets->b[b].mark); -+ -+ if (i->gen == m.gen && -+ bucket_sectors_used(m)) { -+ sectors_not_moved += bucket_sectors_used(m); -+ buckets_not_moved++; -+ } -+ } -+ up_read(&ca->bucket_lock); -+ } -+ -+ if (sectors_not_moved && !ret) -+ bch_warn_ratelimited(c, -+ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", -+ sectors_not_moved, sectors_to_move, -+ buckets_not_moved, buckets_to_move, -+ atomic64_read(&move_stats.sectors_moved), -+ atomic64_read(&move_stats.keys_raced), -+ atomic64_read(&move_stats.sectors_raced)); -+ -+ trace_copygc(c, -+ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, -+ buckets_to_move, buckets_not_moved); -+ return 0; -+} -+ -+/* -+ * Copygc runs when the amount of fragmented data is above some arbitrary -+ * threshold: -+ * -+ * The threshold at the limit - when the device is full - is the amount of space -+ * we reserved in bch2_recalc_capacity; we can't have more than that amount of -+ * disk space stranded due to fragmentation and store everything we have -+ * promised to store. -+ * -+ * But we don't want to be running copygc unnecessarily when the device still -+ * has plenty of free space - rather, we want copygc to smoothly run every so -+ * often and continually reduce the amount of fragmented space as the device -+ * fills up. So, we increase the threshold by half the current free space. -+ */ -+unsigned long bch2_copygc_wait_amount(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ u64 fragmented_allowed = c->copygc_threshold; -+ u64 fragmented = 0; -+ -+ for_each_rw_member(ca, c, dev_idx) { -+ struct bch_dev_usage usage = bch2_dev_usage_read(ca); -+ -+ fragmented_allowed += ((__dev_buckets_available(ca, usage) * -+ ca->mi.bucket_size) >> 1); -+ fragmented += usage.sectors_fragmented; -+ } -+ -+ return max_t(s64, 0, fragmented_allowed - fragmented); -+} -+ -+static int bch2_copygc_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ unsigned long last, wait; -+ -+ set_freezable(); -+ -+ while (!kthread_should_stop()) { -+ if (kthread_wait_freezable(c->copy_gc_enabled)) -+ break; -+ -+ last = atomic_long_read(&clock->now); -+ wait = bch2_copygc_wait_amount(c); -+ -+ if (wait > clock->max_slop) { -+ bch2_kthread_io_clock_wait(clock, last + wait, -+ MAX_SCHEDULE_TIMEOUT); -+ continue; -+ } -+ -+ if (bch2_copygc(c)) -+ break; -+ } -+ -+ return 0; -+} -+ -+void bch2_copygc_stop(struct bch_fs *c) -+{ -+ c->copygc_pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->copygc_pd.rate); -+ -+ if (c->copygc_thread) { -+ kthread_stop(c->copygc_thread); -+ put_task_struct(c->copygc_thread); -+ } -+ c->copygc_thread = NULL; -+} -+ -+int bch2_copygc_start(struct bch_fs *c) -+{ -+ struct task_struct *t; -+ -+ if (c->copygc_thread) -+ return 0; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ if (bch2_fs_init_fault("copygc_start")) -+ return -ENOMEM; -+ -+ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); -+ if (IS_ERR(t)) -+ return PTR_ERR(t); -+ -+ get_task_struct(t); -+ -+ c->copygc_thread = t; -+ wake_up_process(c->copygc_thread); -+ -+ return 0; -+} -+ -+void bch2_fs_copygc_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->copygc_pd); -+ c->copygc_pd.d_term = 0; -+} -diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h -new file mode 100644 -index 000000000000..922738247d03 ---- /dev/null -+++ b/fs/bcachefs/movinggc.h -@@ -0,0 +1,9 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_MOVINGGC_H -+#define _BCACHEFS_MOVINGGC_H -+ -+void bch2_copygc_stop(struct bch_fs *); -+int bch2_copygc_start(struct bch_fs *); -+void bch2_fs_copygc_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_MOVINGGC_H */ -diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c -new file mode 100644 -index 000000000000..afe25cd26c06 ---- /dev/null -+++ b/fs/bcachefs/opts.c -@@ -0,0 +1,437 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+ -+#include "bcachefs.h" -+#include "compress.h" -+#include "disk_groups.h" -+#include "opts.h" -+#include "super-io.h" -+#include "util.h" -+ -+const char * const bch2_error_actions[] = { -+ "continue", -+ "remount-ro", -+ "panic", -+ NULL -+}; -+ -+const char * const bch2_sb_features[] = { -+#define x(f, n) #f, -+ BCH_SB_FEATURES() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_csum_opts[] = { -+ "none", -+ "crc32c", -+ "crc64", -+ NULL -+}; -+ -+const char * const bch2_compression_opts[] = { -+#define x(t, n) #t, -+ BCH_COMPRESSION_OPTS() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_str_hash_types[] = { -+ "crc32c", -+ "crc64", -+ "siphash", -+ NULL -+}; -+ -+const char * const bch2_data_types[] = { -+#define x(t, n) #t, -+ BCH_DATA_TYPES() -+#undef x -+ NULL -+}; -+ -+const char * const bch2_cache_replacement_policies[] = { -+ "lru", -+ "fifo", -+ "random", -+ NULL -+}; -+ -+/* Default is -1; we skip past it for struct cached_dev's cache mode */ -+const char * const bch2_cache_modes[] = { -+ "default", -+ "writethrough", -+ "writeback", -+ "writearound", -+ "none", -+ NULL -+}; -+ -+const char * const bch2_dev_state[] = { -+ "readwrite", -+ "readonly", -+ "failed", -+ "spare", -+ NULL -+}; -+ -+void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) -+{ -+#define x(_name, ...) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ -+ BCH_OPTS() -+#undef x -+} -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opt_defined(*opts, _name); -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ return opts->_name; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) -+{ -+ switch (id) { -+#define x(_name, ...) \ -+ case Opt_##_name: \ -+ opt_set(*opts, _name, v); \ -+ break; -+ BCH_OPTS() -+#undef x -+ default: -+ BUG(); -+ } -+} -+ -+/* -+ * Initial options from superblock - here we don't want any options undefined, -+ * any options the superblock doesn't specify are set to 0: -+ */ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ -+ if (_sb_opt != NO_SB_OPT) \ -+ opt_set(opts, _name, _sb_opt(sb)); -+ BCH_OPTS() -+#undef x -+ -+ return opts; -+} -+ -+const struct bch_option bch2_opt_table[] = { -+#define OPT_BOOL() .type = BCH_OPT_BOOL -+#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max -+#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max -+#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices -+#define OPT_FN(_fn) .type = BCH_OPT_FN, \ -+ .parse = _fn##_parse, \ -+ .to_text = _fn##_to_text -+ -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ -+ [Opt_##_name] = { \ -+ .attr = { \ -+ .name = #_name, \ -+ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ -+ }, \ -+ .mode = _mode, \ -+ .hint = _hint, \ -+ .help = _help, \ -+ .set_sb = SET_##_sb_opt, \ -+ _type \ -+ }, -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+int bch2_opt_lookup(const char *name) -+{ -+ const struct bch_option *i; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); -+ i++) -+ if (!strcmp(name, i->attr.name)) -+ return i - bch2_opt_table; -+ -+ return -1; -+} -+ -+struct synonym { -+ const char *s1, *s2; -+}; -+ -+static const struct synonym bch_opt_synonyms[] = { -+ { "quota", "usrquota" }, -+}; -+ -+static int bch2_mount_opt_lookup(const char *name) -+{ -+ const struct synonym *i; -+ -+ for (i = bch_opt_synonyms; -+ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); -+ i++) -+ if (!strcmp(name, i->s1)) -+ name = i->s2; -+ -+ return bch2_opt_lookup(name); -+} -+ -+int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, -+ const char *val, u64 *res) -+{ -+ ssize_t ret; -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res > 1) -+ return -ERANGE; -+ break; -+ case BCH_OPT_UINT: -+ ret = kstrtou64(val, 10, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_SECTORS: -+ ret = bch2_strtou64_h(val, res); -+ if (ret < 0) -+ return ret; -+ -+ if (*res & 511) -+ return -EINVAL; -+ -+ *res >>= 9; -+ -+ if (*res < opt->min || *res >= opt->max) -+ return -ERANGE; -+ break; -+ case BCH_OPT_STR: -+ ret = match_string(opt->choices, -1, val); -+ if (ret < 0) -+ return ret; -+ -+ *res = ret; -+ break; -+ case BCH_OPT_FN: -+ if (!c) -+ return -EINVAL; -+ -+ return opt->parse(c, val, res); -+ } -+ -+ return 0; -+} -+ -+void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, -+ const struct bch_option *opt, u64 v, -+ unsigned flags) -+{ -+ if (flags & OPT_SHOW_MOUNT_STYLE) { -+ if (opt->type == BCH_OPT_BOOL) { -+ pr_buf(out, "%s%s", -+ v ? "" : "no", -+ opt->attr.name); -+ return; -+ } -+ -+ pr_buf(out, "%s=", opt->attr.name); -+ } -+ -+ switch (opt->type) { -+ case BCH_OPT_BOOL: -+ case BCH_OPT_UINT: -+ pr_buf(out, "%lli", v); -+ break; -+ case BCH_OPT_SECTORS: -+ bch2_hprint(out, v); -+ break; -+ case BCH_OPT_STR: -+ if (flags & OPT_SHOW_FULL_LIST) -+ bch2_string_opt_to_text(out, opt->choices, v); -+ else -+ pr_buf(out, opt->choices[v]); -+ break; -+ case BCH_OPT_FN: -+ opt->to_text(out, c, v); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) -+{ -+ int ret = 0; -+ -+ switch (id) { -+ case Opt_compression: -+ case Opt_background_compression: -+ ret = bch2_check_set_has_compressed_data(c, v); -+ break; -+ case Opt_erasure_code: -+ if (v) -+ bch2_check_set_feature(c, BCH_FEATURE_ec); -+ break; -+ } -+ -+ return ret; -+} -+ -+int bch2_opts_check_may_set(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ ret = bch2_opt_check_may_set(c, i, -+ bch2_opt_get_by_id(&c->opts, i)); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+int bch2_parse_mount_opts(struct bch_opts *opts, char *options) -+{ -+ char *opt, *name, *val; -+ int ret, id; -+ u64 v; -+ -+ while ((opt = strsep(&options, ",")) != NULL) { -+ name = strsep(&opt, "="); -+ val = opt; -+ -+ if (val) { -+ id = bch2_mount_opt_lookup(name); -+ if (id < 0) -+ goto bad_opt; -+ -+ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); -+ if (ret < 0) -+ goto bad_val; -+ } else { -+ id = bch2_mount_opt_lookup(name); -+ v = 1; -+ -+ if (id < 0 && -+ !strncmp("no", name, 2)) { -+ id = bch2_mount_opt_lookup(name + 2); -+ v = 0; -+ } -+ -+ if (id < 0) -+ goto bad_opt; -+ -+ if (bch2_opt_table[id].type != BCH_OPT_BOOL) -+ goto no_val; -+ } -+ -+ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) -+ goto bad_opt; -+ -+ if (id == Opt_acl && -+ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) -+ goto bad_opt; -+ -+ if ((id == Opt_usrquota || -+ id == Opt_grpquota) && -+ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) -+ goto bad_opt; -+ -+ bch2_opt_set_by_id(opts, id, v); -+ } -+ -+ return 0; -+bad_opt: -+ pr_err("Bad mount option %s", name); -+ return -1; -+bad_val: -+ pr_err("Invalid value %s for mount option %s", val, name); -+ return -1; -+no_val: -+ pr_err("Mount option %s requires a value", name); -+ return -1; -+} -+ -+/* io opts: */ -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) -+{ -+ struct bch_io_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) -+{ -+ struct bch_opts ret = { 0 }; -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(ret, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+ return ret; -+} -+ -+void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) -+{ -+#define x(_name, _bits) \ -+ if (opt_defined(src, _name)) \ -+ opt_set(*dst, _name, src._name); -+ BCH_INODE_OPTS() -+#undef x -+} -+ -+bool bch2_opt_is_inode_opt(enum bch_opt_id id) -+{ -+ static const enum bch_opt_id inode_opt_list[] = { -+#define x(_name, _bits) Opt_##_name, -+ BCH_INODE_OPTS() -+#undef x -+ }; -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) -+ if (inode_opt_list[i] == id) -+ return true; -+ -+ return false; -+} -diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h -new file mode 100644 -index 000000000000..014c608ca0c6 ---- /dev/null -+++ b/fs/bcachefs/opts.h -@@ -0,0 +1,440 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_OPTS_H -+#define _BCACHEFS_OPTS_H -+ -+#include -+#include -+#include -+#include -+#include "bcachefs_format.h" -+ -+extern const char * const bch2_error_actions[]; -+extern const char * const bch2_sb_features[]; -+extern const char * const bch2_csum_opts[]; -+extern const char * const bch2_compression_opts[]; -+extern const char * const bch2_str_hash_types[]; -+extern const char * const bch2_data_types[]; -+extern const char * const bch2_cache_replacement_policies[]; -+extern const char * const bch2_cache_modes[]; -+extern const char * const bch2_dev_state[]; -+ -+/* -+ * Mount options; we also store defaults in the superblock. -+ * -+ * Also exposed via sysfs: if an option is writeable, and it's also stored in -+ * the superblock, changing it via sysfs (currently? might change this) also -+ * updates the superblock. -+ * -+ * We store options as signed integers, where -1 means undefined. This means we -+ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only -+ * apply the options from that struct that are defined. -+ */ -+ -+/* dummy option, for options that aren't stored in the superblock */ -+LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); -+ -+/* When can be set: */ -+enum opt_mode { -+ OPT_FORMAT = (1 << 0), -+ OPT_MOUNT = (1 << 1), -+ OPT_RUNTIME = (1 << 2), -+ OPT_INODE = (1 << 3), -+ OPT_DEVICE = (1 << 4), -+}; -+ -+enum opt_type { -+ BCH_OPT_BOOL, -+ BCH_OPT_UINT, -+ BCH_OPT_SECTORS, -+ BCH_OPT_STR, -+ BCH_OPT_FN, -+}; -+ -+/** -+ * x(name, shortopt, type, in mem type, mode, sb_opt) -+ * -+ * @name - name of mount option, sysfs attribute, and struct bch_opts -+ * member -+ * -+ * @mode - when opt may be set -+ * -+ * @sb_option - name of corresponding superblock option -+ * -+ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR -+ */ -+ -+/* -+ * XXX: add fields for -+ * - default value -+ * - helptext -+ */ -+ -+#ifdef __KERNEL__ -+#define RATELIMIT_ERRORS true -+#else -+#define RATELIMIT_ERRORS false -+#endif -+ -+#define BCH_OPTS() \ -+ x(block_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 128), \ -+ BCH_SB_BLOCK_SIZE, 8, \ -+ "size", NULL) \ -+ x(btree_node_size, u16, \ -+ OPT_FORMAT, \ -+ OPT_SECTORS(1, 512), \ -+ BCH_SB_BTREE_NODE_SIZE, 512, \ -+ "size", "Btree node size, default 256k") \ -+ x(errors, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_error_actions), \ -+ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ -+ NULL, "Action to take on filesystem error") \ -+ x(metadata_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_WANT, 1, \ -+ "#", "Number of metadata replicas") \ -+ x(data_replicas, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_WANT, 1, \ -+ "#", "Number of data replicas") \ -+ x(metadata_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_META_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(data_replicas_required, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(1, BCH_REPLICAS_MAX), \ -+ BCH_SB_DATA_REPLICAS_REQ, 1, \ -+ "#", NULL) \ -+ x(metadata_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(data_checksum, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_csum_opts), \ -+ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ -+ NULL, NULL) \ -+ x(compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(background_compression, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_STR(bch2_compression_opts), \ -+ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ -+ NULL, NULL) \ -+ x(str_hash, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_STR(bch2_str_hash_types), \ -+ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ -+ NULL, "Hash function for directory entries and xattrs")\ -+ x(foreground_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_FOREGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group for foreground writes") \ -+ x(background_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_BACKGROUND_TARGET, 0, \ -+ "(target)", "Device or disk group to move data to in the background")\ -+ x(promote_target, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_FN(bch2_opt_target), \ -+ BCH_SB_PROMOTE_TARGET, 0, \ -+ "(target)", "Device or disk group to promote data to on read")\ -+ x(erasure_code, u16, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ -+ OPT_BOOL(), \ -+ BCH_SB_ERASURE_CODE, false, \ -+ NULL, "Enable erasure coding (DO NOT USE YET)") \ -+ x(inodes_32bit, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_INODE_32BIT, false, \ -+ NULL, "Constrain inode numbers to 32 bits") \ -+ x(gc_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_UINT(5, 21), \ -+ BCH_SB_GC_RESERVE, 8, \ -+ "%", "Percentage of disk space to reserve for copygc")\ -+ x(gc_reserve_bytes, u64, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_SECTORS(0, U64_MAX), \ -+ BCH_SB_GC_RESERVE_BYTES, 0, \ -+ "%", "Amount of disk space to reserve for copygc\n" \ -+ "Takes precedence over gc_reserve_percent if set")\ -+ x(root_reserve_percent, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_UINT(0, 100), \ -+ BCH_SB_ROOT_RESERVE, 0, \ -+ "%", "Percentage of disk space to reserve for superuser")\ -+ x(wide_macs, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_128_BIT_MACS, false, \ -+ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ -+ x(inline_data, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable inline data extents") \ -+ x(acl, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_POSIX_ACL, true, \ -+ NULL, "Enable POSIX acls") \ -+ x(usrquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_USRQUOTA, false, \ -+ NULL, "Enable user quotas") \ -+ x(grpquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_GRPQUOTA, false, \ -+ NULL, "Enable group quotas") \ -+ x(prjquota, u8, \ -+ OPT_FORMAT|OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ BCH_SB_PRJQUOTA, false, \ -+ NULL, "Enable project quotas") \ -+ x(reflink, u8, \ -+ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ BCH_SB_REFLINK, true, \ -+ NULL, "Enable reflink support") \ -+ x(degraded, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Allow mounting in degraded mode") \ -+ x(discard, u8, \ -+ OPT_MOUNT|OPT_DEVICE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Enable discard/TRIM support") \ -+ x(verbose, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Extra debugging information during mount/recovery")\ -+ x(journal_flush_disabled, u8, \ -+ OPT_MOUNT|OPT_RUNTIME, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Disable journal flush on sync/fsync\n" \ -+ "If enabled, writes can be lost, but only since the\n"\ -+ "last journal write (default 1 second)") \ -+ x(fsck, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Run fsck on mount") \ -+ x(fix_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Fix errors during fsck without asking") \ -+ x(ratelimit_errors, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, RATELIMIT_ERRORS, \ -+ NULL, "Ratelimit error messages during fsck") \ -+ x(nochanges, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Super read only mode - no writes at all will be issued,\n"\ -+ "even if we have to replay the journal") \ -+ x(norecovery, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't replay the journal") \ -+ x(rebuild_replicas, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Rebuild the superblock replicas section") \ -+ x(keep_journal, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't free journal entries/keys after startup")\ -+ x(read_entire_journal, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Read all journal entries, not just dirty ones")\ -+ x(noexcl, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don't open device in exclusive mode") \ -+ x(sb, u64, \ -+ OPT_MOUNT, \ -+ OPT_UINT(0, S64_MAX), \ -+ NO_SB_OPT, BCH_SB_SECTOR, \ -+ "offset", "Sector offset of superblock") \ -+ x(read_only, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(nostart, u8, \ -+ 0, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Don\'t start filesystem, only open devices") \ -+ x(reconstruct_alloc, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Reconstruct alloc btree") \ -+ x(version_upgrade, u8, \ -+ OPT_MOUNT, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, "Set superblock to latest version,\n" \ -+ "allowing any new features to be used") \ -+ x(project, u8, \ -+ OPT_INODE, \ -+ OPT_BOOL(), \ -+ NO_SB_OPT, false, \ -+ NULL, NULL) \ -+ x(fs_size, u64, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(bucket, u32, \ -+ OPT_DEVICE, \ -+ OPT_SECTORS(0, S64_MAX), \ -+ NO_SB_OPT, 0, \ -+ "size", "Size of filesystem on device") \ -+ x(durability, u8, \ -+ OPT_DEVICE, \ -+ OPT_UINT(0, BCH_REPLICAS_MAX), \ -+ NO_SB_OPT, 1, \ -+ "n", "Data written to this device will be considered\n"\ -+ "to have already been replicated n times") -+ -+struct bch_opts { -+#define x(_name, _bits, ...) unsigned _name##_defined:1; -+ BCH_OPTS() -+#undef x -+ -+#define x(_name, _bits, ...) _bits _name; -+ BCH_OPTS() -+#undef x -+}; -+ -+static const struct bch_opts bch2_opts_default = { -+#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ -+ ._name##_defined = true, \ -+ ._name = _default, \ -+ -+ BCH_OPTS() -+#undef x -+}; -+ -+#define opt_defined(_opts, _name) ((_opts)._name##_defined) -+ -+#define opt_get(_opts, _name) \ -+ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) -+ -+#define opt_set(_opts, _name, _v) \ -+do { \ -+ (_opts)._name##_defined = true; \ -+ (_opts)._name = _v; \ -+} while (0) -+ -+static inline struct bch_opts bch2_opts_empty(void) -+{ -+ return (struct bch_opts) { 0 }; -+} -+ -+void bch2_opts_apply(struct bch_opts *, struct bch_opts); -+ -+enum bch_opt_id { -+#define x(_name, ...) Opt_##_name, -+ BCH_OPTS() -+#undef x -+ bch2_opts_nr -+}; -+ -+struct bch_fs; -+struct printbuf; -+ -+struct bch_option { -+ struct attribute attr; -+ void (*set_sb)(struct bch_sb *, u64); -+ enum opt_mode mode; -+ enum opt_type type; -+ -+ union { -+ struct { -+ u64 min, max; -+ }; -+ struct { -+ const char * const *choices; -+ }; -+ struct { -+ int (*parse)(struct bch_fs *, const char *, u64 *); -+ void (*to_text)(struct printbuf *, struct bch_fs *, u64); -+ }; -+ }; -+ -+ const char *hint; -+ const char *help; -+ -+}; -+ -+extern const struct bch_option bch2_opt_table[]; -+ -+bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); -+u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); -+void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -+ -+struct bch_opts bch2_opts_from_sb(struct bch_sb *); -+ -+int bch2_opt_lookup(const char *); -+int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); -+ -+#define OPT_SHOW_FULL_LIST (1 << 0) -+#define OPT_SHOW_MOUNT_STYLE (1 << 1) -+ -+void bch2_opt_to_text(struct printbuf *, struct bch_fs *, -+ const struct bch_option *, u64, unsigned); -+ -+int bch2_opt_check_may_set(struct bch_fs *, int, u64); -+int bch2_opts_check_may_set(struct bch_fs *); -+int bch2_parse_mount_opts(struct bch_opts *, char *); -+ -+/* inode opts: */ -+ -+struct bch_io_opts { -+#define x(_name, _bits) unsigned _name##_defined:1; -+ BCH_INODE_OPTS() -+#undef x -+ -+#define x(_name, _bits) u##_bits _name; -+ BCH_INODE_OPTS() -+#undef x -+}; -+ -+struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); -+struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); -+void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); -+bool bch2_opt_is_inode_opt(enum bch_opt_id); -+ -+#endif /* _BCACHEFS_OPTS_H */ -diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c -new file mode 100644 -index 000000000000..d3032a46e7f3 ---- /dev/null -+++ b/fs/bcachefs/quota.c -@@ -0,0 +1,783 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "inode.h" -+#include "quota.h" -+#include "super-io.h" -+ -+static const char *bch2_sb_validate_quota(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_quota *q = field_to_type(f, quota); -+ -+ if (vstruct_bytes(&q->field) != sizeof(*q)) -+ return "invalid field quota: wrong size"; -+ -+ return NULL; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_quota = { -+ .validate = bch2_sb_validate_quota, -+}; -+ -+const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ if (k.k->p.inode >= QTYP_NR) -+ return "invalid quota type"; -+ -+ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+static const char * const bch2_quota_counters[] = { -+ "space", -+ "inodes", -+}; -+ -+void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); -+ unsigned i; -+ -+ for (i = 0; i < Q_COUNTERS; i++) -+ pr_buf(out, "%s hardlimit %llu softlimit %llu", -+ bch2_quota_counters[i], -+ le64_to_cpu(dq.v->c[i].hardlimit), -+ le64_to_cpu(dq.v->c[i].softlimit)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+#include -+#include -+#include -+ -+static inline unsigned __next_qtype(unsigned i, unsigned qtypes) -+{ -+ qtypes >>= i; -+ return qtypes ? i + __ffs(qtypes) : QTYP_NR; -+} -+ -+#define for_each_set_qtype(_c, _i, _q, _qtypes) \ -+ for (_i = 0; \ -+ (_i = __next_qtype(_i, _qtypes), \ -+ _q = &(_c)->quotas[_i], \ -+ _i < QTYP_NR); \ -+ _i++) -+ -+static bool ignore_hardlimit(struct bch_memquota_type *q) -+{ -+ if (capable(CAP_SYS_RESOURCE)) -+ return true; -+#if 0 -+ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; -+ -+ return capable(CAP_SYS_RESOURCE) && -+ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || -+ !(info->dqi_flags & DQF_ROOT_SQUASH)); -+#endif -+ return false; -+} -+ -+enum quota_msg { -+ SOFTWARN, /* Softlimit reached */ -+ SOFTLONGWARN, /* Grace time expired */ -+ HARDWARN, /* Hardlimit reached */ -+ -+ HARDBELOW, /* Usage got below inode hardlimit */ -+ SOFTBELOW, /* Usage got below inode softlimit */ -+}; -+ -+static int quota_nl[][Q_COUNTERS] = { -+ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, -+ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, -+ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, -+ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, -+ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, -+ -+ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, -+ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, -+ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, -+ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, -+ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, -+}; -+ -+struct quota_msgs { -+ u8 nr; -+ struct { -+ u8 qtype; -+ u8 msg; -+ } m[QTYP_NR * Q_COUNTERS]; -+}; -+ -+static void prepare_msg(unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); -+ -+ msgs->m[msgs->nr].qtype = qtype; -+ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; -+ msgs->nr++; -+} -+ -+static void prepare_warning(struct memquota_counter *qc, -+ unsigned qtype, -+ enum quota_counters counter, -+ struct quota_msgs *msgs, -+ enum quota_msg msg_type) -+{ -+ if (qc->warning_issued & (1 << msg_type)) -+ return; -+ -+ prepare_msg(qtype, counter, msgs, msg_type); -+} -+ -+static void flush_warnings(struct bch_qid qid, -+ struct super_block *sb, -+ struct quota_msgs *msgs) -+{ -+ unsigned i; -+ -+ for (i = 0; i < msgs->nr; i++) -+ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), -+ sb->s_dev, msgs->m[i].msg); -+} -+ -+static int bch2_quota_check_limit(struct bch_fs *c, -+ unsigned qtype, -+ struct bch_memquota *mq, -+ struct quota_msgs *msgs, -+ enum quota_counters counter, -+ s64 v, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q = &c->quotas[qtype]; -+ struct memquota_counter *qc = &mq->c[counter]; -+ u64 n = qc->v + v; -+ -+ BUG_ON((s64) n < 0); -+ -+ if (mode == KEY_TYPE_QUOTA_NOCHECK) -+ return 0; -+ -+ if (v <= 0) { -+ if (n < qc->hardlimit && -+ (qc->warning_issued & (1 << HARDWARN))) { -+ qc->warning_issued &= ~(1 << HARDWARN); -+ prepare_msg(qtype, counter, msgs, HARDBELOW); -+ } -+ -+ if (n < qc->softlimit && -+ (qc->warning_issued & (1 << SOFTWARN))) { -+ qc->warning_issued &= ~(1 << SOFTWARN); -+ prepare_msg(qtype, counter, msgs, SOFTBELOW); -+ } -+ -+ qc->warning_issued = 0; -+ return 0; -+ } -+ -+ if (qc->hardlimit && -+ qc->hardlimit < n && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, HARDWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer && -+ ktime_get_real_seconds() >= qc->timer && -+ !ignore_hardlimit(q)) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); -+ } -+ -+ if (qc->softlimit && -+ qc->softlimit < n && -+ qc->timer == 0) { -+ if (mode == KEY_TYPE_QUOTA_PREALLOC) -+ return -EDQUOT; -+ -+ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); -+ -+ /* XXX is this the right one? */ -+ qc->timer = ktime_get_real_seconds() + -+ q->limits[counter].warnlimit; -+ } -+ -+ return 0; -+} -+ -+int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ unsigned qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq[QTYP_NR]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); -+ if (!mq[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mq[i]->c[counter].v += v; -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(qid, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static void __bch2_quota_transfer(struct bch_memquota *src_q, -+ struct bch_memquota *dst_q, -+ enum quota_counters counter, s64 v) -+{ -+ BUG_ON(v > src_q->c[counter].v); -+ BUG_ON(v + dst_q->c[counter].v < v); -+ -+ src_q->c[counter].v -= v; -+ dst_q->c[counter].v += v; -+} -+ -+int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ struct bch_memquota_type *q; -+ struct bch_memquota *src_q[3], *dst_q[3]; -+ struct quota_msgs msgs; -+ unsigned i; -+ int ret = 0; -+ -+ qtypes &= enabled_qtypes(c); -+ -+ memset(&msgs, 0, sizeof(msgs)); -+ -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_lock_nested(&q->lock, i); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); -+ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); -+ -+ if (!src_q[i] || !dst_q[i]) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, -+ dst_q[i]->c[Q_SPC].v + space, -+ mode); -+ if (ret) -+ goto err; -+ -+ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, -+ dst_q[i]->c[Q_INO].v + 1, -+ mode); -+ if (ret) -+ goto err; -+ } -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); -+ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); -+ } -+ -+err: -+ for_each_set_qtype(c, i, q, qtypes) -+ mutex_unlock(&q->lock); -+ -+ flush_warnings(dst, c->vfs_sb, &msgs); -+ -+ return ret; -+} -+ -+static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_quota dq; -+ struct bch_memquota_type *q; -+ struct bch_memquota *mq; -+ unsigned i; -+ -+ BUG_ON(k.k->p.inode >= QTYP_NR); -+ -+ switch (k.k->type) { -+ case KEY_TYPE_quota: -+ dq = bkey_s_c_to_quota(k); -+ q = &c->quotas[k.k->p.inode]; -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); -+ if (!mq) { -+ mutex_unlock(&q->lock); -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < Q_COUNTERS; i++) { -+ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); -+ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); -+ } -+ -+ mutex_unlock(&q->lock); -+ } -+ -+ return 0; -+} -+ -+static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), -+ BTREE_ITER_PREFETCH, k, ret) { -+ if (k.k->p.inode != type) -+ break; -+ -+ ret = __bch2_quota_set(c, k); -+ if (ret) -+ break; -+ } -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+void bch2_fs_quota_exit(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ genradix_free(&c->quotas[i].table); -+} -+ -+void bch2_fs_quota_init(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) -+ mutex_init(&c->quotas[i].lock); -+} -+ -+static void bch2_sb_quota_read(struct bch_fs *c) -+{ -+ struct bch_sb_field_quota *sb_quota; -+ unsigned i, j; -+ -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) -+ return; -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ struct bch_memquota_type *q = &c->quotas[i]; -+ -+ for (j = 0; j < Q_COUNTERS; j++) { -+ q->limits[j].timelimit = -+ le32_to_cpu(sb_quota->q[i].c[j].timelimit); -+ q->limits[j].warnlimit = -+ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); -+ } -+ } -+} -+ -+int bch2_fs_quota_read(struct bch_fs *c) -+{ -+ unsigned i, qtypes = enabled_qtypes(c); -+ struct bch_memquota_type *q; -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bch_inode_unpacked u; -+ struct bkey_s_c k; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ bch2_sb_quota_read(c); -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_set_qtype(c, i, q, qtypes) { -+ ret = bch2_quota_init_type(c, i); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, -+ BTREE_ITER_PREFETCH, k, ret) { -+ switch (k.k->type) { -+ case KEY_TYPE_inode: -+ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); -+ if (ret) -+ return ret; -+ -+ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, -+ KEY_TYPE_QUOTA_NOCHECK); -+ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, -+ KEY_TYPE_QUOTA_NOCHECK); -+ } -+ } -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+/* Enable/disable/delete quotas for an entire filesystem: */ -+ -+static int bch2_quota_enable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ /* Accounting must be enabled at mount time: */ -+ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) -+ return -EINVAL; -+ -+ /* Can't enable enforcement without accounting: */ -+ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) -+ return -EINVAL; -+ -+ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) -+ return -EINVAL; -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) -+ return -EINVAL; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_disable(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ mutex_lock(&c->sb_lock); -+ if (uflags & FS_QUOTA_UDQ_ENFD) -+ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_GDQ_ENFD) -+ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); -+ -+ if (uflags & FS_QUOTA_PDQ_ENFD) -+ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+static int bch2_quota_remove(struct super_block *sb, unsigned uflags) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (uflags & FS_USER_QUOTA) { -+ if (c->opts.usrquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_USR, 0), -+ POS(QTYP_USR + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_GROUP_QUOTA) { -+ if (c->opts.grpquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_GRP, 0), -+ POS(QTYP_GRP + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ if (uflags & FS_PROJ_QUOTA) { -+ if (c->opts.prjquota) -+ return -EINVAL; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, -+ POS(QTYP_PRJ, 0), -+ POS(QTYP_PRJ + 1, 0), -+ NULL); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Return quota status information, such as enforcements, quota file inode -+ * numbers etc. -+ */ -+static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ unsigned qtypes = enabled_qtypes(c); -+ unsigned i; -+ -+ memset(state, 0, sizeof(*state)); -+ -+ for (i = 0; i < QTYP_NR; i++) { -+ state->s_state[i].flags |= QCI_SYSFILE; -+ -+ if (!(qtypes & (1 << i))) -+ continue; -+ -+ state->s_state[i].flags |= QCI_ACCT_ENABLED; -+ -+ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; -+ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; -+ -+ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; -+ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; -+ } -+ -+ return 0; -+} -+ -+/* -+ * Adjust quota timers & warnings -+ */ -+static int bch2_quota_set_info(struct super_block *sb, int type, -+ struct qc_info *info) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_sb_field_quota *sb_quota; -+ struct bch_memquota_type *q; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ if (type >= QTYP_NR) -+ return -EINVAL; -+ -+ if (!((1 << type) & enabled_qtypes(c))) -+ return -ESRCH; -+ -+ if (info->i_fieldmask & -+ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) -+ return -EINVAL; -+ -+ q = &c->quotas[type]; -+ -+ mutex_lock(&c->sb_lock); -+ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); -+ if (!sb_quota) { -+ sb_quota = bch2_sb_resize_quota(&c->disk_sb, -+ sizeof(*sb_quota) / sizeof(u64)); -+ if (!sb_quota) -+ return -ENOSPC; -+ } -+ -+ if (info->i_fieldmask & QC_SPC_TIMER) -+ sb_quota->q[type].c[Q_SPC].timelimit = -+ cpu_to_le32(info->i_spc_timelimit); -+ -+ if (info->i_fieldmask & QC_SPC_WARNS) -+ sb_quota->q[type].c[Q_SPC].warnlimit = -+ cpu_to_le32(info->i_spc_warnlimit); -+ -+ if (info->i_fieldmask & QC_INO_TIMER) -+ sb_quota->q[type].c[Q_INO].timelimit = -+ cpu_to_le32(info->i_ino_timelimit); -+ -+ if (info->i_fieldmask & QC_INO_WARNS) -+ sb_quota->q[type].c[Q_INO].warnlimit = -+ cpu_to_le32(info->i_ino_warnlimit); -+ -+ bch2_sb_quota_read(c); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+/* Get/set individual quotas: */ -+ -+static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) -+{ -+ dst->d_space = src->c[Q_SPC].v << 9; -+ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; -+ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; -+ dst->d_spc_timer = src->c[Q_SPC].timer; -+ dst->d_spc_warns = src->c[Q_SPC].warns; -+ -+ dst->d_ino_count = src->c[Q_INO].v; -+ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; -+ dst->d_ino_softlimit = src->c[Q_INO].softlimit; -+ dst->d_ino_timer = src->c[Q_INO].timer; -+ dst->d_ino_warns = src->c[Q_INO].warns; -+} -+ -+static int bch2_get_quota(struct super_block *sb, struct kqid kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid.type]; -+ qid_t qid = from_kqid(&init_user_ns, kqid); -+ struct bch_memquota *mq; -+ -+ memset(qdq, 0, sizeof(*qdq)); -+ -+ mutex_lock(&q->lock); -+ mq = genradix_ptr(&q->table, qid); -+ if (mq) -+ __bch2_quota_get(qdq, mq); -+ mutex_unlock(&q->lock); -+ -+ return 0; -+} -+ -+static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct bch_memquota_type *q = &c->quotas[kqid->type]; -+ qid_t qid = from_kqid(&init_user_ns, *kqid); -+ struct genradix_iter iter; -+ struct bch_memquota *mq; -+ int ret = 0; -+ -+ mutex_lock(&q->lock); -+ -+ genradix_for_each_from(&q->table, iter, mq, qid) -+ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { -+ __bch2_quota_get(qdq, mq); -+ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); -+ goto found; -+ } -+ -+ ret = -ENOENT; -+found: -+ mutex_unlock(&q->lock); -+ return ret; -+} -+ -+static int bch2_set_quota_trans(struct btree_trans *trans, -+ struct bkey_i_quota *new_quota, -+ struct qc_dqblk *qdq) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); -+ k = bch2_btree_iter_peek_slot(iter); -+ -+ ret = bkey_err(k); -+ if (unlikely(ret)) -+ return ret; -+ -+ if (k.k->type == KEY_TYPE_quota) -+ new_quota->v = *bkey_s_c_to_quota(k).v; -+ -+ if (qdq->d_fieldmask & QC_SPC_SOFT) -+ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); -+ if (qdq->d_fieldmask & QC_SPC_HARD) -+ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); -+ -+ if (qdq->d_fieldmask & QC_INO_SOFT) -+ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); -+ if (qdq->d_fieldmask & QC_INO_HARD) -+ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); -+ -+ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); -+} -+ -+static int bch2_set_quota(struct super_block *sb, struct kqid qid, -+ struct qc_dqblk *qdq) -+{ -+ struct bch_fs *c = sb->s_fs_info; -+ struct btree_trans trans; -+ struct bkey_i_quota new_quota; -+ int ret; -+ -+ if (sb->s_flags & SB_RDONLY) -+ return -EROFS; -+ -+ bkey_quota_init(&new_quota.k_i); -+ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, -+ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: -+ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); -+ -+ bch2_trans_exit(&trans); -+ -+ return ret; -+} -+ -+const struct quotactl_ops bch2_quotactl_operations = { -+ .quota_enable = bch2_quota_enable, -+ .quota_disable = bch2_quota_disable, -+ .rm_xquota = bch2_quota_remove, -+ -+ .get_state = bch2_quota_get_state, -+ .set_info = bch2_quota_set_info, -+ -+ .get_dqblk = bch2_get_quota, -+ .get_nextdqblk = bch2_get_next_quota, -+ .set_dqblk = bch2_set_quota, -+}; -+ -+#endif /* CONFIG_BCACHEFS_QUOTA */ -diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h -new file mode 100644 -index 000000000000..51e4f9713ef0 ---- /dev/null -+++ b/fs/bcachefs/quota.h -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_H -+#define _BCACHEFS_QUOTA_H -+ -+#include "inode.h" -+#include "quota_types.h" -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_quota; -+ -+const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_quota (struct bkey_ops) { \ -+ .key_invalid = bch2_quota_invalid, \ -+ .val_to_text = bch2_quota_to_text, \ -+} -+ -+static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) -+{ -+ return (struct bch_qid) { -+ .q[QTYP_USR] = u->bi_uid, -+ .q[QTYP_GRP] = u->bi_gid, -+ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, -+ }; -+} -+ -+static inline unsigned enabled_qtypes(struct bch_fs *c) -+{ -+ return ((c->opts.usrquota << QTYP_USR)| -+ (c->opts.grpquota << QTYP_GRP)| -+ (c->opts.prjquota << QTYP_PRJ)); -+} -+ -+#ifdef CONFIG_BCACHEFS_QUOTA -+ -+int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, -+ s64, enum quota_acct_mode); -+ -+int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, -+ struct bch_qid, u64, enum quota_acct_mode); -+ -+void bch2_fs_quota_exit(struct bch_fs *); -+void bch2_fs_quota_init(struct bch_fs *); -+int bch2_fs_quota_read(struct bch_fs *); -+ -+extern const struct quotactl_ops bch2_quotactl_operations; -+ -+#else -+ -+static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, -+ enum quota_counters counter, s64 v, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, -+ struct bch_qid dst, -+ struct bch_qid src, u64 space, -+ enum quota_acct_mode mode) -+{ -+ return 0; -+} -+ -+static inline void bch2_fs_quota_exit(struct bch_fs *c) {} -+static inline void bch2_fs_quota_init(struct bch_fs *c) {} -+static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } -+ -+#endif -+ -+#endif /* _BCACHEFS_QUOTA_H */ -diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h -new file mode 100644 -index 000000000000..6a136083d389 ---- /dev/null -+++ b/fs/bcachefs/quota_types.h -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_QUOTA_TYPES_H -+#define _BCACHEFS_QUOTA_TYPES_H -+ -+#include -+ -+struct bch_qid { -+ u32 q[QTYP_NR]; -+}; -+ -+enum quota_acct_mode { -+ KEY_TYPE_QUOTA_PREALLOC, -+ KEY_TYPE_QUOTA_WARN, -+ KEY_TYPE_QUOTA_NOCHECK, -+}; -+ -+struct memquota_counter { -+ u64 v; -+ u64 hardlimit; -+ u64 softlimit; -+ s64 timer; -+ int warns; -+ int warning_issued; -+}; -+ -+struct bch_memquota { -+ struct memquota_counter c[Q_COUNTERS]; -+}; -+ -+typedef GENRADIX(struct bch_memquota) bch_memquota_table; -+ -+struct quota_limit { -+ u32 timelimit; -+ u32 warnlimit; -+}; -+ -+struct bch_memquota_type { -+ struct quota_limit limits[Q_COUNTERS]; -+ bch_memquota_table table; -+ struct mutex lock; -+}; -+ -+#endif /* _BCACHEFS_QUOTA_TYPES_H */ -diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c -new file mode 100644 -index 000000000000..56a1f761271f ---- /dev/null -+++ b/fs/bcachefs/rebalance.c -@@ -0,0 +1,331 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_foreground.h" -+#include "btree_iter.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "extents.h" -+#include "io.h" -+#include "move.h" -+#include "rebalance.h" -+#include "super-io.h" -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Check if an extent should be moved: -+ * returns -1 if it should not be moved, or -+ * device of pointer that should be moved, if known, or INT_MAX if unknown -+ */ -+static int __bch2_rebalance_pred(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ if (io_opts->background_compression && -+ !bch2_bkey_is_incompressible(k)) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ p.crc.compression_type != -+ bch2_compression_opt_to_type[io_opts->background_compression]) -+ return p.ptr.dev; -+ -+ if (io_opts->background_target) -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) -+ if (!p.ptr.cached && -+ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) -+ return p.ptr.dev; -+ -+ return -1; -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *c, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts) -+{ -+ atomic64_t *counter; -+ int dev; -+ -+ dev = __bch2_rebalance_pred(c, k, io_opts); -+ if (dev < 0) -+ return; -+ -+ counter = dev < INT_MAX -+ ? &bch_dev_bkey_exists(c, dev)->rebalance_work -+ : &c->rebalance.work_unknown_dev; -+ -+ if (atomic64_add_return(k.k->size, counter) == k.k->size) -+ rebalance_wakeup(c); -+} -+ -+static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, -+ struct bkey_s_c k, -+ struct bch_io_opts *io_opts, -+ struct data_opts *data_opts) -+{ -+ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { -+ data_opts->target = io_opts->background_target; -+ data_opts->btree_insert_flags = 0; -+ return DATA_ADD_REPLICAS; -+ } else { -+ return DATA_SKIP; -+ } -+} -+ -+void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) -+{ -+ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == -+ sectors) -+ rebalance_wakeup(c); -+} -+ -+struct rebalance_work { -+ int dev_most_full_idx; -+ unsigned dev_most_full_percent; -+ u64 dev_most_full_work; -+ u64 dev_most_full_capacity; -+ u64 total_work; -+}; -+ -+static void rebalance_work_accumulate(struct rebalance_work *w, -+ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) -+{ -+ unsigned percent_full; -+ u64 work = dev_work + unknown_dev; -+ -+ if (work < dev_work || work < unknown_dev) -+ work = U64_MAX; -+ work = min(work, capacity); -+ -+ percent_full = div64_u64(work * 100, capacity); -+ -+ if (percent_full >= w->dev_most_full_percent) { -+ w->dev_most_full_idx = idx; -+ w->dev_most_full_percent = percent_full; -+ w->dev_most_full_work = work; -+ w->dev_most_full_capacity = capacity; -+ } -+ -+ if (w->total_work + dev_work >= w->total_work && -+ w->total_work + dev_work >= dev_work) -+ w->total_work += dev_work; -+} -+ -+static struct rebalance_work rebalance_work(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ struct rebalance_work ret = { .dev_most_full_idx = -1 }; -+ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ rebalance_work_accumulate(&ret, -+ atomic64_read(&ca->rebalance_work), -+ unknown_dev, -+ bucket_to_sector(ca, ca->mi.nbuckets - -+ ca->mi.first_bucket), -+ i); -+ -+ rebalance_work_accumulate(&ret, -+ unknown_dev, 0, c->capacity, -1); -+ -+ return ret; -+} -+ -+static void rebalance_work_reset(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ for_each_online_member(ca, c, i) -+ atomic64_set(&ca->rebalance_work, 0); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, 0); -+} -+ -+static unsigned long curr_cputime(void) -+{ -+ u64 utime, stime; -+ -+ task_cputime_adjusted(current, &utime, &stime); -+ return nsecs_to_jiffies(utime + stime); -+} -+ -+static int bch2_rebalance_thread(void *arg) -+{ -+ struct bch_fs *c = arg; -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct io_clock *clock = &c->io_clock[WRITE]; -+ struct rebalance_work w, p; -+ unsigned long start, prev_start; -+ unsigned long prev_run_time, prev_run_cputime; -+ unsigned long cputime, prev_cputime; -+ unsigned long io_start; -+ long throttle; -+ -+ set_freezable(); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = rebalance_work(c); -+ prev_start = jiffies; -+ prev_cputime = curr_cputime(); -+ -+ while (!kthread_wait_freezable(r->enabled)) { -+ cond_resched(); -+ -+ start = jiffies; -+ cputime = curr_cputime(); -+ -+ prev_run_time = start - prev_start; -+ prev_run_cputime = cputime - prev_cputime; -+ -+ w = rebalance_work(c); -+ BUG_ON(!w.dev_most_full_capacity); -+ -+ if (!w.total_work) { -+ r->state = REBALANCE_WAITING; -+ kthread_wait_freezable(rebalance_work(c).total_work); -+ continue; -+ } -+ -+ /* -+ * If there isn't much work to do, throttle cpu usage: -+ */ -+ throttle = prev_run_cputime * 100 / -+ max(1U, w.dev_most_full_percent) - -+ prev_run_time; -+ -+ if (w.dev_most_full_percent < 20 && throttle > 0) { -+ r->throttled_until_iotime = io_start + -+ div_u64(w.dev_most_full_capacity * -+ (20 - w.dev_most_full_percent), -+ 50); -+ -+ if (atomic_long_read(&clock->now) + clock->max_slop < -+ r->throttled_until_iotime) { -+ r->throttled_until_cputime = start + throttle; -+ r->state = REBALANCE_THROTTLED; -+ -+ bch2_kthread_io_clock_wait(clock, -+ r->throttled_until_iotime, -+ throttle); -+ continue; -+ } -+ } -+ -+ /* minimum 1 mb/sec: */ -+ r->pd.rate.rate = -+ max_t(u64, 1 << 11, -+ r->pd.rate.rate * -+ max(p.dev_most_full_percent, 1U) / -+ max(w.dev_most_full_percent, 1U)); -+ -+ io_start = atomic_long_read(&clock->now); -+ p = w; -+ prev_start = start; -+ prev_cputime = cputime; -+ -+ r->state = REBALANCE_RUNNING; -+ memset(&r->move_stats, 0, sizeof(r->move_stats)); -+ rebalance_work_reset(c); -+ -+ bch2_move_data(c, -+ /* ratelimiting disabled for now */ -+ NULL, /* &r->pd.rate, */ -+ writepoint_ptr(&c->rebalance_write_point), -+ POS_MIN, POS_MAX, -+ rebalance_pred, NULL, -+ &r->move_stats); -+ } -+ -+ return 0; -+} -+ -+void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_rebalance *r = &c->rebalance; -+ struct rebalance_work w = rebalance_work(c); -+ char h1[21], h2[21]; -+ -+ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); -+ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); -+ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", -+ w.dev_most_full_idx, h1, h2); -+ -+ bch2_hprint(&PBUF(h1), w.total_work << 9); -+ bch2_hprint(&PBUF(h2), c->capacity << 9); -+ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); -+ -+ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); -+ -+ switch (r->state) { -+ case REBALANCE_WAITING: -+ pr_buf(out, "waiting\n"); -+ break; -+ case REBALANCE_THROTTLED: -+ bch2_hprint(&PBUF(h1), -+ (r->throttled_until_iotime - -+ atomic_long_read(&c->io_clock[WRITE].now)) << 9); -+ pr_buf(out, "throttled for %lu sec or %s io\n", -+ (r->throttled_until_cputime - jiffies) / HZ, -+ h1); -+ break; -+ case REBALANCE_RUNNING: -+ pr_buf(out, "running\n"); -+ pr_buf(out, "pos %llu:%llu\n", -+ r->move_stats.pos.inode, -+ r->move_stats.pos.offset); -+ break; -+ } -+} -+ -+void bch2_rebalance_stop(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ c->rebalance.pd.rate.rate = UINT_MAX; -+ bch2_ratelimit_reset(&c->rebalance.pd.rate); -+ -+ p = rcu_dereference_protected(c->rebalance.thread, 1); -+ c->rebalance.thread = NULL; -+ -+ if (p) { -+ /* for sychronizing with rebalance_wakeup() */ -+ synchronize_rcu(); -+ -+ kthread_stop(p); -+ put_task_struct(p); -+ } -+} -+ -+int bch2_rebalance_start(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ if (c->opts.nochanges) -+ return 0; -+ -+ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); -+ if (IS_ERR(p)) -+ return PTR_ERR(p); -+ -+ get_task_struct(p); -+ rcu_assign_pointer(c->rebalance.thread, p); -+ wake_up_process(p); -+ return 0; -+} -+ -+void bch2_fs_rebalance_init(struct bch_fs *c) -+{ -+ bch2_pd_controller_init(&c->rebalance.pd); -+ -+ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); -+} -diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h -new file mode 100644 -index 000000000000..7ade0bb81cce ---- /dev/null -+++ b/fs/bcachefs/rebalance.h -@@ -0,0 +1,28 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_H -+#define _BCACHEFS_REBALANCE_H -+ -+#include "rebalance_types.h" -+ -+static inline void rebalance_wakeup(struct bch_fs *c) -+{ -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = rcu_dereference(c->rebalance.thread); -+ if (p) -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, -+ struct bch_io_opts *); -+void bch2_rebalance_add_work(struct bch_fs *, u64); -+ -+void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); -+ -+void bch2_rebalance_stop(struct bch_fs *); -+int bch2_rebalance_start(struct bch_fs *); -+void bch2_fs_rebalance_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REBALANCE_H */ -diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h -new file mode 100644 -index 000000000000..192c6be20ced ---- /dev/null -+++ b/fs/bcachefs/rebalance_types.h -@@ -0,0 +1,27 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REBALANCE_TYPES_H -+#define _BCACHEFS_REBALANCE_TYPES_H -+ -+#include "move_types.h" -+ -+enum rebalance_state { -+ REBALANCE_WAITING, -+ REBALANCE_THROTTLED, -+ REBALANCE_RUNNING, -+}; -+ -+struct bch_fs_rebalance { -+ struct task_struct __rcu *thread; -+ struct bch_pd_controller pd; -+ -+ atomic64_t work_unknown_dev; -+ -+ enum rebalance_state state; -+ unsigned long throttled_until_iotime; -+ unsigned long throttled_until_cputime; -+ struct bch_move_stats move_stats; -+ -+ unsigned enabled:1; -+}; -+ -+#endif /* _BCACHEFS_REBALANCE_TYPES_H */ -diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c -new file mode 100644 -index 000000000000..d70fa968db50 ---- /dev/null -+++ b/fs/bcachefs/recovery.c -@@ -0,0 +1,1350 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "btree_gc.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "buckets.h" -+#include "dirent.h" -+#include "ec.h" -+#include "error.h" -+#include "fs-common.h" -+#include "fsck.h" -+#include "journal_io.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "quota.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+#include -+#include -+ -+#define QSTR(n) { { { .len = strlen(n) } }, .name = n } -+ -+/* iterate over keys read from the journal: */ -+ -+static struct journal_key *journal_key_search(struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ size_t l = 0, r = journal_keys->nr, m; -+ -+ while (l < r) { -+ m = l + ((r - l) >> 1); -+ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: -+ cmp_int(level, journal_keys->d[m].level) ?: -+ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) -+ l = m + 1; -+ else -+ r = m; -+ } -+ -+ BUG_ON(l < journal_keys->nr && -+ (cmp_int(id, journal_keys->d[l].btree_id) ?: -+ cmp_int(level, journal_keys->d[l].level) ?: -+ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); -+ -+ BUG_ON(l && -+ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: -+ cmp_int(level, journal_keys->d[l - 1].level) ?: -+ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); -+ -+ return l < journal_keys->nr ? journal_keys->d + l : NULL; -+} -+ -+static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) -+{ -+ if (iter->k && -+ iter->k < iter->keys->d + iter->keys->nr && -+ iter->k->btree_id == iter->btree_id && -+ iter->k->level == iter->level) -+ return iter->k->k; -+ -+ iter->k = NULL; -+ return NULL; -+} -+ -+static void bch2_journal_iter_advance(struct journal_iter *iter) -+{ -+ if (iter->k) -+ iter->k++; -+} -+ -+static void bch2_journal_iter_init(struct journal_iter *iter, -+ struct journal_keys *journal_keys, -+ enum btree_id id, unsigned level, -+ struct bpos pos) -+{ -+ iter->btree_id = id; -+ iter->level = level; -+ iter->keys = journal_keys; -+ iter->k = journal_key_search(journal_keys, id, level, pos); -+} -+ -+static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) -+{ -+ return iter->btree -+ ? bch2_btree_iter_peek(iter->btree) -+ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, -+ iter->b, &iter->unpacked); -+} -+ -+static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) -+{ -+ if (iter->btree) -+ bch2_btree_iter_next(iter->btree); -+ else -+ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); -+} -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) -+{ -+ switch (iter->last) { -+ case none: -+ break; -+ case btree: -+ bch2_journal_iter_advance_btree(iter); -+ break; -+ case journal: -+ bch2_journal_iter_advance(&iter->journal); -+ break; -+ } -+ -+ iter->last = none; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) -+{ -+ struct bkey_s_c ret; -+ -+ while (1) { -+ struct bkey_s_c btree_k = -+ bch2_journal_iter_peek_btree(iter); -+ struct bkey_s_c journal_k = -+ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); -+ -+ if (btree_k.k && journal_k.k) { -+ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); -+ -+ if (!cmp) -+ bch2_journal_iter_advance_btree(iter); -+ -+ iter->last = cmp < 0 ? btree : journal; -+ } else if (btree_k.k) { -+ iter->last = btree; -+ } else if (journal_k.k) { -+ iter->last = journal; -+ } else { -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ ret = iter->last == journal ? journal_k : btree_k; -+ -+ if (iter->b && -+ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { -+ iter->journal.k = NULL; -+ iter->last = none; -+ return bkey_s_c_null; -+ } -+ -+ if (!bkey_deleted(ret.k)) -+ break; -+ -+ bch2_btree_and_journal_iter_advance(iter); -+ } -+ -+ return ret; -+} -+ -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) -+{ -+ bch2_btree_and_journal_iter_advance(iter); -+ -+ return bch2_btree_and_journal_iter_peek(iter); -+} -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, -+ struct btree_trans *trans, -+ struct journal_keys *journal_keys, -+ enum btree_id id, struct bpos pos) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); -+ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); -+} -+ -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, -+ struct journal_keys *journal_keys, -+ struct btree *b) -+{ -+ memset(iter, 0, sizeof(*iter)); -+ -+ iter->b = b; -+ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); -+ bch2_journal_iter_init(&iter->journal, journal_keys, -+ b->c.btree_id, b->c.level, b->data->min_key); -+} -+ -+/* Walk btree, overlaying keys from the journal: */ -+ -+static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, -+ struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree_and_journal_iter iter; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); -+ -+ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { -+ ret = key_fn(c, btree_id, b->c.level, k); -+ if (ret) -+ break; -+ -+ if (b->c.level) { -+ struct btree *child; -+ BKEY_PADDED(k) tmp; -+ -+ bkey_reassemble(&tmp.k, k); -+ k = bkey_i_to_s_c(&tmp.k); -+ -+ bch2_btree_and_journal_iter_advance(&iter); -+ -+ if (b->c.level > 0) { -+ child = bch2_btree_node_get_noiter(c, &tmp.k, -+ b->c.btree_id, b->c.level - 1); -+ ret = PTR_ERR_OR_ZERO(child); -+ if (ret) -+ break; -+ -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, child, -+ journal_keys, btree_id, node_fn, key_fn); -+ six_unlock_read(&child->c.lock); -+ -+ if (ret) -+ break; -+ } -+ } else { -+ bch2_btree_and_journal_iter_advance(&iter); -+ } -+ } -+ -+ return ret; -+} -+ -+int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, -+ enum btree_id btree_id, -+ btree_walk_node_fn node_fn, -+ btree_walk_key_fn key_fn) -+{ -+ struct btree *b = c->btree_roots[btree_id].b; -+ int ret = 0; -+ -+ if (btree_node_fake(b)) -+ return 0; -+ -+ six_lock_read(&b->c.lock, NULL, NULL); -+ ret = (node_fn ? node_fn(c, b) : 0) ?: -+ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, -+ node_fn, key_fn) ?: -+ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); -+ six_unlock_read(&b->c.lock); -+ -+ return ret; -+} -+ -+/* sort and dedup all keys in the journal: */ -+ -+void bch2_journal_entries_free(struct list_head *list) -+{ -+ -+ while (!list_empty(list)) { -+ struct journal_replay *i = -+ list_first_entry(list, struct journal_replay, list); -+ list_del(&i->list); -+ kvpfree(i, offsetof(struct journal_replay, j) + -+ vstruct_bytes(&i->j)); -+ } -+} -+ -+/* -+ * When keys compare equal, oldest compares first: -+ */ -+static int journal_sort_key_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(l->btree_id, r->btree_id) ?: -+ cmp_int(l->level, r->level) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->journal_offset, r->journal_offset); -+} -+ -+void bch2_journal_keys_free(struct journal_keys *keys) -+{ -+ kvfree(keys->d); -+ keys->d = NULL; -+ keys->nr = 0; -+} -+ -+static struct journal_keys journal_keys_sort(struct list_head *journal_entries) -+{ -+ struct journal_replay *p; -+ struct jset_entry *entry; -+ struct bkey_i *k, *_n; -+ struct journal_keys keys = { NULL }; -+ struct journal_key *src, *dst; -+ size_t nr_keys = 0; -+ -+ if (list_empty(journal_entries)) -+ return keys; -+ -+ keys.journal_seq_base = -+ le64_to_cpu(list_last_entry(journal_entries, -+ struct journal_replay, list)->j.last_seq); -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ nr_keys++; -+ } -+ -+ -+ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); -+ if (!keys.d) -+ goto err; -+ -+ list_for_each_entry(p, journal_entries, list) { -+ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) -+ continue; -+ -+ for_each_jset_key(k, _n, entry, &p->j) -+ keys.d[keys.nr++] = (struct journal_key) { -+ .btree_id = entry->btree_id, -+ .level = entry->level, -+ .k = k, -+ .journal_seq = le64_to_cpu(p->j.seq) - -+ keys.journal_seq_base, -+ .journal_offset = k->_data - p->j._data, -+ }; -+ } -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); -+ -+ src = dst = keys.d; -+ while (src < keys.d + keys.nr) { -+ while (src + 1 < keys.d + keys.nr && -+ src[0].btree_id == src[1].btree_id && -+ src[0].level == src[1].level && -+ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) -+ src++; -+ -+ *dst++ = *src++; -+ } -+ -+ keys.nr = dst - keys.d; -+err: -+ return keys; -+} -+ -+/* journal replay: */ -+ -+static void replay_now_at(struct journal *j, u64 seq) -+{ -+ BUG_ON(seq < j->replay_journal_seq); -+ BUG_ON(seq > j->replay_journal_seq_end); -+ -+ while (j->replay_journal_seq < seq) -+ bch2_journal_pin_put(j, j->replay_journal_seq++); -+} -+ -+static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, -+ struct bkey_i *k) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter, *split_iter; -+ /* -+ * We might cause compressed extents to be split, so we need to pass in -+ * a disk_reservation: -+ */ -+ struct disk_reservation disk_res = -+ bch2_disk_reservation_init(c, 0); -+ struct bkey_i *split; -+ struct bpos atomic_end; -+ /* -+ * Some extents aren't equivalent - w.r.t. what the triggers do -+ * - if they're split: -+ */ -+ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || -+ k->k.type == KEY_TYPE_reflink_p; -+ bool remark = false; -+ int ret; -+ -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); -+retry: -+ bch2_trans_begin(&trans); -+ -+ iter = bch2_trans_get_iter(&trans, btree_id, -+ bkey_start_pos(&k->k), -+ BTREE_ITER_INTENT); -+ -+ do { -+ ret = bch2_btree_iter_traverse(iter); -+ if (ret) -+ goto err; -+ -+ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); -+ -+ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); -+ ret = PTR_ERR_OR_ZERO(split); -+ if (ret) -+ goto err; -+ -+ if (!remark && -+ remark_if_split && -+ bkey_cmp(atomic_end, k->k.p) < 0) { -+ ret = bch2_disk_reservation_add(c, &disk_res, -+ k->k.size * -+ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), -+ BCH_DISK_RESERVATION_NOFAIL); -+ BUG_ON(ret); -+ -+ remark = true; -+ } -+ -+ bkey_copy(split, k); -+ bch2_cut_front(iter->pos, split); -+ bch2_cut_back(atomic_end, split); -+ -+ split_iter = bch2_trans_copy_iter(&trans, iter); -+ ret = PTR_ERR_OR_ZERO(split_iter); -+ if (ret) -+ goto err; -+ -+ /* -+ * It's important that we don't go through the -+ * extent_handle_overwrites() and extent_update_to_keys() path -+ * here: journal replay is supposed to treat extents like -+ * regular keys -+ */ -+ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); -+ bch2_trans_update(&trans, split_iter, split, -+ BTREE_TRIGGER_NORUN); -+ -+ bch2_btree_iter_set_pos(iter, split->k.p); -+ -+ if (remark) { -+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), -+ 0, split->k.size, -+ BTREE_TRIGGER_INSERT); -+ if (ret) -+ goto err; -+ } -+ } while (bkey_cmp(iter->pos, k->k.p) < 0); -+ -+ if (remark) { -+ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), -+ 0, -((s64) k->k.size), -+ BTREE_TRIGGER_OVERWRITE); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_trans_commit(&trans, &disk_res, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY); -+err: -+ if (ret == -EINTR) -+ goto retry; -+ -+ bch2_disk_reservation_put(c, &disk_res); -+ -+ return bch2_trans_exit(&trans) ?: ret; -+} -+ -+static int __bch2_journal_replay_key(struct btree_trans *trans, -+ enum btree_id id, unsigned level, -+ struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_node_iter(trans, id, k->k.p, -+ BTREE_MAX_DEPTH, level, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ /* -+ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run -+ * extent_handle_overwrites() and extent_update_to_keys() - but we don't -+ * want that here, journal replay is supposed to treat extents like -+ * regular keys: -+ */ -+ __bch2_btree_iter_set_pos(iter, k->k.p, false); -+ -+ ret = bch2_btree_iter_traverse(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_journal_replay_key(&trans, id, level, k)); -+} -+ -+static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, -+ BTREE_ITER_CACHED| -+ BTREE_ITER_CACHED_NOFILL| -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter) ?: -+ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) -+{ -+ return bch2_trans_do(c, NULL, NULL, -+ BTREE_INSERT_NOFAIL| -+ BTREE_INSERT_USE_RESERVE| -+ BTREE_INSERT_LAZY_RW| -+ BTREE_INSERT_JOURNAL_REPLAY, -+ __bch2_alloc_replay_key(&trans, k)); -+} -+ -+static int journal_sort_seq_cmp(const void *_l, const void *_r) -+{ -+ const struct journal_key *l = _l; -+ const struct journal_key *r = _r; -+ -+ return cmp_int(r->level, l->level) ?: -+ cmp_int(l->journal_seq, r->journal_seq) ?: -+ cmp_int(l->btree_id, r->btree_id) ?: -+ bkey_cmp(l->k->k.p, r->k->k.p); -+} -+ -+static int bch2_journal_replay(struct bch_fs *c, -+ struct journal_keys keys) -+{ -+ struct journal *j = &c->journal; -+ struct journal_key *i; -+ u64 seq; -+ int ret; -+ -+ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); -+ -+ if (keys.nr) -+ replay_now_at(j, keys.journal_seq_base); -+ -+ seq = j->replay_journal_seq; -+ -+ /* -+ * First replay updates to the alloc btree - these will only update the -+ * btree key cache: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_alloc_replay_key(c, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Next replay updates to interior btree nodes: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level) { -+ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; -+ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ } -+ -+ /* -+ * Now that the btree is in a consistent state, we can start journal -+ * reclaim (which will be flushing entries from the btree key cache back -+ * to the btree: -+ */ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); -+ -+ j->replay_journal_seq = seq; -+ -+ /* -+ * Now replay leaf node updates: -+ */ -+ for_each_journal_key(keys, i) { -+ cond_resched(); -+ -+ if (i->level || i->btree_id == BTREE_ID_ALLOC) -+ continue; -+ -+ replay_now_at(j, keys.journal_seq_base + i->journal_seq); -+ -+ ret = i->k->k.size -+ ? bch2_extent_replay_key(c, i->btree_id, i->k) -+ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); -+ if (ret) -+ goto err; -+ } -+ -+ replay_now_at(j, j->replay_journal_seq_end); -+ j->replay_journal_seq = 0; -+ -+ bch2_journal_set_replay_done(j); -+ bch2_journal_flush_all_pins(j); -+ return bch2_journal_error(j); -+err: -+ bch_err(c, "journal replay: error %d while replaying key", ret); -+ return ret; -+} -+ -+static bool journal_empty(struct list_head *journal) -+{ -+ return list_empty(journal) || -+ journal_entry_empty(&list_last_entry(journal, -+ struct journal_replay, list)->j); -+} -+ -+static int -+verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, -+ struct list_head *journal) -+{ -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ u64 start_seq = le64_to_cpu(i->j.last_seq); -+ u64 end_seq = le64_to_cpu(i->j.seq); -+ u64 seq = start_seq; -+ int ret = 0; -+ -+ list_for_each_entry(i, journal, list) { -+ if (le64_to_cpu(i->j.seq) < start_seq) -+ continue; -+ -+ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, -+ "journal entries %llu-%llu missing! (replaying %llu-%llu)", -+ seq, le64_to_cpu(i->j.seq) - 1, -+ start_seq, end_seq); -+ -+ seq = le64_to_cpu(i->j.seq); -+ -+ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, -+ "found blacklisted journal entry %llu", seq); -+ -+ do { -+ seq++; -+ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); -+ } -+fsck_err: -+ return ret; -+} -+ -+/* journal replay early: */ -+ -+static int journal_replay_entry_early(struct bch_fs *c, -+ struct jset_entry *entry) -+{ -+ int ret = 0; -+ -+ switch (entry->type) { -+ case BCH_JSET_ENTRY_btree_root: { -+ struct btree_root *r; -+ -+ if (entry->btree_id >= BTREE_ID_NR) { -+ bch_err(c, "filesystem has unknown btree type %u", -+ entry->btree_id); -+ return -EINVAL; -+ } -+ -+ r = &c->btree_roots[entry->btree_id]; -+ -+ if (entry->u64s) { -+ r->level = entry->level; -+ bkey_copy(&r->key, &entry->start[0]); -+ r->error = 0; -+ } else { -+ r->error = -EIO; -+ } -+ r->alive = true; -+ break; -+ } -+ case BCH_JSET_ENTRY_usage: { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ switch (entry->btree_id) { -+ case FS_USAGE_RESERVED: -+ if (entry->level < BCH_REPLICAS_MAX) -+ c->usage_base->persistent_reserved[entry->level] = -+ le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_INODES: -+ c->usage_base->nr_inodes = le64_to_cpu(u->v); -+ break; -+ case FS_USAGE_KEY_VERSION: -+ atomic64_set(&c->key_version, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ -+ break; -+ } -+ case BCH_JSET_ENTRY_data_usage: { -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ ret = bch2_replicas_set_usage(c, &u->r, -+ le64_to_cpu(u->v)); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist: { -+ struct jset_entry_blacklist *bl_entry = -+ container_of(entry, struct jset_entry_blacklist, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->seq), -+ le64_to_cpu(bl_entry->seq) + 1); -+ break; -+ } -+ case BCH_JSET_ENTRY_blacklist_v2: { -+ struct jset_entry_blacklist_v2 *bl_entry = -+ container_of(entry, struct jset_entry_blacklist_v2, entry); -+ -+ ret = bch2_journal_seq_blacklist_add(c, -+ le64_to_cpu(bl_entry->start), -+ le64_to_cpu(bl_entry->end) + 1); -+ break; -+ } -+ } -+ -+ return ret; -+} -+ -+static int journal_replay_early(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct list_head *journal) -+{ -+ struct jset_entry *entry; -+ int ret; -+ -+ if (clean) { -+ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); -+ -+ for (entry = clean->start; -+ entry != vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } else { -+ struct journal_replay *i = -+ list_last_entry(journal, struct journal_replay, list); -+ -+ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); -+ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); -+ -+ list_for_each_entry(i, journal, list) -+ vstruct_for_each(&i->j, entry) { -+ ret = journal_replay_entry_early(c, entry); -+ if (ret) -+ return ret; -+ } -+ } -+ -+ bch2_fs_usage_initialize(c); -+ -+ return 0; -+} -+ -+/* sb clean section: */ -+ -+static struct bkey_i *btree_root_find(struct bch_fs *c, -+ struct bch_sb_field_clean *clean, -+ struct jset *j, -+ enum btree_id id, unsigned *level) -+{ -+ struct bkey_i *k; -+ struct jset_entry *entry, *start, *end; -+ -+ if (clean) { -+ start = clean->start; -+ end = vstruct_end(&clean->field); -+ } else { -+ start = j->start; -+ end = vstruct_last(j); -+ } -+ -+ for (entry = start; entry < end; entry = vstruct_next(entry)) -+ if (entry->type == BCH_JSET_ENTRY_btree_root && -+ entry->btree_id == id) -+ goto found; -+ -+ return NULL; -+found: -+ if (!entry->u64s) -+ return ERR_PTR(-EINVAL); -+ -+ k = entry->start; -+ *level = entry->level; -+ return k; -+} -+ -+static int verify_superblock_clean(struct bch_fs *c, -+ struct bch_sb_field_clean **cleanp, -+ struct jset *j) -+{ -+ unsigned i; -+ struct bch_sb_field_clean *clean = *cleanp; -+ int ret = 0; -+ -+ if (!c->sb.clean || !j) -+ return 0; -+ -+ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, -+ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", -+ le64_to_cpu(clean->journal_seq), -+ le64_to_cpu(j->seq))) { -+ kfree(clean); -+ *cleanp = NULL; -+ return 0; -+ } -+ -+ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, -+ "superblock read clock %u doesn't match journal %u after clean shutdown", -+ clean->read_clock, j->read_clock); -+ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, -+ "superblock write clock %u doesn't match journal %u after clean shutdown", -+ clean->write_clock, j->write_clock); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ char buf1[200], buf2[200]; -+ struct bkey_i *k1, *k2; -+ unsigned l1 = 0, l2 = 0; -+ -+ k1 = btree_root_find(c, clean, NULL, i, &l1); -+ k2 = btree_root_find(c, NULL, j, i, &l2); -+ -+ if (!k1 && !k2) -+ continue; -+ -+ mustfix_fsck_err_on(!k1 || !k2 || -+ IS_ERR(k1) || -+ IS_ERR(k2) || -+ k1->k.u64s != k2->k.u64s || -+ memcmp(k1, k2, bkey_bytes(k1)) || -+ l1 != l2, c, -+ "superblock btree root %u doesn't match journal after clean shutdown\n" -+ "sb: l=%u %s\n" -+ "journal: l=%u %s\n", i, -+ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), -+ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); -+ } -+fsck_err: -+ return ret; -+} -+ -+static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *clean, *sb_clean; -+ int ret; -+ -+ mutex_lock(&c->sb_lock); -+ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); -+ -+ if (fsck_err_on(!sb_clean, c, -+ "superblock marked clean but clean section not present")) { -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ mutex_unlock(&c->sb_lock); -+ return NULL; -+ } -+ -+ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), -+ GFP_KERNEL); -+ if (!clean) { -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(-ENOMEM); -+ } -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(clean, READ); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return clean; -+fsck_err: -+ mutex_unlock(&c->sb_lock); -+ return ERR_PTR(ret); -+} -+ -+static int read_btree_roots(struct bch_fs *c) -+{ -+ unsigned i; -+ int ret = 0; -+ -+ for (i = 0; i < BTREE_ID_NR; i++) { -+ struct btree_root *r = &c->btree_roots[i]; -+ -+ if (!r->alive) -+ continue; -+ -+ if (i == BTREE_ID_ALLOC && -+ c->opts.reconstruct_alloc) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ continue; -+ } -+ -+ -+ if (r->error) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "invalid btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ -+ ret = bch2_btree_root_read(c, i, &r->key, r->level); -+ if (ret) { -+ __fsck_err(c, i == BTREE_ID_ALLOC -+ ? FSCK_CAN_IGNORE : 0, -+ "error reading btree root %s", -+ bch2_btree_ids[i]); -+ if (i == BTREE_ID_ALLOC) -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ } -+ } -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ if (!c->btree_roots[i].b) -+ bch2_btree_root_alloc(c, i); -+fsck_err: -+ return ret; -+} -+ -+int bch2_fs_recovery(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_clean *clean = NULL; -+ u64 journal_seq; -+ bool write_sb = false, need_write_alloc = false; -+ int ret; -+ -+ if (c->sb.clean) -+ clean = read_superblock_clean(c); -+ ret = PTR_ERR_OR_ZERO(clean); -+ if (ret) -+ goto err; -+ -+ if (c->sb.clean) -+ bch_info(c, "recovering from clean shutdown, journal seq %llu", -+ le64_to_cpu(clean->journal_seq)); -+ -+ if (!c->replicas.entries || -+ c->opts.rebuild_replicas) { -+ bch_info(c, "building replicas info"); -+ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ } -+ -+ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { -+ struct jset *j; -+ -+ ret = bch2_journal_read(c, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, -+ "filesystem marked clean but journal not empty")) { -+ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->sb.clean = false; -+ } -+ -+ if (!c->sb.clean && list_empty(&c->journal_entries)) { -+ bch_err(c, "no journal entries found"); -+ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; -+ goto err; -+ } -+ -+ c->journal_keys = journal_keys_sort(&c->journal_entries); -+ if (!c->journal_keys.d) { -+ ret = -ENOMEM; -+ goto err; -+ } -+ -+ j = &list_last_entry(&c->journal_entries, -+ struct journal_replay, list)->j; -+ -+ ret = verify_superblock_clean(c, &clean, j); -+ if (ret) -+ goto err; -+ -+ journal_seq = le64_to_cpu(j->seq) + 1; -+ } else { -+ journal_seq = le64_to_cpu(clean->journal_seq) + 1; -+ } -+ -+ if (!c->sb.clean && -+ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { -+ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = journal_replay_early(c, clean, &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ if (!c->sb.clean) { -+ ret = bch2_journal_seq_blacklist_add(c, -+ journal_seq, -+ journal_seq + 4); -+ if (ret) { -+ bch_err(c, "error creating new journal seq blacklist entry"); -+ goto err; -+ } -+ -+ journal_seq += 4; -+ -+ /* -+ * The superblock needs to be written before we do any btree -+ * node writes: it will be in the read_write() path -+ */ -+ } -+ -+ ret = bch2_blacklist_table_initialize(c); -+ -+ if (!list_empty(&c->journal_entries)) { -+ ret = verify_journal_entries_not_blacklisted_or_missing(c, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_fs_journal_start(&c->journal, journal_seq, -+ &c->journal_entries); -+ if (ret) -+ goto err; -+ -+ ret = read_btree_roots(c); -+ if (ret) -+ goto err; -+ -+ bch_verbose(c, "starting alloc read"); -+ err = "error reading allocation information"; -+ ret = bch2_alloc_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "alloc read done"); -+ -+ bch_verbose(c, "starting stripes_read"); -+ err = "error reading stripes"; -+ ret = bch2_stripes_read(c, &c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "stripes_read done"); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ -+ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { -+ /* -+ * interior btree node updates aren't consistent with the -+ * journal; after an unclean shutdown we have to walk all -+ * pointers to metadata: -+ */ -+ bch_info(c, "starting metadata mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, true); -+ if (ret < 0) -+ goto err; -+ if (ret) -+ need_write_alloc = true; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ if (c->opts.fsck || -+ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || -+ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { -+ bch_info(c, "starting mark and sweep"); -+ err = "error in mark and sweep"; -+ ret = bch2_gc(c, &c->journal_keys, true, false); -+ if (ret < 0) -+ goto err; -+ if (ret) -+ need_write_alloc = true; -+ bch_verbose(c, "mark and sweep done"); -+ } -+ -+ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ /* -+ * Skip past versions that might have possibly been used (as nonces), -+ * but hadn't had their pointers written: -+ */ -+ if (c->sb.encryption_type && !c->sb.clean) -+ atomic64_add(1 << 16, &c->key_version); -+ -+ if (c->opts.norecovery) -+ goto out; -+ -+ bch_verbose(c, "starting journal replay"); -+ err = "journal replay failed"; -+ ret = bch2_journal_replay(c, c->journal_keys); -+ if (ret) -+ goto err; -+ bch_verbose(c, "journal replay done"); -+ -+ if (need_write_alloc && !c->opts.nochanges) { -+ /* -+ * note that even when filesystem was clean there might be work -+ * to do here, if we ran gc (because of fsck) which recalculated -+ * oldest_gen: -+ */ -+ bch_verbose(c, "writing allocation info"); -+ err = "error writing out alloc info"; -+ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: -+ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); -+ if (ret) { -+ bch_err(c, "error writing alloc info"); -+ goto err; -+ } -+ bch_verbose(c, "alloc write done"); -+ -+ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); -+ } -+ -+ if (!c->sb.clean) { -+ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { -+ bch_info(c, "checking inode link counts"); -+ err = "error in recovery"; -+ ret = bch2_fsck_inode_nlink(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ -+ } else { -+ bch_verbose(c, "checking for deleted inodes"); -+ err = "error in recovery"; -+ ret = bch2_fsck_walk_inodes_only(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "check inodes done"); -+ } -+ } -+ -+ if (c->opts.fsck) { -+ bch_info(c, "starting fsck"); -+ err = "error in fsck"; -+ ret = bch2_fsck_full(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "fsck done"); -+ } -+ -+ if (enabled_qtypes(c)) { -+ bch_verbose(c, "reading quotas"); -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ bch_verbose(c, "quotas done"); -+ } -+ -+ mutex_lock(&c->sb_lock); -+ if (c->opts.version_upgrade) { -+ if (c->sb.version < bcachefs_metadata_version_new_versioning) -+ c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_min); -+ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ write_sb = true; -+ } -+ -+ if (!test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ write_sb = true; -+ } -+ -+ if (c->opts.fsck && -+ !test_bit(BCH_FS_ERROR, &c->flags)) { -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); -+ write_sb = true; -+ } -+ -+ if (write_sb) -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (c->journal_seq_blacklist_table && -+ c->journal_seq_blacklist_table->nr > 128) -+ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); -+out: -+ ret = 0; -+err: -+fsck_err: -+ set_bit(BCH_FS_FSCK_DONE, &c->flags); -+ bch2_flush_fsck_errs(c); -+ -+ if (!c->opts.keep_journal) { -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ } -+ kfree(clean); -+ if (ret) -+ bch_err(c, "Error in recovery: %s (%i)", err, ret); -+ else -+ bch_verbose(c, "ret %i", ret); -+ return ret; -+} -+ -+int bch2_fs_initialize(struct bch_fs *c) -+{ -+ struct bch_inode_unpacked root_inode, lostfound_inode; -+ struct bkey_inode_buf packed_inode; -+ struct qstr lostfound = QSTR("lost+found"); -+ const char *err = "cannot allocate memory"; -+ struct bch_dev *ca; -+ LIST_HEAD(journal); -+ unsigned i; -+ int ret; -+ -+ bch_notice(c, "initializing new filesystem"); -+ -+ mutex_lock(&c->sb_lock); -+ for_each_online_member(ca, c, i) -+ bch2_mark_dev_superblock(c, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ -+ mutex_lock(&c->sb_lock); -+ c->disk_sb.sb->version = c->disk_sb.sb->version_min = -+ le16_to_cpu(bcachefs_metadata_version_current); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; -+ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); -+ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); -+ -+ for (i = 0; i < BTREE_ID_NR; i++) -+ bch2_btree_root_alloc(c, i); -+ -+ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); -+ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); -+ -+ err = "unable to allocate journal buckets"; -+ for_each_online_member(ca, c, i) { -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) { -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ /* -+ * journal_res_get() will crash if called before this has -+ * set up the journal.pin FIFO and journal.cur pointer: -+ */ -+ bch2_fs_journal_start(&c->journal, 1, &journal); -+ bch2_journal_set_replay_done(&c->journal); -+ -+ err = "error going read-write"; -+ ret = bch2_fs_read_write_early(c); -+ if (ret) -+ goto err; -+ -+ /* -+ * Write out the superblock and journal buckets, now that we can do -+ * btree updates -+ */ -+ err = "error writing alloc info"; -+ ret = bch2_alloc_write(c, 0); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init(c, &root_inode, 0, 0, -+ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); -+ root_inode.bi_inum = BCACHEFS_ROOT_INO; -+ bch2_inode_pack(&packed_inode, &root_inode); -+ -+ err = "error creating root directory"; -+ ret = bch2_btree_insert(c, BTREE_ID_INODES, -+ &packed_inode.inode.k_i, -+ NULL, NULL, 0); -+ if (ret) -+ goto err; -+ -+ bch2_inode_init_early(c, &lostfound_inode); -+ -+ err = "error creating lost+found"; -+ ret = bch2_trans_do(c, NULL, NULL, 0, -+ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, -+ &root_inode, &lostfound_inode, -+ &lostfound, -+ 0, 0, S_IFDIR|0700, 0, -+ NULL, NULL)); -+ if (ret) -+ goto err; -+ -+ if (enabled_qtypes(c)) { -+ ret = bch2_fs_quota_read(c); -+ if (ret) -+ goto err; -+ } -+ -+ err = "error writing first journal entry"; -+ ret = bch2_journal_meta(&c->journal); -+ if (ret) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+err: -+ pr_err("Error initializing new filesystem: %s (%i)", err, ret); -+ return ret; -+} -diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h -new file mode 100644 -index 000000000000..a66827c9addf ---- /dev/null -+++ b/fs/bcachefs/recovery.h -@@ -0,0 +1,60 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_RECOVERY_H -+#define _BCACHEFS_RECOVERY_H -+ -+#define for_each_journal_key(keys, i) \ -+ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) -+ -+struct journal_iter { -+ enum btree_id btree_id; -+ unsigned level; -+ struct journal_keys *keys; -+ struct journal_key *k; -+}; -+ -+/* -+ * Iterate over keys in the btree, with keys from the journal overlaid on top: -+ */ -+ -+struct btree_and_journal_iter { -+ struct btree_iter *btree; -+ -+ struct btree *b; -+ struct btree_node_iter node_iter; -+ struct bkey unpacked; -+ -+ struct journal_iter journal; -+ -+ enum last_key_returned { -+ none, -+ btree, -+ journal, -+ } last; -+}; -+ -+void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); -+struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); -+ -+void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, -+ struct btree_trans *, -+ struct journal_keys *, -+ enum btree_id, struct bpos); -+void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, -+ struct journal_keys *, -+ struct btree *); -+ -+typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); -+typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, -+ unsigned level, struct bkey_s_c k); -+ -+int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, -+ btree_walk_node_fn, btree_walk_key_fn); -+ -+void bch2_journal_keys_free(struct journal_keys *); -+void bch2_journal_entries_free(struct list_head *); -+ -+int bch2_fs_recovery(struct bch_fs *); -+int bch2_fs_initialize(struct bch_fs *); -+ -+#endif /* _BCACHEFS_RECOVERY_H */ -diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c -new file mode 100644 -index 000000000000..3c473f1380a6 ---- /dev/null -+++ b/fs/bcachefs/reflink.c -@@ -0,0 +1,303 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "bkey_on_stack.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "inode.h" -+#include "io.h" -+#include "reflink.h" -+ -+#include -+ -+/* reflink pointers */ -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ if (bkey_val_bytes(p.k) != sizeof(*p.v)) -+ return "incorrect value size"; -+ -+ return NULL; -+} -+ -+void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); -+ -+ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); -+} -+ -+enum merge_result bch2_reflink_p_merge(struct bch_fs *c, -+ struct bkey_s _l, struct bkey_s _r) -+{ -+ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); -+ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); -+ -+ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) -+ return BCH_MERGE_NOMERGE; -+ -+ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { -+ bch2_key_resize(l.k, KEY_SIZE_MAX); -+ bch2_cut_front_s(l.k->p, _r); -+ return BCH_MERGE_PARTIAL; -+ } -+ -+ bch2_key_resize(l.k, l.k->size + r.k->size); -+ -+ return BCH_MERGE_MERGE; -+} -+ -+/* indirect extents */ -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ if (bkey_val_bytes(r.k) < sizeof(*r.v)) -+ return "incorrect value size"; -+ -+ return bch2_bkey_ptrs_invalid(c, k); -+} -+ -+void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); -+ -+ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); -+ -+ bch2_bkey_ptrs_to_text(out, c, k); -+} -+ -+static int bch2_make_extent_indirect(struct btree_trans *trans, -+ struct btree_iter *extent_iter, -+ struct bkey_i_extent *e) -+{ -+ struct bch_fs *c = trans->c; -+ struct btree_iter *reflink_iter; -+ struct bkey_s_c k; -+ struct bkey_i_reflink_v *r_v; -+ struct bkey_i_reflink_p *r_p; -+ int ret; -+ -+ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, -+ POS(0, c->reflink_hint), -+ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { -+ if (reflink_iter->pos.inode) { -+ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); -+ continue; -+ } -+ -+ if (bkey_deleted(k.k) && e->k.size <= k.k->size) -+ break; -+ } -+ -+ if (ret) -+ goto err; -+ -+ /* rewind iter to start of hole, if necessary: */ -+ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); -+ -+ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); -+ ret = PTR_ERR_OR_ZERO(r_v); -+ if (ret) -+ goto err; -+ -+ bkey_reflink_v_init(&r_v->k_i); -+ r_v->k.p = reflink_iter->pos; -+ bch2_key_resize(&r_v->k, e->k.size); -+ r_v->k.version = e->k.version; -+ -+ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + -+ bkey_val_u64s(&e->k)); -+ r_v->v.refcount = 0; -+ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); -+ -+ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); -+ -+ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); -+ if (IS_ERR(r_p)) -+ return PTR_ERR(r_p); -+ -+ e->k.type = KEY_TYPE_reflink_p; -+ r_p = bkey_i_to_reflink_p(&e->k_i); -+ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); -+ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); -+ -+ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); -+err: -+ if (!IS_ERR(reflink_iter)) -+ c->reflink_hint = reflink_iter->pos.offset; -+ bch2_trans_iter_put(trans, reflink_iter); -+ -+ return ret; -+} -+ -+static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) -+{ -+ struct bkey_s_c k = bch2_btree_iter_peek(iter); -+ int ret; -+ -+ for_each_btree_key_continue(iter, 0, k, ret) { -+ if (bkey_cmp(iter->pos, end) >= 0) -+ return bkey_s_c_null; -+ -+ if (k.k->type == KEY_TYPE_extent || -+ k.k->type == KEY_TYPE_reflink_p) -+ break; -+ } -+ -+ return k; -+} -+ -+s64 bch2_remap_range(struct bch_fs *c, -+ struct bpos dst_start, struct bpos src_start, -+ u64 remap_sectors, u64 *journal_seq, -+ u64 new_i_size, s64 *i_sectors_delta) -+{ -+ struct btree_trans trans; -+ struct btree_iter *dst_iter, *src_iter; -+ struct bkey_s_c src_k; -+ BKEY_PADDED(k) new_dst; -+ struct bkey_on_stack new_src; -+ struct bpos dst_end = dst_start, src_end = src_start; -+ struct bpos dst_want, src_want; -+ u64 src_done, dst_done; -+ int ret = 0, ret2 = 0; -+ -+ if (!c->opts.reflink) -+ return -EOPNOTSUPP; -+ -+ if (!percpu_ref_tryget(&c->writes)) -+ return -EROFS; -+ -+ bch2_check_set_feature(c, BCH_FEATURE_reflink); -+ -+ dst_end.offset += remap_sectors; -+ src_end.offset += remap_sectors; -+ -+ bkey_on_stack_init(&new_src); -+ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); -+ -+ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, -+ BTREE_ITER_INTENT); -+ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, -+ BTREE_ITER_INTENT); -+ -+ while (1) { -+ bch2_trans_begin(&trans); -+ -+ trans.mem_top = 0; -+ -+ if (fatal_signal_pending(current)) { -+ ret = -EINTR; -+ goto err; -+ } -+ -+ src_k = get_next_src(src_iter, src_end); -+ ret = bkey_err(src_k); -+ if (ret) -+ goto btree_err; -+ -+ src_done = bpos_min(src_iter->pos, src_end).offset - -+ src_start.offset; -+ dst_want = POS(dst_start.inode, dst_start.offset + src_done); -+ -+ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { -+ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, -+ journal_seq, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ continue; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); -+ -+ if (!bkey_cmp(dst_iter->pos, dst_end)) -+ break; -+ -+ if (src_k.k->type == KEY_TYPE_extent) { -+ bkey_on_stack_reassemble(&new_src, c, src_k); -+ src_k = bkey_i_to_s_c(new_src.k); -+ -+ bch2_cut_front(src_iter->pos, new_src.k); -+ bch2_cut_back(src_end, new_src.k); -+ -+ ret = bch2_make_extent_indirect(&trans, src_iter, -+ bkey_i_to_extent(new_src.k)); -+ if (ret) -+ goto btree_err; -+ -+ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); -+ } -+ -+ if (src_k.k->type == KEY_TYPE_reflink_p) { -+ struct bkey_s_c_reflink_p src_p = -+ bkey_s_c_to_reflink_p(src_k); -+ struct bkey_i_reflink_p *dst_p = -+ bkey_reflink_p_init(&new_dst.k); -+ -+ u64 offset = le64_to_cpu(src_p.v->idx) + -+ (src_iter->pos.offset - -+ bkey_start_offset(src_k.k)); -+ -+ dst_p->v.idx = cpu_to_le64(offset); -+ } else { -+ BUG(); -+ } -+ -+ new_dst.k.k.p = dst_iter->pos; -+ bch2_key_resize(&new_dst.k.k, -+ min(src_k.k->p.offset - src_iter->pos.offset, -+ dst_end.offset - dst_iter->pos.offset)); -+ -+ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, -+ NULL, journal_seq, -+ new_i_size, i_sectors_delta); -+ if (ret) -+ goto btree_err; -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ src_want = POS(src_start.inode, src_start.offset + dst_done); -+ bch2_btree_iter_set_pos(src_iter, src_want); -+btree_err: -+ if (ret == -EINTR) -+ ret = 0; -+ if (ret) -+ goto err; -+ } -+ -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); -+err: -+ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); -+ -+ dst_done = dst_iter->pos.offset - dst_start.offset; -+ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); -+ -+ bch2_trans_begin(&trans); -+ -+ do { -+ struct bch_inode_unpacked inode_u; -+ struct btree_iter *inode_iter; -+ -+ inode_iter = bch2_inode_peek(&trans, &inode_u, -+ dst_start.inode, BTREE_ITER_INTENT); -+ ret2 = PTR_ERR_OR_ZERO(inode_iter); -+ -+ if (!ret2 && -+ inode_u.bi_size < new_i_size) { -+ inode_u.bi_size = new_i_size; -+ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: -+ bch2_trans_commit(&trans, NULL, journal_seq, 0); -+ } -+ } while (ret2 == -EINTR); -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ bkey_on_stack_exit(&new_src, c); -+ -+ percpu_ref_put(&c->writes); -+ -+ return dst_done ?: ret ?: ret2; -+} -diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h -new file mode 100644 -index 000000000000..5445c1cf0797 ---- /dev/null -+++ b/fs/bcachefs/reflink.h -@@ -0,0 +1,31 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REFLINK_H -+#define _BCACHEFS_REFLINK_H -+ -+const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+enum merge_result bch2_reflink_p_merge(struct bch_fs *, -+ struct bkey_s, struct bkey_s); -+ -+#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_p_invalid, \ -+ .val_to_text = bch2_reflink_p_to_text, \ -+ .key_merge = bch2_reflink_p_merge, \ -+} -+ -+const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, -+ struct bkey_s_c); -+ -+ -+#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ -+ .key_invalid = bch2_reflink_v_invalid, \ -+ .val_to_text = bch2_reflink_v_to_text, \ -+ .swab = bch2_ptr_swab, \ -+} -+ -+s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, -+ u64, u64 *, u64, s64 *); -+ -+#endif /* _BCACHEFS_REFLINK_H */ -diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c -new file mode 100644 -index 000000000000..6b6506c68609 ---- /dev/null -+++ b/fs/bcachefs/replicas.c -@@ -0,0 +1,1059 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "buckets.h" -+#include "journal.h" -+#include "replicas.h" -+#include "super-io.h" -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, -+ struct bch_replicas_cpu *); -+ -+/* Replicas tracking - in memory: */ -+ -+static inline int u8_cmp(u8 l, u8 r) -+{ -+ return cmp_int(l, r); -+} -+ -+static void verify_replicas_entry(struct bch_replicas_entry *e) -+{ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ unsigned i; -+ -+ BUG_ON(e->data_type >= BCH_DATA_NR); -+ BUG_ON(!e->nr_devs); -+ BUG_ON(e->nr_required > 1 && -+ e->nr_required >= e->nr_devs); -+ -+ for (i = 0; i + 1 < e->nr_devs; i++) -+ BUG_ON(e->devs[i] >= e->devs[i + 1]); -+#endif -+} -+ -+static void replicas_entry_sort(struct bch_replicas_entry *e) -+{ -+ bubble_sort(e->devs, e->nr_devs, u8_cmp); -+} -+ -+static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) -+{ -+ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); -+} -+ -+void bch2_replicas_entry_to_text(struct printbuf *out, -+ struct bch_replicas_entry *e) -+{ -+ unsigned i; -+ -+ pr_buf(out, "%s: %u/%u [", -+ bch2_data_types[e->data_type], -+ e->nr_required, -+ e->nr_devs); -+ -+ for (i = 0; i < e->nr_devs; i++) -+ pr_buf(out, i ? " %u" : "%u", e->devs[i]); -+ pr_buf(out, "]"); -+} -+ -+void bch2_cpu_replicas_to_text(struct printbuf *out, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_cpu_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+static void extent_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ r->nr_required = 1; -+ -+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { -+ if (p.ptr.cached) -+ continue; -+ -+ if (!p.has_ec) -+ r->devs[r->nr_devs++] = p.ptr.dev; -+ else -+ r->nr_required = 0; -+ } -+} -+ -+static void stripe_to_replicas(struct bkey_s_c k, -+ struct bch_replicas_entry *r) -+{ -+ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); -+ const struct bch_extent_ptr *ptr; -+ -+ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; -+ -+ for (ptr = s.v->ptrs; -+ ptr < s.v->ptrs + s.v->nr_blocks; -+ ptr++) -+ r->devs[r->nr_devs++] = ptr->dev; -+} -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *e, -+ struct bkey_s_c k) -+{ -+ e->nr_devs = 0; -+ -+ switch (k.k->type) { -+ case KEY_TYPE_btree_ptr: -+ case KEY_TYPE_btree_ptr_v2: -+ e->data_type = BCH_DATA_btree; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_extent: -+ case KEY_TYPE_reflink_v: -+ e->data_type = BCH_DATA_user; -+ extent_to_replicas(k, e); -+ break; -+ case KEY_TYPE_stripe: -+ e->data_type = BCH_DATA_user; -+ stripe_to_replicas(k, e); -+ break; -+ } -+ -+ replicas_entry_sort(e); -+} -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *e, -+ enum bch_data_type data_type, -+ struct bch_devs_list devs) -+{ -+ unsigned i; -+ -+ BUG_ON(!data_type || -+ data_type == BCH_DATA_sb || -+ data_type >= BCH_DATA_NR); -+ -+ e->data_type = data_type; -+ e->nr_devs = 0; -+ e->nr_required = 1; -+ -+ for (i = 0; i < devs.nr; i++) -+ e->devs[e->nr_devs++] = devs.devs[i]; -+ -+ replicas_entry_sort(e); -+} -+ -+static struct bch_replicas_cpu -+cpu_replicas_add_entry(struct bch_replicas_cpu *old, -+ struct bch_replicas_entry *new_entry) -+{ -+ unsigned i; -+ struct bch_replicas_cpu new = { -+ .nr = old->nr + 1, -+ .entry_size = max_t(unsigned, old->entry_size, -+ replicas_entry_bytes(new_entry)), -+ }; -+ -+ BUG_ON(!new_entry->data_type); -+ verify_replicas_entry(new_entry); -+ -+ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); -+ if (!new.entries) -+ return new; -+ -+ for (i = 0; i < old->nr; i++) -+ memcpy(cpu_replicas_entry(&new, i), -+ cpu_replicas_entry(old, i), -+ old->entry_size); -+ -+ memcpy(cpu_replicas_entry(&new, old->nr), -+ new_entry, -+ replicas_entry_bytes(new_entry)); -+ -+ bch2_cpu_replicas_sort(&new); -+ return new; -+} -+ -+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ int idx, entry_size = replicas_entry_bytes(search); -+ -+ if (unlikely(entry_size > r->entry_size)) -+ return -1; -+ -+ verify_replicas_entry(search); -+ -+#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) -+ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, -+ entry_cmp, search); -+#undef entry_cmp -+ -+ return idx < r->nr ? idx : -1; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ replicas_entry_sort(search); -+ -+ return __replicas_entry_idx(&c->replicas, search); -+} -+ -+static bool __replicas_has_entry(struct bch_replicas_cpu *r, -+ struct bch_replicas_entry *search) -+{ -+ return __replicas_entry_idx(r, search) >= 0; -+} -+ -+bool bch2_replicas_marked(struct bch_fs *c, -+ struct bch_replicas_entry *search) -+{ -+ bool marked; -+ -+ if (!search->nr_devs) -+ return true; -+ -+ verify_replicas_entry(search); -+ -+ percpu_down_read(&c->mark_lock); -+ marked = __replicas_has_entry(&c->replicas, search) && -+ (likely((!c->replicas_gc.entries)) || -+ __replicas_has_entry(&c->replicas_gc, search)); -+ percpu_up_read(&c->mark_lock); -+ -+ return marked; -+} -+ -+static void __replicas_table_update(struct bch_fs_usage *dst, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage *src, -+ struct bch_replicas_cpu *src_r) -+{ -+ int src_idx, dst_idx; -+ -+ *dst = *src; -+ -+ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { -+ if (!src->replicas[src_idx]) -+ continue; -+ -+ dst_idx = __replicas_entry_idx(dst_r, -+ cpu_replicas_entry(src_r, src_idx)); -+ BUG_ON(dst_idx < 0); -+ -+ dst->replicas[dst_idx] = src->replicas[src_idx]; -+ } -+} -+ -+static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, -+ struct bch_replicas_cpu *dst_r, -+ struct bch_fs_usage __percpu *src_p, -+ struct bch_replicas_cpu *src_r) -+{ -+ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; -+ struct bch_fs_usage *dst, *src = (void *) -+ bch2_acc_percpu_u64s((void *) src_p, src_nr); -+ -+ preempt_disable(); -+ dst = this_cpu_ptr(dst_p); -+ preempt_enable(); -+ -+ __replicas_table_update(dst, dst_r, src, src_r); -+} -+ -+/* -+ * Resize filesystem accounting: -+ */ -+static int replicas_table_update(struct bch_fs *c, -+ struct bch_replicas_cpu *new_r) -+{ -+ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; -+ struct bch_fs_usage *new_scratch = NULL; -+ struct bch_fs_usage __percpu *new_gc = NULL; -+ struct bch_fs_usage *new_base = NULL; -+ unsigned bytes = sizeof(struct bch_fs_usage) + -+ sizeof(u64) * new_r->nr; -+ int ret = -ENOMEM; -+ -+ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || -+ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), -+ GFP_NOIO)) || -+ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || -+ (c->usage_gc && -+ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { -+ bch_err(c, "error updating replicas table: memory allocation failure"); -+ goto err; -+ } -+ -+ if (c->usage_base) -+ __replicas_table_update(new_base, new_r, -+ c->usage_base, &c->replicas); -+ if (c->usage[0]) -+ __replicas_table_update_pcpu(new_usage[0], new_r, -+ c->usage[0], &c->replicas); -+ if (c->usage[1]) -+ __replicas_table_update_pcpu(new_usage[1], new_r, -+ c->usage[1], &c->replicas); -+ if (c->usage_gc) -+ __replicas_table_update_pcpu(new_gc, new_r, -+ c->usage_gc, &c->replicas); -+ -+ swap(c->usage_base, new_base); -+ swap(c->usage[0], new_usage[0]); -+ swap(c->usage[1], new_usage[1]); -+ swap(c->usage_scratch, new_scratch); -+ swap(c->usage_gc, new_gc); -+ swap(c->replicas, *new_r); -+ ret = 0; -+err: -+ free_percpu(new_gc); -+ kfree(new_scratch); -+ free_percpu(new_usage[1]); -+ free_percpu(new_usage[0]); -+ kfree(new_base); -+ return ret; -+} -+ -+static unsigned reserve_journal_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_replicas_entry *e; -+ unsigned journal_res_u64s = 0; -+ -+ /* nr_inodes: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* key_version: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); -+ -+ /* persistent_reserved: */ -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * -+ BCH_REPLICAS_MAX; -+ -+ for_each_cpu_replicas_entry(r, e) -+ journal_res_u64s += -+ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + -+ e->nr_devs, sizeof(u64)); -+ return journal_res_u64s; -+} -+ -+noinline -+static int bch2_mark_replicas_slowpath(struct bch_fs *c, -+ struct bch_replicas_entry *new_entry) -+{ -+ struct bch_replicas_cpu new_r, new_gc; -+ int ret = 0; -+ -+ verify_replicas_entry(new_entry); -+ -+ memset(&new_r, 0, sizeof(new_r)); -+ memset(&new_gc, 0, sizeof(new_gc)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (c->replicas_gc.entries && -+ !__replicas_has_entry(&c->replicas_gc, new_entry)) { -+ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); -+ if (!new_gc.entries) -+ goto err; -+ } -+ -+ if (!__replicas_has_entry(&c->replicas, new_entry)) { -+ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); -+ if (!new_r.entries) -+ goto err; -+ -+ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); -+ if (ret) -+ goto err; -+ -+ bch2_journal_entry_res_resize(&c->journal, -+ &c->replicas_journal_res, -+ reserve_journal_replicas(c, &new_r)); -+ } -+ -+ if (!new_r.entries && -+ !new_gc.entries) -+ goto out; -+ -+ /* allocations done, now commit: */ -+ -+ if (new_r.entries) -+ bch2_write_super(c); -+ -+ /* don't update in memory replicas until changes are persistent */ -+ percpu_down_write(&c->mark_lock); -+ if (new_r.entries) -+ ret = replicas_table_update(c, &new_r); -+ if (new_gc.entries) -+ swap(new_gc, c->replicas_gc); -+ percpu_up_write(&c->mark_lock); -+out: -+ mutex_unlock(&c->sb_lock); -+ -+ kfree(new_r.entries); -+ kfree(new_gc.entries); -+ -+ return ret; -+err: -+ bch_err(c, "error adding replicas entry: memory allocation failure"); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int __bch2_mark_replicas(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ bool check) -+{ -+ return likely(bch2_replicas_marked(c, r)) ? 0 -+ : check ? -1 -+ : bch2_mark_replicas_slowpath(c, r); -+} -+ -+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) -+{ -+ return __bch2_mark_replicas(c, r, false); -+} -+ -+static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, -+ bool check) -+{ -+ struct bch_replicas_padded search; -+ struct bch_devs_list cached = bch2_bkey_cached_devs(k); -+ unsigned i; -+ int ret; -+ -+ for (i = 0; i < cached.nr; i++) { -+ bch2_replicas_entry_cached(&search.e, cached.devs[i]); -+ -+ ret = __bch2_mark_replicas(c, &search.e, check); -+ if (ret) -+ return ret; -+ } -+ -+ bch2_bkey_to_replicas(&search.e, k); -+ -+ return __bch2_mark_replicas(c, &search.e, check); -+} -+ -+bool bch2_bkey_replicas_marked(struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, true) == 0; -+} -+ -+int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) -+{ -+ return __bch2_mark_bkey_replicas(c, k, false); -+} -+ -+int bch2_replicas_gc_end(struct bch_fs *c, int ret) -+{ -+ unsigned i; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ /* -+ * this is kind of crappy; the replicas gc mechanism needs to be ripped -+ * out -+ */ -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct bch_replicas_cpu n; -+ -+ if (!__replicas_has_entry(&c->replicas_gc, e) && -+ (c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i]))) { -+ n = cpu_replicas_add_entry(&c->replicas_gc, e); -+ if (!n.entries) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ swap(n, c->replicas_gc); -+ kfree(n.entries); -+ } -+ } -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &c->replicas_gc); -+err: -+ kfree(c->replicas_gc.entries); -+ c->replicas_gc.entries = NULL; -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i = 0; -+ -+ lockdep_assert_held(&c->replicas_gc_lock); -+ -+ mutex_lock(&c->sb_lock); -+ BUG_ON(c->replicas_gc.entries); -+ -+ c->replicas_gc.nr = 0; -+ c->replicas_gc.entry_size = 0; -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) { -+ c->replicas_gc.nr++; -+ c->replicas_gc.entry_size = -+ max_t(unsigned, c->replicas_gc.entry_size, -+ replicas_entry_bytes(e)); -+ } -+ -+ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, -+ c->replicas_gc.entry_size, -+ GFP_NOIO); -+ if (!c->replicas_gc.entries) { -+ mutex_unlock(&c->sb_lock); -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ if (!((1 << e->data_type) & typemask)) -+ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), -+ e, c->replicas_gc.entry_size); -+ -+ bch2_cpu_replicas_sort(&c->replicas_gc); -+ mutex_unlock(&c->sb_lock); -+ -+ return 0; -+} -+ -+int bch2_replicas_gc2(struct bch_fs *c) -+{ -+ struct bch_replicas_cpu new = { 0 }; -+ unsigned i, nr; -+ int ret = 0; -+ -+ bch2_journal_meta(&c->journal); -+retry: -+ nr = READ_ONCE(c->replicas.nr); -+ new.entry_size = READ_ONCE(c->replicas.entry_size); -+ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); -+ if (!new.entries) { -+ bch_err(c, "error allocating c->replicas_gc"); -+ return -ENOMEM; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ percpu_down_write(&c->mark_lock); -+ -+ if (nr != c->replicas.nr || -+ new.entry_size != c->replicas.entry_size) { -+ percpu_up_write(&c->mark_lock); -+ mutex_unlock(&c->sb_lock); -+ kfree(new.entries); -+ goto retry; -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ -+ if (e->data_type == BCH_DATA_journal || -+ c->usage_base->replicas[i] || -+ percpu_u64_get(&c->usage[0]->replicas[i]) || -+ percpu_u64_get(&c->usage[1]->replicas[i])) -+ memcpy(cpu_replicas_entry(&new, new.nr++), -+ e, new.entry_size); -+ } -+ -+ bch2_cpu_replicas_sort(&new); -+ -+ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { -+ ret = -ENOSPC; -+ goto err; -+ } -+ -+ ret = replicas_table_update(c, &new); -+err: -+ kfree(new.entries); -+ -+ percpu_up_write(&c->mark_lock); -+ -+ if (!ret) -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+int bch2_replicas_set_usage(struct bch_fs *c, -+ struct bch_replicas_entry *r, -+ u64 sectors) -+{ -+ int ret, idx = bch2_replicas_entry_idx(c, r); -+ -+ if (idx < 0) { -+ struct bch_replicas_cpu n; -+ -+ n = cpu_replicas_add_entry(&c->replicas, r); -+ if (!n.entries) -+ return -ENOMEM; -+ -+ ret = replicas_table_update(c, &n); -+ if (ret) -+ return ret; -+ -+ kfree(n.entries); -+ -+ idx = bch2_replicas_entry_idx(c, r); -+ BUG_ON(ret < 0); -+ } -+ -+ c->usage_base->replicas[idx] = sectors; -+ -+ return 0; -+} -+ -+/* Replicas tracking - superblock: */ -+ -+static int -+__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry *e, *dst; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ dst = cpu_replicas_entry(cpu_r, idx++); -+ memcpy(dst, e, replicas_entry_bytes(e)); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+static int -+__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, -+ struct bch_replicas_cpu *cpu_r) -+{ -+ struct bch_replicas_entry_v0 *e; -+ unsigned nr = 0, entry_size = 0, idx = 0; -+ -+ for_each_replicas_entry(sb_r, e) { -+ entry_size = max_t(unsigned, entry_size, -+ replicas_entry_bytes(e)); -+ nr++; -+ } -+ -+ entry_size += sizeof(struct bch_replicas_entry) - -+ sizeof(struct bch_replicas_entry_v0); -+ -+ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); -+ if (!cpu_r->entries) -+ return -ENOMEM; -+ -+ cpu_r->nr = nr; -+ cpu_r->entry_size = entry_size; -+ -+ for_each_replicas_entry(sb_r, e) { -+ struct bch_replicas_entry *dst = -+ cpu_replicas_entry(cpu_r, idx++); -+ -+ dst->data_type = e->data_type; -+ dst->nr_devs = e->nr_devs; -+ dst->nr_required = 1; -+ memcpy(dst->devs, e->devs, e->nr_devs); -+ replicas_entry_sort(dst); -+ } -+ -+ return 0; -+} -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) -+{ -+ struct bch_sb_field_replicas *sb_v1; -+ struct bch_sb_field_replicas_v0 *sb_v0; -+ struct bch_replicas_cpu new_r = { 0, 0, NULL }; -+ int ret = 0; -+ -+ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); -+ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) -+ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); -+ -+ if (ret) -+ return -ENOMEM; -+ -+ bch2_cpu_replicas_sort(&new_r); -+ -+ percpu_down_write(&c->mark_lock); -+ -+ ret = replicas_table_update(c, &new_r); -+ percpu_up_write(&c->mark_lock); -+ -+ kfree(new_r.entries); -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r; -+ struct bch_replicas_entry_v0 *dst; -+ struct bch_replicas_entry *src; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) -+ bytes += replicas_entry_bytes(src) - 1; -+ -+ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); -+ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ dst->data_type = src->data_type; -+ dst->nr_devs = src->nr_devs; -+ memcpy(dst->devs, src->devs, src->nr_devs); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, -+ struct bch_replicas_cpu *r) -+{ -+ struct bch_sb_field_replicas *sb_r; -+ struct bch_replicas_entry *dst, *src; -+ bool need_v1 = false; -+ size_t bytes; -+ -+ bytes = sizeof(struct bch_sb_field_replicas); -+ -+ for_each_cpu_replicas_entry(r, src) { -+ bytes += replicas_entry_bytes(src); -+ if (src->nr_required != 1) -+ need_v1 = true; -+ } -+ -+ if (!need_v1) -+ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); -+ -+ sb_r = bch2_sb_resize_replicas(&c->disk_sb, -+ DIV_ROUND_UP(bytes, sizeof(u64))); -+ if (!sb_r) -+ return -ENOSPC; -+ -+ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); -+ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); -+ -+ memset(&sb_r->entries, 0, -+ vstruct_end(&sb_r->field) - -+ (void *) &sb_r->entries); -+ -+ dst = sb_r->entries; -+ for_each_cpu_replicas_entry(r, src) { -+ memcpy(dst, src, replicas_entry_bytes(src)); -+ -+ dst = replicas_entry_next(dst); -+ -+ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); -+ } -+ -+ return 0; -+} -+ -+static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) -+{ -+ unsigned i; -+ -+ sort_cmp_size(cpu_r->entries, -+ cpu_r->nr, -+ cpu_r->entry_size, -+ memcmp, NULL); -+ -+ for (i = 0; i + 1 < cpu_r->nr; i++) { -+ struct bch_replicas_entry *l = -+ cpu_replicas_entry(cpu_r, i); -+ struct bch_replicas_entry *r = -+ cpu_replicas_entry(cpu_r, i + 1); -+ -+ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); -+ -+ if (!memcmp(l, r, cpu_r->entry_size)) -+ return "duplicate replicas entry"; -+ } -+ -+ return NULL; -+} -+ -+static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: bad nr_required"; -+ if (e->nr_required > 1 && -+ e->nr_required >= e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+static void bch2_sb_replicas_to_text(struct printbuf *out, -+ struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas *r = field_to_type(f, replicas); -+ struct bch_replicas_entry *e; -+ bool first = true; -+ -+ for_each_replicas_entry(r, e) { -+ if (!first) -+ pr_buf(out, " "); -+ first = false; -+ -+ bch2_replicas_entry_to_text(out, e); -+ } -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas = { -+ .validate = bch2_sb_validate_replicas, -+ .to_text = bch2_sb_replicas_to_text, -+}; -+ -+static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) -+{ -+ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); -+ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); -+ struct bch_replicas_cpu cpu_r = { .entries = NULL }; -+ struct bch_replicas_entry_v0 *e; -+ const char *err; -+ unsigned i; -+ -+ for_each_replicas_entry_v0(sb_r, e) { -+ err = "invalid replicas entry: invalid data type"; -+ if (e->data_type >= BCH_DATA_NR) -+ goto err; -+ -+ err = "invalid replicas entry: no devices"; -+ if (!e->nr_devs) -+ goto err; -+ -+ err = "invalid replicas entry: invalid device"; -+ for (i = 0; i < e->nr_devs; i++) -+ if (!bch2_dev_exists(sb, mi, e->devs[i])) -+ goto err; -+ } -+ -+ err = "cannot allocate memory"; -+ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) -+ goto err; -+ -+ err = check_dup_replicas_entries(&cpu_r); -+err: -+ kfree(cpu_r.entries); -+ return err; -+} -+ -+const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { -+ .validate = bch2_sb_validate_replicas_v0, -+}; -+ -+/* Query replicas: */ -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *c, -+ struct bch_devs_mask online_devs) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_replicas_entry *e; -+ unsigned i, nr_online, nr_offline; -+ struct replicas_status ret; -+ -+ memset(&ret, 0, sizeof(ret)); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ ret.replicas[i].redundancy = INT_MAX; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) { -+ if (e->data_type >= ARRAY_SIZE(ret.replicas)) -+ panic("e %p data_type %u\n", e, e->data_type); -+ -+ nr_online = nr_offline = 0; -+ -+ for (i = 0; i < e->nr_devs; i++) { -+ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, -+ e->devs[i])); -+ -+ if (test_bit(e->devs[i], online_devs.d)) -+ nr_online++; -+ else -+ nr_offline++; -+ } -+ -+ ret.replicas[e->data_type].redundancy = -+ min(ret.replicas[e->data_type].redundancy, -+ (int) nr_online - (int) e->nr_required); -+ -+ ret.replicas[e->data_type].nr_offline = -+ max(ret.replicas[e->data_type].nr_offline, -+ nr_offline); -+ } -+ -+ percpu_up_read(&c->mark_lock); -+ -+ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) -+ if (ret.replicas[i].redundancy == INT_MAX) -+ ret.replicas[i].redundancy = 0; -+ -+ return ret; -+} -+ -+struct replicas_status bch2_replicas_status(struct bch_fs *c) -+{ -+ return __bch2_replicas_status(c, bch2_online_devs(c)); -+} -+ -+static bool have_enough_devs(struct replicas_status s, -+ enum bch_data_type type, -+ bool force_if_degraded, -+ bool force_if_lost) -+{ -+ return (!s.replicas[type].nr_offline || force_if_degraded) && -+ (s.replicas[type].redundancy >= 0 || force_if_lost); -+} -+ -+bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) -+{ -+ return (have_enough_devs(s, BCH_DATA_journal, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_btree, -+ flags & BCH_FORCE_IF_METADATA_DEGRADED, -+ flags & BCH_FORCE_IF_METADATA_LOST) && -+ have_enough_devs(s, BCH_DATA_user, -+ flags & BCH_FORCE_IF_DATA_DEGRADED, -+ flags & BCH_FORCE_IF_DATA_LOST)); -+} -+ -+int bch2_replicas_online(struct bch_fs *c, bool meta) -+{ -+ struct replicas_status s = bch2_replicas_status(c); -+ -+ return (meta -+ ? min(s.replicas[BCH_DATA_journal].redundancy, -+ s.replicas[BCH_DATA_btree].redundancy) -+ : s.replicas[BCH_DATA_user].redundancy) + 1; -+} -+ -+unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_replicas_entry *e; -+ unsigned i, ret = 0; -+ -+ percpu_down_read(&c->mark_lock); -+ -+ for_each_cpu_replicas_entry(&c->replicas, e) -+ for (i = 0; i < e->nr_devs; i++) -+ if (e->devs[i] == ca->dev_idx) -+ ret |= 1 << e->data_type; -+ -+ percpu_up_read(&c->mark_lock); -+ -+ return ret; -+} -+ -+int bch2_fs_replicas_init(struct bch_fs *c) -+{ -+ c->journal.entry_u64s_reserved += -+ reserve_journal_replicas(c, &c->replicas); -+ -+ return replicas_table_update(c, &c->replicas); -+} -diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h -new file mode 100644 -index 000000000000..8b95164fbb56 ---- /dev/null -+++ b/fs/bcachefs/replicas.h -@@ -0,0 +1,91 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_REPLICAS_H -+#define _BCACHEFS_REPLICAS_H -+ -+#include "eytzinger.h" -+#include "replicas_types.h" -+ -+void bch2_replicas_entry_to_text(struct printbuf *, -+ struct bch_replicas_entry *); -+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); -+ -+static inline struct bch_replicas_entry * -+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) -+{ -+ return (void *) r->entries + r->entry_size * i; -+} -+ -+int bch2_replicas_entry_idx(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_devlist_to_replicas(struct bch_replicas_entry *, -+ enum bch_data_type, -+ struct bch_devs_list); -+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); -+int bch2_mark_replicas(struct bch_fs *, -+ struct bch_replicas_entry *); -+ -+void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); -+bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); -+int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); -+ -+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, -+ unsigned dev) -+{ -+ e->data_type = BCH_DATA_cached; -+ e->nr_devs = 1; -+ e->nr_required = 1; -+ e->devs[0] = dev; -+} -+ -+struct replicas_status { -+ struct { -+ int redundancy; -+ unsigned nr_offline; -+ } replicas[BCH_DATA_NR]; -+}; -+ -+struct replicas_status __bch2_replicas_status(struct bch_fs *, -+ struct bch_devs_mask); -+struct replicas_status bch2_replicas_status(struct bch_fs *); -+bool bch2_have_enough_devs(struct replicas_status, unsigned); -+ -+int bch2_replicas_online(struct bch_fs *, bool); -+unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); -+ -+int bch2_replicas_gc_end(struct bch_fs *, int); -+int bch2_replicas_gc_start(struct bch_fs *, unsigned); -+int bch2_replicas_gc2(struct bch_fs *); -+ -+int bch2_replicas_set_usage(struct bch_fs *, -+ struct bch_replicas_entry *, -+ u64); -+ -+#define for_each_cpu_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ -+ _i = (void *) (_i) + (_r)->entry_size) -+ -+/* iterate over superblock replicas - used by userspace tools: */ -+ -+#define replicas_entry_next(_i) \ -+ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) -+ -+#define for_each_replicas_entry(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+#define for_each_replicas_entry_v0(_r, _i) \ -+ for (_i = (_r)->entries; \ -+ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ -+ (_i) = replicas_entry_next(_i)) -+ -+int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); -+ -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; -+extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; -+ -+int bch2_fs_replicas_init(struct bch_fs *); -+ -+#endif /* _BCACHEFS_REPLICAS_H */ -diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h -new file mode 100644 -index 000000000000..0535b1d3760e ---- /dev/null -+++ b/fs/bcachefs/replicas_types.h -@@ -0,0 +1,10 @@ -+#ifndef _BCACHEFS_REPLICAS_TYPES_H -+#define _BCACHEFS_REPLICAS_TYPES_H -+ -+struct bch_replicas_cpu { -+ unsigned nr; -+ unsigned entry_size; -+ struct bch_replicas_entry *entries; -+}; -+ -+#endif /* _BCACHEFS_REPLICAS_TYPES_H */ -diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c -new file mode 100644 -index 000000000000..c062edb3fbc2 ---- /dev/null -+++ b/fs/bcachefs/siphash.c -@@ -0,0 +1,173 @@ -+// SPDX-License-Identifier: BSD-3-Clause -+/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ -+ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ */ -+ -+/* -+ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d -+ * are the number of compression rounds and the number of finalization rounds. -+ * A compression round is identical to a finalization round and this round -+ * function is called SipRound. Given a 128-bit key k and a (possibly empty) -+ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). -+ * -+ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, -+ * by Jean-Philippe Aumasson and Daniel J. Bernstein, -+ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa -+ * https://131002.net/siphash/siphash.pdf -+ * https://131002.net/siphash/ -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#include "siphash.h" -+ -+static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) -+{ -+ while (rounds--) { -+ ctx->v[0] += ctx->v[1]; -+ ctx->v[2] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 13); -+ ctx->v[3] = rol64(ctx->v[3], 16); -+ -+ ctx->v[1] ^= ctx->v[0]; -+ ctx->v[3] ^= ctx->v[2]; -+ ctx->v[0] = rol64(ctx->v[0], 32); -+ -+ ctx->v[2] += ctx->v[1]; -+ ctx->v[0] += ctx->v[3]; -+ ctx->v[1] = rol64(ctx->v[1], 17); -+ ctx->v[3] = rol64(ctx->v[3], 21); -+ -+ ctx->v[1] ^= ctx->v[2]; -+ ctx->v[3] ^= ctx->v[0]; -+ ctx->v[2] = rol64(ctx->v[2], 32); -+ } -+} -+ -+static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) -+{ -+ u64 m = get_unaligned_le64(ptr); -+ -+ ctx->v[3] ^= m; -+ SipHash_Rounds(ctx, rounds); -+ ctx->v[0] ^= m; -+} -+ -+void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) -+{ -+ u64 k0, k1; -+ -+ k0 = le64_to_cpu(key->k0); -+ k1 = le64_to_cpu(key->k1); -+ -+ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; -+ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; -+ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; -+ ctx->v[3] = 0x7465646279746573ULL ^ k1; -+ -+ memset(ctx->buf, 0, sizeof(ctx->buf)); -+ ctx->bytes = 0; -+} -+ -+void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, -+ const void *src, size_t len) -+{ -+ const u8 *ptr = src; -+ size_t left, used; -+ -+ if (len == 0) -+ return; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ ctx->bytes += len; -+ -+ if (used > 0) { -+ left = sizeof(ctx->buf) - used; -+ -+ if (len >= left) { -+ memcpy(&ctx->buf[used], ptr, left); -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ len -= left; -+ ptr += left; -+ } else { -+ memcpy(&ctx->buf[used], ptr, len); -+ return; -+ } -+ } -+ -+ while (len >= sizeof(ctx->buf)) { -+ SipHash_CRounds(ctx, ptr, rc); -+ len -= sizeof(ctx->buf); -+ ptr += sizeof(ctx->buf); -+ } -+ -+ if (len > 0) -+ memcpy(&ctx->buf[used], ptr, len); -+} -+ -+void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ -+ r = SipHash_End(ctx, rc, rf); -+ -+ *((__le64 *) dst) = cpu_to_le64(r); -+} -+ -+u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) -+{ -+ u64 r; -+ size_t left, used; -+ -+ used = ctx->bytes % sizeof(ctx->buf); -+ left = sizeof(ctx->buf) - used; -+ memset(&ctx->buf[used], 0, left - 1); -+ ctx->buf[7] = ctx->bytes; -+ -+ SipHash_CRounds(ctx, ctx->buf, rc); -+ ctx->v[2] ^= 0xff; -+ SipHash_Rounds(ctx, rf); -+ -+ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); -+ memset(ctx, 0, sizeof(*ctx)); -+ return (r); -+} -+ -+u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) -+{ -+ SIPHASH_CTX ctx; -+ -+ SipHash_Init(&ctx, key); -+ SipHash_Update(&ctx, rc, rf, src, len); -+ return SipHash_End(&ctx, rc, rf); -+} -diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h -new file mode 100644 -index 000000000000..3dfaf34a43b2 ---- /dev/null -+++ b/fs/bcachefs/siphash.h -@@ -0,0 +1,87 @@ -+/* SPDX-License-Identifier: BSD-3-Clause */ -+/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ -+/*- -+ * Copyright (c) 2013 Andre Oppermann -+ * All rights reserved. -+ * -+ * Redistribution and use in source and binary forms, with or without -+ * modification, are permitted provided that the following conditions -+ * are met: -+ * 1. Redistributions of source code must retain the above copyright -+ * notice, this list of conditions and the following disclaimer. -+ * 2. Redistributions in binary form must reproduce the above copyright -+ * notice, this list of conditions and the following disclaimer in the -+ * documentation and/or other materials provided with the distribution. -+ * 3. The name of the author may not be used to endorse or promote -+ * products derived from this software without specific prior written -+ * permission. -+ * -+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND -+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE -+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -+ * SUCH DAMAGE. -+ * -+ * $FreeBSD$ -+ */ -+ -+/* -+ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) -+ * optimized for speed on short messages returning a 64bit hash/digest value. -+ * -+ * The number of rounds is defined during the initialization: -+ * SipHash24_Init() for the fast and resonable strong version -+ * SipHash48_Init() for the strong version (half as fast) -+ * -+ * struct SIPHASH_CTX ctx; -+ * SipHash24_Init(&ctx); -+ * SipHash_SetKey(&ctx, "16bytes long key"); -+ * SipHash_Update(&ctx, pointer_to_string, length_of_string); -+ * SipHash_Final(output, &ctx); -+ */ -+ -+#ifndef _SIPHASH_H_ -+#define _SIPHASH_H_ -+ -+#include -+ -+#define SIPHASH_BLOCK_LENGTH 8 -+#define SIPHASH_KEY_LENGTH 16 -+#define SIPHASH_DIGEST_LENGTH 8 -+ -+typedef struct _SIPHASH_CTX { -+ u64 v[4]; -+ u8 buf[SIPHASH_BLOCK_LENGTH]; -+ u32 bytes; -+} SIPHASH_CTX; -+ -+typedef struct { -+ __le64 k0; -+ __le64 k1; -+} SIPHASH_KEY; -+ -+void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); -+void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); -+u64 SipHash_End(SIPHASH_CTX *, int, int); -+void SipHash_Final(void *, SIPHASH_CTX *, int, int); -+u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); -+ -+#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) -+#define SipHash24_End(_d) SipHash_End((_d), 2, 4) -+#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) -+#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) -+ -+#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) -+#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) -+#define SipHash48_End(_d) SipHash_End((_d), 4, 8) -+#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) -+#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) -+ -+#endif /* _SIPHASH_H_ */ -diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h -new file mode 100644 -index 000000000000..dea9b7252b88 ---- /dev/null -+++ b/fs/bcachefs/str_hash.h -@@ -0,0 +1,336 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_STR_HASH_H -+#define _BCACHEFS_STR_HASH_H -+ -+#include "btree_iter.h" -+#include "btree_update.h" -+#include "checksum.h" -+#include "error.h" -+#include "inode.h" -+#include "siphash.h" -+#include "super.h" -+ -+#include -+#include -+#include -+ -+static inline enum bch_str_hash_type -+bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) -+{ -+ switch (opt) { -+ case BCH_STR_HASH_OPT_CRC32C: -+ return BCH_STR_HASH_CRC32C; -+ case BCH_STR_HASH_OPT_CRC64: -+ return BCH_STR_HASH_CRC64; -+ case BCH_STR_HASH_OPT_SIPHASH: -+ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) -+ ? BCH_STR_HASH_SIPHASH -+ : BCH_STR_HASH_SIPHASH_OLD; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_info { -+ u8 type; -+ union { -+ __le64 crc_key; -+ SIPHASH_KEY siphash_key; -+ }; -+}; -+ -+static inline struct bch_hash_info -+bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) -+{ -+ /* XXX ick */ -+ struct bch_hash_info info = { -+ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & -+ ~(~0U << INODE_STR_HASH_BITS), -+ .crc_key = bi->bi_hash_seed, -+ }; -+ -+ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { -+ SHASH_DESC_ON_STACK(desc, c->sha256); -+ u8 digest[SHA256_DIGEST_SIZE]; -+ -+ desc->tfm = c->sha256; -+ -+ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, -+ sizeof(bi->bi_hash_seed), digest); -+ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); -+ } -+ -+ return info; -+} -+ -+struct bch_str_hash_ctx { -+ union { -+ u32 crc32c; -+ u64 crc64; -+ SIPHASH_CTX siphash; -+ }; -+}; -+ -+static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Init(&ctx->siphash, &info->siphash_key); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info, -+ const void *data, size_t len) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ ctx->crc32c = crc32c(ctx->crc32c, data, len); -+ break; -+ case BCH_STR_HASH_CRC64: -+ ctx->crc64 = crc64_be(ctx->crc64, data, len); -+ break; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ SipHash24_Update(&ctx->siphash, data, len); -+ break; -+ default: -+ BUG(); -+ } -+} -+ -+static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, -+ const struct bch_hash_info *info) -+{ -+ switch (info->type) { -+ case BCH_STR_HASH_CRC32C: -+ return ctx->crc32c; -+ case BCH_STR_HASH_CRC64: -+ return ctx->crc64 >> 1; -+ case BCH_STR_HASH_SIPHASH_OLD: -+ case BCH_STR_HASH_SIPHASH: -+ return SipHash24_End(&ctx->siphash) >> 1; -+ default: -+ BUG(); -+ } -+} -+ -+struct bch_hash_desc { -+ enum btree_id btree_id; -+ u8 key_type; -+ -+ u64 (*hash_key)(const struct bch_hash_info *, const void *); -+ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); -+ bool (*cmp_key)(struct bkey_s_c, const void *); -+ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); -+}; -+ -+static __always_inline struct btree_iter * -+bch2_hash_lookup(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key, -+ unsigned flags) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|flags, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_key(k, key)) -+ return iter; -+ } else if (k.k->type == KEY_TYPE_whiteout) { -+ ; -+ } else { -+ /* hole, not found */ -+ break; -+ } -+ } -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOENT); -+} -+ -+static __always_inline struct btree_iter * -+bch2_hash_hole(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_key(info, key)), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type != desc.key_type) -+ return iter; -+ } -+ -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ bch2_trans_iter_put(trans, iter); -+ -+ return ERR_PTR(ret ?: -ENOSPC); -+} -+ -+static __always_inline -+int bch2_hash_needs_whiteout(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *start) -+{ -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ iter = bch2_trans_copy_iter(trans, start); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ bch2_btree_iter_next_slot(iter); -+ -+ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { -+ if (k.k->type != desc.key_type && -+ k.k->type != KEY_TYPE_whiteout) -+ break; -+ -+ if (k.k->type == desc.key_type && -+ desc.hash_bkey(info, k) <= start->pos.offset) { -+ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; -+ ret = 1; -+ break; -+ } -+ } -+ -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static __always_inline -+int bch2_hash_set(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, struct bkey_i *insert, int flags) -+{ -+ struct btree_iter *iter, *slot = NULL; -+ struct bkey_s_c k; -+ bool found = false; -+ int ret; -+ -+ for_each_btree_key(trans, iter, desc.btree_id, -+ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ if (iter->pos.inode != inode) -+ break; -+ -+ if (k.k->type == desc.key_type) { -+ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) -+ goto found; -+ -+ /* hash collision: */ -+ continue; -+ } -+ -+ if (!slot && -+ !(flags & BCH_HASH_SET_MUST_REPLACE)) { -+ slot = bch2_trans_copy_iter(trans, iter); -+ if (IS_ERR(slot)) -+ return PTR_ERR(slot); -+ } -+ -+ if (k.k->type != KEY_TYPE_whiteout) -+ goto not_found; -+ } -+ -+ if (!ret) -+ ret = -ENOSPC; -+out: -+ bch2_trans_iter_put(trans, slot); -+ bch2_trans_iter_put(trans, iter); -+ -+ return ret; -+found: -+ found = true; -+not_found: -+ -+ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { -+ ret = -ENOENT; -+ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { -+ ret = -EEXIST; -+ } else { -+ if (!found && slot) -+ swap(iter, slot); -+ -+ insert->k.p = iter->pos; -+ bch2_trans_update(trans, iter, insert, 0); -+ } -+ -+ goto out; -+} -+ -+static __always_inline -+int bch2_hash_delete_at(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ struct btree_iter *iter) -+{ -+ struct bkey_i *delete; -+ int ret; -+ -+ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); -+ if (ret < 0) -+ return ret; -+ -+ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); -+ if (IS_ERR(delete)) -+ return PTR_ERR(delete); -+ -+ bkey_init(&delete->k); -+ delete->k.p = iter->pos; -+ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; -+ -+ bch2_trans_update(trans, iter, delete, 0); -+ return 0; -+} -+ -+static __always_inline -+int bch2_hash_delete(struct btree_trans *trans, -+ const struct bch_hash_desc desc, -+ const struct bch_hash_info *info, -+ u64 inode, const void *key) -+{ -+ struct btree_iter *iter; -+ int ret; -+ -+ iter = bch2_hash_lookup(trans, desc, info, inode, key, -+ BTREE_ITER_INTENT); -+ if (IS_ERR(iter)) -+ return PTR_ERR(iter); -+ -+ ret = bch2_hash_delete_at(trans, desc, info, iter); -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+#endif /* _BCACHEFS_STR_HASH_H */ -diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c -new file mode 100644 -index 000000000000..cee6cc938734 ---- /dev/null -+++ b/fs/bcachefs/super-io.c -@@ -0,0 +1,1158 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "btree_update_interior.h" -+#include "buckets.h" -+#include "checksum.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_seq_blacklist.h" -+#include "replicas.h" -+#include "quota.h" -+#include "super-io.h" -+#include "super.h" -+#include "vstructs.h" -+ -+#include -+#include -+ -+const char * const bch2_sb_fields[] = { -+#define x(name, nr) #name, -+ BCH_SB_FIELDS() -+#undef x -+ NULL -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *, -+ struct bch_sb_field *); -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f; -+ -+ /* XXX: need locking around superblock to access optional fields */ -+ -+ vstruct_for_each(sb, f) -+ if (le32_to_cpu(f->type) == type) -+ return f; -+ return NULL; -+} -+ -+static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, -+ struct bch_sb_field *f, -+ unsigned u64s) -+{ -+ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; -+ -+ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > -+ sb->page_order); -+ -+ if (!f && !u64s) { -+ /* nothing to do: */ -+ } else if (!f) { -+ f = vstruct_last(sb->sb); -+ memset(f, 0, sizeof(u64) * u64s); -+ f->u64s = cpu_to_le32(u64s); -+ f->type = 0; -+ } else { -+ void *src, *dst; -+ -+ src = vstruct_end(f); -+ -+ if (u64s) { -+ f->u64s = cpu_to_le32(u64s); -+ dst = vstruct_end(f); -+ } else { -+ dst = f; -+ } -+ -+ memmove(dst, src, vstruct_end(sb->sb) - src); -+ -+ if (dst > src) -+ memset(src, 0, dst - src); -+ } -+ -+ sb->sb->u64s = cpu_to_le32(sb_u64s); -+ -+ return u64s ? f : NULL; -+} -+ -+void bch2_sb_field_delete(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ -+ if (f) -+ __bch2_sb_field_resize(sb, f, 0); -+} -+ -+/* Superblock realloc/free: */ -+ -+void bch2_free_super(struct bch_sb_handle *sb) -+{ -+ if (sb->bio) -+ bio_put(sb->bio); -+ if (!IS_ERR_OR_NULL(sb->bdev)) -+ blkdev_put(sb->bdev, sb->mode); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ memset(sb, 0, sizeof(*sb)); -+} -+ -+int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) -+{ -+ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); -+ unsigned order = get_order(new_bytes); -+ struct bch_sb *new_sb; -+ struct bio *bio; -+ -+ if (sb->sb && sb->page_order >= order) -+ return 0; -+ -+ if (sb->have_layout) { -+ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; -+ -+ if (new_bytes > max_bytes) { -+ char buf[BDEVNAME_SIZE]; -+ -+ pr_err("%s: superblock too big: want %zu but have %llu", -+ bdevname(sb->bdev, buf), new_bytes, max_bytes); -+ return -ENOSPC; -+ } -+ } -+ -+ if (sb->page_order >= order && sb->sb) -+ return 0; -+ -+ if (dynamic_fault("bcachefs:add:super_realloc")) -+ return -ENOMEM; -+ -+ if (sb->have_bio) { -+ bio = bio_kmalloc(GFP_KERNEL, 1 << order); -+ if (!bio) -+ return -ENOMEM; -+ -+ if (sb->bio) -+ bio_put(sb->bio); -+ sb->bio = bio; -+ } -+ -+ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); -+ if (!new_sb) -+ return -ENOMEM; -+ -+ if (sb->sb) -+ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ free_pages((unsigned long) sb->sb, sb->page_order); -+ sb->sb = new_sb; -+ -+ sb->page_order = order; -+ -+ return 0; -+} -+ -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, -+ enum bch_sb_field_type type, -+ unsigned u64s) -+{ -+ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); -+ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; -+ ssize_t d = -old_u64s + u64s; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) -+ return NULL; -+ -+ if (sb->fs_sb) { -+ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ /* XXX: we're not checking that offline device have enough space */ -+ -+ for_each_online_member(ca, c, i) { -+ struct bch_sb_handle *sb = &ca->disk_sb; -+ -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { -+ percpu_ref_put(&ca->ref); -+ return NULL; -+ } -+ } -+ } -+ -+ f = bch2_sb_field_get(sb->sb, type); -+ f = __bch2_sb_field_resize(sb, f, u64s); -+ if (f) -+ f->type = cpu_to_le32(type); -+ return f; -+} -+ -+/* Superblock validate: */ -+ -+static inline void __bch2_sb_layout_size_assert(void) -+{ -+ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); -+} -+ -+static const char *validate_sb_layout(struct bch_sb_layout *layout) -+{ -+ u64 offset, prev_offset, max_sectors; -+ unsigned i; -+ -+ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock layout"; -+ -+ if (layout->layout_type != 0) -+ return "Invalid superblock layout type"; -+ -+ if (!layout->nr_superblocks) -+ return "Invalid superblock layout: no superblocks"; -+ -+ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) -+ return "Invalid superblock layout: too many superblocks"; -+ -+ max_sectors = 1 << layout->sb_max_size_bits; -+ -+ prev_offset = le64_to_cpu(layout->sb_offset[0]); -+ -+ for (i = 1; i < layout->nr_superblocks; i++) { -+ offset = le64_to_cpu(layout->sb_offset[i]); -+ -+ if (offset < prev_offset + max_sectors) -+ return "Invalid superblock layout: superblocks overlap"; -+ prev_offset = offset; -+ } -+ -+ return NULL; -+} -+ -+const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) -+{ -+ struct bch_sb *sb = disk_sb->sb; -+ struct bch_sb_field *f; -+ struct bch_sb_field_members *mi; -+ const char *err; -+ u32 version, version_min; -+ u16 block_size; -+ -+ version = le16_to_cpu(sb->version); -+ version_min = version >= bcachefs_metadata_version_new_versioning -+ ? le16_to_cpu(sb->version_min) -+ : version; -+ -+ if (version >= bcachefs_metadata_version_max || -+ version_min < bcachefs_metadata_version_min) -+ return "Unsupported superblock version"; -+ -+ if (version_min > version) -+ return "Bad minimum version"; -+ -+ if (sb->features[1] || -+ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) -+ return "Filesystem has incompatible features"; -+ -+ block_size = le16_to_cpu(sb->block_size); -+ -+ if (!is_power_of_2(block_size) || -+ block_size > PAGE_SECTORS) -+ return "Bad block size"; -+ -+ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) -+ return "Bad user UUID"; -+ -+ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) -+ return "Bad internal UUID"; -+ -+ if (!sb->nr_devices || -+ sb->nr_devices <= sb->dev_idx || -+ sb->nr_devices > BCH_SB_MEMBERS_MAX) -+ return "Bad number of member devices"; -+ -+ if (!BCH_SB_META_REPLICAS_WANT(sb) || -+ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_META_REPLICAS_REQ(sb) || -+ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of metadata replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || -+ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || -+ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) -+ return "Invalid number of data replicas"; -+ -+ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) -+ return "Invalid metadata checksum type"; -+ -+ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) -+ return "Invalid compression type"; -+ -+ if (!BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "Btree node size not set"; -+ -+ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) -+ return "Btree node size not a power of two"; -+ -+ if (BCH_SB_GC_RESERVE(sb) < 5) -+ return "gc reserve percentage too small"; -+ -+ if (!sb->time_precision || -+ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) -+ return "invalid time precision"; -+ -+ /* validate layout */ -+ err = validate_sb_layout(&sb->layout); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (!f->u64s) -+ return "Invalid superblock: invalid optional field"; -+ -+ if (vstruct_next(f) > vstruct_last(sb)) -+ return "Invalid superblock: invalid optional field"; -+ } -+ -+ /* members must be validated first: */ -+ mi = bch2_sb_get_members(sb); -+ if (!mi) -+ return "Invalid superblock: member info area missing"; -+ -+ err = bch2_sb_field_validate(sb, &mi->field); -+ if (err) -+ return err; -+ -+ vstruct_for_each(sb, f) { -+ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) -+ continue; -+ -+ err = bch2_sb_field_validate(sb, f); -+ if (err) -+ return err; -+ } -+ -+ return NULL; -+} -+ -+/* device open: */ -+ -+static void bch2_sb_update(struct bch_fs *c) -+{ -+ struct bch_sb *src = c->disk_sb.sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(src); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ c->sb.uuid = src->uuid; -+ c->sb.user_uuid = src->user_uuid; -+ c->sb.version = le16_to_cpu(src->version); -+ c->sb.nr_devices = src->nr_devices; -+ c->sb.clean = BCH_SB_CLEAN(src); -+ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); -+ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); -+ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); -+ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); -+ c->sb.time_precision = le32_to_cpu(src->time_precision); -+ c->sb.features = le64_to_cpu(src->features[0]); -+ c->sb.compat = le64_to_cpu(src->compat[0]); -+ -+ for_each_member_device(ca, c, i) -+ ca->mi = bch2_mi_to_cpu(mi->members + i); -+} -+ -+/* doesn't copy member info */ -+static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) -+{ -+ struct bch_sb_field *src_f, *dst_f; -+ struct bch_sb *dst = dst_handle->sb; -+ unsigned i; -+ -+ dst->version = src->version; -+ dst->version_min = src->version_min; -+ dst->seq = src->seq; -+ dst->uuid = src->uuid; -+ dst->user_uuid = src->user_uuid; -+ memcpy(dst->label, src->label, sizeof(dst->label)); -+ -+ dst->block_size = src->block_size; -+ dst->nr_devices = src->nr_devices; -+ -+ dst->time_base_lo = src->time_base_lo; -+ dst->time_base_hi = src->time_base_hi; -+ dst->time_precision = src->time_precision; -+ -+ memcpy(dst->flags, src->flags, sizeof(dst->flags)); -+ memcpy(dst->features, src->features, sizeof(dst->features)); -+ memcpy(dst->compat, src->compat, sizeof(dst->compat)); -+ -+ for (i = 0; i < BCH_SB_FIELD_NR; i++) { -+ if (i == BCH_SB_FIELD_journal) -+ continue; -+ -+ src_f = bch2_sb_field_get(src, i); -+ dst_f = bch2_sb_field_get(dst, i); -+ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, -+ src_f ? le32_to_cpu(src_f->u64s) : 0); -+ -+ if (src_f) -+ memcpy(dst_f, src_f, vstruct_bytes(src_f)); -+ } -+} -+ -+int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) -+{ -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(src); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ int ret; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ ret = bch2_sb_realloc(&c->disk_sb, -+ le32_to_cpu(src->u64s) - journal_u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&c->disk_sb, src); -+ -+ ret = bch2_sb_replicas_to_cpu_replicas(c); -+ if (ret) -+ return ret; -+ -+ ret = bch2_sb_disk_groups_to_cpu(c); -+ if (ret) -+ return ret; -+ -+ bch2_sb_update(c); -+ return 0; -+} -+ -+int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; -+ struct bch_sb_field_journal *journal_buckets = -+ bch2_sb_get_journal(dst); -+ unsigned journal_u64s = journal_buckets -+ ? le32_to_cpu(journal_buckets->field.u64s) -+ : 0; -+ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; -+ int ret; -+ -+ ret = bch2_sb_realloc(&ca->disk_sb, u64s); -+ if (ret) -+ return ret; -+ -+ __copy_super(&ca->disk_sb, src); -+ return 0; -+} -+ -+/* read superblock: */ -+ -+static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) -+{ -+ struct bch_csum csum; -+ size_t bytes; -+reread: -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = offset; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); -+ -+ if (submit_bio_wait(sb->bio)) -+ return "IO error"; -+ -+ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) -+ return "Not a bcachefs superblock"; -+ -+ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || -+ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) -+ return "Unsupported superblock version"; -+ -+ bytes = vstruct_bytes(sb->sb); -+ -+ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) -+ return "Bad superblock: too big"; -+ -+ if (get_order(bytes) > sb->page_order) { -+ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) -+ return "cannot allocate memory"; -+ goto reread; -+ } -+ -+ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) -+ return "unknown csum type"; -+ -+ /* XXX: verify MACs */ -+ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), -+ null_nonce(), sb->sb); -+ -+ if (bch2_crc_cmp(csum, sb->sb->csum)) -+ return "bad checksum reading superblock"; -+ -+ sb->seq = le64_to_cpu(sb->sb->seq); -+ -+ return NULL; -+} -+ -+int bch2_read_super(const char *path, struct bch_opts *opts, -+ struct bch_sb_handle *sb) -+{ -+ u64 offset = opt_get(*opts, sb); -+ struct bch_sb_layout layout; -+ const char *err; -+ __le64 *i; -+ int ret; -+ -+ pr_verbose_init(*opts, ""); -+ -+ memset(sb, 0, sizeof(*sb)); -+ sb->mode = FMODE_READ; -+ sb->have_bio = true; -+ -+ if (!opt_get(*opts, noexcl)) -+ sb->mode |= FMODE_EXCL; -+ -+ if (!opt_get(*opts, nochanges)) -+ sb->mode |= FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (IS_ERR(sb->bdev) && -+ PTR_ERR(sb->bdev) == -EACCES && -+ opt_get(*opts, read_only)) { -+ sb->mode &= ~FMODE_WRITE; -+ -+ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); -+ if (!IS_ERR(sb->bdev)) -+ opt_set(*opts, nochanges, true); -+ } -+ -+ if (IS_ERR(sb->bdev)) { -+ ret = PTR_ERR(sb->bdev); -+ goto out; -+ } -+ -+ err = "cannot allocate memory"; -+ ret = bch2_sb_realloc(sb, 0); -+ if (ret) -+ goto err; -+ -+ ret = -EFAULT; -+ err = "dynamic fault"; -+ if (bch2_fs_init_fault("read_super")) -+ goto err; -+ -+ ret = -EINVAL; -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ -+ if (opt_defined(*opts, sb)) -+ goto err; -+ -+ pr_err("error reading default superblock: %s", err); -+ -+ /* -+ * Error reading primary superblock - read location of backup -+ * superblocks: -+ */ -+ bio_reset(sb->bio); -+ bio_set_dev(sb->bio, sb->bdev); -+ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; -+ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ /* -+ * use sb buffer to read layout, since sb buffer is page aligned but -+ * layout won't be: -+ */ -+ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); -+ -+ err = "IO error"; -+ if (submit_bio_wait(sb->bio)) -+ goto err; -+ -+ memcpy(&layout, sb->sb, sizeof(layout)); -+ err = validate_sb_layout(&layout); -+ if (err) -+ goto err; -+ -+ for (i = layout.sb_offset; -+ i < layout.sb_offset + layout.nr_superblocks; i++) { -+ offset = le64_to_cpu(*i); -+ -+ if (offset == opt_get(*opts, sb)) -+ continue; -+ -+ err = read_one_super(sb, offset); -+ if (!err) -+ goto got_super; -+ } -+ -+ ret = -EINVAL; -+ goto err; -+ -+got_super: -+ err = "Superblock block size smaller than device block size"; -+ ret = -EINVAL; -+ if (le16_to_cpu(sb->sb->block_size) << 9 < -+ bdev_logical_block_size(sb->bdev)) -+ goto err; -+ -+ if (sb->mode & FMODE_WRITE) -+ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities -+ |= BDI_CAP_STABLE_WRITES; -+ ret = 0; -+ sb->have_layout = true; -+out: -+ pr_verbose_init(*opts, "ret %i", ret); -+ return ret; -+err: -+ bch2_free_super(sb); -+ pr_err("error reading superblock: %s", err); -+ goto out; -+} -+ -+/* write superblock: */ -+ -+static void write_super_endio(struct bio *bio) -+{ -+ struct bch_dev *ca = bio->bi_private; -+ -+ /* XXX: return errors directly */ -+ -+ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", -+ bch2_blk_status_to_str(bio->bi_status))) -+ ca->sb_write_error = 1; -+ -+ closure_put(&ca->fs->sb_write); -+ percpu_ref_put(&ca->io_ref); -+} -+ -+static void read_back_super(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); -+ -+ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) -+{ -+ struct bch_sb *sb = ca->disk_sb.sb; -+ struct bio *bio = ca->disk_sb.bio; -+ -+ sb->offset = sb->layout.sb_offset[idx]; -+ -+ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); -+ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), -+ null_nonce(), sb); -+ -+ bio_reset(bio); -+ bio_set_dev(bio, ca->disk_sb.bdev); -+ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); -+ bio->bi_end_io = write_super_endio; -+ bio->bi_private = ca; -+ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); -+ bch2_bio_map(bio, sb, -+ roundup((size_t) vstruct_bytes(sb), -+ bdev_logical_block_size(ca->disk_sb.bdev))); -+ -+ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], -+ bio_sectors(bio)); -+ -+ percpu_ref_get(&ca->io_ref); -+ closure_bio_submit(bio, &c->sb_write); -+} -+ -+int bch2_write_super(struct bch_fs *c) -+{ -+ struct closure *cl = &c->sb_write; -+ struct bch_dev *ca; -+ unsigned i, sb = 0, nr_wrote; -+ const char *err; -+ struct bch_devs_mask sb_written; -+ bool wrote, can_mount_without_written, can_mount_with_written; -+ int ret = 0; -+ -+ lockdep_assert_held(&c->sb_lock); -+ -+ closure_init_stack(cl); -+ memset(&sb_written, 0, sizeof(sb_written)); -+ -+ le64_add_cpu(&c->disk_sb.sb->seq, 1); -+ -+ if (test_bit(BCH_FS_ERROR, &c->flags)) -+ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ for_each_online_member(ca, c, i) { -+ err = bch2_sb_validate(&ca->disk_sb); -+ if (err) { -+ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); -+ ret = -1; -+ goto out; -+ } -+ } -+ -+ if (c->opts.nochanges) -+ goto out; -+ -+ for_each_online_member(ca, c, i) { -+ __set_bit(ca->dev_idx, sb_written.d); -+ ca->sb_write_error = 0; -+ } -+ -+ for_each_online_member(ca, c, i) -+ read_back_super(c, ca); -+ closure_sync(cl); -+ -+ for_each_online_member(ca, c, i) { -+ if (!ca->sb_write_error && -+ ca->disk_sb.seq != -+ le64_to_cpu(ca->sb_read_scratch->seq)) { -+ bch2_fs_fatal_error(c, -+ "Superblock modified by another process"); -+ percpu_ref_put(&ca->io_ref); -+ ret = -EROFS; -+ goto out; -+ } -+ } -+ -+ do { -+ wrote = false; -+ for_each_online_member(ca, c, i) -+ if (!ca->sb_write_error && -+ sb < ca->disk_sb.sb->layout.nr_superblocks) { -+ write_one_super(c, ca, sb); -+ wrote = true; -+ } -+ closure_sync(cl); -+ sb++; -+ } while (wrote); -+ -+ for_each_online_member(ca, c, i) { -+ if (ca->sb_write_error) -+ __clear_bit(ca->dev_idx, sb_written.d); -+ else -+ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); -+ } -+ -+ nr_wrote = dev_mask_nr(&sb_written); -+ -+ can_mount_with_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) -+ sb_written.d[i] = ~sb_written.d[i]; -+ -+ can_mount_without_written = -+ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), -+ BCH_FORCE_IF_DEGRADED); -+ -+ /* -+ * If we would be able to mount _without_ the devices we successfully -+ * wrote superblocks to, we weren't able to write to enough devices: -+ * -+ * Exception: if we can mount without the successes because we haven't -+ * written anything (new filesystem), we continue if we'd be able to -+ * mount with the devices we did successfully write to: -+ */ -+ if (bch2_fs_fatal_err_on(!nr_wrote || -+ (can_mount_without_written && -+ !can_mount_with_written), c, -+ "Unable to write superblock to sufficient devices")) -+ ret = -1; -+out: -+ /* Make new options visible after they're persistent: */ -+ bch2_sb_update(c); -+ return ret; -+} -+ -+void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ mutex_lock(&c->sb_lock); -+ if (!(c->sb.features & (1ULL << feat))) { -+ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); -+ -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static int u64_cmp(const void *_l, const void *_r) -+{ -+ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); -+ -+ return l < r ? -1 : l > r ? 1 : 0; -+} -+ -+static const char *bch2_sb_validate_journal(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_journal *journal = field_to_type(f, journal); -+ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; -+ const char *err; -+ unsigned nr; -+ unsigned i; -+ u64 *b; -+ -+ journal = bch2_sb_get_journal(sb); -+ if (!journal) -+ return NULL; -+ -+ nr = bch2_nr_journal_buckets(journal); -+ if (!nr) -+ return NULL; -+ -+ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); -+ if (!b) -+ return "cannot allocate memory"; -+ -+ for (i = 0; i < nr; i++) -+ b[i] = le64_to_cpu(journal->buckets[i]); -+ -+ sort(b, nr, sizeof(u64), u64_cmp, NULL); -+ -+ err = "journal bucket at sector 0"; -+ if (!b[0]) -+ goto err; -+ -+ err = "journal bucket before first bucket"; -+ if (m && b[0] < le16_to_cpu(m->first_bucket)) -+ goto err; -+ -+ err = "journal bucket past end of device"; -+ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) -+ goto err; -+ -+ err = "duplicate journal buckets"; -+ for (i = 0; i + 1 < nr; i++) -+ if (b[i] == b[i + 1]) -+ goto err; -+ -+ err = NULL; -+err: -+ kfree(b); -+ return err; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_journal = { -+ .validate = bch2_sb_validate_journal, -+}; -+ -+/* BCH_SB_FIELD_members: */ -+ -+static const char *bch2_sb_validate_members(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_members *mi = field_to_type(f, members); -+ struct bch_member *m; -+ -+ if ((void *) (mi->members + sb->nr_devices) > -+ vstruct_end(&mi->field)) -+ return "Invalid superblock: bad member info"; -+ -+ for (m = mi->members; -+ m < mi->members + sb->nr_devices; -+ m++) { -+ if (!bch2_member_exists(m)) -+ continue; -+ -+ if (le64_to_cpu(m->nbuckets) > LONG_MAX) -+ return "Too many buckets"; -+ -+ if (le64_to_cpu(m->nbuckets) - -+ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) -+ return "Not enough buckets"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ le16_to_cpu(sb->block_size)) -+ return "bucket size smaller than block size"; -+ -+ if (le16_to_cpu(m->bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(sb)) -+ return "bucket size smaller than btree node size"; -+ } -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_members = { -+ .validate = bch2_sb_validate_members, -+}; -+ -+/* BCH_SB_FIELD_crypt: */ -+ -+static const char *bch2_sb_validate_crypt(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); -+ -+ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) -+ return "invalid field crypt: wrong size"; -+ -+ if (BCH_CRYPT_KDF_TYPE(crypt)) -+ return "invalid field crypt: bad kdf type"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { -+ .validate = bch2_sb_validate_crypt, -+}; -+ -+/* BCH_SB_FIELD_clean: */ -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) -+{ -+ struct jset_entry *entry; -+ -+ for (entry = clean->start; -+ entry < (struct jset_entry *) vstruct_end(&clean->field); -+ entry = vstruct_next(entry)) -+ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); -+} -+ -+int bch2_fs_mark_dirty(struct bch_fs *c) -+{ -+ int ret; -+ -+ /* -+ * Unconditionally write superblock, to verify it hasn't changed before -+ * we go rw: -+ */ -+ -+ mutex_lock(&c->sb_lock); -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; -+ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; -+ ret = bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ return ret; -+} -+ -+static void -+entry_init_u64s(struct jset_entry *entry, unsigned u64s) -+{ -+ memset(entry, 0, u64s * sizeof(u64)); -+ -+ /* -+ * The u64s field counts from the start of data, ignoring the shared -+ * fields. -+ */ -+ entry->u64s = u64s - 1; -+} -+ -+static void -+entry_init_size(struct jset_entry *entry, size_t size) -+{ -+ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); -+ entry_init_u64s(entry, u64s); -+} -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *c, -+ struct jset_entry *entry, -+ u64 journal_seq) -+{ -+ unsigned i; -+ -+ percpu_down_write(&c->mark_lock); -+ -+ if (!journal_seq) { -+ bch2_fs_usage_acc_to_base(c, 0); -+ bch2_fs_usage_acc_to_base(c, 1); -+ } else { -+ bch2_fs_usage_acc_to_base(c, journal_seq & 1); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_INODES; -+ u->v = cpu_to_le64(c->usage_base->nr_inodes); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_KEY_VERSION; -+ u->v = cpu_to_le64(atomic64_read(&c->key_version)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < BCH_REPLICAS_MAX; i++) { -+ struct jset_entry_usage *u = -+ container_of(entry, struct jset_entry_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u)); -+ u->entry.type = BCH_JSET_ENTRY_usage; -+ u->entry.btree_id = FS_USAGE_RESERVED; -+ u->entry.level = i; -+ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ for (i = 0; i < c->replicas.nr; i++) { -+ struct bch_replicas_entry *e = -+ cpu_replicas_entry(&c->replicas, i); -+ struct jset_entry_data_usage *u = -+ container_of(entry, struct jset_entry_data_usage, entry); -+ -+ entry_init_size(entry, sizeof(*u) + e->nr_devs); -+ u->entry.type = BCH_JSET_ENTRY_data_usage; -+ u->v = cpu_to_le64(c->usage_base->replicas[i]); -+ memcpy(&u->r, e, replicas_entry_bytes(e)); -+ -+ entry = vstruct_next(entry); -+ } -+ -+ percpu_up_write(&c->mark_lock); -+ -+ return entry; -+} -+ -+void bch2_fs_mark_clean(struct bch_fs *c) -+{ -+ struct bch_sb_field_clean *sb_clean; -+ struct jset_entry *entry; -+ unsigned u64s; -+ -+ mutex_lock(&c->sb_lock); -+ if (BCH_SB_CLEAN(c->disk_sb.sb)) -+ goto out; -+ -+ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); -+ -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; -+ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); -+ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); -+ -+ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; -+ -+ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); -+ if (!sb_clean) { -+ bch_err(c, "error resizing superblock while setting filesystem clean"); -+ goto out; -+ } -+ -+ sb_clean->flags = 0; -+ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); -+ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); -+ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); -+ -+ /* Trying to catch outstanding bug: */ -+ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); -+ -+ entry = sb_clean->start; -+ entry = bch2_journal_super_entries_add_common(c, entry, 0); -+ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); -+ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); -+ -+ memset(entry, 0, -+ vstruct_end(&sb_clean->field) - (void *) entry); -+ -+ if (le16_to_cpu(c->disk_sb.sb->version) < -+ bcachefs_metadata_version_bkey_renumber) -+ bch2_sb_clean_renumber(sb_clean, WRITE); -+ -+ bch2_write_super(c); -+out: -+ mutex_unlock(&c->sb_lock); -+} -+ -+static const char *bch2_sb_validate_clean(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ struct bch_sb_field_clean *clean = field_to_type(f, clean); -+ -+ if (vstruct_bytes(&clean->field) < sizeof(*clean)) -+ return "invalid field crypt: wrong size"; -+ -+ return NULL; -+} -+ -+static const struct bch_sb_field_ops bch_sb_field_ops_clean = { -+ .validate = bch2_sb_validate_clean, -+}; -+ -+static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { -+#define x(f, nr) \ -+ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, -+ BCH_SB_FIELDS() -+#undef x -+}; -+ -+static const char *bch2_sb_field_validate(struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ -+ return type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type]->validate(sb, f) -+ : NULL; -+} -+ -+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, -+ struct bch_sb_field *f) -+{ -+ unsigned type = le32_to_cpu(f->type); -+ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR -+ ? bch2_sb_field_ops[type] : NULL; -+ -+ if (ops) -+ pr_buf(out, "%s", bch2_sb_fields[type]); -+ else -+ pr_buf(out, "(unknown field %u)", type); -+ -+ pr_buf(out, " (size %llu):", vstruct_bytes(f)); -+ -+ if (ops && ops->to_text) -+ bch2_sb_field_ops[type]->to_text(out, sb, f); -+} -diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h -new file mode 100644 -index 000000000000..7a068158efca ---- /dev/null -+++ b/fs/bcachefs/super-io.h -@@ -0,0 +1,137 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_IO_H -+#define _BCACHEFS_SUPER_IO_H -+ -+#include "extents.h" -+#include "eytzinger.h" -+#include "super_types.h" -+#include "super.h" -+ -+#include -+ -+struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); -+struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, -+ enum bch_sb_field_type, unsigned); -+void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); -+ -+#define field_to_type(_f, _name) \ -+ container_of_or_null(_f, struct bch_sb_field_##_name, field) -+ -+#define x(_name, _nr) \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_get_##_name(struct bch_sb *sb) \ -+{ \ -+ return field_to_type(bch2_sb_field_get(sb, \ -+ BCH_SB_FIELD_##_name), _name); \ -+} \ -+ \ -+static inline struct bch_sb_field_##_name * \ -+bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ -+{ \ -+ return field_to_type(bch2_sb_field_resize(sb, \ -+ BCH_SB_FIELD_##_name, u64s), _name); \ -+} -+ -+BCH_SB_FIELDS() -+#undef x -+ -+extern const char * const bch2_sb_fields[]; -+ -+struct bch_sb_field_ops { -+ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); -+ void (*to_text)(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+}; -+ -+static inline __le64 bch2_sb_magic(struct bch_fs *c) -+{ -+ __le64 ret; -+ memcpy(&ret, &c->sb.uuid, sizeof(ret)); -+ return ret; -+} -+ -+static inline __u64 jset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); -+} -+ -+static inline __u64 bset_magic(struct bch_fs *c) -+{ -+ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); -+} -+ -+int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); -+int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); -+ -+void bch2_free_super(struct bch_sb_handle *); -+int bch2_sb_realloc(struct bch_sb_handle *, unsigned); -+ -+const char *bch2_sb_validate(struct bch_sb_handle *); -+ -+int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); -+int bch2_write_super(struct bch_fs *); -+void __bch2_check_set_feature(struct bch_fs *, unsigned); -+ -+static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) -+{ -+ if (!(c->sb.features & (1ULL << feat))) -+ __bch2_check_set_feature(c, feat); -+} -+ -+/* BCH_SB_FIELD_journal: */ -+ -+static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) -+{ -+ return j -+ ? (__le64 *) vstruct_end(&j->field) - j->buckets -+ : 0; -+} -+ -+/* BCH_SB_FIELD_members: */ -+ -+static inline bool bch2_member_exists(struct bch_member *m) -+{ -+ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); -+} -+ -+static inline bool bch2_dev_exists(struct bch_sb *sb, -+ struct bch_sb_field_members *mi, -+ unsigned dev) -+{ -+ return dev < sb->nr_devices && -+ bch2_member_exists(&mi->members[dev]); -+} -+ -+static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) -+{ -+ return (struct bch_member_cpu) { -+ .nbuckets = le64_to_cpu(mi->nbuckets), -+ .first_bucket = le16_to_cpu(mi->first_bucket), -+ .bucket_size = le16_to_cpu(mi->bucket_size), -+ .group = BCH_MEMBER_GROUP(mi), -+ .state = BCH_MEMBER_STATE(mi), -+ .replacement = BCH_MEMBER_REPLACEMENT(mi), -+ .discard = BCH_MEMBER_DISCARD(mi), -+ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), -+ .durability = BCH_MEMBER_DURABILITY(mi) -+ ? BCH_MEMBER_DURABILITY(mi) - 1 -+ : 1, -+ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), -+ }; -+} -+ -+/* BCH_SB_FIELD_clean: */ -+ -+struct jset_entry * -+bch2_journal_super_entries_add_common(struct bch_fs *, -+ struct jset_entry *, u64); -+ -+void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); -+ -+int bch2_fs_mark_dirty(struct bch_fs *); -+void bch2_fs_mark_clean(struct bch_fs *); -+ -+void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, -+ struct bch_sb_field *); -+ -+#endif /* _BCACHEFS_SUPER_IO_H */ -diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c -new file mode 100644 -index 000000000000..015bbd9f21fd ---- /dev/null -+++ b/fs/bcachefs/super.c -@@ -0,0 +1,2037 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcachefs setup/teardown code, and some metadata io - read a superblock and -+ * figure out what to do with it. -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "alloc_foreground.h" -+#include "bkey_sort.h" -+#include "btree_cache.h" -+#include "btree_gc.h" -+#include "btree_key_cache.h" -+#include "btree_update_interior.h" -+#include "btree_io.h" -+#include "chardev.h" -+#include "checksum.h" -+#include "clock.h" -+#include "compress.h" -+#include "debug.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "error.h" -+#include "fs.h" -+#include "fs-io.h" -+#include "fsck.h" -+#include "inode.h" -+#include "io.h" -+#include "journal.h" -+#include "journal_reclaim.h" -+#include "journal_seq_blacklist.h" -+#include "move.h" -+#include "migrate.h" -+#include "movinggc.h" -+#include "quota.h" -+#include "rebalance.h" -+#include "recovery.h" -+#include "replicas.h" -+#include "super.h" -+#include "super-io.h" -+#include "sysfs.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+MODULE_LICENSE("GPL"); -+MODULE_AUTHOR("Kent Overstreet "); -+ -+#define KTYPE(type) \ -+struct kobj_type type ## _ktype = { \ -+ .release = type ## _release, \ -+ .sysfs_ops = &type ## _sysfs_ops, \ -+ .default_attrs = type ## _files \ -+} -+ -+static void bch2_fs_release(struct kobject *); -+static void bch2_dev_release(struct kobject *); -+ -+static void bch2_fs_internal_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_opts_dir_release(struct kobject *k) -+{ -+} -+ -+static void bch2_fs_time_stats_release(struct kobject *k) -+{ -+} -+ -+static KTYPE(bch2_fs); -+static KTYPE(bch2_fs_internal); -+static KTYPE(bch2_fs_opts_dir); -+static KTYPE(bch2_fs_time_stats); -+static KTYPE(bch2_dev); -+ -+static struct kset *bcachefs_kset; -+static LIST_HEAD(bch_fs_list); -+static DEFINE_MUTEX(bch_fs_list_lock); -+ -+static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); -+ -+static void bch2_dev_free(struct bch_dev *); -+static int bch2_dev_alloc(struct bch_fs *, unsigned); -+static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); -+static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) -+{ -+ struct bch_fs *c; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ mutex_lock(&bch_fs_list_lock); -+ rcu_read_lock(); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ for_each_member_device_rcu(ca, c, i, NULL) -+ if (ca->disk_sb.bdev == bdev) { -+ closure_get(&c->cl); -+ goto found; -+ } -+ c = NULL; -+found: -+ rcu_read_unlock(); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ list_for_each_entry(c, &bch_fs_list, list) -+ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) -+ return c; -+ -+ return NULL; -+} -+ -+struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) -+{ -+ struct bch_fs *c; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(uuid); -+ if (c) -+ closure_get(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return c; -+} -+ -+/* Filesystem RO/RW: */ -+ -+/* -+ * For startup/shutdown of RW stuff, the dependencies are: -+ * -+ * - foreground writes depend on copygc and rebalance (to free up space) -+ * -+ * - copygc and rebalance depend on mark and sweep gc (they actually probably -+ * don't because they either reserve ahead of time or don't block if -+ * allocations fail, but allocations can require mark and sweep gc to run -+ * because of generation number wraparound) -+ * -+ * - all of the above depends on the allocator threads -+ * -+ * - allocator depends on the journal (when it rewrites prios and gens) -+ */ -+ -+static void __bch2_fs_read_only(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i, clean_passes = 0; -+ -+ bch2_rebalance_stop(c); -+ bch2_copygc_stop(c); -+ bch2_gc_thread_stop(c); -+ -+ bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); -+ bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); -+ -+ /* -+ * Flush journal before stopping allocators, because flushing journal -+ * blacklist entries involves allocating new btree nodes: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ /* -+ * If the allocator threads didn't all start up, the btree updates to -+ * write out alloc info aren't going to work: -+ */ -+ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) -+ goto nowrote_alloc; -+ -+ bch_verbose(c, "flushing journal and stopping allocators"); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ do { -+ clean_passes++; -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ -+ /* -+ * In flight interior btree updates will generate more journal -+ * updates and btree updates (alloc btree): -+ */ -+ if (bch2_btree_interior_updates_nr_pending(c)) { -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ clean_passes = 0; -+ } -+ flush_work(&c->btree_interior_update_work); -+ -+ if (bch2_journal_flush_all_pins(&c->journal)) -+ clean_passes = 0; -+ } while (clean_passes < 2); -+ bch_verbose(c, "flushing journal and stopping allocators complete"); -+ -+ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+nowrote_alloc: -+ closure_wait_event(&c->btree_interior_update_wait, -+ !bch2_btree_interior_updates_nr_pending(c)); -+ flush_work(&c->btree_interior_update_work); -+ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_stop(ca); -+ -+ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); -+ -+ bch2_fs_journal_stop(&c->journal); -+ -+ /* -+ * the journal kicks off btree writes via reclaim - wait for in flight -+ * writes after stopping journal: -+ */ -+ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) -+ bch2_btree_flush_all_writes(c); -+ else -+ bch2_btree_verify_flushed(c); -+ -+ /* -+ * After stopping journal: -+ */ -+ for_each_member_device(ca, c, i) -+ bch2_dev_allocator_remove(c, ca); -+} -+ -+static void bch2_writes_disabled(struct percpu_ref *writes) -+{ -+ struct bch_fs *c = container_of(writes, struct bch_fs, writes); -+ -+ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ wake_up(&bch_read_only_wait); -+} -+ -+void bch2_fs_read_only(struct bch_fs *c) -+{ -+ if (!test_bit(BCH_FS_RW, &c->flags)) { -+ cancel_delayed_work_sync(&c->journal.reclaim_work); -+ return; -+ } -+ -+ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ /* -+ * Block new foreground-end write operations from starting - any new -+ * writes will return -EROFS: -+ * -+ * (This is really blocking new _allocations_, writes to previously -+ * allocated space can still happen until stopping the allocator in -+ * bch2_dev_allocator_stop()). -+ */ -+ percpu_ref_kill(&c->writes); -+ -+ cancel_work_sync(&c->ec_stripe_delete_work); -+ cancel_delayed_work(&c->pd_controllers_update); -+ -+ /* -+ * If we're not doing an emergency shutdown, we want to wait on -+ * outstanding writes to complete so they don't see spurious errors due -+ * to shutting down the allocator: -+ * -+ * If we are doing an emergency shutdown outstanding writes may -+ * hang until we shutdown the allocator so we don't want to wait -+ * on outstanding writes before shutting everything down - but -+ * we do need to wait on them before returning and signalling -+ * that going RO is complete: -+ */ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || -+ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); -+ -+ __bch2_fs_read_only(c); -+ -+ wait_event(bch_read_only_wait, -+ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); -+ -+ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); -+ -+ if (!bch2_journal_error(&c->journal) && -+ !test_bit(BCH_FS_ERROR, &c->flags) && -+ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && -+ test_bit(BCH_FS_STARTED, &c->flags) && -+ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && -+ !c->opts.norecovery) { -+ bch_verbose(c, "marking filesystem clean"); -+ bch2_fs_mark_clean(c); -+ } -+ -+ clear_bit(BCH_FS_RW, &c->flags); -+} -+ -+static void bch2_fs_read_only_work(struct work_struct *work) -+{ -+ struct bch_fs *c = -+ container_of(work, struct bch_fs, read_only_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+} -+ -+static void bch2_fs_read_only_async(struct bch_fs *c) -+{ -+ queue_work(system_long_wq, &c->read_only_work); -+} -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *c) -+{ -+ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); -+ -+ bch2_journal_halt(&c->journal); -+ bch2_fs_read_only_async(c); -+ -+ wake_up(&bch_read_only_wait); -+ return ret; -+} -+ -+static int bch2_fs_read_write_late(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_gc_thread_start(c); -+ if (ret) { -+ bch_err(c, "error starting gc thread"); -+ return ret; -+ } -+ -+ ret = bch2_copygc_start(c); -+ if (ret) { -+ bch_err(c, "error starting copygc thread"); -+ return ret; -+ } -+ -+ ret = bch2_rebalance_start(c); -+ if (ret) { -+ bch_err(c, "error starting rebalance thread"); -+ return ret; -+ } -+ -+ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); -+ -+ schedule_work(&c->ec_stripe_delete_work); -+ -+ return 0; -+} -+ -+static int __bch2_fs_read_write(struct bch_fs *c, bool early) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ int ret; -+ -+ if (test_bit(BCH_FS_RW, &c->flags)) -+ return 0; -+ -+ /* -+ * nochanges is used for fsck -n mode - we have to allow going rw -+ * during recovery for that to work: -+ */ -+ if (c->opts.norecovery || -+ (c->opts.nochanges && -+ (!early || c->opts.read_only))) -+ return -EROFS; -+ -+ ret = bch2_fs_mark_dirty(c); -+ if (ret) -+ goto err; -+ -+ /* -+ * We need to write out a journal entry before we start doing btree -+ * updates, to ensure that on unclean shutdown new journal blacklist -+ * entries are created: -+ */ -+ bch2_journal_meta(&c->journal); -+ -+ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); -+ bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); -+ -+ for_each_rw_member(ca, c, i) { -+ ret = bch2_dev_allocator_start(ca); -+ if (ret) { -+ bch_err(c, "error starting allocator threads"); -+ percpu_ref_put(&ca->io_ref); -+ goto err; -+ } -+ } -+ -+ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); -+ -+ if (!early) { -+ ret = bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ percpu_ref_reinit(&c->writes); -+ set_bit(BCH_FS_RW, &c->flags); -+ -+ queue_delayed_work(c->journal_reclaim_wq, -+ &c->journal.reclaim_work, 0); -+ return 0; -+err: -+ __bch2_fs_read_only(c); -+ return ret; -+} -+ -+int bch2_fs_read_write(struct bch_fs *c) -+{ -+ return __bch2_fs_read_write(c, false); -+} -+ -+int bch2_fs_read_write_early(struct bch_fs *c) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ return __bch2_fs_read_write(c, true); -+} -+ -+/* Filesystem startup/shutdown: */ -+ -+static void __bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_exit(&c->times[i]); -+ -+ bch2_fs_quota_exit(c); -+ bch2_fs_fsio_exit(c); -+ bch2_fs_ec_exit(c); -+ bch2_fs_encryption_exit(c); -+ bch2_fs_io_exit(c); -+ bch2_fs_btree_interior_update_exit(c); -+ bch2_fs_btree_iter_exit(c); -+ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); -+ bch2_fs_btree_cache_exit(c); -+ bch2_fs_journal_exit(&c->journal); -+ bch2_io_clock_exit(&c->io_clock[WRITE]); -+ bch2_io_clock_exit(&c->io_clock[READ]); -+ bch2_fs_compress_exit(c); -+ bch2_journal_keys_free(&c->journal_keys); -+ bch2_journal_entries_free(&c->journal_entries); -+ percpu_free_rwsem(&c->mark_lock); -+ kfree(c->usage_scratch); -+ free_percpu(c->usage[1]); -+ free_percpu(c->usage[0]); -+ kfree(c->usage_base); -+ free_percpu(c->pcpu); -+ mempool_exit(&c->large_bkey_pool); -+ mempool_exit(&c->btree_bounce_pool); -+ bioset_exit(&c->btree_bio); -+ mempool_exit(&c->fill_iter); -+ percpu_ref_exit(&c->writes); -+ kfree(c->replicas.entries); -+ kfree(c->replicas_gc.entries); -+ kfree(rcu_dereference_protected(c->disk_groups, 1)); -+ kfree(c->journal_seq_blacklist_table); -+ free_heap(&c->copygc_heap); -+ -+ if (c->journal_reclaim_wq) -+ destroy_workqueue(c->journal_reclaim_wq); -+ if (c->copygc_wq) -+ destroy_workqueue(c->copygc_wq); -+ if (c->wq) -+ destroy_workqueue(c->wq); -+ -+ free_pages((unsigned long) c->disk_sb.sb, -+ c->disk_sb.page_order); -+ kvpfree(c, sizeof(*c)); -+ module_put(THIS_MODULE); -+} -+ -+static void bch2_fs_release(struct kobject *kobj) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ __bch2_fs_free(c); -+} -+ -+void __bch2_fs_stop(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ unsigned i; -+ -+ bch_verbose(c, "shutting down"); -+ -+ set_bit(BCH_FS_STOPPING, &c->flags); -+ -+ cancel_work_sync(&c->journal_seq_blacklist_gc_work); -+ -+ down_write(&c->state_lock); -+ bch2_fs_read_only(c); -+ up_write(&c->state_lock); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (c->kobj.state_in_sysfs) -+ kobject_del(&c->kobj); -+ -+ bch2_fs_debug_exit(c); -+ bch2_fs_chardev_exit(c); -+ -+ kobject_put(&c->time_stats); -+ kobject_put(&c->opts_dir); -+ kobject_put(&c->internal); -+ -+ /* btree prefetch might have kicked off reads in the background: */ -+ bch2_btree_flush_all_reads(c); -+ -+ for_each_member_device(ca, c, i) -+ cancel_work_sync(&ca->io_error_work); -+ -+ cancel_work_sync(&c->btree_write_error_work); -+ cancel_delayed_work_sync(&c->pd_controllers_update); -+ cancel_work_sync(&c->read_only_work); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_free_super(&c->devs[i]->disk_sb); -+} -+ -+void bch2_fs_free(struct bch_fs *c) -+{ -+ unsigned i; -+ -+ mutex_lock(&bch_fs_list_lock); -+ list_del(&c->list); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ closure_sync(&c->cl); -+ closure_debug_destroy(&c->cl); -+ -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (c->devs[i]) -+ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); -+ -+ bch_verbose(c, "shutdown complete"); -+ -+ kobject_put(&c->kobj); -+} -+ -+void bch2_fs_stop(struct bch_fs *c) -+{ -+ __bch2_fs_stop(c); -+ bch2_fs_free(c); -+} -+ -+static const char *bch2_fs_online(struct bch_fs *c) -+{ -+ struct bch_dev *ca; -+ const char *err = NULL; -+ unsigned i; -+ int ret; -+ -+ lockdep_assert_held(&bch_fs_list_lock); -+ -+ if (!list_empty(&c->list)) -+ return NULL; -+ -+ if (__bch2_uuid_to_fs(c->sb.uuid)) -+ return "filesystem UUID already open"; -+ -+ ret = bch2_fs_chardev_init(c); -+ if (ret) -+ return "error creating character device"; -+ -+ bch2_fs_debug_init(c); -+ -+ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || -+ kobject_add(&c->internal, &c->kobj, "internal") || -+ kobject_add(&c->opts_dir, &c->kobj, "options") || -+ kobject_add(&c->time_stats, &c->kobj, "time_stats") || -+ bch2_opts_create_sysfs_files(&c->opts_dir)) -+ return "error creating sysfs objects"; -+ -+ down_write(&c->state_lock); -+ -+ err = "error creating sysfs objects"; -+ __for_each_member_device(ca, c, i, NULL) -+ if (bch2_dev_sysfs_online(c, ca)) -+ goto err; -+ -+ list_add(&c->list, &bch_fs_list); -+ err = NULL; -+err: -+ up_write(&c->state_lock); -+ return err; -+} -+ -+static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) -+{ -+ struct bch_sb_field_members *mi; -+ struct bch_fs *c; -+ unsigned i, iter_size; -+ const char *err; -+ -+ pr_verbose_init(opts, ""); -+ -+ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); -+ if (!c) -+ goto out; -+ -+ __module_get(THIS_MODULE); -+ -+ closure_init(&c->cl, NULL); -+ -+ c->kobj.kset = bcachefs_kset; -+ kobject_init(&c->kobj, &bch2_fs_ktype); -+ kobject_init(&c->internal, &bch2_fs_internal_ktype); -+ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); -+ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); -+ -+ c->minor = -1; -+ c->disk_sb.fs_sb = true; -+ -+ init_rwsem(&c->state_lock); -+ mutex_init(&c->sb_lock); -+ mutex_init(&c->replicas_gc_lock); -+ mutex_init(&c->btree_root_lock); -+ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); -+ -+ init_rwsem(&c->gc_lock); -+ -+ for (i = 0; i < BCH_TIME_STAT_NR; i++) -+ bch2_time_stats_init(&c->times[i]); -+ -+ bch2_fs_copygc_init(c); -+ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); -+ bch2_fs_allocator_background_init(c); -+ bch2_fs_allocator_foreground_init(c); -+ bch2_fs_rebalance_init(c); -+ bch2_fs_quota_init(c); -+ -+ INIT_LIST_HEAD(&c->list); -+ -+ mutex_init(&c->usage_scratch_lock); -+ -+ mutex_init(&c->bio_bounce_pages_lock); -+ -+ bio_list_init(&c->btree_write_error_list); -+ spin_lock_init(&c->btree_write_error_lock); -+ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); -+ -+ INIT_WORK(&c->journal_seq_blacklist_gc_work, -+ bch2_blacklist_entries_gc); -+ -+ INIT_LIST_HEAD(&c->journal_entries); -+ -+ INIT_LIST_HEAD(&c->fsck_errors); -+ mutex_init(&c->fsck_error_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_head_list); -+ mutex_init(&c->ec_stripe_head_lock); -+ -+ INIT_LIST_HEAD(&c->ec_stripe_new_list); -+ mutex_init(&c->ec_stripe_new_lock); -+ -+ spin_lock_init(&c->ec_stripes_heap_lock); -+ -+ seqcount_init(&c->gc_pos_lock); -+ -+ seqcount_init(&c->usage_lock); -+ -+ sema_init(&c->io_in_flight, 64); -+ -+ c->copy_gc_enabled = 1; -+ c->rebalance.enabled = 1; -+ c->promote_whole_extents = true; -+ -+ c->journal.write_time = &c->times[BCH_TIME_journal_write]; -+ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; -+ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; -+ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; -+ -+ bch2_fs_btree_cache_init_early(&c->btree_cache); -+ -+ if (percpu_init_rwsem(&c->mark_lock)) -+ goto err; -+ -+ mutex_lock(&c->sb_lock); -+ -+ if (bch2_sb_to_fs(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ -+ mutex_unlock(&c->sb_lock); -+ -+ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); -+ -+ c->opts = bch2_opts_default; -+ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); -+ bch2_opts_apply(&c->opts, opts); -+ -+ c->block_bits = ilog2(c->opts.block_size); -+ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); -+ -+ if (bch2_fs_init_fault("fs_alloc")) -+ goto err; -+ -+ iter_size = sizeof(struct sort_iter) + -+ (btree_blocks(c) + 1) * 2 * -+ sizeof(struct sort_iter_set); -+ -+ if (!(c->wq = alloc_workqueue("bcachefs", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->copygc_wq = alloc_workqueue("bcache_copygc", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || -+ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", -+ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || -+ percpu_ref_init(&c->writes, bch2_writes_disabled, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || -+ bioset_init(&c->btree_bio, 1, -+ max(offsetof(struct btree_read_bio, bio), -+ offsetof(struct btree_write_bio, wbio.bio)), -+ BIOSET_NEED_BVECS) || -+ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || -+ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, -+ btree_bytes(c)) || -+ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || -+ bch2_io_clock_init(&c->io_clock[READ]) || -+ bch2_io_clock_init(&c->io_clock[WRITE]) || -+ bch2_fs_journal_init(&c->journal) || -+ bch2_fs_replicas_init(c) || -+ bch2_fs_btree_cache_init(c) || -+ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || -+ bch2_fs_btree_iter_init(c) || -+ bch2_fs_btree_interior_update_init(c) || -+ bch2_fs_io_init(c) || -+ bch2_fs_encryption_init(c) || -+ bch2_fs_compress_init(c) || -+ bch2_fs_ec_init(c) || -+ bch2_fs_fsio_init(c)) -+ goto err; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (i = 0; i < c->sb.nr_devices; i++) -+ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && -+ bch2_dev_alloc(c, i)) -+ goto err; -+ -+ mutex_lock(&bch_fs_list_lock); -+ err = bch2_fs_online(c); -+ mutex_unlock(&bch_fs_list_lock); -+ if (err) { -+ bch_err(c, "bch2_fs_online() error: %s", err); -+ goto err; -+ } -+out: -+ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); -+ return c; -+err: -+ bch2_fs_free(c); -+ c = NULL; -+ goto out; -+} -+ -+noinline_for_stack -+static void print_mount_opts(struct bch_fs *c) -+{ -+ enum bch_opt_id i; -+ char buf[512]; -+ struct printbuf p = PBUF(buf); -+ bool first = true; -+ -+ strcpy(buf, "(null)"); -+ -+ if (c->opts.read_only) { -+ pr_buf(&p, "ro"); -+ first = false; -+ } -+ -+ for (i = 0; i < bch2_opts_nr; i++) { -+ const struct bch_option *opt = &bch2_opt_table[i]; -+ u64 v = bch2_opt_get_by_id(&c->opts, i); -+ -+ if (!(opt->mode & OPT_MOUNT)) -+ continue; -+ -+ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) -+ continue; -+ -+ if (!first) -+ pr_buf(&p, ","); -+ first = false; -+ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); -+ } -+ -+ bch_info(c, "mounted with opts: %s", buf); -+} -+ -+int bch2_fs_start(struct bch_fs *c) -+{ -+ const char *err = "cannot allocate memory"; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ time64_t now = ktime_get_real_seconds(); -+ unsigned i; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); -+ -+ mutex_lock(&c->sb_lock); -+ -+ for_each_online_member(ca, c, i) -+ bch2_sb_from_fs(c, ca); -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for_each_online_member(ca, c, i) -+ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); -+ -+ mutex_unlock(&c->sb_lock); -+ -+ for_each_rw_member(ca, c, i) -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) -+ ? bch2_fs_recovery(c) -+ : bch2_fs_initialize(c); -+ if (ret) -+ goto err; -+ -+ ret = bch2_opts_check_may_set(c); -+ if (ret) -+ goto err; -+ -+ err = "dynamic fault"; -+ ret = -EINVAL; -+ if (bch2_fs_init_fault("fs_start")) -+ goto err; -+ -+ set_bit(BCH_FS_STARTED, &c->flags); -+ -+ /* -+ * Allocator threads don't start filling copygc reserve until after we -+ * set BCH_FS_STARTED - wake them now: -+ */ -+ for_each_online_member(ca, c, i) -+ bch2_wake_allocator(ca); -+ -+ if (c->opts.read_only || c->opts.nochanges) { -+ bch2_fs_read_only(c); -+ } else { -+ err = "error going read write"; -+ ret = !test_bit(BCH_FS_RW, &c->flags) -+ ? bch2_fs_read_write(c) -+ : bch2_fs_read_write_late(c); -+ if (ret) -+ goto err; -+ } -+ -+ print_mount_opts(c); -+ ret = 0; -+out: -+ up_write(&c->state_lock); -+ return ret; -+err: -+ switch (ret) { -+ case BCH_FSCK_ERRORS_NOT_FIXED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("mount with -o fix_errors to repair\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_UNIMPLEMENTED: -+ bch_err(c, "filesystem contains errors: please report this to the developers"); -+ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_REPAIR_IMPOSSIBLE: -+ bch_err(c, "filesystem contains errors, but repair impossible"); -+ err = "fsck error"; -+ break; -+ case BCH_FSCK_UNKNOWN_VERSION: -+ err = "unknown metadata version";; -+ break; -+ case -ENOMEM: -+ err = "cannot allocate memory"; -+ break; -+ case -EIO: -+ err = "IO error"; -+ break; -+ } -+ -+ if (ret >= 0) -+ ret = -EIO; -+ goto out; -+} -+ -+static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) -+{ -+ struct bch_sb_field_members *sb_mi; -+ -+ sb_mi = bch2_sb_get_members(sb); -+ if (!sb_mi) -+ return "Invalid superblock: member info area missing"; -+ -+ if (le16_to_cpu(sb->block_size) != c->opts.block_size) -+ return "mismatched block size"; -+ -+ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < -+ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) -+ return "new cache bucket size is too small"; -+ -+ return NULL; -+} -+ -+static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) -+{ -+ struct bch_sb *newest = -+ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; -+ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); -+ -+ if (uuid_le_cmp(fs->uuid, sb->uuid)) -+ return "device not a member of filesystem"; -+ -+ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) -+ return "device has been removed"; -+ -+ if (fs->block_size != sb->block_size) -+ return "mismatched block size"; -+ -+ return NULL; -+} -+ -+/* Device startup/shutdown: */ -+ -+static void bch2_dev_release(struct kobject *kobj) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ -+ kfree(ca); -+} -+ -+static void bch2_dev_free(struct bch_dev *ca) -+{ -+ cancel_work_sync(&ca->io_error_work); -+ -+ if (ca->kobj.state_in_sysfs && -+ ca->disk_sb.bdev) -+ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, -+ "bcachefs"); -+ -+ if (ca->kobj.state_in_sysfs) -+ kobject_del(&ca->kobj); -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+ -+ free_percpu(ca->io_done); -+ bioset_exit(&ca->replica_set); -+ bch2_dev_buckets_free(ca); -+ free_page((unsigned long) ca->sb_read_scratch); -+ -+ bch2_time_stats_exit(&ca->io_latency[WRITE]); -+ bch2_time_stats_exit(&ca->io_latency[READ]); -+ -+ percpu_ref_exit(&ca->io_ref); -+ percpu_ref_exit(&ca->ref); -+ kobject_put(&ca->kobj); -+} -+ -+static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) -+{ -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (percpu_ref_is_zero(&ca->io_ref)) -+ return; -+ -+ __bch2_dev_read_only(c, ca); -+ -+ reinit_completion(&ca->io_ref_completion); -+ percpu_ref_kill(&ca->io_ref); -+ wait_for_completion(&ca->io_ref_completion); -+ -+ if (ca->kobj.state_in_sysfs) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ sysfs_remove_link(block, "bcachefs"); -+ sysfs_remove_link(&ca->kobj, "block"); -+ } -+ -+ bch2_free_super(&ca->disk_sb); -+ bch2_dev_journal_exit(ca); -+} -+ -+static void bch2_dev_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); -+ -+ complete(&ca->ref_completion); -+} -+ -+static void bch2_dev_io_ref_complete(struct percpu_ref *ref) -+{ -+ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); -+ -+ complete(&ca->io_ref_completion); -+} -+ -+static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) -+{ -+ int ret; -+ -+ if (!c->kobj.state_in_sysfs) -+ return 0; -+ -+ if (!ca->kobj.state_in_sysfs) { -+ ret = kobject_add(&ca->kobj, &c->kobj, -+ "dev-%u", ca->dev_idx); -+ if (ret) -+ return ret; -+ } -+ -+ if (ca->disk_sb.bdev) { -+ struct kobject *block = -+ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; -+ -+ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); -+ if (ret) -+ return ret; -+ ret = sysfs_create_link(&ca->kobj, block, "block"); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, -+ struct bch_member *member) -+{ -+ struct bch_dev *ca; -+ -+ ca = kzalloc(sizeof(*ca), GFP_KERNEL); -+ if (!ca) -+ return NULL; -+ -+ kobject_init(&ca->kobj, &bch2_dev_ktype); -+ init_completion(&ca->ref_completion); -+ init_completion(&ca->io_ref_completion); -+ -+ init_rwsem(&ca->bucket_lock); -+ -+ INIT_WORK(&ca->io_error_work, bch2_io_error_work); -+ -+ bch2_time_stats_init(&ca->io_latency[READ]); -+ bch2_time_stats_init(&ca->io_latency[WRITE]); -+ -+ ca->mi = bch2_mi_to_cpu(member); -+ ca->uuid = member->uuid; -+ -+ if (opt_defined(c->opts, discard)) -+ ca->mi.discard = opt_get(c->opts, discard); -+ -+ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, -+ 0, GFP_KERNEL) || -+ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, -+ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -+ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || -+ bch2_dev_buckets_alloc(c, ca) || -+ bioset_init(&ca->replica_set, 4, -+ offsetof(struct bch_write_bio, bio), 0) || -+ !(ca->io_done = alloc_percpu(*ca->io_done))) -+ goto err; -+ -+ return ca; -+err: -+ bch2_dev_free(ca); -+ return NULL; -+} -+ -+static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, -+ unsigned dev_idx) -+{ -+ ca->dev_idx = dev_idx; -+ __set_bit(ca->dev_idx, ca->self.d); -+ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); -+ -+ ca->fs = c; -+ rcu_assign_pointer(c->devs[ca->dev_idx], ca); -+ -+ if (bch2_dev_sysfs_online(c, ca)) -+ pr_warn("error creating sysfs objects"); -+} -+ -+static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) -+{ -+ struct bch_member *member = -+ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; -+ struct bch_dev *ca = NULL; -+ int ret = 0; -+ -+ pr_verbose_init(c->opts, ""); -+ -+ if (bch2_fs_init_fault("dev_alloc")) -+ goto err; -+ -+ ca = __bch2_dev_alloc(c, member); -+ if (!ca) -+ goto err; -+ -+ bch2_dev_attach(c, ca, dev_idx); -+out: -+ pr_verbose_init(c->opts, "ret %i", ret); -+ return ret; -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ ret = -ENOMEM; -+ goto out; -+} -+ -+static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) -+{ -+ unsigned ret; -+ -+ if (bch2_dev_is_online(ca)) { -+ bch_err(ca, "already have device online in slot %u", -+ sb->sb->dev_idx); -+ return -EINVAL; -+ } -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "cannot online: device too small"); -+ return -EINVAL; -+ } -+ -+ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); -+ -+ if (get_capacity(sb->bdev->bd_disk) < -+ ca->mi.bucket_size * ca->mi.nbuckets) { -+ bch_err(ca, "device too small"); -+ return -EINVAL; -+ } -+ -+ ret = bch2_dev_journal_init(ca, sb->sb); -+ if (ret) -+ return ret; -+ -+ /* Commit: */ -+ ca->disk_sb = *sb; -+ if (sb->mode & FMODE_EXCL) -+ ca->disk_sb.bdev->bd_holder = ca; -+ memset(sb, 0, sizeof(*sb)); -+ -+ percpu_ref_reinit(&ca->io_ref); -+ -+ return 0; -+} -+ -+static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) -+{ -+ struct bch_dev *ca; -+ int ret; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ if (le64_to_cpu(sb->sb->seq) > -+ le64_to_cpu(c->disk_sb.sb->seq)) -+ bch2_sb_to_fs(c, sb->sb); -+ -+ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || -+ !c->devs[sb->sb->dev_idx]); -+ -+ ca = bch_dev_locked(c, sb->sb->dev_idx); -+ -+ ret = __bch2_dev_attach_bdev(ca, sb); -+ if (ret) -+ return ret; -+ -+ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && -+ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { -+ mutex_lock(&c->sb_lock); -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_dev_sysfs_online(c, ca); -+ -+ if (c->sb.nr_devices == 1) -+ bdevname(ca->disk_sb.bdev, c->name); -+ bdevname(ca->disk_sb.bdev, ca->name); -+ -+ rebalance_wakeup(c); -+ return 0; -+} -+ -+/* Device management: */ -+ -+/* -+ * Note: this function is also used by the error paths - when a particular -+ * device sees an error, we call it to determine whether we can just set the -+ * device RO, or - if this function returns false - we'll set the whole -+ * filesystem RO: -+ * -+ * XXX: maybe we should be more explicit about whether we're changing state -+ * because we got an error or what have you? -+ */ -+bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_devs_mask new_online_devs; -+ struct replicas_status s; -+ struct bch_dev *ca2; -+ int i, nr_rw = 0, required; -+ -+ lockdep_assert_held(&c->state_lock); -+ -+ switch (new_state) { -+ case BCH_MEMBER_STATE_RW: -+ return true; -+ case BCH_MEMBER_STATE_RO: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW) -+ return true; -+ -+ /* do we have enough devices to write to? */ -+ for_each_member_device(ca2, c, i) -+ if (ca2 != ca) -+ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; -+ -+ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) -+ ? c->opts.metadata_replicas -+ : c->opts.metadata_replicas_required, -+ !(flags & BCH_FORCE_IF_DATA_DEGRADED) -+ ? c->opts.data_replicas -+ : c->opts.data_replicas_required); -+ -+ return nr_rw >= required; -+ case BCH_MEMBER_STATE_FAILED: -+ case BCH_MEMBER_STATE_SPARE: -+ if (ca->mi.state != BCH_MEMBER_STATE_RW && -+ ca->mi.state != BCH_MEMBER_STATE_RO) -+ return true; -+ -+ /* do we have enough devices to read from? */ -+ new_online_devs = bch2_online_devs(c); -+ __clear_bit(ca->dev_idx, new_online_devs.d); -+ -+ s = __bch2_replicas_status(c, new_online_devs); -+ -+ return bch2_have_enough_devs(s, flags); -+ default: -+ BUG(); -+ } -+} -+ -+static bool bch2_fs_may_start(struct bch_fs *c) -+{ -+ struct replicas_status s; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned i, flags = c->opts.degraded -+ ? BCH_FORCE_IF_DEGRADED -+ : 0; -+ -+ if (!c->opts.degraded) { -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) -+ continue; -+ -+ ca = bch_dev_locked(c, i); -+ -+ if (!bch2_dev_is_online(ca) && -+ (ca->mi.state == BCH_MEMBER_STATE_RW || -+ ca->mi.state == BCH_MEMBER_STATE_RO)) { -+ mutex_unlock(&c->sb_lock); -+ return false; -+ } -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ s = bch2_replicas_status(c); -+ -+ return bch2_have_enough_devs(s, flags); -+} -+ -+static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) -+{ -+ /* -+ * Device going read only means the copygc reserve get smaller, so we -+ * don't want that happening while copygc is in progress: -+ */ -+ bch2_copygc_stop(c); -+ -+ /* -+ * The allocator thread itself allocates btree nodes, so stop it first: -+ */ -+ bch2_dev_allocator_stop(ca); -+ bch2_dev_allocator_remove(c, ca); -+ bch2_dev_journal_stop(&c->journal, ca); -+ -+ bch2_copygc_start(c); -+} -+ -+static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) -+{ -+ lockdep_assert_held(&c->state_lock); -+ -+ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); -+ -+ bch2_dev_allocator_add(c, ca); -+ bch2_recalc_capacity(c); -+ -+ if (bch2_dev_allocator_start(ca)) -+ return "error starting allocator thread"; -+ -+ return NULL; -+} -+ -+int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ int ret = 0; -+ -+ if (ca->mi.state == new_state) -+ return 0; -+ -+ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) -+ return -EINVAL; -+ -+ if (new_state != BCH_MEMBER_STATE_RW) -+ __bch2_dev_read_only(c, ca); -+ -+ bch_notice(ca, "%s", bch2_dev_state[new_state]); -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ if (new_state == BCH_MEMBER_STATE_RW && -+ __bch2_dev_read_write(c, ca)) -+ ret = -ENOMEM; -+ -+ rebalance_wakeup(c); -+ -+ return ret; -+} -+ -+int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, -+ enum bch_member_state new_state, int flags) -+{ -+ int ret; -+ -+ down_write(&c->state_lock); -+ ret = __bch2_dev_set_state(c, ca, new_state, flags); -+ up_write(&c->state_lock); -+ -+ return ret; -+} -+ -+/* Device add/removal: */ -+ -+int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) -+{ -+ struct btree_trans trans; -+ size_t i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < ca->mi.nbuckets; i++) { -+ ret = bch2_btree_key_cache_flush(&trans, -+ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); -+ if (ret) -+ break; -+ } -+ bch2_trans_exit(&trans); -+ -+ if (ret) -+ return ret; -+ -+ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, -+ POS(ca->dev_idx, 0), -+ POS(ca->dev_idx + 1, 0), -+ NULL); -+} -+ -+int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ struct bch_sb_field_members *mi; -+ unsigned dev_idx = ca->dev_idx, data; -+ int ret = -EINVAL; -+ -+ down_write(&c->state_lock); -+ -+ /* -+ * We consume a reference to ca->ref, regardless of whether we succeed -+ * or fail: -+ */ -+ percpu_ref_put(&ca->ref); -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot remove without losing data"); -+ goto err; -+ } -+ -+ __bch2_dev_read_only(c, ca); -+ -+ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i dropping data", ret); -+ goto err; -+ } -+ -+ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i flushing journal", ret); -+ goto err; -+ } -+ -+ ret = bch2_dev_remove_alloc(c, ca); -+ if (ret) { -+ bch_err(ca, "Remove failed, error deleting alloc info"); -+ goto err; -+ } -+ -+ /* -+ * must flush all existing journal entries, they might have -+ * (overwritten) keys that point to the device we're removing: -+ */ -+ bch2_journal_flush_all_pins(&c->journal); -+ /* -+ * hack to ensure bch2_replicas_gc2() clears out entries to this device -+ */ -+ bch2_journal_meta(&c->journal); -+ ret = bch2_journal_error(&c->journal); -+ if (ret) { -+ bch_err(ca, "Remove failed, journal error"); -+ goto err; -+ } -+ -+ ret = bch2_replicas_gc2(c); -+ if (ret) { -+ bch_err(ca, "Remove failed: error %i from replicas gc", ret); -+ goto err; -+ } -+ -+ data = bch2_dev_has_data(c, ca); -+ if (data) { -+ char data_has_str[100]; -+ -+ bch2_flags_to_text(&PBUF(data_has_str), -+ bch2_data_types, data); -+ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); -+ ret = -EBUSY; -+ goto err; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ mutex_lock(&c->sb_lock); -+ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); -+ mutex_unlock(&c->sb_lock); -+ -+ percpu_ref_kill(&ca->ref); -+ wait_for_completion(&ca->ref_completion); -+ -+ bch2_dev_free(ca); -+ -+ /* -+ * Free this device's slot in the bch_member array - all pointers to -+ * this device must be gone: -+ */ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); -+ -+ bch2_write_super(c); -+ -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+ return 0; -+err: -+ if (ca->mi.state == BCH_MEMBER_STATE_RW && -+ !percpu_ref_is_zero(&ca->io_ref)) -+ __bch2_dev_read_write(c, ca); -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+static void dev_usage_clear(struct bch_dev *ca) -+{ -+ struct bucket_array *buckets; -+ -+ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); -+ -+ down_read(&ca->bucket_lock); -+ buckets = bucket_array(ca); -+ -+ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); -+ up_read(&ca->bucket_lock); -+} -+ -+/* Add new device to running filesystem: */ -+int bch2_dev_add(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb; -+ const char *err; -+ struct bch_dev *ca = NULL; -+ struct bch_sb_field_members *mi; -+ struct bch_member dev_mi; -+ unsigned dev_idx, nr_devices, u64s; -+ int ret; -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) -+ return ret; -+ -+ err = bch2_sb_validate(&sb); -+ if (err) -+ return -EINVAL; -+ -+ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; -+ -+ err = bch2_dev_may_add(sb.sb, c); -+ if (err) -+ return -EINVAL; -+ -+ ca = __bch2_dev_alloc(c, &dev_mi); -+ if (!ca) { -+ bch2_free_super(&sb); -+ return -ENOMEM; -+ } -+ -+ ret = __bch2_dev_attach_bdev(ca, &sb); -+ if (ret) { -+ bch2_dev_free(ca); -+ return ret; -+ } -+ -+ /* -+ * We want to allocate journal on the new device before adding the new -+ * device to the filesystem because allocating after we attach requires -+ * spinning up the allocator thread, and the allocator thread requires -+ * doing btree writes, which if the existing devices are RO isn't going -+ * to work -+ * -+ * So we have to mark where the superblocks are, but marking allocated -+ * data normally updates the filesystem usage too, so we have to mark, -+ * allocate the journal, reset all the marks, then remark after we -+ * attach... -+ */ -+ bch2_mark_dev_superblock(ca->fs, ca, 0); -+ -+ err = "journal alloc failed"; -+ ret = bch2_dev_journal_alloc(ca); -+ if (ret) -+ goto err; -+ -+ dev_usage_clear(ca); -+ -+ down_write(&c->state_lock); -+ mutex_lock(&c->sb_lock); -+ -+ err = "insufficient space in new superblock"; -+ ret = bch2_sb_from_fs(c, ca); -+ if (ret) -+ goto err_unlock; -+ -+ mi = bch2_sb_get_members(ca->disk_sb.sb); -+ -+ if (!bch2_sb_resize_members(&ca->disk_sb, -+ le32_to_cpu(mi->field.u64s) + -+ sizeof(dev_mi) / sizeof(u64))) { -+ ret = -ENOSPC; -+ goto err_unlock; -+ } -+ -+ if (dynamic_fault("bcachefs:add:no_slot")) -+ goto no_slot; -+ -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) -+ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) -+ goto have_slot; -+no_slot: -+ err = "no slots available in superblock"; -+ ret = -ENOSPC; -+ goto err_unlock; -+ -+have_slot: -+ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); -+ u64s = (sizeof(struct bch_sb_field_members) + -+ sizeof(struct bch_member) * nr_devices) / sizeof(u64); -+ -+ err = "no space in superblock for member info"; -+ ret = -ENOSPC; -+ -+ mi = bch2_sb_resize_members(&c->disk_sb, u64s); -+ if (!mi) -+ goto err_unlock; -+ -+ /* success: */ -+ -+ mi->members[dev_idx] = dev_mi; -+ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); -+ c->disk_sb.sb->nr_devices = nr_devices; -+ -+ ca->disk_sb.sb->dev_idx = dev_idx; -+ bch2_dev_attach(c, ca, dev_idx); -+ -+ bch2_mark_dev_superblock(c, ca, 0); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ err = "alloc write failed"; -+ ret = bch2_dev_alloc_write(c, ca, 0); -+ if (ret) -+ goto err; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err_late; -+ } -+ -+ up_write(&c->state_lock); -+ return 0; -+ -+err_unlock: -+ mutex_unlock(&c->sb_lock); -+ up_write(&c->state_lock); -+err: -+ if (ca) -+ bch2_dev_free(ca); -+ bch2_free_super(&sb); -+ bch_err(c, "Unable to add device: %s", err); -+ return ret; -+err_late: -+ bch_err(c, "Error going rw after adding device: %s", err); -+ return -EINVAL; -+} -+ -+/* Hot add existing device to running filesystem: */ -+int bch2_dev_online(struct bch_fs *c, const char *path) -+{ -+ struct bch_opts opts = bch2_opts_empty(); -+ struct bch_sb_handle sb = { NULL }; -+ struct bch_sb_field_members *mi; -+ struct bch_dev *ca; -+ unsigned dev_idx; -+ const char *err; -+ int ret; -+ -+ down_write(&c->state_lock); -+ -+ ret = bch2_read_super(path, &opts, &sb); -+ if (ret) { -+ up_write(&c->state_lock); -+ return ret; -+ } -+ -+ dev_idx = sb.sb->dev_idx; -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); -+ if (err) -+ goto err; -+ -+ if (bch2_dev_attach_bdev(c, &sb)) { -+ err = "bch2_dev_attach_bdev() error"; -+ goto err; -+ } -+ -+ ca = bch_dev_locked(c, dev_idx); -+ if (ca->mi.state == BCH_MEMBER_STATE_RW) { -+ err = __bch2_dev_read_write(c, ca); -+ if (err) -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = bch2_sb_get_members(c->disk_sb.sb); -+ -+ mi->members[ca->dev_idx].last_mount = -+ cpu_to_le64(ktime_get_real_seconds()); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ up_write(&c->state_lock); -+ return 0; -+err: -+ up_write(&c->state_lock); -+ bch2_free_super(&sb); -+ bch_err(c, "error bringing %s online: %s", path, err); -+ return -EINVAL; -+} -+ -+int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) -+{ -+ down_write(&c->state_lock); -+ -+ if (!bch2_dev_is_online(ca)) { -+ bch_err(ca, "Already offline"); -+ up_write(&c->state_lock); -+ return 0; -+ } -+ -+ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { -+ bch_err(ca, "Cannot offline required disk"); -+ up_write(&c->state_lock); -+ return -EINVAL; -+ } -+ -+ __bch2_dev_offline(c, ca); -+ -+ up_write(&c->state_lock); -+ return 0; -+} -+ -+int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) -+{ -+ struct bch_member *mi; -+ int ret = 0; -+ -+ down_write(&c->state_lock); -+ -+ if (nbuckets < ca->mi.nbuckets) { -+ bch_err(ca, "Cannot shrink yet"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ if (bch2_dev_is_online(ca) && -+ get_capacity(ca->disk_sb.bdev->bd_disk) < -+ ca->mi.bucket_size * nbuckets) { -+ bch_err(ca, "New size larger than device"); -+ ret = -EINVAL; -+ goto err; -+ } -+ -+ ret = bch2_dev_buckets_resize(c, ca, nbuckets); -+ if (ret) { -+ bch_err(ca, "Resize error: %i", ret); -+ goto err; -+ } -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ mi->nbuckets = cpu_to_le64(nbuckets); -+ -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ -+ bch2_recalc_capacity(c); -+err: -+ up_write(&c->state_lock); -+ return ret; -+} -+ -+/* return with ref on ca->ref: */ -+struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) -+{ -+ struct block_device *bdev = lookup_bdev(path); -+ struct bch_dev *ca; -+ unsigned i; -+ -+ if (IS_ERR(bdev)) -+ return ERR_CAST(bdev); -+ -+ for_each_member_device(ca, c, i) -+ if (ca->disk_sb.bdev == bdev) -+ goto found; -+ -+ ca = ERR_PTR(-ENOENT); -+found: -+ bdput(bdev); -+ return ca; -+} -+ -+/* Filesystem open: */ -+ -+struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, -+ struct bch_opts opts) -+{ -+ struct bch_sb_handle *sb = NULL; -+ struct bch_fs *c = NULL; -+ struct bch_sb_field_members *mi; -+ unsigned i, best_sb = 0; -+ const char *err; -+ int ret = -ENOMEM; -+ -+ pr_verbose_init(opts, ""); -+ -+ if (!nr_devices) { -+ c = ERR_PTR(-EINVAL); -+ goto out2; -+ } -+ -+ if (!try_module_get(THIS_MODULE)) { -+ c = ERR_PTR(-ENODEV); -+ goto out2; -+ } -+ -+ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); -+ if (!sb) -+ goto err; -+ -+ for (i = 0; i < nr_devices; i++) { -+ ret = bch2_read_super(devices[i], &opts, &sb[i]); -+ if (ret) -+ goto err; -+ -+ err = bch2_sb_validate(&sb[i]); -+ if (err) -+ goto err_print; -+ } -+ -+ for (i = 1; i < nr_devices; i++) -+ if (le64_to_cpu(sb[i].sb->seq) > -+ le64_to_cpu(sb[best_sb].sb->seq)) -+ best_sb = i; -+ -+ mi = bch2_sb_get_members(sb[best_sb].sb); -+ -+ i = 0; -+ while (i < nr_devices) { -+ if (i != best_sb && -+ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { -+ char buf[BDEVNAME_SIZE]; -+ pr_info("%s has been removed, skipping", -+ bdevname(sb[i].bdev, buf)); -+ bch2_free_super(&sb[i]); -+ array_remove_item(sb, nr_devices, i); -+ continue; -+ } -+ -+ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); -+ if (err) -+ goto err_print; -+ i++; -+ } -+ -+ ret = -ENOMEM; -+ c = bch2_fs_alloc(sb[best_sb].sb, opts); -+ if (!c) -+ goto err; -+ -+ err = "bch2_dev_online() error"; -+ down_write(&c->state_lock); -+ for (i = 0; i < nr_devices; i++) -+ if (bch2_dev_attach_bdev(c, &sb[i])) { -+ up_write(&c->state_lock); -+ goto err_print; -+ } -+ up_write(&c->state_lock); -+ -+ err = "insufficient devices"; -+ if (!bch2_fs_may_start(c)) -+ goto err_print; -+ -+ if (!c->opts.nostart) { -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+out: -+ kfree(sb); -+ module_put(THIS_MODULE); -+out2: -+ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); -+ return c; -+err_print: -+ pr_err("bch_fs_open err opening %s: %s", -+ devices[0], err); -+ ret = -EINVAL; -+err: -+ if (c) -+ bch2_fs_stop(c); -+ for (i = 0; i < nr_devices; i++) -+ bch2_free_super(&sb[i]); -+ c = ERR_PTR(ret); -+ goto out; -+} -+ -+static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, -+ struct bch_opts opts) -+{ -+ const char *err; -+ struct bch_fs *c; -+ bool allocated_fs = false; -+ int ret; -+ -+ err = bch2_sb_validate(sb); -+ if (err) -+ return err; -+ -+ mutex_lock(&bch_fs_list_lock); -+ c = __bch2_uuid_to_fs(sb->sb->uuid); -+ if (c) { -+ closure_get(&c->cl); -+ -+ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); -+ if (err) -+ goto err; -+ } else { -+ c = bch2_fs_alloc(sb->sb, opts); -+ err = "cannot allocate memory"; -+ if (!c) -+ goto err; -+ -+ allocated_fs = true; -+ } -+ -+ err = "bch2_dev_online() error"; -+ -+ mutex_lock(&c->sb_lock); -+ if (bch2_dev_attach_bdev(c, sb)) { -+ mutex_unlock(&c->sb_lock); -+ goto err; -+ } -+ mutex_unlock(&c->sb_lock); -+ -+ if (!c->opts.nostart && bch2_fs_may_start(c)) { -+ err = "error starting filesystem"; -+ ret = bch2_fs_start(c); -+ if (ret) -+ goto err; -+ } -+ -+ closure_put(&c->cl); -+ mutex_unlock(&bch_fs_list_lock); -+ -+ return NULL; -+err: -+ mutex_unlock(&bch_fs_list_lock); -+ -+ if (allocated_fs) -+ bch2_fs_stop(c); -+ else if (c) -+ closure_put(&c->cl); -+ -+ return err; -+} -+ -+const char *bch2_fs_open_incremental(const char *path) -+{ -+ struct bch_sb_handle sb; -+ struct bch_opts opts = bch2_opts_empty(); -+ const char *err; -+ -+ if (bch2_read_super(path, &opts, &sb)) -+ return "error reading superblock"; -+ -+ err = __bch2_fs_open_incremental(&sb, opts); -+ bch2_free_super(&sb); -+ -+ return err; -+} -+ -+/* Global interfaces/init */ -+ -+static void bcachefs_exit(void) -+{ -+ bch2_debug_exit(); -+ bch2_vfs_exit(); -+ bch2_chardev_exit(); -+ if (bcachefs_kset) -+ kset_unregister(bcachefs_kset); -+} -+ -+static int __init bcachefs_init(void) -+{ -+ bch2_bkey_pack_test(); -+ bch2_inode_pack_test(); -+ -+ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || -+ bch2_chardev_init() || -+ bch2_vfs_init() || -+ bch2_debug_init()) -+ goto err; -+ -+ return 0; -+err: -+ bcachefs_exit(); -+ return -ENOMEM; -+} -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ bool bch2_##name; \ -+ module_param_named(name, bch2_##name, bool, 0644); \ -+ MODULE_PARM_DESC(name, description); -+BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+module_exit(bcachefs_exit); -+module_init(bcachefs_init); -diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h -new file mode 100644 -index 000000000000..02c81f3555c3 ---- /dev/null -+++ b/fs/bcachefs/super.h -@@ -0,0 +1,241 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_H -+#define _BCACHEFS_SUPER_H -+ -+#include "extents.h" -+ -+#include "bcachefs_ioctl.h" -+ -+#include -+ -+static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) -+{ -+ return div_u64(s, ca->mi.bucket_size); -+} -+ -+static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) -+{ -+ return ((sector_t) b) * ca->mi.bucket_size; -+} -+ -+static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) -+{ -+ u32 remainder; -+ -+ div_u64_rem(s, ca->mi.bucket_size, &remainder); -+ return remainder; -+} -+ -+static inline bool bch2_dev_is_online(struct bch_dev *ca) -+{ -+ return !percpu_ref_is_zero(&ca->io_ref); -+} -+ -+static inline bool bch2_dev_is_readable(struct bch_dev *ca) -+{ -+ return bch2_dev_is_online(ca) && -+ ca->mi.state != BCH_MEMBER_STATE_FAILED; -+} -+ -+static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) -+{ -+ if (!percpu_ref_tryget(&ca->io_ref)) -+ return false; -+ -+ if (ca->mi.state == BCH_MEMBER_STATE_RW || -+ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) -+ return true; -+ -+ percpu_ref_put(&ca->io_ref); -+ return false; -+} -+ -+static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) -+{ -+ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); -+} -+ -+static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs.nr; i++) -+ if (devs.devs[i] == dev) -+ return true; -+ -+ return false; -+} -+ -+static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ unsigned i; -+ -+ for (i = 0; i < devs->nr; i++) -+ if (devs->devs[i] == dev) { -+ array_remove_item(devs->devs, devs->nr, i); -+ return; -+ } -+} -+ -+static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, -+ unsigned dev) -+{ -+ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); -+ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); -+ devs->devs[devs->nr++] = dev; -+} -+ -+static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) -+{ -+ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; -+} -+ -+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, -+ const struct bch_devs_mask *mask) -+{ -+ struct bch_dev *ca = NULL; -+ -+ while ((*iter = mask -+ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) -+ : *iter) < c->sb.nr_devices && -+ !(ca = rcu_dereference_check(c->devs[*iter], -+ lockdep_is_held(&c->state_lock)))) -+ (*iter)++; -+ -+ return ca; -+} -+ -+#define __for_each_member_device(ca, c, iter, mask) \ -+ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) -+ -+#define for_each_member_device_rcu(ca, c, iter, mask) \ -+ __for_each_member_device(ca, c, iter, mask) -+ -+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ if ((ca = __bch2_next_dev(c, iter, NULL))) -+ percpu_ref_get(&ca->ref); -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+/* -+ * If you break early, you must drop your ref on the current device -+ */ -+#define for_each_member_device(ca, c, iter) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_dev(c, &(iter))); \ -+ percpu_ref_put(&ca->ref), (iter)++) -+ -+static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, -+ unsigned *iter, -+ int state_mask) -+{ -+ struct bch_dev *ca; -+ -+ rcu_read_lock(); -+ while ((ca = __bch2_next_dev(c, iter, NULL)) && -+ (!((1 << ca->mi.state) & state_mask) || -+ !percpu_ref_tryget(&ca->io_ref))) -+ (*iter)++; -+ rcu_read_unlock(); -+ -+ return ca; -+} -+ -+#define __for_each_online_member(ca, c, iter, state_mask) \ -+ for ((iter) = 0; \ -+ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ -+ percpu_ref_put(&ca->io_ref), (iter)++) -+ -+#define for_each_online_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, ~0) -+ -+#define for_each_rw_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) -+ -+#define for_each_readable_member(ca, c, iter) \ -+ __for_each_online_member(ca, c, iter, \ -+ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) -+ -+/* -+ * If a key exists that references a device, the device won't be going away and -+ * we can omit rcu_read_lock(): -+ */ -+static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_check(c->devs[idx], 1); -+} -+ -+static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) -+{ -+ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); -+ -+ return rcu_dereference_protected(c->devs[idx], -+ lockdep_is_held(&c->sb_lock) || -+ lockdep_is_held(&c->state_lock)); -+} -+ -+/* XXX kill, move to struct bch_fs */ -+static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -+{ -+ struct bch_devs_mask devs; -+ struct bch_dev *ca; -+ unsigned i; -+ -+ memset(&devs, 0, sizeof(devs)); -+ for_each_online_member(ca, c, i) -+ __set_bit(ca->dev_idx, devs.d); -+ return devs; -+} -+ -+struct bch_fs *bch2_bdev_to_fs(struct block_device *); -+struct bch_fs *bch2_uuid_to_fs(uuid_le); -+ -+bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, -+ enum bch_member_state, int); -+ -+int bch2_dev_fail(struct bch_dev *, int); -+int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_add(struct bch_fs *, const char *); -+int bch2_dev_online(struct bch_fs *, const char *); -+int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); -+int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); -+struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); -+ -+bool bch2_fs_emergency_read_only(struct bch_fs *); -+void bch2_fs_read_only(struct bch_fs *); -+ -+int bch2_fs_read_write(struct bch_fs *); -+int bch2_fs_read_write_early(struct bch_fs *); -+ -+/* -+ * Only for use in the recovery/fsck path: -+ */ -+static inline void bch2_fs_lazy_rw(struct bch_fs *c) -+{ -+ if (percpu_ref_is_zero(&c->writes)) -+ bch2_fs_read_write_early(c); -+} -+ -+void __bch2_fs_stop(struct bch_fs *); -+void bch2_fs_free(struct bch_fs *); -+void bch2_fs_stop(struct bch_fs *); -+ -+int bch2_fs_start(struct bch_fs *); -+struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); -+const char *bch2_fs_open_incremental(const char *path); -+ -+#endif /* _BCACHEFS_SUPER_H */ -diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h -new file mode 100644 -index 000000000000..20406ebd6f5b ---- /dev/null -+++ b/fs/bcachefs/super_types.h -@@ -0,0 +1,51 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SUPER_TYPES_H -+#define _BCACHEFS_SUPER_TYPES_H -+ -+struct bch_sb_handle { -+ struct bch_sb *sb; -+ struct block_device *bdev; -+ struct bio *bio; -+ unsigned page_order; -+ fmode_t mode; -+ unsigned have_layout:1; -+ unsigned have_bio:1; -+ unsigned fs_sb:1; -+ u64 seq; -+}; -+ -+struct bch_devs_mask { -+ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; -+}; -+ -+struct bch_devs_list { -+ u8 nr; -+ u8 devs[BCH_REPLICAS_MAX + 1]; -+}; -+ -+struct bch_member_cpu { -+ u64 nbuckets; /* device size */ -+ u16 first_bucket; /* index of first bucket used */ -+ u16 bucket_size; /* sectors */ -+ u16 group; -+ u8 state; -+ u8 replacement; -+ u8 discard; -+ u8 data_allowed; -+ u8 durability; -+ u8 valid; -+}; -+ -+struct bch_disk_group_cpu { -+ bool deleted; -+ u16 parent; -+ struct bch_devs_mask devs; -+}; -+ -+struct bch_disk_groups_cpu { -+ struct rcu_head rcu; -+ unsigned nr; -+ struct bch_disk_group_cpu entries[]; -+}; -+ -+#endif /* _BCACHEFS_SUPER_TYPES_H */ -diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c -new file mode 100644 -index 000000000000..0cb29f43d99d ---- /dev/null -+++ b/fs/bcachefs/sysfs.c -@@ -0,0 +1,1074 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * bcache sysfs interfaces -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+#include "bcachefs.h" -+#include "alloc_background.h" -+#include "sysfs.h" -+#include "btree_cache.h" -+#include "btree_io.h" -+#include "btree_iter.h" -+#include "btree_key_cache.h" -+#include "btree_update.h" -+#include "btree_update_interior.h" -+#include "btree_gc.h" -+#include "buckets.h" -+#include "clock.h" -+#include "disk_groups.h" -+#include "ec.h" -+#include "inode.h" -+#include "journal.h" -+#include "keylist.h" -+#include "move.h" -+#include "opts.h" -+#include "rebalance.h" -+#include "replicas.h" -+#include "super-io.h" -+#include "tests.h" -+ -+#include -+#include -+#include -+ -+#include "util.h" -+ -+#define SYSFS_OPS(type) \ -+struct sysfs_ops type ## _sysfs_ops = { \ -+ .show = type ## _show, \ -+ .store = type ## _store \ -+} -+ -+#define SHOW(fn) \ -+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ -+ char *buf) \ -+ -+#define STORE(fn) \ -+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ -+ const char *buf, size_t size) \ -+ -+#define __sysfs_attribute(_name, _mode) \ -+ static struct attribute sysfs_##_name = \ -+ { .name = #_name, .mode = _mode } -+ -+#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) -+#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) -+#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) -+ -+#define sysfs_printf(file, fmt, ...) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ -+} while (0) -+ -+#define sysfs_print(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return snprint(buf, PAGE_SIZE, var); \ -+} while (0) -+ -+#define sysfs_hprint(file, val) \ -+do { \ -+ if (attr == &sysfs_ ## file) { \ -+ bch2_hprint(&out, val); \ -+ pr_buf(&out, "\n"); \ -+ return out.pos - buf; \ -+ } \ -+} while (0) -+ -+#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) -+#define var_print(_var) sysfs_print(_var, var(_var)) -+#define var_hprint(_var) sysfs_hprint(_var, var(_var)) -+ -+#define sysfs_strtoul(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe(buf, var) ?: (ssize_t) size; \ -+} while (0) -+ -+#define sysfs_strtoul_clamp(file, var, min, max) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoul_safe_clamp(buf, var, min, max) \ -+ ?: (ssize_t) size; \ -+} while (0) -+ -+#define strtoul_or_return(cp) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define strtoul_restrict_or_return(cp, min, max) \ -+({ \ -+ unsigned long __v = 0; \ -+ int _r = strtoul_safe_restrict(cp, __v, min, max); \ -+ if (_r) \ -+ return _r; \ -+ __v; \ -+}) -+ -+#define strtoi_h_or_return(cp) \ -+({ \ -+ u64 _v; \ -+ int _r = strtoi_h(cp, &_v); \ -+ if (_r) \ -+ return _r; \ -+ _v; \ -+}) -+ -+#define sysfs_hatoi(file, var) \ -+do { \ -+ if (attr == &sysfs_ ## file) \ -+ return strtoi_h(buf, &var) ?: (ssize_t) size; \ -+} while (0) -+ -+write_attribute(trigger_journal_flush); -+write_attribute(trigger_btree_coalesce); -+write_attribute(trigger_gc); -+write_attribute(prune_cache); -+rw_attribute(btree_gc_periodic); -+ -+read_attribute(uuid); -+read_attribute(minor); -+read_attribute(bucket_size); -+read_attribute(block_size); -+read_attribute(btree_node_size); -+read_attribute(first_bucket); -+read_attribute(nbuckets); -+read_attribute(durability); -+read_attribute(iodone); -+ -+read_attribute(io_latency_read); -+read_attribute(io_latency_write); -+read_attribute(io_latency_stats_read); -+read_attribute(io_latency_stats_write); -+read_attribute(congested); -+ -+read_attribute(bucket_quantiles_last_read); -+read_attribute(bucket_quantiles_last_write); -+read_attribute(bucket_quantiles_fragmentation); -+read_attribute(bucket_quantiles_oldest_gen); -+ -+read_attribute(reserve_stats); -+read_attribute(btree_cache_size); -+read_attribute(compression_stats); -+read_attribute(journal_debug); -+read_attribute(journal_pins); -+read_attribute(btree_updates); -+read_attribute(dirty_btree_nodes); -+read_attribute(btree_key_cache); -+read_attribute(btree_transactions); -+read_attribute(stripes_heap); -+ -+read_attribute(internal_uuid); -+ -+read_attribute(has_data); -+read_attribute(alloc_debug); -+write_attribute(wake_allocator); -+ -+read_attribute(read_realloc_races); -+read_attribute(extent_migrate_done); -+read_attribute(extent_migrate_raced); -+ -+rw_attribute(journal_write_delay_ms); -+rw_attribute(journal_reclaim_delay_ms); -+ -+rw_attribute(discard); -+rw_attribute(cache_replacement_policy); -+rw_attribute(label); -+ -+rw_attribute(copy_gc_enabled); -+sysfs_pd_controller_attribute(copy_gc); -+ -+rw_attribute(rebalance_enabled); -+sysfs_pd_controller_attribute(rebalance); -+read_attribute(rebalance_work); -+rw_attribute(promote_whole_extents); -+ -+read_attribute(new_stripes); -+ -+rw_attribute(pd_controllers_update_seconds); -+ -+read_attribute(meta_replicas_have); -+read_attribute(data_replicas_have); -+ -+read_attribute(io_timers_read); -+read_attribute(io_timers_write); -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+write_attribute(perf_test); -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#define BCH_DEBUG_PARAM(name, description) \ -+ rw_attribute(name); -+ -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+#define x(_name) \ -+ static struct attribute sysfs_time_stat_##_name = \ -+ { .name = #_name, .mode = S_IRUGO }; -+ BCH_TIME_STATS() -+#undef x -+ -+static struct attribute sysfs_state_rw = { -+ .name = "state", -+ .mode = S_IRUGO -+}; -+ -+static size_t bch2_btree_cache_size(struct bch_fs *c) -+{ -+ size_t ret = 0; -+ struct btree *b; -+ -+ mutex_lock(&c->btree_cache.lock); -+ list_for_each_entry(b, &c->btree_cache.live, list) -+ ret += btree_bytes(c); -+ -+ mutex_unlock(&c->btree_cache.lock); -+ return ret; -+} -+ -+static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); -+ -+ if (!fs_usage) -+ return -ENOMEM; -+ -+ bch2_fs_usage_to_text(out, c, fs_usage); -+ -+ percpu_up_read(&c->mark_lock); -+ -+ kfree(fs_usage); -+ return 0; -+} -+ -+static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, -+ nr_compressed_extents = 0, -+ compressed_sectors_compressed = 0, -+ compressed_sectors_uncompressed = 0; -+ int ret; -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) -+ if (k.k->type == KEY_TYPE_extent) { -+ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); -+ const union bch_extent_entry *entry; -+ struct extent_ptr_decoded p; -+ -+ extent_for_each_ptr_decode(e, p, entry) { -+ if (!crc_is_compressed(p.crc)) { -+ nr_uncompressed_extents++; -+ uncompressed_sectors += e.k->size; -+ } else { -+ nr_compressed_extents++; -+ compressed_sectors_compressed += -+ p.crc.compressed_size; -+ compressed_sectors_uncompressed += -+ p.crc.uncompressed_size; -+ } -+ -+ /* only looking at the first ptr */ -+ break; -+ } -+ } -+ -+ ret = bch2_trans_exit(&trans) ?: ret; -+ if (ret) -+ return ret; -+ -+ pr_buf(out, -+ "uncompressed data:\n" -+ " nr extents: %llu\n" -+ " size (bytes): %llu\n" -+ "compressed data:\n" -+ " nr extents: %llu\n" -+ " compressed size (bytes): %llu\n" -+ " uncompressed size (bytes): %llu\n", -+ nr_uncompressed_extents, -+ uncompressed_sectors << 9, -+ nr_compressed_extents, -+ compressed_sectors_compressed << 9, -+ compressed_sectors_uncompressed << 9); -+ return 0; -+} -+ -+SHOW(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ sysfs_print(minor, c->minor); -+ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); -+ -+ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(btree_node_size, btree_bytes(c)); -+ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); -+ -+ sysfs_print(read_realloc_races, -+ atomic_long_read(&c->read_realloc_races)); -+ sysfs_print(extent_migrate_done, -+ atomic_long_read(&c->extent_migrate_done)); -+ sysfs_print(extent_migrate_raced, -+ atomic_long_read(&c->extent_migrate_raced)); -+ -+ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); -+ -+ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); -+ -+ sysfs_print(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ -+ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); -+ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ -+ sysfs_pd_controller_show(copy_gc, &c->copygc_pd); -+ -+ if (attr == &sysfs_rebalance_work) { -+ bch2_rebalance_work_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ sysfs_print(promote_whole_extents, c->promote_whole_extents); -+ -+ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); -+ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_alloc_debug) -+ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; -+ -+ if (attr == &sysfs_journal_debug) { -+ bch2_journal_debug_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_journal_pins) { -+ bch2_journal_pins_to_text(&out, &c->journal); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_updates) { -+ bch2_btree_updates_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_dirty_btree_nodes) { -+ bch2_dirty_btree_nodes_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_key_cache) { -+ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_btree_transactions) { -+ bch2_btree_trans_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_stripes_heap) { -+ bch2_stripes_heap_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_compression_stats) { -+ bch2_compression_stats_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_new_stripes) { -+ bch2_new_stripes_to_text(&out, c); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_io_timers_read) { -+ bch2_io_timers_to_text(&out, &c->io_clock[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_timers_write) { -+ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); -+ return out.pos - buf; -+ } -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ return 0; -+} -+ -+STORE(bch2_fs) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); -+ -+ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); -+ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); -+ -+ if (attr == &sysfs_btree_gc_periodic) { -+ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) -+ ?: (ssize_t) size; -+ -+ wake_up_process(c->gc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_copy_gc_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) -+ ?: (ssize_t) size; -+ -+ if (c->copygc_thread) -+ wake_up_process(c->copygc_thread); -+ return ret; -+ } -+ -+ if (attr == &sysfs_rebalance_enabled) { -+ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) -+ ?: (ssize_t) size; -+ -+ rebalance_wakeup(c); -+ return ret; -+ } -+ -+ sysfs_strtoul(pd_controllers_update_seconds, -+ c->pd_controllers_update_seconds); -+ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); -+ sysfs_pd_controller_store(copy_gc, &c->copygc_pd); -+ -+ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); -+ -+ /* Debugging: */ -+ -+#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ if (!test_bit(BCH_FS_STARTED, &c->flags)) -+ return -EPERM; -+ -+ /* Debugging: */ -+ -+ if (attr == &sysfs_trigger_journal_flush) -+ bch2_journal_meta_async(&c->journal, NULL); -+ -+ if (attr == &sysfs_trigger_btree_coalesce) -+ bch2_coalesce(c); -+ -+ if (attr == &sysfs_trigger_gc) { -+ /* -+ * Full gc is currently incompatible with btree key cache: -+ */ -+#if 0 -+ down_read(&c->state_lock); -+ bch2_gc(c, NULL, false, false); -+ up_read(&c->state_lock); -+#else -+ bch2_gc_gens(c); -+#endif -+ } -+ -+ if (attr == &sysfs_prune_cache) { -+ struct shrink_control sc; -+ -+ sc.gfp_mask = GFP_KERNEL; -+ sc.nr_to_scan = strtoul_or_return(buf); -+ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); -+ } -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ if (attr == &sysfs_perf_test) { -+ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; -+ char *test = strsep(&p, " \t\n"); -+ char *nr_str = strsep(&p, " \t\n"); -+ char *threads_str = strsep(&p, " \t\n"); -+ unsigned threads; -+ u64 nr; -+ int ret = -EINVAL; -+ -+ if (threads_str && -+ !(ret = kstrtouint(threads_str, 10, &threads)) && -+ !(ret = bch2_strtoull_h(nr_str, &nr))) -+ bch2_btree_perf_test(c, test, nr, threads); -+ else -+ size = ret; -+ kfree(tmp); -+ } -+#endif -+ return size; -+} -+SYSFS_OPS(bch2_fs); -+ -+struct attribute *bch2_fs_files[] = { -+ &sysfs_minor, -+ &sysfs_block_size, -+ &sysfs_btree_node_size, -+ &sysfs_btree_cache_size, -+ -+ &sysfs_meta_replicas_have, -+ &sysfs_data_replicas_have, -+ -+ &sysfs_journal_write_delay_ms, -+ &sysfs_journal_reclaim_delay_ms, -+ -+ &sysfs_promote_whole_extents, -+ -+ &sysfs_compression_stats, -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ &sysfs_perf_test, -+#endif -+ NULL -+}; -+ -+/* internal dir - just a wrapper */ -+ -+SHOW(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_show(&c->kobj, attr, buf); -+} -+ -+STORE(bch2_fs_internal) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); -+ return bch2_fs_store(&c->kobj, attr, buf, size); -+} -+SYSFS_OPS(bch2_fs_internal); -+ -+struct attribute *bch2_fs_internal_files[] = { -+ &sysfs_alloc_debug, -+ &sysfs_journal_debug, -+ &sysfs_journal_pins, -+ &sysfs_btree_updates, -+ &sysfs_dirty_btree_nodes, -+ &sysfs_btree_key_cache, -+ &sysfs_btree_transactions, -+ &sysfs_stripes_heap, -+ -+ &sysfs_read_realloc_races, -+ &sysfs_extent_migrate_done, -+ &sysfs_extent_migrate_raced, -+ -+ &sysfs_trigger_journal_flush, -+ &sysfs_trigger_btree_coalesce, -+ &sysfs_trigger_gc, -+ &sysfs_prune_cache, -+ -+ &sysfs_copy_gc_enabled, -+ -+ &sysfs_rebalance_enabled, -+ &sysfs_rebalance_work, -+ sysfs_pd_controller_files(rebalance), -+ sysfs_pd_controller_files(copy_gc), -+ -+ &sysfs_new_stripes, -+ -+ &sysfs_io_timers_read, -+ &sysfs_io_timers_write, -+ -+ &sysfs_internal_uuid, -+ -+#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, -+ BCH_DEBUG_PARAMS() -+#undef BCH_DEBUG_PARAM -+ -+ NULL -+}; -+ -+/* options */ -+ -+SHOW(bch2_fs_opts_dir) -+{ -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int id = opt - bch2_opt_table; -+ u64 v = bch2_opt_get_by_id(&c->opts, id); -+ -+ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); -+ pr_buf(&out, "\n"); -+ -+ return out.pos - buf; -+} -+ -+STORE(bch2_fs_opts_dir) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); -+ const struct bch_option *opt = container_of(attr, struct bch_option, attr); -+ int ret, id = opt - bch2_opt_table; -+ char *tmp; -+ u64 v; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_opt_parse(c, opt, strim(tmp), &v); -+ kfree(tmp); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, id, v); -+ if (ret < 0) -+ return ret; -+ -+ if (opt->set_sb != SET_NO_SB_OPT) { -+ mutex_lock(&c->sb_lock); -+ opt->set_sb(c->disk_sb.sb, v); -+ bch2_write_super(c); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ bch2_opt_set_by_id(&c->opts, id, v); -+ -+ if ((id == Opt_background_target || -+ id == Opt_background_compression) && v) { -+ bch2_rebalance_add_work(c, S64_MAX); -+ rebalance_wakeup(c); -+ } -+ -+ return size; -+} -+SYSFS_OPS(bch2_fs_opts_dir); -+ -+struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -+ -+int bch2_opts_create_sysfs_files(struct kobject *kobj) -+{ -+ const struct bch_option *i; -+ int ret; -+ -+ for (i = bch2_opt_table; -+ i < bch2_opt_table + bch2_opts_nr; -+ i++) { -+ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) -+ continue; -+ -+ ret = sysfs_create_file(kobj, &i->attr); -+ if (ret) -+ return ret; -+ } -+ -+ return 0; -+} -+ -+/* time stats */ -+ -+SHOW(bch2_fs_time_stats) -+{ -+ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+#define x(name) \ -+ if (attr == &sysfs_time_stat_##name) { \ -+ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ -+ return out.pos - buf; \ -+ } -+ BCH_TIME_STATS() -+#undef x -+ -+ return 0; -+} -+ -+STORE(bch2_fs_time_stats) -+{ -+ return size; -+} -+SYSFS_OPS(bch2_fs_time_stats); -+ -+struct attribute *bch2_fs_time_stats_files[] = { -+#define x(name) \ -+ &sysfs_time_stat_##name, -+ BCH_TIME_STATS() -+#undef x -+ NULL -+}; -+ -+typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, -+ size_t, void *); -+ -+static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ int rw = (private ? 1 : 0); -+ -+ return bucket_last_io(c, bucket(ca, b), rw); -+} -+ -+static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ struct bucket *g = bucket(ca, b); -+ return bucket_sectors_used(g->mark); -+} -+ -+static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, -+ size_t b, void *private) -+{ -+ return bucket_gc_gen(ca, b); -+} -+ -+static int unsigned_cmp(const void *_l, const void *_r) -+{ -+ const unsigned *l = _l; -+ const unsigned *r = _r; -+ -+ return cmp_int(*l, *r); -+} -+ -+static int quantiles_to_text(struct printbuf *out, -+ struct bch_fs *c, struct bch_dev *ca, -+ bucket_map_fn *fn, void *private) -+{ -+ size_t i, n; -+ /* Compute 31 quantiles */ -+ unsigned q[31], *p; -+ -+ down_read(&ca->bucket_lock); -+ n = ca->mi.nbuckets; -+ -+ p = vzalloc(n * sizeof(unsigned)); -+ if (!p) { -+ up_read(&ca->bucket_lock); -+ return -ENOMEM; -+ } -+ -+ for (i = ca->mi.first_bucket; i < n; i++) -+ p[i] = fn(c, ca, i, private); -+ -+ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); -+ up_read(&ca->bucket_lock); -+ -+ while (n && -+ !p[n - 1]) -+ --n; -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; -+ -+ vfree(p); -+ -+ for (i = 0; i < ARRAY_SIZE(q); i++) -+ pr_buf(out, "%u ", q[i]); -+ pr_buf(out, "\n"); -+ return 0; -+} -+ -+static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ enum alloc_reserve i; -+ -+ spin_lock(&ca->fs->freelist_lock); -+ -+ pr_buf(out, "free_inc:\t%zu\t%zu\n", -+ fifo_used(&ca->free_inc), -+ ca->free_inc.size); -+ -+ for (i = 0; i < RESERVE_NR; i++) -+ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, -+ fifo_used(&ca->free[i]), -+ ca->free[i].size); -+ -+ spin_unlock(&ca->fs->freelist_lock); -+} -+ -+static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ struct bch_fs *c = ca->fs; -+ struct bch_dev_usage stats = bch2_dev_usage_read(ca); -+ unsigned i, nr[BCH_DATA_NR]; -+ -+ memset(nr, 0, sizeof(nr)); -+ -+ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) -+ nr[c->open_buckets[i].type]++; -+ -+ pr_buf(out, -+ "free_inc: %zu/%zu\n" -+ "free[RESERVE_BTREE]: %zu/%zu\n" -+ "free[RESERVE_MOVINGGC]: %zu/%zu\n" -+ "free[RESERVE_NONE]: %zu/%zu\n" -+ "buckets:\n" -+ " capacity: %llu\n" -+ " alloc: %llu\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " available: %lli\n" -+ "sectors:\n" -+ " sb: %llu\n" -+ " journal: %llu\n" -+ " meta: %llu\n" -+ " user: %llu\n" -+ " cached: %llu\n" -+ " erasure coded: %llu\n" -+ " fragmented: %llu\n" -+ " copygc threshold: %llu\n" -+ "freelist_wait: %s\n" -+ "open buckets: %u/%u (reserved %u)\n" -+ "open_buckets_wait: %s\n" -+ "open_buckets_btree: %u\n" -+ "open_buckets_user: %u\n" -+ "btree reserve cache: %u\n", -+ fifo_used(&ca->free_inc), ca->free_inc.size, -+ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, -+ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, -+ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, -+ ca->mi.nbuckets - ca->mi.first_bucket, -+ stats.buckets_alloc, -+ stats.buckets[BCH_DATA_sb], -+ stats.buckets[BCH_DATA_journal], -+ stats.buckets[BCH_DATA_btree], -+ stats.buckets[BCH_DATA_user], -+ stats.buckets[BCH_DATA_cached], -+ stats.buckets_ec, -+ __dev_buckets_available(ca, stats), -+ stats.sectors[BCH_DATA_sb], -+ stats.sectors[BCH_DATA_journal], -+ stats.sectors[BCH_DATA_btree], -+ stats.sectors[BCH_DATA_user], -+ stats.sectors[BCH_DATA_cached], -+ stats.sectors_ec, -+ stats.sectors_fragmented, -+ c->copygc_threshold, -+ c->freelist_wait.list.first ? "waiting" : "empty", -+ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, -+ BTREE_NODE_OPEN_BUCKET_RESERVE, -+ c->open_buckets_wait.list.first ? "waiting" : "empty", -+ nr[BCH_DATA_btree], -+ nr[BCH_DATA_user], -+ c->btree_reserve_cache_nr); -+} -+ -+static const char * const bch2_rw[] = { -+ "read", -+ "write", -+ NULL -+}; -+ -+static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) -+{ -+ int rw, i; -+ -+ for (rw = 0; rw < 2; rw++) { -+ pr_buf(out, "%s:\n", bch2_rw[rw]); -+ -+ for (i = 1; i < BCH_DATA_NR; i++) -+ pr_buf(out, "%-12s:%12llu\n", -+ bch2_data_types[i], -+ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); -+ } -+} -+ -+SHOW(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct printbuf out = _PBUF(buf, PAGE_SIZE); -+ -+ sysfs_printf(uuid, "%pU\n", ca->uuid.b); -+ -+ sysfs_print(bucket_size, bucket_bytes(ca)); -+ sysfs_print(block_size, block_bytes(c)); -+ sysfs_print(first_bucket, ca->mi.first_bucket); -+ sysfs_print(nbuckets, ca->mi.nbuckets); -+ sysfs_print(durability, ca->mi.durability); -+ sysfs_print(discard, ca->mi.discard); -+ -+ if (attr == &sysfs_label) { -+ if (ca->mi.group) { -+ mutex_lock(&c->sb_lock); -+ bch2_disk_path_to_text(&out, &c->disk_sb, -+ ca->mi.group - 1); -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_has_data) { -+ bch2_flags_to_text(&out, bch2_data_types, -+ bch2_dev_has_data(c, ca)); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ bch2_string_opt_to_text(&out, -+ bch2_cache_replacement_policies, -+ ca->mi.replacement); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_state_rw) { -+ bch2_string_opt_to_text(&out, bch2_dev_state, -+ ca->mi.state); -+ pr_buf(&out, "\n"); -+ return out.pos - buf; -+ } -+ -+ if (attr == &sysfs_iodone) { -+ dev_iodone_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ -+ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); -+ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); -+ -+ if (attr == &sysfs_io_latency_stats_read) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_io_latency_stats_write) { -+ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); -+ return out.pos - buf; -+ } -+ -+ sysfs_printf(congested, "%u%%", -+ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) -+ * 100 / CONGESTED_MAX); -+ -+ if (attr == &sysfs_bucket_quantiles_last_read) -+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_last_write) -+ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_fragmentation) -+ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; -+ if (attr == &sysfs_bucket_quantiles_oldest_gen) -+ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; -+ -+ if (attr == &sysfs_reserve_stats) { -+ reserve_stats_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ if (attr == &sysfs_alloc_debug) { -+ dev_alloc_debug_to_text(&out, ca); -+ return out.pos - buf; -+ } -+ -+ return 0; -+} -+ -+STORE(bch2_dev) -+{ -+ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); -+ struct bch_fs *c = ca->fs; -+ struct bch_member *mi; -+ -+ if (attr == &sysfs_discard) { -+ bool v = strtoul_or_return(buf); -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if (v != BCH_MEMBER_DISCARD(mi)) { -+ SET_BCH_MEMBER_DISCARD(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_cache_replacement_policy) { -+ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); -+ -+ if (v < 0) -+ return v; -+ -+ mutex_lock(&c->sb_lock); -+ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; -+ -+ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { -+ SET_BCH_MEMBER_REPLACEMENT(mi, v); -+ bch2_write_super(c); -+ } -+ mutex_unlock(&c->sb_lock); -+ } -+ -+ if (attr == &sysfs_label) { -+ char *tmp; -+ int ret; -+ -+ tmp = kstrdup(buf, GFP_KERNEL); -+ if (!tmp) -+ return -ENOMEM; -+ -+ ret = bch2_dev_group_set(c, ca, strim(tmp)); -+ kfree(tmp); -+ if (ret) -+ return ret; -+ } -+ -+ if (attr == &sysfs_wake_allocator) -+ bch2_wake_allocator(ca); -+ -+ return size; -+} -+SYSFS_OPS(bch2_dev); -+ -+struct attribute *bch2_dev_files[] = { -+ &sysfs_uuid, -+ &sysfs_bucket_size, -+ &sysfs_block_size, -+ &sysfs_first_bucket, -+ &sysfs_nbuckets, -+ &sysfs_durability, -+ -+ /* settings: */ -+ &sysfs_discard, -+ &sysfs_cache_replacement_policy, -+ &sysfs_state_rw, -+ &sysfs_label, -+ -+ &sysfs_has_data, -+ &sysfs_iodone, -+ -+ &sysfs_io_latency_read, -+ &sysfs_io_latency_write, -+ &sysfs_io_latency_stats_read, -+ &sysfs_io_latency_stats_write, -+ &sysfs_congested, -+ -+ /* alloc info - other stats: */ -+ &sysfs_bucket_quantiles_last_read, -+ &sysfs_bucket_quantiles_last_write, -+ &sysfs_bucket_quantiles_fragmentation, -+ &sysfs_bucket_quantiles_oldest_gen, -+ -+ &sysfs_reserve_stats, -+ -+ /* debug: */ -+ &sysfs_alloc_debug, -+ &sysfs_wake_allocator, -+ NULL -+}; -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h -new file mode 100644 -index 000000000000..525fd05d91f7 ---- /dev/null -+++ b/fs/bcachefs/sysfs.h -@@ -0,0 +1,44 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_SYSFS_H_ -+#define _BCACHEFS_SYSFS_H_ -+ -+#include -+ -+#ifndef NO_BCACHEFS_SYSFS -+ -+struct attribute; -+struct sysfs_ops; -+ -+extern struct attribute *bch2_fs_files[]; -+extern struct attribute *bch2_fs_internal_files[]; -+extern struct attribute *bch2_fs_opts_dir_files[]; -+extern struct attribute *bch2_fs_time_stats_files[]; -+extern struct attribute *bch2_dev_files[]; -+ -+extern struct sysfs_ops bch2_fs_sysfs_ops; -+extern struct sysfs_ops bch2_fs_internal_sysfs_ops; -+extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+extern struct sysfs_ops bch2_dev_sysfs_ops; -+ -+int bch2_opts_create_sysfs_files(struct kobject *); -+ -+#else -+ -+static struct attribute *bch2_fs_files[] = {}; -+static struct attribute *bch2_fs_internal_files[] = {}; -+static struct attribute *bch2_fs_opts_dir_files[] = {}; -+static struct attribute *bch2_fs_time_stats_files[] = {}; -+static struct attribute *bch2_dev_files[] = {}; -+ -+static const struct sysfs_ops bch2_fs_sysfs_ops; -+static const struct sysfs_ops bch2_fs_internal_sysfs_ops; -+static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; -+static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; -+static const struct sysfs_ops bch2_dev_sysfs_ops; -+ -+static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } -+ -+#endif /* NO_BCACHEFS_SYSFS */ -+ -+#endif /* _BCACHEFS_SYSFS_H_ */ -diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c -new file mode 100644 -index 000000000000..4dcace650416 ---- /dev/null -+++ b/fs/bcachefs/tests.c -@@ -0,0 +1,725 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+#include "bcachefs.h" -+#include "btree_update.h" -+#include "journal_reclaim.h" -+#include "tests.h" -+ -+#include "linux/kthread.h" -+#include "linux/random.h" -+ -+static void delete_test_keys(struct bch_fs *c) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+/* unit tests */ -+ -+static void test_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ pr_info("deleting once"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ pr_info("deleting twice"); -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_delete_written(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_i_cookie k; -+ int ret; -+ -+ bkey_cookie_init(&k.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, -+ BTREE_ITER_INTENT); -+ -+ ret = bch2_btree_iter_traverse(iter); -+ BUG_ON(ret); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ BUG_ON(ret); -+ -+ bch2_journal_flush_all_pins(&c->journal); -+ -+ ret = bch2_btree_delete_at(&trans, iter, 0); -+ BUG_ON(ret); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS_MIN, 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i++); -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) -+ BUG_ON(k.k->p.offset != --i); -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test extents"); -+ -+ for (i = 0; i < nr; i += 8) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 8; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, -+ POS_MIN, 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i); -+ i = k.k->p.offset; -+ } -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating backwards"); -+ -+ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { -+ BUG_ON(k.k->p.offset != i); -+ i = bkey_start_offset(k.k); -+ } -+ -+ BUG_ON(i); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i++) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i * 2; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ 0, k, ret) { -+ if (k.k->p.inode) -+ break; -+ -+ BUG_ON(k.k->p.offset != i); -+ i += 2; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr * 2); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(k.k->p.offset != i); -+ BUG_ON(bkey_deleted(k.k) != (i & 1)); -+ -+ i++; -+ if (i == nr * 2) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ delete_test_keys(c); -+ -+ pr_info("inserting test keys"); -+ -+ for (i = 0; i < nr; i += 16) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = i + 16; -+ k.k.size = 8; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+ } -+ -+ pr_info("iterating forwards"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ 0, k, ret) { -+ BUG_ON(bkey_start_offset(k.k) != i + 8); -+ BUG_ON(k.k->size != 8); -+ i += 16; -+ } -+ bch2_trans_iter_free(&trans, iter); -+ -+ BUG_ON(i != nr); -+ -+ pr_info("iterating forwards by slots"); -+ -+ i = 0; -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, -+ BTREE_ITER_SLOTS, k, ret) { -+ BUG_ON(bkey_deleted(k.k) != !(i % 16)); -+ -+ BUG_ON(bkey_start_offset(k.k) != i); -+ BUG_ON(k.k->size != 8); -+ i = k.k->p.offset; -+ -+ if (i == nr) -+ break; -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* -+ * XXX: we really want to make sure we've got a btree with depth > 0 for these -+ * tests -+ */ -+static void test_peek_end(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void test_peek_end_extents(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ k = bch2_btree_iter_peek(iter); -+ BUG_ON(k.k); -+ -+ bch2_trans_exit(&trans); -+} -+ -+/* extent unit tests */ -+ -+u64 test_version; -+ -+static void insert_test_extent(struct bch_fs *c, -+ u64 start, u64 end) -+{ -+ struct bkey_i_cookie k; -+ int ret; -+ -+ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); -+ -+ bkey_cookie_init(&k.k_i); -+ k.k_i.k.p.offset = end; -+ k.k_i.k.size = end - start; -+ k.k_i.k.version.lo = test_version++; -+ -+ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, -+ NULL, NULL, 0); -+ BUG_ON(ret); -+} -+ -+static void __test_extent_overwrite(struct bch_fs *c, -+ u64 e1_start, u64 e1_end, -+ u64 e2_start, u64 e2_end) -+{ -+ insert_test_extent(c, e1_start, e1_end); -+ insert_test_extent(c, e2_start, e2_end); -+ -+ delete_test_keys(c); -+} -+ -+static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 0, 32); -+ __test_extent_overwrite(c, 8, 64, 0, 32); -+} -+ -+static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 64); -+ __test_extent_overwrite(c, 0, 64, 32, 72); -+} -+ -+static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 0, 64, 32, 40); -+} -+ -+static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) -+{ -+ __test_extent_overwrite(c, 32, 64, 0, 64); -+ __test_extent_overwrite(c, 32, 64, 0, 128); -+ __test_extent_overwrite(c, 32, 64, 32, 64); -+ __test_extent_overwrite(c, 32, 64, 32, 128); -+} -+ -+/* perf tests */ -+ -+static u64 test_rand(void) -+{ -+ u64 v; -+#if 0 -+ v = prandom_u32(); -+#else -+ prandom_bytes(&v, sizeof(v)); -+#endif -+ return v; -+} -+ -+static void rand_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct bkey_i_cookie k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ bkey_cookie_init(&k.k_i); -+ k.k.p.offset = test_rand(); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void rand_mixed(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, -+ POS(0, test_rand()), 0); -+ -+ k = bch2_btree_iter_peek(iter); -+ -+ if (!(i & 3) && k.k) { -+ struct bkey_i_cookie k; -+ -+ bkey_cookie_init(&k.k_i); -+ k.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &k.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_iter_free(&trans, iter); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static int __do_delete(struct btree_trans *trans, struct bpos pos) -+{ -+ struct btree_iter *iter; -+ struct bkey_i delete; -+ struct bkey_s_c k; -+ int ret = 0; -+ -+ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, -+ BTREE_ITER_INTENT); -+ ret = PTR_ERR_OR_ZERO(iter); -+ if (ret) -+ goto err; -+ -+ k = bch2_btree_iter_peek(iter); -+ ret = bkey_err(k); -+ if (ret) -+ goto err; -+ -+ bkey_init(&delete.k); -+ delete.k.p = k.k->p; -+ -+ bch2_trans_update(trans, iter, &delete, 0); -+err: -+ bch2_trans_iter_put(trans, iter); -+ return ret; -+} -+ -+static void rand_delete(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ int ret; -+ u64 i; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for (i = 0; i < nr; i++) { -+ struct bpos pos = POS(0, test_rand()); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ __do_delete(&trans, pos)); -+ BUG_ON(ret); -+ } -+ -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_insert(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct bkey_i_cookie insert; -+ int ret; -+ u64 i = 0; -+ -+ bkey_cookie_init(&insert.k_i); -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { -+ insert.k.p = iter->pos; -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &insert.k_i, 0)); -+ -+ BUG_ON(ret); -+ -+ if (++i == nr) -+ break; -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_lookup(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) -+ ; -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_overwrite(struct bch_fs *c, u64 nr) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, -+ BTREE_ITER_INTENT, k, ret) { -+ struct bkey_i_cookie u; -+ -+ bkey_reassemble(&u.k_i, k); -+ -+ ret = __bch2_trans_do(&trans, NULL, NULL, 0, -+ bch2_trans_update(&trans, iter, &u.k_i, 0)); -+ -+ BUG_ON(ret); -+ } -+ bch2_trans_exit(&trans); -+} -+ -+static void seq_delete(struct bch_fs *c, u64 nr) -+{ -+ int ret; -+ -+ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, -+ POS(0, 0), POS(0, U64_MAX), -+ NULL); -+ BUG_ON(ret); -+} -+ -+typedef void (*perf_test_fn)(struct bch_fs *, u64); -+ -+struct test_job { -+ struct bch_fs *c; -+ u64 nr; -+ unsigned nr_threads; -+ perf_test_fn fn; -+ -+ atomic_t ready; -+ wait_queue_head_t ready_wait; -+ -+ atomic_t done; -+ struct completion done_completion; -+ -+ u64 start; -+ u64 finish; -+}; -+ -+static int btree_perf_test_thread(void *data) -+{ -+ struct test_job *j = data; -+ -+ if (atomic_dec_and_test(&j->ready)) { -+ wake_up(&j->ready_wait); -+ j->start = sched_clock(); -+ } else { -+ wait_event(j->ready_wait, !atomic_read(&j->ready)); -+ } -+ -+ j->fn(j->c, j->nr / j->nr_threads); -+ -+ if (atomic_dec_and_test(&j->done)) { -+ j->finish = sched_clock(); -+ complete(&j->done_completion); -+ } -+ -+ return 0; -+} -+ -+void bch2_btree_perf_test(struct bch_fs *c, const char *testname, -+ u64 nr, unsigned nr_threads) -+{ -+ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; -+ char name_buf[20], nr_buf[20], per_sec_buf[20]; -+ unsigned i; -+ u64 time; -+ -+ atomic_set(&j.ready, nr_threads); -+ init_waitqueue_head(&j.ready_wait); -+ -+ atomic_set(&j.done, nr_threads); -+ init_completion(&j.done_completion); -+ -+#define perf_test(_test) \ -+ if (!strcmp(testname, #_test)) j.fn = _test -+ -+ perf_test(rand_insert); -+ perf_test(rand_lookup); -+ perf_test(rand_mixed); -+ perf_test(rand_delete); -+ -+ perf_test(seq_insert); -+ perf_test(seq_lookup); -+ perf_test(seq_overwrite); -+ perf_test(seq_delete); -+ -+ /* a unit test, not a perf test: */ -+ perf_test(test_delete); -+ perf_test(test_delete_written); -+ perf_test(test_iterate); -+ perf_test(test_iterate_extents); -+ perf_test(test_iterate_slots); -+ perf_test(test_iterate_slots_extents); -+ perf_test(test_peek_end); -+ perf_test(test_peek_end_extents); -+ -+ perf_test(test_extent_overwrite_front); -+ perf_test(test_extent_overwrite_back); -+ perf_test(test_extent_overwrite_middle); -+ perf_test(test_extent_overwrite_all); -+ -+ if (!j.fn) { -+ pr_err("unknown test %s", testname); -+ return; -+ } -+ -+ //pr_info("running test %s:", testname); -+ -+ if (nr_threads == 1) -+ btree_perf_test_thread(&j); -+ else -+ for (i = 0; i < nr_threads; i++) -+ kthread_run(btree_perf_test_thread, &j, -+ "bcachefs perf test[%u]", i); -+ -+ while (wait_for_completion_interruptible(&j.done_completion)) -+ ; -+ -+ time = j.finish - j.start; -+ -+ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); -+ bch2_hprint(&PBUF(nr_buf), nr); -+ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); -+ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", -+ name_buf, nr_buf, nr_threads, -+ time / NSEC_PER_SEC, -+ time * nr_threads / nr, -+ per_sec_buf); -+} -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h -new file mode 100644 -index 000000000000..551d0764225e ---- /dev/null -+++ b/fs/bcachefs/tests.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_TEST_H -+#define _BCACHEFS_TEST_H -+ -+struct bch_fs; -+ -+#ifdef CONFIG_BCACHEFS_TESTS -+ -+void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); -+ -+#else -+ -+#endif /* CONFIG_BCACHEFS_TESTS */ -+ -+#endif /* _BCACHEFS_TEST_H */ -diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c -new file mode 100644 -index 000000000000..59e8dfa3d245 ---- /dev/null -+++ b/fs/bcachefs/trace.c -@@ -0,0 +1,12 @@ -+// SPDX-License-Identifier: GPL-2.0 -+#include "bcachefs.h" -+#include "alloc_types.h" -+#include "buckets.h" -+#include "btree_types.h" -+#include "keylist.h" -+ -+#include -+#include "keylist.h" -+ -+#define CREATE_TRACE_POINTS -+#include -diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c -new file mode 100644 -index 000000000000..fd4044a6a08f ---- /dev/null -+++ b/fs/bcachefs/util.c -@@ -0,0 +1,907 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * random utiility code, for bcache but in theory not specific to bcache -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "eytzinger.h" -+#include "util.h" -+ -+static const char si_units[] = "?kMGTPEZY"; -+ -+static int __bch2_strtoh(const char *cp, u64 *res, -+ u64 t_max, bool t_signed) -+{ -+ bool positive = *cp != '-'; -+ unsigned u; -+ u64 v = 0; -+ -+ if (*cp == '+' || *cp == '-') -+ cp++; -+ -+ if (!isdigit(*cp)) -+ return -EINVAL; -+ -+ do { -+ if (v > U64_MAX / 10) -+ return -ERANGE; -+ v *= 10; -+ if (v > U64_MAX - (*cp - '0')) -+ return -ERANGE; -+ v += *cp - '0'; -+ cp++; -+ } while (isdigit(*cp)); -+ -+ for (u = 1; u < strlen(si_units); u++) -+ if (*cp == si_units[u]) { -+ cp++; -+ goto got_unit; -+ } -+ u = 0; -+got_unit: -+ if (*cp == '\n') -+ cp++; -+ if (*cp) -+ return -EINVAL; -+ -+ if (fls64(v) + u * 10 > 64) -+ return -ERANGE; -+ -+ v <<= u * 10; -+ -+ if (positive) { -+ if (v > t_max) -+ return -ERANGE; -+ } else { -+ if (v && !t_signed) -+ return -ERANGE; -+ -+ if (v > t_max + 1) -+ return -ERANGE; -+ v = -v; -+ } -+ -+ *res = v; -+ return 0; -+} -+ -+#define STRTO_H(name, type) \ -+int bch2_ ## name ## _h(const char *cp, type *res) \ -+{ \ -+ u64 v; \ -+ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ -+ ANYSINT_MAX(type) != ((type) ~0ULL)); \ -+ *res = v; \ -+ return ret; \ -+} -+ -+STRTO_H(strtoint, int) -+STRTO_H(strtouint, unsigned int) -+STRTO_H(strtoll, long long) -+STRTO_H(strtoull, unsigned long long) -+STRTO_H(strtou64, u64) -+ -+void bch2_hprint(struct printbuf *buf, s64 v) -+{ -+ int u, t = 0; -+ -+ for (u = 0; v >= 1024 || v <= -1024; u++) { -+ t = v & ~(~0U << 10); -+ v >>= 10; -+ } -+ -+ pr_buf(buf, "%lli", v); -+ -+ /* -+ * 103 is magic: t is in the range [-1023, 1023] and we want -+ * to turn it into [-9, 9] -+ */ -+ if (u && v < 100 && v > -100) -+ pr_buf(buf, ".%i", t / 103); -+ if (u) -+ pr_buf(buf, "%c", si_units[u]); -+} -+ -+void bch2_string_opt_to_text(struct printbuf *out, -+ const char * const list[], -+ size_t selected) -+{ -+ size_t i; -+ -+ for (i = 0; list[i]; i++) -+ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); -+} -+ -+void bch2_flags_to_text(struct printbuf *out, -+ const char * const list[], u64 flags) -+{ -+ unsigned bit, nr = 0; -+ bool first = true; -+ -+ if (out->pos != out->end) -+ *out->pos = '\0'; -+ -+ while (list[nr]) -+ nr++; -+ -+ while (flags && (bit = __ffs(flags)) < nr) { -+ if (!first) -+ pr_buf(out, ","); -+ first = false; -+ pr_buf(out, "%s", list[bit]); -+ flags ^= 1 << bit; -+ } -+} -+ -+u64 bch2_read_flag_list(char *opt, const char * const list[]) -+{ -+ u64 ret = 0; -+ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); -+ -+ if (!d) -+ return -ENOMEM; -+ -+ s = strim(d); -+ -+ while ((p = strsep(&s, ","))) { -+ int flag = match_string(list, -1, p); -+ if (flag < 0) { -+ ret = -1; -+ break; -+ } -+ -+ ret |= 1 << flag; -+ } -+ -+ kfree(d); -+ -+ return ret; -+} -+ -+bool bch2_is_zero(const void *_p, size_t n) -+{ -+ const char *p = _p; -+ size_t i; -+ -+ for (i = 0; i < n; i++) -+ if (p[i]) -+ return false; -+ return true; -+} -+ -+static void bch2_quantiles_update(struct quantiles *q, u64 v) -+{ -+ unsigned i = 0; -+ -+ while (i < ARRAY_SIZE(q->entries)) { -+ struct quantile_entry *e = q->entries + i; -+ -+ if (unlikely(!e->step)) { -+ e->m = v; -+ e->step = max_t(unsigned, v / 2, 1024); -+ } else if (e->m > v) { -+ e->m = e->m >= e->step -+ ? e->m - e->step -+ : 0; -+ } else if (e->m < v) { -+ e->m = e->m + e->step > e->m -+ ? e->m + e->step -+ : U32_MAX; -+ } -+ -+ if ((e->m > v ? e->m - v : v - e->m) < e->step) -+ e->step = max_t(unsigned, e->step / 2, 1); -+ -+ if (v >= e->m) -+ break; -+ -+ i = eytzinger0_child(i, v > e->m); -+ } -+} -+ -+/* time stats: */ -+ -+static void bch2_time_stats_update_one(struct time_stats *stats, -+ u64 start, u64 end) -+{ -+ u64 duration, freq; -+ -+ duration = time_after64(end, start) -+ ? end - start : 0; -+ freq = time_after64(end, stats->last_event) -+ ? end - stats->last_event : 0; -+ -+ stats->count++; -+ -+ stats->average_duration = stats->average_duration -+ ? ewma_add(stats->average_duration, duration, 6) -+ : duration; -+ -+ stats->average_frequency = stats->average_frequency -+ ? ewma_add(stats->average_frequency, freq, 6) -+ : freq; -+ -+ stats->max_duration = max(stats->max_duration, duration); -+ -+ stats->last_event = end; -+ -+ bch2_quantiles_update(&stats->quantiles, duration); -+} -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) -+{ -+ unsigned long flags; -+ -+ if (!stats->buffer) { -+ spin_lock_irqsave(&stats->lock, flags); -+ bch2_time_stats_update_one(stats, start, end); -+ -+ if (stats->average_frequency < 32 && -+ stats->count > 1024) -+ stats->buffer = -+ alloc_percpu_gfp(struct time_stat_buffer, -+ GFP_ATOMIC); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ } else { -+ struct time_stat_buffer_entry *i; -+ struct time_stat_buffer *b; -+ -+ preempt_disable(); -+ b = this_cpu_ptr(stats->buffer); -+ -+ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); -+ b->entries[b->nr++] = (struct time_stat_buffer_entry) { -+ .start = start, -+ .end = end -+ }; -+ -+ if (b->nr == ARRAY_SIZE(b->entries)) { -+ spin_lock_irqsave(&stats->lock, flags); -+ for (i = b->entries; -+ i < b->entries + ARRAY_SIZE(b->entries); -+ i++) -+ bch2_time_stats_update_one(stats, i->start, i->end); -+ spin_unlock_irqrestore(&stats->lock, flags); -+ -+ b->nr = 0; -+ } -+ -+ preempt_enable(); -+ } -+} -+ -+static const struct time_unit { -+ const char *name; -+ u32 nsecs; -+} time_units[] = { -+ { "ns", 1 }, -+ { "us", NSEC_PER_USEC }, -+ { "ms", NSEC_PER_MSEC }, -+ { "sec", NSEC_PER_SEC }, -+}; -+ -+static const struct time_unit *pick_time_units(u64 ns) -+{ -+ const struct time_unit *u; -+ -+ for (u = time_units; -+ u + 1 < time_units + ARRAY_SIZE(time_units) && -+ ns >= u[1].nsecs << 1; -+ u++) -+ ; -+ -+ return u; -+} -+ -+static void pr_time_units(struct printbuf *out, u64 ns) -+{ -+ const struct time_unit *u = pick_time_units(ns); -+ -+ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) -+{ -+ const struct time_unit *u; -+ u64 freq = READ_ONCE(stats->average_frequency); -+ u64 q, last_q = 0; -+ int i; -+ -+ pr_buf(out, "count:\t\t%llu\n", -+ stats->count); -+ pr_buf(out, "rate:\t\t%llu/sec\n", -+ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); -+ -+ pr_buf(out, "frequency:\t"); -+ pr_time_units(out, freq); -+ -+ pr_buf(out, "\navg duration:\t"); -+ pr_time_units(out, stats->average_duration); -+ -+ pr_buf(out, "\nmax duration:\t"); -+ pr_time_units(out, stats->max_duration); -+ -+ i = eytzinger0_first(NR_QUANTILES); -+ u = pick_time_units(stats->quantiles.entries[i].m); -+ -+ pr_buf(out, "\nquantiles (%s):\t", u->name); -+ eytzinger0_for_each(i, NR_QUANTILES) { -+ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; -+ -+ q = max(stats->quantiles.entries[i].m, last_q); -+ pr_buf(out, "%llu%s", -+ div_u64(q, u->nsecs), -+ is_last ? "\n" : " "); -+ last_q = q; -+ } -+} -+ -+void bch2_time_stats_exit(struct time_stats *stats) -+{ -+ free_percpu(stats->buffer); -+} -+ -+void bch2_time_stats_init(struct time_stats *stats) -+{ -+ memset(stats, 0, sizeof(*stats)); -+ spin_lock_init(&stats->lock); -+} -+ -+/* ratelimit: */ -+ -+/** -+ * bch2_ratelimit_delay() - return how long to delay until the next time to do -+ * some work -+ * -+ * @d - the struct bch_ratelimit to update -+ * -+ * Returns the amount of time to delay by, in jiffies -+ */ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *d) -+{ -+ u64 now = local_clock(); -+ -+ return time_after64(d->next, now) -+ ? nsecs_to_jiffies(d->next - now) -+ : 0; -+} -+ -+/** -+ * bch2_ratelimit_increment() - increment @d by the amount of work done -+ * -+ * @d - the struct bch_ratelimit to update -+ * @done - the amount of work done, in arbitrary units -+ */ -+void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) -+{ -+ u64 now = local_clock(); -+ -+ d->next += div_u64(done * NSEC_PER_SEC, d->rate); -+ -+ if (time_before64(now + NSEC_PER_SEC, d->next)) -+ d->next = now + NSEC_PER_SEC; -+ -+ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) -+ d->next = now - NSEC_PER_SEC * 2; -+} -+ -+/* pd controller: */ -+ -+/* -+ * Updates pd_controller. Attempts to scale inputed values to units per second. -+ * @target: desired value -+ * @actual: current value -+ * -+ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing -+ * it makes actual go down. -+ */ -+void bch2_pd_controller_update(struct bch_pd_controller *pd, -+ s64 target, s64 actual, int sign) -+{ -+ s64 proportional, derivative, change; -+ -+ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; -+ -+ if (seconds_since_update == 0) -+ return; -+ -+ pd->last_update = jiffies; -+ -+ proportional = actual - target; -+ proportional *= seconds_since_update; -+ proportional = div_s64(proportional, pd->p_term_inverse); -+ -+ derivative = actual - pd->last_actual; -+ derivative = div_s64(derivative, seconds_since_update); -+ derivative = ewma_add(pd->smoothed_derivative, derivative, -+ (pd->d_term / seconds_since_update) ?: 1); -+ derivative = derivative * pd->d_term; -+ derivative = div_s64(derivative, pd->p_term_inverse); -+ -+ change = proportional + derivative; -+ -+ /* Don't increase rate if not keeping up */ -+ if (change > 0 && -+ pd->backpressure && -+ time_after64(local_clock(), -+ pd->rate.next + NSEC_PER_MSEC)) -+ change = 0; -+ -+ change *= (sign * -1); -+ -+ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, -+ 1, UINT_MAX); -+ -+ pd->last_actual = actual; -+ pd->last_derivative = derivative; -+ pd->last_proportional = proportional; -+ pd->last_change = change; -+ pd->last_target = target; -+} -+ -+void bch2_pd_controller_init(struct bch_pd_controller *pd) -+{ -+ pd->rate.rate = 1024; -+ pd->last_update = jiffies; -+ pd->p_term_inverse = 6000; -+ pd->d_term = 30; -+ pd->d_smooth = pd->d_term; -+ pd->backpressure = 1; -+} -+ -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) -+{ -+ /* 2^64 - 1 is 20 digits, plus null byte */ -+ char rate[21]; -+ char actual[21]; -+ char target[21]; -+ char proportional[21]; -+ char derivative[21]; -+ char change[21]; -+ s64 next_io; -+ -+ bch2_hprint(&PBUF(rate), pd->rate.rate); -+ bch2_hprint(&PBUF(actual), pd->last_actual); -+ bch2_hprint(&PBUF(target), pd->last_target); -+ bch2_hprint(&PBUF(proportional), pd->last_proportional); -+ bch2_hprint(&PBUF(derivative), pd->last_derivative); -+ bch2_hprint(&PBUF(change), pd->last_change); -+ -+ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); -+ -+ return sprintf(buf, -+ "rate:\t\t%s/sec\n" -+ "target:\t\t%s\n" -+ "actual:\t\t%s\n" -+ "proportional:\t%s\n" -+ "derivative:\t%s\n" -+ "change:\t\t%s/sec\n" -+ "next io:\t%llims\n", -+ rate, target, actual, proportional, -+ derivative, change, next_io); -+} -+ -+/* misc: */ -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t size) -+{ -+ while (size) { -+ struct page *page = is_vmalloc_addr(base) -+ ? vmalloc_to_page(base) -+ : virt_to_page(base); -+ unsigned offset = offset_in_page(base); -+ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); -+ -+ BUG_ON(!bio_add_page(bio, page, len, offset)); -+ size -= len; -+ base += len; -+ } -+} -+ -+int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) -+{ -+ while (size) { -+ struct page *page = alloc_page(gfp_mask); -+ unsigned len = min(PAGE_SIZE, size); -+ -+ if (!page) -+ return -ENOMEM; -+ -+ BUG_ON(!bio_add_page(bio, page, len, 0)); -+ size -= len; -+ } -+ -+ return 0; -+} -+ -+size_t bch2_rand_range(size_t max) -+{ -+ size_t rand; -+ -+ if (!max) -+ return 0; -+ -+ do { -+ rand = get_random_long(); -+ rand &= roundup_pow_of_two(max) - 1; -+ } while (rand >= max); -+ -+ return rand; -+} -+ -+void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, dst, iter, dst_iter) { -+ void *dstp = kmap_atomic(bv.bv_page); -+ memcpy(dstp + bv.bv_offset, src, bv.bv_len); -+ kunmap_atomic(dstp); -+ -+ src += bv.bv_len; -+ } -+} -+ -+void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) -+{ -+ struct bio_vec bv; -+ struct bvec_iter iter; -+ -+ __bio_for_each_segment(bv, src, iter, src_iter) { -+ void *srcp = kmap_atomic(bv.bv_page); -+ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); -+ kunmap_atomic(srcp); -+ -+ dst += bv.bv_len; -+ } -+} -+ -+void bch_scnmemcpy(struct printbuf *out, -+ const char *src, size_t len) -+{ -+ size_t n = printbuf_remaining(out); -+ -+ if (n) { -+ n = min(n - 1, len); -+ memcpy(out->pos, src, n); -+ out->pos += n; -+ *out->pos = '\0'; -+ } -+} -+ -+#include "eytzinger.h" -+ -+static int alignment_ok(const void *base, size_t align) -+{ -+ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || -+ ((unsigned long)base & (align - 1)) == 0; -+} -+ -+static void u32_swap(void *a, void *b, size_t size) -+{ -+ u32 t = *(u32 *)a; -+ *(u32 *)a = *(u32 *)b; -+ *(u32 *)b = t; -+} -+ -+static void u64_swap(void *a, void *b, size_t size) -+{ -+ u64 t = *(u64 *)a; -+ *(u64 *)a = *(u64 *)b; -+ *(u64 *)b = t; -+} -+ -+static void generic_swap(void *a, void *b, size_t size) -+{ -+ char t; -+ -+ do { -+ t = *(char *)a; -+ *(char *)a++ = *(char *)b; -+ *(char *)b++ = t; -+ } while (--size > 0); -+} -+ -+static inline int do_cmp(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ size_t l, size_t r) -+{ -+ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+static inline void do_swap(void *base, size_t n, size_t size, -+ void (*swap_func)(void *, void *, size_t), -+ size_t l, size_t r) -+{ -+ swap_func(base + inorder_to_eytzinger0(l, n) * size, -+ base + inorder_to_eytzinger0(r, n) * size, -+ size); -+} -+ -+void eytzinger0_sort(void *base, size_t n, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)) -+{ -+ int i, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for (i = n / 2 - 1; i >= 0; --i) { -+ for (r = i; r * 2 + 1 < n; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < n && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - 1; i > 0; --i) { -+ do_swap(base, n, size, swap_func, 0, i); -+ -+ for (r = 0; r * 2 + 1 < i; r = c) { -+ c = r * 2 + 1; -+ -+ if (c + 1 < i && -+ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) -+ c++; -+ -+ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) -+ break; -+ -+ do_swap(base, n, size, swap_func, r, c); -+ } -+ } -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t size)) -+{ -+ /* pre-scale counters for performance */ -+ int i = (num/2 - 1) * size, n = num * size, c, r; -+ -+ if (!swap_func) { -+ if (size == 4 && alignment_ok(base, 4)) -+ swap_func = u32_swap; -+ else if (size == 8 && alignment_ok(base, 8)) -+ swap_func = u64_swap; -+ else -+ swap_func = generic_swap; -+ } -+ -+ /* heapify */ -+ for ( ; i >= 0; i -= size) { -+ for (r = i; r * 2 + size < n; r = c) { -+ c = r * 2 + size; -+ if (c < n - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+ -+ /* sort */ -+ for (i = n - size; i > 0; i -= size) { -+ swap_func(base, base + i, size); -+ for (r = 0; r * 2 + size < i; r = c) { -+ c = r * 2 + size; -+ if (c < i - size && -+ cmp_func(base + c, base + c + size, size) < 0) -+ c += size; -+ if (cmp_func(base + r, base + c, size) >= 0) -+ break; -+ swap_func(base + r, base + c, size); -+ } -+ } -+} -+ -+static void mempool_free_vp(void *element, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ vpfree(element, size); -+} -+ -+static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) -+{ -+ size_t size = (size_t) pool_data; -+ -+ return vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) -+{ -+ return size < PAGE_SIZE -+ ? mempool_init_kmalloc_pool(pool, min_nr, size) -+ : mempool_init(pool, min_nr, mempool_alloc_vp, -+ mempool_free_vp, (void *) size); -+} -+ -+#if 0 -+void eytzinger1_test(void) -+{ -+ unsigned inorder, eytz, size; -+ -+ pr_info("1 based eytzinger test:"); -+ -+ for (size = 2; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger1_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); -+ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); -+ -+ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); -+ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); -+ -+ inorder = 1; -+ eytzinger1_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger1_last(size) && -+ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+void eytzinger0_test(void) -+{ -+ -+ unsigned inorder, eytz, size; -+ -+ pr_info("0 based eytzinger test:"); -+ -+ for (size = 1; -+ size < 65536; -+ size++) { -+ unsigned extra = eytzinger0_extra(size); -+ -+ if (!(size % 4096)) -+ pr_info("tree size %u", size); -+ -+ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); -+ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); -+ -+ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); -+ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); -+ -+ inorder = 0; -+ eytzinger0_for_each(eytz, size) { -+ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); -+ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); -+ BUG_ON(eytz != eytzinger0_last(size) && -+ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); -+ -+ inorder++; -+ } -+ } -+} -+ -+static inline int cmp_u16(const void *_l, const void *_r, size_t size) -+{ -+ const u16 *l = _l, *r = _r; -+ -+ return (*l > *r) - (*r - *l); -+} -+ -+static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) -+{ -+ int i, c1 = -1, c2 = -1; -+ ssize_t r; -+ -+ r = eytzinger0_find_le(test_array, nr, -+ sizeof(test_array[0]), -+ cmp_u16, &search); -+ if (r >= 0) -+ c1 = test_array[r]; -+ -+ for (i = 0; i < nr; i++) -+ if (test_array[i] <= search && test_array[i] > c2) -+ c2 = test_array[i]; -+ -+ if (c1 != c2) { -+ eytzinger0_for_each(i, nr) -+ pr_info("[%3u] = %12u", i, test_array[i]); -+ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", -+ i, r, c1, c2); -+ } -+} -+ -+void eytzinger0_find_test(void) -+{ -+ unsigned i, nr, allocated = 1 << 12; -+ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); -+ -+ for (nr = 1; nr < allocated; nr++) { -+ pr_info("testing %u elems", nr); -+ -+ get_random_bytes(test_array, nr * sizeof(test_array[0])); -+ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); -+ -+ /* verify array is sorted correctly: */ -+ eytzinger0_for_each(i, nr) -+ BUG_ON(i != eytzinger0_last(nr) && -+ test_array[i] > test_array[eytzinger0_next(i, nr)]); -+ -+ for (i = 0; i < U16_MAX; i += 1 << 12) -+ eytzinger0_find_test_val(test_array, nr, i); -+ -+ for (i = 0; i < nr; i++) { -+ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); -+ eytzinger0_find_test_val(test_array, nr, test_array[i]); -+ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); -+ } -+ } -+ -+ kfree(test_array); -+} -+#endif -+ -+/* -+ * Accumulate percpu counters onto one cpu's copy - only valid when access -+ * against any percpu counter is guarded against -+ */ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) -+{ -+ u64 *ret; -+ int cpu; -+ -+ preempt_disable(); -+ ret = this_cpu_ptr(p); -+ preempt_enable(); -+ -+ for_each_possible_cpu(cpu) { -+ u64 *i = per_cpu_ptr(p, cpu); -+ -+ if (i != ret) { -+ acc_u64s(ret, i, nr); -+ memset(i, 0, nr * sizeof(u64)); -+ } -+ } -+ -+ return ret; -+} -diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h -new file mode 100644 -index 000000000000..f48c6380684f ---- /dev/null -+++ b/fs/bcachefs/util.h -@@ -0,0 +1,761 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_UTIL_H -+#define _BCACHEFS_UTIL_H -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) -+#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) -+ -+struct closure; -+ -+#ifdef CONFIG_BCACHEFS_DEBUG -+ -+#define EBUG_ON(cond) BUG_ON(cond) -+#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) -+#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) -+#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) -+#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) -+#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) -+#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) -+#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) -+#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) -+#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) -+#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) -+ -+#define memcpy(dst, src, len) \ -+({ \ -+ void *_dst = (dst); \ -+ const void *_src = (src); \ -+ size_t _len = (len); \ -+ \ -+ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ -+ (void *) (_dst) + (_len) <= (void *) (_src))); \ -+ memcpy(_dst, _src, _len); \ -+}) -+ -+#else /* DEBUG */ -+ -+#define EBUG_ON(cond) -+#define atomic_dec_bug(v) atomic_dec(v) -+#define atomic_inc_bug(v, i) atomic_inc(v) -+#define atomic_sub_bug(i, v) atomic_sub(i, v) -+#define atomic_add_bug(i, v) atomic_add(i, v) -+#define atomic_long_dec_bug(v) atomic_long_dec(v) -+#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) -+#define atomic64_dec_bug(v) atomic64_dec(v) -+#define atomic64_inc_bug(v, i) atomic64_inc(v) -+#define atomic64_sub_bug(i, v) atomic64_sub(i, v) -+#define atomic64_add_bug(i, v) atomic64_add(i, v) -+ -+#endif -+ -+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -+#define CPU_BIG_ENDIAN 0 -+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ -+#define CPU_BIG_ENDIAN 1 -+#endif -+ -+/* type hackery */ -+ -+#define type_is_exact(_val, _type) \ -+ __builtin_types_compatible_p(typeof(_val), _type) -+ -+#define type_is(_val, _type) \ -+ (__builtin_types_compatible_p(typeof(_val), _type) || \ -+ __builtin_types_compatible_p(typeof(_val), const _type)) -+ -+/* Userspace doesn't align allocations as nicely as the kernel allocators: */ -+static inline size_t buf_pages(void *p, size_t len) -+{ -+ return DIV_ROUND_UP(len + -+ ((unsigned long) p & (PAGE_SIZE - 1)), -+ PAGE_SIZE); -+} -+ -+static inline void vpfree(void *p, size_t size) -+{ -+ if (is_vmalloc_addr(p)) -+ vfree(p); -+ else -+ free_pages((unsigned long) p, get_order(size)); -+} -+ -+static inline void *vpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, -+ get_order(size)) ?: -+ __vmalloc(size, gfp_mask); -+} -+ -+static inline void kvpfree(void *p, size_t size) -+{ -+ if (size < PAGE_SIZE) -+ kfree(p); -+ else -+ vpfree(p, size); -+} -+ -+static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) -+{ -+ return size < PAGE_SIZE -+ ? kmalloc(size, gfp_mask) -+ : vpmalloc(size, gfp_mask); -+} -+ -+int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); -+ -+#define HEAP(type) \ -+struct { \ -+ size_t size, used; \ -+ type *data; \ -+} -+ -+#define DECLARE_HEAP(type, name) HEAP(type) name -+ -+#define init_heap(heap, _size, gfp) \ -+({ \ -+ (heap)->used = 0; \ -+ (heap)->size = (_size); \ -+ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ -+ (gfp)); \ -+}) -+ -+#define free_heap(heap) \ -+do { \ -+ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ -+ (heap)->data = NULL; \ -+} while (0) -+ -+#define heap_set_backpointer(h, i, _fn) \ -+do { \ -+ void (*fn)(typeof(h), size_t) = _fn; \ -+ if (fn) \ -+ fn(h, i); \ -+} while (0) -+ -+#define heap_swap(h, i, j, set_backpointer) \ -+do { \ -+ swap((h)->data[i], (h)->data[j]); \ -+ heap_set_backpointer(h, i, set_backpointer); \ -+ heap_set_backpointer(h, j, set_backpointer); \ -+} while (0) -+ -+#define heap_peek(h) \ -+({ \ -+ EBUG_ON(!(h)->used); \ -+ (h)->data[0]; \ -+}) -+ -+#define heap_full(h) ((h)->used == (h)->size) -+ -+#define heap_sift_down(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _c, _j = i; \ -+ \ -+ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ -+ _c = _j * 2 + 1; \ -+ if (_c + 1 < (h)->used && \ -+ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ -+ _c++; \ -+ \ -+ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ -+ break; \ -+ heap_swap(h, _c, _j, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_sift_up(h, i, cmp, set_backpointer) \ -+do { \ -+ while (i) { \ -+ size_t p = (i - 1) / 2; \ -+ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ -+ break; \ -+ heap_swap(h, i, p, set_backpointer); \ -+ i = p; \ -+ } \ -+} while (0) -+ -+#define __heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ size_t _i = (h)->used++; \ -+ (h)->data[_i] = d; \ -+ heap_set_backpointer(h, _i, set_backpointer); \ -+ \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ _i; \ -+}) -+ -+#define heap_add(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = !heap_full(h); \ -+ if (_r) \ -+ __heap_add(h, d, cmp, set_backpointer); \ -+ _r; \ -+}) -+ -+#define heap_add_or_replace(h, new, cmp, set_backpointer) \ -+do { \ -+ if (!heap_add(h, new, cmp, set_backpointer) && \ -+ cmp(h, new, heap_peek(h)) >= 0) { \ -+ (h)->data[0] = new; \ -+ heap_set_backpointer(h, 0, set_backpointer); \ -+ heap_sift_down(h, 0, cmp, set_backpointer); \ -+ } \ -+} while (0) -+ -+#define heap_del(h, i, cmp, set_backpointer) \ -+do { \ -+ size_t _i = (i); \ -+ \ -+ BUG_ON(_i >= (h)->used); \ -+ (h)->used--; \ -+ heap_swap(h, _i, (h)->used, set_backpointer); \ -+ heap_sift_up(h, _i, cmp, set_backpointer); \ -+ heap_sift_down(h, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define heap_pop(h, d, cmp, set_backpointer) \ -+({ \ -+ bool _r = (h)->used; \ -+ if (_r) { \ -+ (d) = (h)->data[0]; \ -+ heap_del(h, 0, cmp, set_backpointer); \ -+ } \ -+ _r; \ -+}) -+ -+#define heap_resort(heap, cmp, set_backpointer) \ -+do { \ -+ ssize_t _i; \ -+ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ -+ heap_sift_down(heap, _i, cmp, set_backpointer); \ -+} while (0) -+ -+#define ANYSINT_MAX(t) \ -+ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) -+ -+struct printbuf { -+ char *pos; -+ char *end; -+}; -+ -+static inline size_t printbuf_remaining(struct printbuf *buf) -+{ -+ return buf->end - buf->pos; -+} -+ -+#define _PBUF(_buf, _len) \ -+ ((struct printbuf) { \ -+ .pos = _buf, \ -+ .end = _buf + _len, \ -+ }) -+ -+#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) -+ -+#define pr_buf(_out, ...) \ -+do { \ -+ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ -+ __VA_ARGS__); \ -+} while (0) -+ -+void bch_scnmemcpy(struct printbuf *, const char *, size_t); -+ -+int bch2_strtoint_h(const char *, int *); -+int bch2_strtouint_h(const char *, unsigned int *); -+int bch2_strtoll_h(const char *, long long *); -+int bch2_strtoull_h(const char *, unsigned long long *); -+int bch2_strtou64_h(const char *, u64 *); -+ -+static inline int bch2_strtol_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtoint_h(cp, (int *) res); -+#else -+ return bch2_strtoll_h(cp, (long long *) res); -+#endif -+} -+ -+static inline int bch2_strtoul_h(const char *cp, long *res) -+{ -+#if BITS_PER_LONG == 32 -+ return bch2_strtouint_h(cp, (unsigned int *) res); -+#else -+ return bch2_strtoull_h(cp, (unsigned long long *) res); -+#endif -+} -+ -+#define strtoi_h(cp, res) \ -+ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ -+ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ -+ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ -+ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ -+ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ -+ : -EINVAL) -+ -+#define strtoul_safe(cp, var) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = _v; \ -+ _r; \ -+}) -+ -+#define strtoul_safe_clamp(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r) \ -+ var = clamp_t(typeof(var), _v, min, max); \ -+ _r; \ -+}) -+ -+#define strtoul_safe_restrict(cp, var, min, max) \ -+({ \ -+ unsigned long _v; \ -+ int _r = kstrtoul(cp, 10, &_v); \ -+ if (!_r && _v >= min && _v <= max) \ -+ var = _v; \ -+ else \ -+ _r = -EINVAL; \ -+ _r; \ -+}) -+ -+#define snprint(buf, size, var) \ -+ snprintf(buf, size, \ -+ type_is(var, int) ? "%i\n" \ -+ : type_is(var, unsigned) ? "%u\n" \ -+ : type_is(var, long) ? "%li\n" \ -+ : type_is(var, unsigned long) ? "%lu\n" \ -+ : type_is(var, s64) ? "%lli\n" \ -+ : type_is(var, u64) ? "%llu\n" \ -+ : type_is(var, char *) ? "%s\n" \ -+ : "%i\n", var) -+ -+void bch2_hprint(struct printbuf *, s64); -+ -+bool bch2_is_zero(const void *, size_t); -+ -+void bch2_string_opt_to_text(struct printbuf *, -+ const char * const [], size_t); -+ -+void bch2_flags_to_text(struct printbuf *, const char * const[], u64); -+u64 bch2_read_flag_list(char *, const char * const[]); -+ -+#define NR_QUANTILES 15 -+#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) -+#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) -+#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) -+ -+struct quantiles { -+ struct quantile_entry { -+ u64 m; -+ u64 step; -+ } entries[NR_QUANTILES]; -+}; -+ -+struct time_stat_buffer { -+ unsigned nr; -+ struct time_stat_buffer_entry { -+ u64 start; -+ u64 end; -+ } entries[32]; -+}; -+ -+struct time_stats { -+ spinlock_t lock; -+ u64 count; -+ /* all fields are in nanoseconds */ -+ u64 average_duration; -+ u64 average_frequency; -+ u64 max_duration; -+ u64 last_event; -+ struct quantiles quantiles; -+ -+ struct time_stat_buffer __percpu *buffer; -+}; -+ -+void __bch2_time_stats_update(struct time_stats *stats, u64, u64); -+ -+static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) -+{ -+ __bch2_time_stats_update(stats, start, local_clock()); -+} -+ -+void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); -+ -+void bch2_time_stats_exit(struct time_stats *); -+void bch2_time_stats_init(struct time_stats *); -+ -+#define ewma_add(ewma, val, weight) \ -+({ \ -+ typeof(ewma) _ewma = (ewma); \ -+ typeof(weight) _weight = (weight); \ -+ \ -+ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ -+}) -+ -+struct bch_ratelimit { -+ /* Next time we want to do some work, in nanoseconds */ -+ u64 next; -+ -+ /* -+ * Rate at which we want to do work, in units per nanosecond -+ * The units here correspond to the units passed to -+ * bch2_ratelimit_increment() -+ */ -+ unsigned rate; -+}; -+ -+static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) -+{ -+ d->next = local_clock(); -+} -+ -+u64 bch2_ratelimit_delay(struct bch_ratelimit *); -+void bch2_ratelimit_increment(struct bch_ratelimit *, u64); -+ -+struct bch_pd_controller { -+ struct bch_ratelimit rate; -+ unsigned long last_update; -+ -+ s64 last_actual; -+ s64 smoothed_derivative; -+ -+ unsigned p_term_inverse; -+ unsigned d_smooth; -+ unsigned d_term; -+ -+ /* for exporting to sysfs (no effect on behavior) */ -+ s64 last_derivative; -+ s64 last_proportional; -+ s64 last_change; -+ s64 last_target; -+ -+ /* If true, the rate will not increase if bch2_ratelimit_delay() -+ * is not being called often enough. */ -+ bool backpressure; -+}; -+ -+void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); -+void bch2_pd_controller_init(struct bch_pd_controller *); -+size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); -+ -+#define sysfs_pd_controller_attribute(name) \ -+ rw_attribute(name##_rate); \ -+ rw_attribute(name##_rate_bytes); \ -+ rw_attribute(name##_rate_d_term); \ -+ rw_attribute(name##_rate_p_term_inverse); \ -+ read_attribute(name##_rate_debug) -+ -+#define sysfs_pd_controller_files(name) \ -+ &sysfs_##name##_rate, \ -+ &sysfs_##name##_rate_bytes, \ -+ &sysfs_##name##_rate_d_term, \ -+ &sysfs_##name##_rate_p_term_inverse, \ -+ &sysfs_##name##_rate_debug -+ -+#define sysfs_pd_controller_show(name, var) \ -+do { \ -+ sysfs_hprint(name##_rate, (var)->rate.rate); \ -+ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ -+ sysfs_print(name##_rate_d_term, (var)->d_term); \ -+ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ -+ \ -+ if (attr == &sysfs_##name##_rate_debug) \ -+ return bch2_pd_controller_print_debug(var, buf); \ -+} while (0) -+ -+#define sysfs_pd_controller_store(name, var) \ -+do { \ -+ sysfs_strtoul_clamp(name##_rate, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul_clamp(name##_rate_bytes, \ -+ (var)->rate.rate, 1, UINT_MAX); \ -+ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ -+ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ -+ (var)->p_term_inverse, 1, INT_MAX); \ -+} while (0) -+ -+#define container_of_or_null(ptr, type, member) \ -+({ \ -+ typeof(ptr) _ptr = ptr; \ -+ _ptr ? container_of(_ptr, type, member) : NULL; \ -+}) -+ -+/* Does linear interpolation between powers of two */ -+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) -+{ -+ unsigned fract = x & ~(~0 << fract_bits); -+ -+ x >>= fract_bits; -+ x = 1 << x; -+ x += (x * fract) >> fract_bits; -+ -+ return x; -+} -+ -+void bch2_bio_map(struct bio *bio, void *base, size_t); -+int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); -+ -+static inline sector_t bdev_sectors(struct block_device *bdev) -+{ -+ return bdev->bd_inode->i_size >> 9; -+} -+ -+#define closure_bio_submit(bio, cl) \ -+do { \ -+ closure_get(cl); \ -+ submit_bio(bio); \ -+} while (0) -+ -+#define kthread_wait_freezable(cond) \ -+({ \ -+ int _ret = 0; \ -+ while (1) { \ -+ set_current_state(TASK_INTERRUPTIBLE); \ -+ if (kthread_should_stop()) { \ -+ _ret = -1; \ -+ break; \ -+ } \ -+ \ -+ if (cond) \ -+ break; \ -+ \ -+ schedule(); \ -+ try_to_freeze(); \ -+ } \ -+ set_current_state(TASK_RUNNING); \ -+ _ret; \ -+}) -+ -+size_t bch2_rand_range(size_t); -+ -+void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); -+void memcpy_from_bio(void *, struct bio *, struct bvec_iter); -+ -+static inline void memcpy_u64s_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+} -+ -+static inline void __memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("rep ; movsq" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ u64 *d = dst; -+ const u64 *s = src; -+ -+ while (u64s--) -+ *d++ = *s++; -+#endif -+} -+ -+static inline void memcpy_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || -+ dst + u64s * sizeof(u64) <= src)); -+ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ __memcpy_u64s(dst, src, u64s); -+} -+ -+static inline void memmove_u64s_down(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst > src); -+ -+ __memmove_u64s_down(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up_small(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s; -+ u64 *src = (u64 *) _src + u64s; -+ -+ while (u64s--) -+ *--dst = *--src; -+} -+ -+static inline void memmove_u64s_up_small(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up_small(dst, src, u64s); -+} -+ -+static inline void __memmove_u64s_up(void *_dst, const void *_src, -+ unsigned u64s) -+{ -+ u64 *dst = (u64 *) _dst + u64s - 1; -+ u64 *src = (u64 *) _src + u64s - 1; -+ -+#ifdef CONFIG_X86_64 -+ long d0, d1, d2; -+ asm volatile("std ;\n" -+ "rep ; movsq\n" -+ "cld ;\n" -+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) -+ : "0" (u64s), "1" (dst), "2" (src) -+ : "memory"); -+#else -+ while (u64s--) -+ *dst-- = *src--; -+#endif -+} -+ -+static inline void memmove_u64s_up(void *dst, const void *src, -+ unsigned u64s) -+{ -+ EBUG_ON(dst < src); -+ -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+static inline void memmove_u64s(void *dst, const void *src, -+ unsigned u64s) -+{ -+ if (dst < src) -+ __memmove_u64s_down(dst, src, u64s); -+ else -+ __memmove_u64s_up(dst, src, u64s); -+} -+ -+/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ -+static inline void memset_u64s_tail(void *s, int c, unsigned bytes) -+{ -+ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; -+ -+ memset(s + bytes, c, rem); -+} -+ -+void sort_cmp_size(void *base, size_t num, size_t size, -+ int (*cmp_func)(const void *, const void *, size_t), -+ void (*swap_func)(void *, void *, size_t)); -+ -+/* just the memmove, doesn't update @_nr */ -+#define __array_insert_item(_array, _nr, _pos) \ -+ memmove(&(_array)[(_pos) + 1], \ -+ &(_array)[(_pos)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))) -+ -+#define array_insert_item(_array, _nr, _pos, _new_item) \ -+do { \ -+ __array_insert_item(_array, _nr, _pos); \ -+ (_nr)++; \ -+ (_array)[(_pos)] = (_new_item); \ -+} while (0) -+ -+#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ -+do { \ -+ (_nr) -= (_nr_to_remove); \ -+ memmove(&(_array)[(_pos)], \ -+ &(_array)[(_pos) + (_nr_to_remove)], \ -+ sizeof((_array)[0]) * ((_nr) - (_pos))); \ -+} while (0) -+ -+#define array_remove_item(_array, _nr, _pos) \ -+ array_remove_items(_array, _nr, _pos, 1) -+ -+#define bubble_sort(_base, _nr, _cmp) \ -+do { \ -+ ssize_t _i, _end; \ -+ bool _swapped = true; \ -+ \ -+ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ -+ _swapped = false; \ -+ for (_i = 0; _i < _end; _i++) \ -+ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ -+ swap((_base)[_i], (_base)[_i + 1]); \ -+ _swapped = true; \ -+ } \ -+ } \ -+} while (0) -+ -+static inline u64 percpu_u64_get(u64 __percpu *src) -+{ -+ u64 ret = 0; -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ ret += *per_cpu_ptr(src, cpu); -+ return ret; -+} -+ -+static inline void percpu_u64_set(u64 __percpu *dst, u64 src) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ *per_cpu_ptr(dst, cpu) = 0; -+ -+ preempt_disable(); -+ *this_cpu_ptr(dst) = src; -+ preempt_enable(); -+} -+ -+static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) -+{ -+ unsigned i; -+ -+ for (i = 0; i < nr; i++) -+ acc[i] += src[i]; -+} -+ -+static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, -+ unsigned nr) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); -+} -+ -+static inline void percpu_memset(void __percpu *p, int c, size_t bytes) -+{ -+ int cpu; -+ -+ for_each_possible_cpu(cpu) -+ memset(per_cpu_ptr(p, cpu), c, bytes); -+} -+ -+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -+ -+#define cmp_int(l, r) ((l > r) - (l < r)) -+ -+#endif /* _BCACHEFS_UTIL_H */ -diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h -new file mode 100644 -index 000000000000..c099cdc0605f ---- /dev/null -+++ b/fs/bcachefs/vstructs.h -@@ -0,0 +1,63 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _VSTRUCTS_H -+#define _VSTRUCTS_H -+ -+#include "util.h" -+ -+/* -+ * NOTE: we can't differentiate between __le64 and u64 with type_is - this -+ * assumes u64 is little endian: -+ */ -+#define __vstruct_u64s(_s) \ -+({ \ -+ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ -+ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ -+ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ -+ : ((__force u8) ((_s)->u64s))); \ -+}) -+ -+#define __vstruct_bytes(_type, _u64s) \ -+({ \ -+ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ -+ \ -+ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ -+}) -+ -+#define vstruct_bytes(_s) \ -+ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) -+ -+#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ -+ (round_up(__vstruct_bytes(_type, _u64s), \ -+ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) -+ -+#define vstruct_blocks(_s, _sector_block_bits) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) -+ -+#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ -+ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ -+ __vstruct_u64s(_s) + (_u64s)) -+ -+#define vstruct_sectors(_s, _sector_block_bits) \ -+ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) -+ -+#define vstruct_next(_s) \ -+ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_last(_s) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) -+#define vstruct_end(_s) \ -+ ((void *) ((_s)->_data + __vstruct_u64s(_s))) -+ -+#define vstruct_for_each(_s, _i) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s); \ -+ _i = vstruct_next(_i)) -+ -+#define vstruct_for_each_safe(_s, _i, _t) \ -+ for (_i = (_s)->start; \ -+ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ -+ _i = _t) -+ -+#define vstruct_idx(_s, _idx) \ -+ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) -+ -+#endif /* _VSTRUCTS_H */ -diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c -new file mode 100644 -index 000000000000..21f64cb7e402 ---- /dev/null -+++ b/fs/bcachefs/xattr.c -@@ -0,0 +1,586 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include "bcachefs.h" -+#include "bkey_methods.h" -+#include "btree_update.h" -+#include "extents.h" -+#include "fs.h" -+#include "rebalance.h" -+#include "str_hash.h" -+#include "xattr.h" -+ -+#include -+#include -+#include -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); -+ -+static u64 bch2_xattr_hash(const struct bch_hash_info *info, -+ const struct xattr_search_key *key) -+{ -+ struct bch_str_hash_ctx ctx; -+ -+ bch2_str_hash_init(&ctx, info); -+ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); -+ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); -+ -+ return bch2_str_hash_end(&ctx, info); -+} -+ -+static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) -+{ -+ return bch2_xattr_hash(info, key); -+} -+ -+static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) -+{ -+ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); -+ -+ return bch2_xattr_hash(info, -+ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); -+} -+ -+static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ const struct xattr_search_key *r = _r; -+ -+ return l.v->x_type != r->type || -+ l.v->x_name_len != r->name.len || -+ memcmp(l.v->x_name, r->name.name, r->name.len); -+} -+ -+static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) -+{ -+ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); -+ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); -+ -+ return l.v->x_type != r.v->x_type || -+ l.v->x_name_len != r.v->x_name_len || -+ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); -+} -+ -+const struct bch_hash_desc bch2_xattr_hash_desc = { -+ .btree_id = BTREE_ID_XATTRS, -+ .key_type = KEY_TYPE_xattr, -+ .hash_key = xattr_hash_key, -+ .hash_bkey = xattr_hash_bkey, -+ .cmp_key = xattr_cmp_key, -+ .cmp_bkey = xattr_cmp_bkey, -+}; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) < -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len))) -+ return "value too small"; -+ -+ if (bkey_val_u64s(k.k) > -+ xattr_val_u64s(xattr.v->x_name_len, -+ le16_to_cpu(xattr.v->x_val_len) + 4)) -+ return "value too big"; -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (!handler) -+ return "invalid type"; -+ -+ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) -+ return "xattr name has invalid characters"; -+ -+ return NULL; -+} -+ -+void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, -+ struct bkey_s_c k) -+{ -+ const struct xattr_handler *handler; -+ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); -+ -+ handler = bch2_xattr_type_to_handler(xattr.v->x_type); -+ if (handler && handler->prefix) -+ pr_buf(out, "%s", handler->prefix); -+ else if (handler) -+ pr_buf(out, "(type %u)", xattr.v->x_type); -+ else -+ pr_buf(out, "(unknown type %u)", xattr.v->x_type); -+ -+ bch_scnmemcpy(out, xattr.v->x_name, -+ xattr.v->x_name_len); -+ pr_buf(out, ":"); -+ bch_scnmemcpy(out, xattr_val(xattr.v), -+ le16_to_cpu(xattr.v->x_val_len)); -+} -+ -+int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, -+ const char *name, void *buffer, size_t size, int type) -+{ -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c_xattr xattr; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, -+ &inode->ei_str_hash, inode->v.i_ino, -+ &X_SEARCH(type, name, strlen(name)), -+ 0); -+ if (IS_ERR(iter)) { -+ bch2_trans_exit(&trans); -+ BUG_ON(PTR_ERR(iter) == -EINTR); -+ -+ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); -+ } -+ -+ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); -+ ret = le16_to_cpu(xattr.v->x_val_len); -+ if (buffer) { -+ if (ret > size) -+ ret = -ERANGE; -+ else -+ memcpy(buffer, xattr_val(xattr.v), ret); -+ } -+ -+ bch2_trans_exit(&trans); -+ return ret; -+} -+ -+int bch2_xattr_set(struct btree_trans *trans, u64 inum, -+ const struct bch_hash_info *hash_info, -+ const char *name, const void *value, size_t size, -+ int type, int flags) -+{ -+ int ret; -+ -+ if (value) { -+ struct bkey_i_xattr *xattr; -+ unsigned namelen = strlen(name); -+ unsigned u64s = BKEY_U64s + -+ xattr_val_u64s(namelen, size); -+ -+ if (u64s > U8_MAX) -+ return -ERANGE; -+ -+ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); -+ if (IS_ERR(xattr)) -+ return PTR_ERR(xattr); -+ -+ bkey_xattr_init(&xattr->k_i); -+ xattr->k.u64s = u64s; -+ xattr->v.x_type = type; -+ xattr->v.x_name_len = namelen; -+ xattr->v.x_val_len = cpu_to_le16(size); -+ memcpy(xattr->v.x_name, name, namelen); -+ memcpy(xattr_val(&xattr->v), value, size); -+ -+ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, -+ inum, &xattr->k_i, -+ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| -+ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); -+ } else { -+ struct xattr_search_key search = -+ X_SEARCH(type, name, strlen(name)); -+ -+ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, -+ hash_info, inum, &search); -+ } -+ -+ if (ret == -ENOENT) -+ ret = flags & XATTR_REPLACE ? -ENODATA : 0; -+ -+ return ret; -+} -+ -+struct xattr_buf { -+ char *buf; -+ size_t len; -+ size_t used; -+}; -+ -+static int __bch2_xattr_emit(const char *prefix, -+ const char *name, size_t name_len, -+ struct xattr_buf *buf) -+{ -+ const size_t prefix_len = strlen(prefix); -+ const size_t total_len = prefix_len + name_len + 1; -+ -+ if (buf->buf) { -+ if (buf->used + total_len > buf->len) -+ return -ERANGE; -+ -+ memcpy(buf->buf + buf->used, prefix, prefix_len); -+ memcpy(buf->buf + buf->used + prefix_len, -+ name, name_len); -+ buf->buf[buf->used + prefix_len + name_len] = '\0'; -+ } -+ -+ buf->used += total_len; -+ return 0; -+} -+ -+static int bch2_xattr_emit(struct dentry *dentry, -+ const struct bch_xattr *xattr, -+ struct xattr_buf *buf) -+{ -+ const struct xattr_handler *handler = -+ bch2_xattr_type_to_handler(xattr->x_type); -+ -+ return handler && (!handler->list || handler->list(dentry)) -+ ? __bch2_xattr_emit(handler->prefix ?: handler->name, -+ xattr->x_name, xattr->x_name_len, buf) -+ : 0; -+} -+ -+static int bch2_xattr_list_bcachefs(struct bch_fs *c, -+ struct bch_inode_info *inode, -+ struct xattr_buf *buf, -+ bool all) -+{ -+ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; -+ unsigned id; -+ int ret = 0; -+ u64 v; -+ -+ for (id = 0; id < Inode_opt_nr; id++) { -+ v = bch2_inode_opt_get(&inode->ei_inode, id); -+ if (!v) -+ continue; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << id))) -+ continue; -+ -+ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], -+ strlen(bch2_inode_opts[id]), buf); -+ if (ret) -+ break; -+ } -+ -+ return ret; -+} -+ -+ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) -+{ -+ struct bch_fs *c = dentry->d_sb->s_fs_info; -+ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); -+ struct btree_trans trans; -+ struct btree_iter *iter; -+ struct bkey_s_c k; -+ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; -+ u64 inum = dentry->d_inode->i_ino; -+ int ret; -+ -+ bch2_trans_init(&trans, c, 0, 0); -+ -+ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, -+ POS(inum, 0), 0, k, ret) { -+ BUG_ON(k.k->p.inode < inum); -+ -+ if (k.k->p.inode > inum) -+ break; -+ -+ if (k.k->type != KEY_TYPE_xattr) -+ continue; -+ -+ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); -+ if (ret) -+ break; -+ } -+ ret = bch2_trans_exit(&trans) ?: ret; -+ -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); -+ if (ret) -+ return ret; -+ -+ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); -+ if (ret) -+ return ret; -+ -+ return buf.used; -+} -+ -+static int bch2_xattr_get_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); -+} -+ -+static int bch2_xattr_set_handler(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ -+ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, -+ bch2_xattr_set(&trans, inode->v.i_ino, -+ &inode->ei_str_hash, -+ name, value, size, -+ handler->flags, flags)); -+} -+ -+static const struct xattr_handler bch_xattr_user_handler = { -+ .prefix = XATTR_USER_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_USER, -+}; -+ -+static bool bch2_xattr_trusted_list(struct dentry *dentry) -+{ -+ return capable(CAP_SYS_ADMIN); -+} -+ -+static const struct xattr_handler bch_xattr_trusted_handler = { -+ .prefix = XATTR_TRUSTED_PREFIX, -+ .list = bch2_xattr_trusted_list, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, -+}; -+ -+static const struct xattr_handler bch_xattr_security_handler = { -+ .prefix = XATTR_SECURITY_PREFIX, -+ .get = bch2_xattr_get_handler, -+ .set = bch2_xattr_set_handler, -+ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, -+}; -+ -+#ifndef NO_BCACHEFS_FS -+ -+static int opt_to_inode_opt(int id) -+{ -+ switch (id) { -+#define x(name, ...) \ -+ case Opt_##name: return Inode_opt_##name; -+ BCH_INODE_OPTS() -+#undef x -+ default: -+ return -1; -+ } -+} -+ -+static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size, -+ bool all) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ struct bch_opts opts = -+ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); -+ const struct bch_option *opt; -+ int id, inode_opt_id; -+ char buf[512]; -+ struct printbuf out = PBUF(buf); -+ unsigned val_len; -+ u64 v; -+ -+ id = bch2_opt_lookup(name); -+ if (id < 0 || !bch2_opt_is_inode_opt(id)) -+ return -EINVAL; -+ -+ inode_opt_id = opt_to_inode_opt(id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + id; -+ -+ if (!bch2_opt_defined_by_id(&opts, id)) -+ return -ENODATA; -+ -+ if (!all && -+ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) -+ return -ENODATA; -+ -+ v = bch2_opt_get_by_id(&opts, id); -+ bch2_opt_to_text(&out, c, opt, v, 0); -+ -+ val_len = out.pos - buf; -+ -+ if (buffer && val_len > size) -+ return -ERANGE; -+ -+ if (buffer) -+ memcpy(buffer, buf, val_len); -+ return val_len; -+} -+ -+static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, false); -+} -+ -+struct inode_opt_set { -+ int id; -+ u64 v; -+ bool defined; -+}; -+ -+static int inode_opt_set_fn(struct bch_inode_info *inode, -+ struct bch_inode_unpacked *bi, -+ void *p) -+{ -+ struct inode_opt_set *s = p; -+ -+ if (s->defined) -+ bi->bi_fields_set |= 1U << s->id; -+ else -+ bi->bi_fields_set &= ~(1U << s->id); -+ -+ bch2_inode_opt_set(bi, s->id, s->v); -+ -+ return 0; -+} -+ -+static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, const void *value, -+ size_t size, int flags) -+{ -+ struct bch_inode_info *inode = to_bch_ei(vinode); -+ struct bch_fs *c = inode->v.i_sb->s_fs_info; -+ const struct bch_option *opt; -+ char *buf; -+ struct inode_opt_set s; -+ int opt_id, inode_opt_id, ret; -+ -+ opt_id = bch2_opt_lookup(name); -+ if (opt_id < 0) -+ return -EINVAL; -+ -+ opt = bch2_opt_table + opt_id; -+ -+ inode_opt_id = opt_to_inode_opt(opt_id); -+ if (inode_opt_id < 0) -+ return -EINVAL; -+ -+ s.id = inode_opt_id; -+ -+ if (value) { -+ u64 v = 0; -+ -+ buf = kmalloc(size + 1, GFP_KERNEL); -+ if (!buf) -+ return -ENOMEM; -+ memcpy(buf, value, size); -+ buf[size] = '\0'; -+ -+ ret = bch2_opt_parse(c, opt, buf, &v); -+ kfree(buf); -+ -+ if (ret < 0) -+ return ret; -+ -+ ret = bch2_opt_check_may_set(c, opt_id, v); -+ if (ret < 0) -+ return ret; -+ -+ s.v = v + 1; -+ s.defined = true; -+ } else { -+ if (!IS_ROOT(dentry)) { -+ struct bch_inode_info *dir = -+ to_bch_ei(d_inode(dentry->d_parent)); -+ -+ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); -+ } else { -+ s.v = 0; -+ } -+ -+ s.defined = false; -+ } -+ -+ mutex_lock(&inode->ei_update_lock); -+ if (inode_opt_id == Inode_opt_project) { -+ /* -+ * inode fields accessible via the xattr interface are stored -+ * with a +1 bias, so that 0 means unset: -+ */ -+ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); -+ if (ret) -+ goto err; -+ } -+ -+ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); -+err: -+ mutex_unlock(&inode->ei_update_lock); -+ -+ if (value && -+ (opt_id == Opt_background_compression || -+ opt_id == Opt_background_target)) -+ bch2_rebalance_add_work(c, inode->v.i_blocks); -+ -+ return ret; -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_handler = { -+ .prefix = "bcachefs.", -+ .get = bch2_xattr_bcachefs_get, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+static int bch2_xattr_bcachefs_get_effective( -+ const struct xattr_handler *handler, -+ struct dentry *dentry, struct inode *vinode, -+ const char *name, void *buffer, size_t size) -+{ -+ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, -+ name, buffer, size, true); -+} -+ -+static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { -+ .prefix = "bcachefs_effective.", -+ .get = bch2_xattr_bcachefs_get_effective, -+ .set = bch2_xattr_bcachefs_set, -+}; -+ -+#endif /* NO_BCACHEFS_FS */ -+ -+const struct xattr_handler *bch2_xattr_handlers[] = { -+ &bch_xattr_user_handler, -+ &posix_acl_access_xattr_handler, -+ &posix_acl_default_xattr_handler, -+ &bch_xattr_trusted_handler, -+ &bch_xattr_security_handler, -+#ifndef NO_BCACHEFS_FS -+ &bch_xattr_bcachefs_handler, -+ &bch_xattr_bcachefs_effective_handler, -+#endif -+ NULL -+}; -+ -+static const struct xattr_handler *bch_xattr_handler_map[] = { -+ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = -+ &posix_acl_access_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = -+ &posix_acl_default_xattr_handler, -+ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, -+ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, -+}; -+ -+static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) -+{ -+ return type < ARRAY_SIZE(bch_xattr_handler_map) -+ ? bch_xattr_handler_map[type] -+ : NULL; -+} -diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h -new file mode 100644 -index 000000000000..4151065ab853 ---- /dev/null -+++ b/fs/bcachefs/xattr.h -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _BCACHEFS_XATTR_H -+#define _BCACHEFS_XATTR_H -+ -+#include "str_hash.h" -+ -+extern const struct bch_hash_desc bch2_xattr_hash_desc; -+ -+const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); -+void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); -+ -+#define bch2_bkey_ops_xattr (struct bkey_ops) { \ -+ .key_invalid = bch2_xattr_invalid, \ -+ .val_to_text = bch2_xattr_to_text, \ -+} -+ -+static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) -+{ -+ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + -+ name_len + val_len, sizeof(u64)); -+} -+ -+#define xattr_val(_xattr) \ -+ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) -+ -+struct xattr_search_key { -+ u8 type; -+ struct qstr name; -+}; -+ -+#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ -+ { .type = _type, .name = QSTR_INIT(_name, _len) }) -+ -+struct dentry; -+struct xattr_handler; -+struct bch_hash_info; -+struct bch_inode_info; -+ -+int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, -+ const char *, void *, size_t, int); -+ -+int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, -+ const char *, const void *, size_t, int, int); -+ -+ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -+ -+extern const struct xattr_handler *bch2_xattr_handlers[]; -+ -+#endif /* _BCACHEFS_XATTR_H */ -diff --git a/fs/cifs/file.c b/fs/cifs/file.c -index be46fab4c96d..a17a21181e18 100644 ---- a/fs/cifs/file.c -+++ b/fs/cifs/file.c -@@ -4296,20 +4296,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - - page = lru_to_page(page_list); - -- /* -- * Lock the page and put it in the cache. Since no one else -- * should have access to this page, we're safe to simply set -- * PG_locked without checking it first. -- */ -- __SetPageLocked(page); -- rc = add_to_page_cache_locked(page, mapping, -- page->index, gfp); -+ rc = add_to_page_cache(page, mapping, -+ page->index, gfp); - - /* give up if we can't stick it in the cache */ -- if (rc) { -- __ClearPageLocked(page); -+ if (rc) - return rc; -- } - - /* move first page to the tmplist */ - *offset = (loff_t)page->index << PAGE_SHIFT; -@@ -4328,12 +4320,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, - if (*bytes + PAGE_SIZE > rsize) - break; - -- __SetPageLocked(page); -- rc = add_to_page_cache_locked(page, mapping, page->index, gfp); -- if (rc) { -- __ClearPageLocked(page); -+ rc = add_to_page_cache(page, mapping, page->index, gfp); -+ if (rc) - break; -- } - list_move_tail(&page->lru, tmplist); - (*bytes) += PAGE_SIZE; - expected_index++; -diff --git a/fs/dcache.c b/fs/dcache.c -index ea0485861d93..b4d6e3e86285 100644 ---- a/fs/dcache.c -+++ b/fs/dcache.c -@@ -3132,9 +3132,8 @@ void d_genocide(struct dentry *parent) - - EXPORT_SYMBOL(d_genocide); - --void d_tmpfile(struct dentry *dentry, struct inode *inode) -+void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) - { -- inode_dec_link_count(inode); - BUG_ON(dentry->d_name.name != dentry->d_iname || - !hlist_unhashed(&dentry->d_u.d_alias) || - !d_unlinked(dentry)); -@@ -3144,6 +3143,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) - (unsigned long long)inode->i_ino); - spin_unlock(&dentry->d_lock); - spin_unlock(&dentry->d_parent->d_lock); -+} -+EXPORT_SYMBOL(d_mark_tmpfile); -+ -+void d_tmpfile(struct dentry *dentry, struct inode *inode) -+{ -+ inode_dec_link_count(inode); -+ d_mark_tmpfile(dentry, inode); - d_instantiate(dentry, inode); - } - EXPORT_SYMBOL(d_tmpfile); -diff --git a/fs/inode.c b/fs/inode.c -index 72c4c347afb7..e70ad3d2d01c 100644 ---- a/fs/inode.c -+++ b/fs/inode.c -@@ -1578,6 +1578,46 @@ int insert_inode_locked(struct inode *inode) - } - EXPORT_SYMBOL(insert_inode_locked); - -+struct inode *insert_inode_locked2(struct inode *inode) -+{ -+ struct super_block *sb = inode->i_sb; -+ ino_t ino = inode->i_ino; -+ struct hlist_head *head = inode_hashtable + hash(sb, ino); -+ -+ while (1) { -+ struct inode *old = NULL; -+ spin_lock(&inode_hash_lock); -+ hlist_for_each_entry(old, head, i_hash) { -+ if (old->i_ino != ino) -+ continue; -+ if (old->i_sb != sb) -+ continue; -+ spin_lock(&old->i_lock); -+ if (old->i_state & (I_FREEING|I_WILL_FREE)) { -+ spin_unlock(&old->i_lock); -+ continue; -+ } -+ break; -+ } -+ if (likely(!old)) { -+ spin_lock(&inode->i_lock); -+ inode->i_state |= I_NEW | I_CREATING; -+ hlist_add_head(&inode->i_hash, head); -+ spin_unlock(&inode->i_lock); -+ spin_unlock(&inode_hash_lock); -+ return NULL; -+ } -+ __iget(old); -+ spin_unlock(&old->i_lock); -+ spin_unlock(&inode_hash_lock); -+ wait_on_inode(old); -+ if (unlikely(!inode_unhashed(old))) -+ return old; -+ iput(old); -+ } -+} -+EXPORT_SYMBOL(insert_inode_locked2); -+ - int insert_inode_locked4(struct inode *inode, unsigned long hashval, - int (*test)(struct inode *, void *), void *data) - { -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 868e11face00..d9e3b7b0175e 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -936,6 +936,7 @@ extern const char *blk_op_str(unsigned int op); - - int blk_status_to_errno(blk_status_t status); - blk_status_t errno_to_blk_status(int errno); -+const char *blk_status_to_str(blk_status_t status); - - int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); - -diff --git a/include/linux/closure.h b/include/linux/closure.h -new file mode 100644 -index 000000000000..36b4a83f9b77 ---- /dev/null -+++ b/include/linux/closure.h -@@ -0,0 +1,399 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#ifndef _LINUX_CLOSURE_H -+#define _LINUX_CLOSURE_H -+ -+#include -+#include -+#include -+#include -+ -+/* -+ * Closure is perhaps the most overused and abused term in computer science, but -+ * since I've been unable to come up with anything better you're stuck with it -+ * again. -+ * -+ * What are closures? -+ * -+ * They embed a refcount. The basic idea is they count "things that are in -+ * progress" - in flight bios, some other thread that's doing something else - -+ * anything you might want to wait on. -+ * -+ * The refcount may be manipulated with closure_get() and closure_put(). -+ * closure_put() is where many of the interesting things happen, when it causes -+ * the refcount to go to 0. -+ * -+ * Closures can be used to wait on things both synchronously and asynchronously, -+ * and synchronous and asynchronous use can be mixed without restriction. To -+ * wait synchronously, use closure_sync() - you will sleep until your closure's -+ * refcount hits 1. -+ * -+ * To wait asynchronously, use -+ * continue_at(cl, next_function, workqueue); -+ * -+ * passing it, as you might expect, the function to run when nothing is pending -+ * and the workqueue to run that function out of. -+ * -+ * continue_at() also, critically, requires a 'return' immediately following the -+ * location where this macro is referenced, to return to the calling function. -+ * There's good reason for this. -+ * -+ * To use safely closures asynchronously, they must always have a refcount while -+ * they are running owned by the thread that is running them. Otherwise, suppose -+ * you submit some bios and wish to have a function run when they all complete: -+ * -+ * foo_endio(struct bio *bio) -+ * { -+ * closure_put(cl); -+ * } -+ * -+ * closure_init(cl); -+ * -+ * do_stuff(); -+ * closure_get(cl); -+ * bio1->bi_endio = foo_endio; -+ * bio_submit(bio1); -+ * -+ * do_more_stuff(); -+ * closure_get(cl); -+ * bio2->bi_endio = foo_endio; -+ * bio_submit(bio2); -+ * -+ * continue_at(cl, complete_some_read, system_wq); -+ * -+ * If closure's refcount started at 0, complete_some_read() could run before the -+ * second bio was submitted - which is almost always not what you want! More -+ * importantly, it wouldn't be possible to say whether the original thread or -+ * complete_some_read()'s thread owned the closure - and whatever state it was -+ * associated with! -+ * -+ * So, closure_init() initializes a closure's refcount to 1 - and when a -+ * closure_fn is run, the refcount will be reset to 1 first. -+ * -+ * Then, the rule is - if you got the refcount with closure_get(), release it -+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount -+ * on a closure because you called closure_init() or you were run out of a -+ * closure - _always_ use continue_at(). Doing so consistently will help -+ * eliminate an entire class of particularly pernicious races. -+ * -+ * Lastly, you might have a wait list dedicated to a specific event, and have no -+ * need for specifying the condition - you just want to wait until someone runs -+ * closure_wake_up() on the appropriate wait list. In that case, just use -+ * closure_wait(). It will return either true or false, depending on whether the -+ * closure was already on a wait list or not - a closure can only be on one wait -+ * list at a time. -+ * -+ * Parents: -+ * -+ * closure_init() takes two arguments - it takes the closure to initialize, and -+ * a (possibly null) parent. -+ * -+ * If parent is non null, the new closure will have a refcount for its lifetime; -+ * a closure is considered to be "finished" when its refcount hits 0 and the -+ * function to run is null. Hence -+ * -+ * continue_at(cl, NULL, NULL); -+ * -+ * returns up the (spaghetti) stack of closures, precisely like normal return -+ * returns up the C stack. continue_at() with non null fn is better thought of -+ * as doing a tail call. -+ * -+ * All this implies that a closure should typically be embedded in a particular -+ * struct (which its refcount will normally control the lifetime of), and that -+ * struct can very much be thought of as a stack frame. -+ */ -+ -+struct closure; -+struct closure_syncer; -+typedef void (closure_fn) (struct closure *); -+extern struct dentry *bcache_debug; -+ -+struct closure_waitlist { -+ struct llist_head list; -+}; -+ -+enum closure_state { -+ /* -+ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by -+ * the thread that owns the closure, and cleared by the thread that's -+ * waking up the closure. -+ * -+ * The rest are for debugging and don't affect behaviour: -+ * -+ * CLOSURE_RUNNING: Set when a closure is running (i.e. by -+ * closure_init() and when closure_put() runs then next function), and -+ * must be cleared before remaining hits 0. Primarily to help guard -+ * against incorrect usage and accidentally transferring references. -+ * continue_at() and closure_return() clear it for you, if you're doing -+ * something unusual you can use closure_set_dead() which also helps -+ * annotate where references are being transferred. -+ */ -+ -+ CLOSURE_BITS_START = (1U << 26), -+ CLOSURE_DESTRUCTOR = (1U << 26), -+ CLOSURE_WAITING = (1U << 28), -+ CLOSURE_RUNNING = (1U << 30), -+}; -+ -+#define CLOSURE_GUARD_MASK \ -+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) -+ -+#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) -+#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) -+ -+struct closure { -+ union { -+ struct { -+ struct workqueue_struct *wq; -+ struct closure_syncer *s; -+ struct llist_node list; -+ closure_fn *fn; -+ }; -+ struct work_struct work; -+ }; -+ -+ struct closure *parent; -+ -+ atomic_t remaining; -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+#define CLOSURE_MAGIC_DEAD 0xc054dead -+#define CLOSURE_MAGIC_ALIVE 0xc054a11e -+ -+ unsigned int magic; -+ struct list_head all; -+ unsigned long ip; -+ unsigned long waiting_on; -+#endif -+}; -+ -+void closure_sub(struct closure *cl, int v); -+void closure_put(struct closure *cl); -+void __closure_wake_up(struct closure_waitlist *list); -+bool closure_wait(struct closure_waitlist *list, struct closure *cl); -+void __closure_sync(struct closure *cl); -+ -+/** -+ * closure_sync - sleep until a closure a closure has nothing left to wait on -+ * -+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns -+ * the last refcount. -+ */ -+static inline void closure_sync(struct closure *cl) -+{ -+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) -+ __closure_sync(cl); -+} -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+void closure_debug_create(struct closure *cl); -+void closure_debug_destroy(struct closure *cl); -+ -+#else -+ -+static inline void closure_debug_create(struct closure *cl) {} -+static inline void closure_debug_destroy(struct closure *cl) {} -+ -+#endif -+ -+static inline void closure_set_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _THIS_IP_; -+#endif -+} -+ -+static inline void closure_set_ret_ip(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->ip = _RET_IP_; -+#endif -+} -+ -+static inline void closure_set_waiting(struct closure *cl, unsigned long f) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ cl->waiting_on = f; -+#endif -+} -+ -+static inline void closure_set_stopped(struct closure *cl) -+{ -+ atomic_sub(CLOSURE_RUNNING, &cl->remaining); -+} -+ -+static inline void set_closure_fn(struct closure *cl, closure_fn *fn, -+ struct workqueue_struct *wq) -+{ -+ closure_set_ip(cl); -+ cl->fn = fn; -+ cl->wq = wq; -+ /* between atomic_dec() in closure_put() */ -+ smp_mb__before_atomic(); -+} -+ -+static inline void closure_queue(struct closure *cl) -+{ -+ struct workqueue_struct *wq = cl->wq; -+ /** -+ * Changes made to closure, work_struct, or a couple of other structs -+ * may cause work.func not pointing to the right location. -+ */ -+ BUILD_BUG_ON(offsetof(struct closure, fn) -+ != offsetof(struct work_struct, func)); -+ -+ if (wq) { -+ INIT_WORK(&cl->work, cl->work.func); -+ BUG_ON(!queue_work(wq, &cl->work)); -+ } else -+ cl->fn(cl); -+} -+ -+/** -+ * closure_get - increment a closure's refcount -+ */ -+static inline void closure_get(struct closure *cl) -+{ -+#ifdef CONFIG_DEBUG_CLOSURES -+ BUG_ON((atomic_inc_return(&cl->remaining) & -+ CLOSURE_REMAINING_MASK) <= 1); -+#else -+ atomic_inc(&cl->remaining); -+#endif -+} -+ -+/** -+ * closure_init - Initialize a closure, setting the refcount to 1 -+ * @cl: closure to initialize -+ * @parent: parent of the new closure. cl will take a refcount on it for its -+ * lifetime; may be NULL. -+ */ -+static inline void closure_init(struct closure *cl, struct closure *parent) -+{ -+ cl->fn = NULL; -+ cl->parent = parent; -+ if (parent) -+ closure_get(parent); -+ -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+ -+ closure_debug_create(cl); -+ closure_set_ip(cl); -+} -+ -+static inline void closure_init_stack(struct closure *cl) -+{ -+ memset(cl, 0, sizeof(struct closure)); -+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); -+} -+ -+/** -+ * closure_wake_up - wake up all closures on a wait list, -+ * with memory barrier -+ */ -+static inline void closure_wake_up(struct closure_waitlist *list) -+{ -+ /* Memory barrier for the wait list */ -+ smp_mb(); -+ __closure_wake_up(list); -+} -+ -+/** -+ * continue_at - jump to another function with barrier -+ * -+ * After @cl is no longer waiting on anything (i.e. all outstanding refs have -+ * been dropped with closure_put()), it will resume execution at @fn running out -+ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). -+ * -+ * This is because after calling continue_at() you no longer have a ref on @cl, -+ * and whatever @cl owns may be freed out from under you - a running closure fn -+ * has a ref on its own closure which continue_at() drops. -+ * -+ * Note you are expected to immediately return after using this macro. -+ */ -+#define continue_at(_cl, _fn, _wq) \ -+do { \ -+ set_closure_fn(_cl, _fn, _wq); \ -+ closure_sub(_cl, CLOSURE_RUNNING + 1); \ -+} while (0) -+ -+/** -+ * closure_return - finish execution of a closure -+ * -+ * This is used to indicate that @cl is finished: when all outstanding refs on -+ * @cl have been dropped @cl's ref on its parent closure (as passed to -+ * closure_init()) will be dropped, if one was specified - thus this can be -+ * thought of as returning to the parent closure. -+ */ -+#define closure_return(_cl) continue_at((_cl), NULL, NULL) -+ -+/** -+ * continue_at_nobarrier - jump to another function without barrier -+ * -+ * Causes @fn to be executed out of @cl, in @wq context (or called directly if -+ * @wq is NULL). -+ * -+ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, -+ * thus it's not safe to touch anything protected by @cl after a -+ * continue_at_nobarrier(). -+ */ -+#define continue_at_nobarrier(_cl, _fn, _wq) \ -+do { \ -+ set_closure_fn(_cl, _fn, _wq); \ -+ closure_queue(_cl); \ -+} while (0) -+ -+/** -+ * closure_return_with_destructor - finish execution of a closure, -+ * with destructor -+ * -+ * Works like closure_return(), except @destructor will be called when all -+ * outstanding refs on @cl have been dropped; @destructor may be used to safely -+ * free the memory occupied by @cl, and it is called with the ref on the parent -+ * closure still held - so @destructor could safely return an item to a -+ * freelist protected by @cl's parent. -+ */ -+#define closure_return_with_destructor(_cl, _destructor) \ -+do { \ -+ set_closure_fn(_cl, _destructor, NULL); \ -+ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ -+} while (0) -+ -+/** -+ * closure_call - execute @fn out of a new, uninitialized closure -+ * -+ * Typically used when running out of one closure, and we want to run @fn -+ * asynchronously out of a new closure - @parent will then wait for @cl to -+ * finish. -+ */ -+static inline void closure_call(struct closure *cl, closure_fn fn, -+ struct workqueue_struct *wq, -+ struct closure *parent) -+{ -+ closure_init(cl, parent); -+ continue_at_nobarrier(cl, fn, wq); -+} -+ -+#define __closure_wait_event(waitlist, _cond) \ -+do { \ -+ struct closure cl; \ -+ \ -+ closure_init_stack(&cl); \ -+ \ -+ while (1) { \ -+ closure_wait(waitlist, &cl); \ -+ if (_cond) \ -+ break; \ -+ closure_sync(&cl); \ -+ } \ -+ closure_wake_up(waitlist); \ -+ closure_sync(&cl); \ -+} while (0) -+ -+#define closure_wait_event(waitlist, _cond) \ -+do { \ -+ if (!(_cond)) \ -+ __closure_wait_event(waitlist, _cond); \ -+} while (0) -+ -+#endif /* _LINUX_CLOSURE_H */ -diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h -index ea7b756b1c8f..51658b72de72 100644 ---- a/include/linux/compiler_attributes.h -+++ b/include/linux/compiler_attributes.h -@@ -278,4 +278,9 @@ - */ - #define __weak __attribute__((__weak__)) - -+/* -+ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute -+ */ -+#define __flatten __attribute__((flatten)) -+ - #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ -diff --git a/include/linux/dcache.h b/include/linux/dcache.h -index 65d975bf9390..008573618071 100644 ---- a/include/linux/dcache.h -+++ b/include/linux/dcache.h -@@ -256,6 +256,7 @@ extern struct dentry * d_make_root(struct inode *); - /* - the ramfs-type tree */ - extern void d_genocide(struct dentry *); - -+extern void d_mark_tmpfile(struct dentry *, struct inode *); - extern void d_tmpfile(struct dentry *, struct inode *); - - extern struct dentry *d_find_alias(struct inode *); -diff --git a/include/linux/fs.h b/include/linux/fs.h -index 7519ae003a08..305d316f01f3 100644 ---- a/include/linux/fs.h -+++ b/include/linux/fs.h -@@ -2953,6 +2953,7 @@ extern struct inode *find_inode_rcu(struct super_block *, unsigned long, - extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); - extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); - extern int insert_inode_locked(struct inode *); -+extern struct inode *insert_inode_locked2(struct inode *); - #ifdef CONFIG_DEBUG_LOCK_ALLOC - extern void lockdep_annotate_inode_mutex_key(struct inode *inode); - #else -diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 434c9c34aeb6..620535006624 100644 ---- a/include/linux/pagemap.h -+++ b/include/linux/pagemap.h -@@ -689,10 +689,15 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) - return 0; - } - --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask); - int add_to_page_cache_lru(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask); -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask); -+ - extern void delete_from_page_cache(struct page *page); - extern void __delete_from_page_cache(struct page *page, void *shadow); - int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); -@@ -710,22 +715,6 @@ void page_cache_readahead_unbounded(struct address_space *, struct file *, - pgoff_t index, unsigned long nr_to_read, - unsigned long lookahead_count); - --/* -- * Like add_to_page_cache_locked, but used to add newly allocated pages: -- * the page is new, so we can just run __SetPageLocked() against it. -- */ --static inline int add_to_page_cache(struct page *page, -- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) --{ -- int error; -- -- __SetPageLocked(page); -- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); -- if (unlikely(error)) -- __ClearPageLocked(page); -- return error; --} -- - /** - * struct readahead_control - Describes a readahead request. - * -diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..793b07788062 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -747,6 +747,7 @@ struct task_struct { - - struct mm_struct *mm; - struct mm_struct *active_mm; -+ struct address_space *faults_disabled_mapping; - - /* Per-thread vma caching: */ - struct vmacache vmacache; -diff --git a/include/linux/six.h b/include/linux/six.h -new file mode 100644 -index 000000000000..a16e94f482e9 ---- /dev/null -+++ b/include/linux/six.h -@@ -0,0 +1,197 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+ -+#ifndef _LINUX_SIX_H -+#define _LINUX_SIX_H -+ -+/* -+ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw -+ * semaphores, except with a third intermediate state, intent. Basic operations -+ * are: -+ * -+ * six_lock_read(&foo->lock); -+ * six_unlock_read(&foo->lock); -+ * -+ * six_lock_intent(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * -+ * Intent locks block other intent locks, but do not block read locks, and you -+ * must have an intent lock held before taking a write lock, like so: -+ * -+ * six_lock_intent(&foo->lock); -+ * six_lock_write(&foo->lock); -+ * six_unlock_write(&foo->lock); -+ * six_unlock_intent(&foo->lock); -+ * -+ * Other operations: -+ * -+ * six_trylock_read() -+ * six_trylock_intent() -+ * six_trylock_write() -+ * -+ * six_lock_downgrade(): convert from intent to read -+ * six_lock_tryupgrade(): attempt to convert from read to intent -+ * -+ * Locks also embed a sequence number, which is incremented when the lock is -+ * locked or unlocked for write. The current sequence number can be grabbed -+ * while a lock is held from lock->state.seq; then, if you drop the lock you can -+ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock -+ * iff it hasn't been locked for write in the meantime. -+ * -+ * There are also operations that take the lock type as a parameter, where the -+ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: -+ * -+ * six_lock_type(lock, type) -+ * six_unlock_type(lock, type) -+ * six_relock(lock, type, seq) -+ * six_trylock_type(lock, type) -+ * six_trylock_convert(lock, from, to) -+ * -+ * A lock may be held multiple types by the same thread (for read or intent, -+ * not write). However, the six locks code does _not_ implement the actual -+ * recursive checks itself though - rather, if your code (e.g. btree iterator -+ * code) knows that the current thread already has a lock held, and for the -+ * correct type, six_lock_increment() may be used to bump up the counter for -+ * that type - the only effect is that one more call to unlock will be required -+ * before the lock is unlocked. -+ */ -+ -+#include -+#include -+#include -+#include -+ -+#define SIX_LOCK_SEPARATE_LOCKFNS -+ -+union six_lock_state { -+ struct { -+ atomic64_t counter; -+ }; -+ -+ struct { -+ u64 v; -+ }; -+ -+ struct { -+ /* for waitlist_bitnr() */ -+ unsigned long l; -+ }; -+ -+ struct { -+ unsigned read_lock:28; -+ unsigned intent_lock:1; -+ unsigned waiters:3; -+ /* -+ * seq works much like in seqlocks: it's incremented every time -+ * we lock and unlock for write. -+ * -+ * If it's odd write lock is held, even unlocked. -+ * -+ * Thus readers can unlock, and then lock again later iff it -+ * hasn't been modified in the meantime. -+ */ -+ u32 seq; -+ }; -+}; -+ -+enum six_lock_type { -+ SIX_LOCK_read, -+ SIX_LOCK_intent, -+ SIX_LOCK_write, -+}; -+ -+struct six_lock { -+ union six_lock_state state; -+ unsigned intent_lock_recurse; -+ struct task_struct *owner; -+ struct optimistic_spin_queue osq; -+ -+ raw_spinlock_t wait_lock; -+ struct list_head wait_list[2]; -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ struct lockdep_map dep_map; -+#endif -+}; -+ -+typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); -+ -+static __always_inline void __six_lock_init(struct six_lock *lock, -+ const char *name, -+ struct lock_class_key *key) -+{ -+ atomic64_set(&lock->state.counter, 0); -+ raw_spin_lock_init(&lock->wait_lock); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); -+ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); -+#ifdef CONFIG_DEBUG_LOCK_ALLOC -+ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); -+ lockdep_init_map(&lock->dep_map, name, key, 0); -+#endif -+} -+ -+#define six_lock_init(lock) \ -+do { \ -+ static struct lock_class_key __key; \ -+ \ -+ __six_lock_init((lock), #lock, &__key); \ -+} while (0) -+ -+#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *); \ -+bool six_relock_##type(struct six_lock *, u32); \ -+int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ -+void six_unlock_##type(struct six_lock *); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+#undef __SIX_LOCK -+ -+#define SIX_LOCK_DISPATCH(type, fn, ...) \ -+ switch (type) { \ -+ case SIX_LOCK_read: \ -+ return fn##_read(__VA_ARGS__); \ -+ case SIX_LOCK_intent: \ -+ return fn##_intent(__VA_ARGS__); \ -+ case SIX_LOCK_write: \ -+ return fn##_write(__VA_ARGS__); \ -+ default: \ -+ BUG(); \ -+ } -+ -+static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_trylock, lock); -+} -+ -+static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); -+} -+ -+static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); -+} -+ -+static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ SIX_LOCK_DISPATCH(type, six_unlock, lock); -+} -+ -+void six_lock_downgrade(struct six_lock *); -+bool six_lock_tryupgrade(struct six_lock *); -+bool six_trylock_convert(struct six_lock *, enum six_lock_type, -+ enum six_lock_type); -+ -+void six_lock_increment(struct six_lock *, enum six_lock_type); -+ -+void six_lock_wakeup_all(struct six_lock *); -+ -+#endif /* _LINUX_SIX_H */ -diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h -index 0221f852a7e1..f81f60d891ac 100644 ---- a/include/linux/vmalloc.h -+++ b/include/linux/vmalloc.h -@@ -106,6 +106,7 @@ extern void *vzalloc(unsigned long size); - extern void *vmalloc_user(unsigned long size); - extern void *vmalloc_node(unsigned long size, int node); - extern void *vzalloc_node(unsigned long size, int node); -+extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask); - extern void *vmalloc_32(unsigned long size); - extern void *vmalloc_32_user(unsigned long size); - extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); -diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h -new file mode 100644 -index 000000000000..9b4e8295ed75 ---- /dev/null -+++ b/include/trace/events/bcachefs.h -@@ -0,0 +1,664 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM bcachefs -+ -+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_BCACHE_H -+ -+#include -+ -+DECLARE_EVENT_CLASS(bpos, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = p->inode; -+ __entry->offset = p->offset; -+ ), -+ -+ TP_printk("%llu:%llu", __entry->inode, __entry->offset) -+); -+ -+DECLARE_EVENT_CLASS(bkey, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k), -+ -+ TP_STRUCT__entry( -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->inode = k->p.inode; -+ __entry->offset = k->p.offset; -+ __entry->size = k->size; -+ ), -+ -+ TP_printk("%llu:%llu len %u", __entry->inode, -+ __entry->offset, __entry->size) -+); -+ -+DECLARE_EVENT_CLASS(bch_fs, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DECLARE_EVENT_CLASS(bio, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio), -+ -+ TP_STRUCT__entry( -+ __field(dev_t, dev ) -+ __field(sector_t, sector ) -+ __field(unsigned int, nr_sector ) -+ __array(char, rwbs, 6 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; -+ __entry->sector = bio->bi_iter.bi_sector; -+ __entry->nr_sector = bio->bi_iter.bi_size >> 9; -+ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); -+ ), -+ -+ TP_printk("%d,%d %s %llu + %u", -+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, -+ (unsigned long long)__entry->sector, __entry->nr_sector) -+); -+ -+/* io.c: */ -+ -+DEFINE_EVENT(bio, read_split, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_bounce, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, read_retry, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+DEFINE_EVENT(bio, promote, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* Journal */ -+ -+DEFINE_EVENT(bch_fs, journal_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, journal_entry_full, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bio, journal_write, -+ TP_PROTO(struct bio *bio), -+ TP_ARGS(bio) -+); -+ -+/* bset.c: */ -+ -+DEFINE_EVENT(bpos, bkey_pack_pos_fail, -+ TP_PROTO(struct bpos *p), -+ TP_ARGS(p) -+); -+ -+/* Btree */ -+ -+DECLARE_EVENT_CLASS(btree_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u8, level ) -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->level = b->c.level; -+ __entry->id = b->c.btree_id; -+ __entry->inode = b->key.k.p.inode; -+ __entry->offset = b->key.k.p.offset; -+ ), -+ -+ TP_printk("%pU %u id %u %llu:%llu", -+ __entry->uuid, __entry->level, __entry->id, -+ __entry->inode, __entry->offset) -+); -+ -+DEFINE_EVENT(btree_node, btree_read, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_write, -+ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), -+ TP_ARGS(b, bytes, sectors), -+ -+ TP_STRUCT__entry( -+ __field(enum btree_node_type, type) -+ __field(unsigned, bytes ) -+ __field(unsigned, sectors ) -+ ), -+ -+ TP_fast_assign( -+ __entry->type = btree_node_type(b); -+ __entry->bytes = bytes; -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("bkey type %u bytes %u sectors %u", -+ __entry->type , __entry->bytes, __entry->sectors) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_alloc, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_free, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_node_reap, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU", __entry->uuid) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+TRACE_EVENT(btree_reserve_get_fail, -+ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), -+ TP_ARGS(c, required, cl), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, required ) -+ __field(struct closure *, cl ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->required = required; -+ __entry->cl = cl; -+ ), -+ -+ TP_printk("%pU required %zu by %p", __entry->uuid, -+ __entry->required, __entry->cl) -+); -+ -+TRACE_EVENT(btree_insert_key, -+ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), -+ TP_ARGS(c, b, k), -+ -+ TP_STRUCT__entry( -+ __field(u8, id ) -+ __field(u64, inode ) -+ __field(u64, offset ) -+ __field(u32, size ) -+ ), -+ -+ TP_fast_assign( -+ __entry->id = b->c.btree_id; -+ __entry->inode = k->k.p.inode; -+ __entry->offset = k->k.p.offset; -+ __entry->size = k->k.size; -+ ), -+ -+ TP_printk("btree %u: %llu:%llu len %u", __entry->id, -+ __entry->inode, __entry->offset, __entry->size) -+); -+ -+DEFINE_EVENT(btree_node, btree_split, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_compact, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_merge, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_set_root, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+/* Garbage collection */ -+ -+DEFINE_EVENT(btree_node, btree_gc_coalesce, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+TRACE_EVENT(btree_gc_coalesce_fail, -+ TP_PROTO(struct bch_fs *c, int reason), -+ TP_ARGS(c, reason), -+ -+ TP_STRUCT__entry( -+ __field(u8, reason ) -+ __array(char, uuid, 16 ) -+ ), -+ -+ TP_fast_assign( -+ __entry->reason = reason; -+ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); -+ ), -+ -+ TP_printk("%pU: %u", __entry->uuid, __entry->reason) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, -+ TP_PROTO(struct bch_fs *c, struct btree *b), -+ TP_ARGS(c, b) -+); -+ -+DEFINE_EVENT(bch_fs, gc_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_start, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_coalesce_end, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+/* Allocator */ -+ -+TRACE_EVENT(alloc_batch, -+ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), -+ TP_ARGS(ca, free, total), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(size_t, free ) -+ __field(size_t, total ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->free = free; -+ __entry->total = total; -+ ), -+ -+ TP_printk("%pU free %zu total %zu", -+ __entry->uuid, __entry->free, __entry->total) -+); -+ -+TRACE_EVENT(invalidate, -+ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), -+ TP_ARGS(ca, offset, sectors), -+ -+ TP_STRUCT__entry( -+ __field(unsigned, sectors ) -+ __field(dev_t, dev ) -+ __field(__u64, offset ) -+ ), -+ -+ TP_fast_assign( -+ __entry->dev = ca->disk_sb.bdev->bd_dev; -+ __entry->offset = offset, -+ __entry->sectors = sectors; -+ ), -+ -+ TP_printk("invalidated %u sectors at %d,%d sector=%llu", -+ __entry->sectors, MAJOR(__entry->dev), -+ MINOR(__entry->dev), __entry->offset) -+); -+ -+DEFINE_EVENT(bch_fs, rescale_prios, -+ TP_PROTO(struct bch_fs *c), -+ TP_ARGS(c) -+); -+ -+DECLARE_EVENT_CLASS(bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16) -+ __field(enum alloc_reserve, reserve ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, ca->uuid.b, 16); -+ __entry->reserve = reserve; -+ ), -+ -+ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, -+ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), -+ TP_ARGS(ca, reserve) -+); -+ -+/* Moving IO */ -+ -+DEFINE_EVENT(bkey, move_extent, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_alloc_fail, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+DEFINE_EVENT(bkey, move_race, -+ TP_PROTO(const struct bkey *k), -+ TP_ARGS(k) -+); -+ -+TRACE_EVENT(move_data, -+ TP_PROTO(struct bch_fs *c, u64 sectors_moved, -+ u64 keys_moved), -+ TP_ARGS(c, sectors_moved, keys_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, keys_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->keys_moved = keys_moved; -+ ), -+ -+ TP_printk("%pU sectors_moved %llu keys_moved %llu", -+ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) -+); -+ -+TRACE_EVENT(copygc, -+ TP_PROTO(struct bch_fs *c, -+ u64 sectors_moved, u64 sectors_not_moved, -+ u64 buckets_moved, u64 buckets_not_moved), -+ TP_ARGS(c, -+ sectors_moved, sectors_not_moved, -+ buckets_moved, buckets_not_moved), -+ -+ TP_STRUCT__entry( -+ __array(char, uuid, 16 ) -+ __field(u64, sectors_moved ) -+ __field(u64, sectors_not_moved ) -+ __field(u64, buckets_moved ) -+ __field(u64, buckets_not_moved ) -+ ), -+ -+ TP_fast_assign( -+ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); -+ __entry->sectors_moved = sectors_moved; -+ __entry->sectors_not_moved = sectors_not_moved; -+ __entry->buckets_moved = buckets_moved; -+ __entry->buckets_not_moved = buckets_moved; -+ ), -+ -+ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", -+ __entry->uuid, -+ __entry->sectors_moved, __entry->sectors_not_moved, -+ __entry->buckets_moved, __entry->buckets_not_moved) -+); -+ -+TRACE_EVENT(transaction_restart_ip, -+ TP_PROTO(unsigned long caller, unsigned long ip), -+ TP_ARGS(caller, ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, caller ) -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->caller = caller; -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) -+); -+ -+DECLARE_EVENT_CLASS(transaction_restart, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ ), -+ -+ TP_printk("%pf", (void *) __entry->ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+TRACE_EVENT(trans_restart_iters_realloced, -+ TP_PROTO(unsigned long ip, unsigned nr), -+ TP_ARGS(ip, nr), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned, nr ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->nr = nr; -+ ), -+ -+ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) -+); -+ -+TRACE_EVENT(trans_restart_mem_realloced, -+ TP_PROTO(unsigned long ip, unsigned long bytes), -+ TP_ARGS(ip, bytes), -+ -+ TP_STRUCT__entry( -+ __field(unsigned long, ip ) -+ __field(unsigned long, bytes ) -+ ), -+ -+ TP_fast_assign( -+ __entry->ip = ip; -+ __entry->bytes = bytes; -+ ), -+ -+ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_mark, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_traverse, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DEFINE_EVENT(transaction_restart, trans_restart_atomic, -+ TP_PROTO(unsigned long ip), -+ TP_ARGS(ip) -+); -+ -+DECLARE_EVENT_CLASS(node_lock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq), -+ -+ TP_STRUCT__entry( -+ __field(u32, level) -+ __field(u32, iter_seq) -+ __field(u32, node) -+ __field(u32, node_seq) -+ ), -+ -+ TP_fast_assign( -+ __entry->level = level; -+ __entry->iter_seq = iter_seq; -+ __entry->node = node; -+ __entry->node_seq = node_seq; -+ ), -+ -+ TP_printk("level %u iter seq %u node %u node seq %u", -+ __entry->level, __entry->iter_seq, -+ __entry->node, __entry->node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_upgrade_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+DEFINE_EVENT(node_lock_fail, node_relock_fail, -+ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), -+ TP_ARGS(level, iter_seq, node, node_seq) -+); -+ -+#endif /* _TRACE_BCACHE_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..94706c45bb6a 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -84,6 +84,7 @@ struct task_struct init_task - .nr_cpus_allowed= NR_CPUS, - .mm = NULL, - .active_mm = &init_mm, -+ .faults_disabled_mapping = NULL, - .restart_block = { - .fn = do_no_restart_syscall, - }, -diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks -index 3de8fd11873b..ab8aa082ce56 100644 ---- a/kernel/Kconfig.locks -+++ b/kernel/Kconfig.locks -@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB - config MMIOWB - def_bool y if ARCH_HAS_MMIOWB - depends on SMP -+ -+config SIXLOCKS -+ bool -diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile -index 6d11cfb9b41f..4c13937e8f37 100644 ---- a/kernel/locking/Makefile -+++ b/kernel/locking/Makefile -@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o - obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o - obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o - obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o -+obj-$(CONFIG_SIXLOCKS) += six.o -diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h -index b0be1560ed17..6388e42cfd68 100644 ---- a/kernel/locking/lockdep_internals.h -+++ b/kernel/locking/lockdep_internals.h -@@ -98,7 +98,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - #else - #define MAX_LOCKDEP_ENTRIES 32768UL - --#define MAX_LOCKDEP_CHAINS_BITS 16 -+#define MAX_LOCKDEP_CHAINS_BITS 18 - - /* - * Stack-trace: tightly packed array of stack backtrace -@@ -116,7 +116,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = - - #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) - --#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) -+#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) - - extern struct list_head all_lock_classes; - extern struct lock_chain lock_chains[]; -diff --git a/kernel/locking/six.c b/kernel/locking/six.c -new file mode 100644 -index 000000000000..49d46ed2e18e ---- /dev/null -+++ b/kernel/locking/six.c -@@ -0,0 +1,553 @@ -+// SPDX-License-Identifier: GPL-2.0 -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#ifdef DEBUG -+#define EBUG_ON(cond) BUG_ON(cond) -+#else -+#define EBUG_ON(cond) do {} while (0) -+#endif -+ -+#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) -+#define six_release(l) lock_release(l, _RET_IP_) -+ -+struct six_lock_vals { -+ /* Value we add to the lock in order to take the lock: */ -+ u64 lock_val; -+ -+ /* If the lock has this value (used as a mask), taking the lock fails: */ -+ u64 lock_fail; -+ -+ /* Value we add to the lock in order to release the lock: */ -+ u64 unlock_val; -+ -+ /* Mask that indicates lock is held for this type: */ -+ u64 held_mask; -+ -+ /* Waitlist we wakeup when releasing the lock: */ -+ enum six_lock_type unlock_wakeup; -+}; -+ -+#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) -+#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) -+#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) -+ -+#define LOCK_VALS { \ -+ [SIX_LOCK_read] = { \ -+ .lock_val = __SIX_VAL(read_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_write, \ -+ .unlock_val = -__SIX_VAL(read_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_read, \ -+ .unlock_wakeup = SIX_LOCK_write, \ -+ }, \ -+ [SIX_LOCK_intent] = { \ -+ .lock_val = __SIX_VAL(intent_lock, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_intent, \ -+ .unlock_val = -__SIX_VAL(intent_lock, 1), \ -+ .held_mask = __SIX_LOCK_HELD_intent, \ -+ .unlock_wakeup = SIX_LOCK_intent, \ -+ }, \ -+ [SIX_LOCK_write] = { \ -+ .lock_val = __SIX_VAL(seq, 1), \ -+ .lock_fail = __SIX_LOCK_HELD_read, \ -+ .unlock_val = __SIX_VAL(seq, 1), \ -+ .held_mask = __SIX_LOCK_HELD_write, \ -+ .unlock_wakeup = SIX_LOCK_read, \ -+ }, \ -+} -+ -+static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, -+ union six_lock_state old) -+{ -+ if (type != SIX_LOCK_intent) -+ return; -+ -+ if (!old.intent_lock) { -+ EBUG_ON(lock->owner); -+ lock->owner = current; -+ } else { -+ EBUG_ON(lock->owner != current); -+ } -+} -+ -+static __always_inline bool do_six_trylock_type(struct six_lock *lock, -+ enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); -+ -+ do { -+ old.v = v; -+ -+ EBUG_ON(type == SIX_LOCK_write && -+ ((old.v & __SIX_LOCK_HELD_write) || -+ !(old.v & __SIX_LOCK_HELD_intent))); -+ -+ if (old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ if (!do_six_trylock_type(lock, type)) -+ return false; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+__always_inline __flatten -+static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, -+ unsigned seq) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ old.v = v; -+ -+ if (old.seq != seq || old.v & l[type].lock_fail) -+ return false; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, -+ old.v + l[type].lock_val)) != old.v); -+ -+ six_set_owner(lock, type, old); -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 1); -+ return true; -+} -+ -+struct six_lock_waiter { -+ struct list_head list; -+ struct task_struct *task; -+}; -+ -+/* This is probably up there with the more evil things I've done */ -+#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) -+ -+#ifdef CONFIG_LOCK_SPIN_ON_OWNER -+ -+static inline int six_can_spin_on_owner(struct six_lock *lock) -+{ -+ struct task_struct *owner; -+ int retval = 1; -+ -+ if (need_resched()) -+ return 0; -+ -+ rcu_read_lock(); -+ owner = READ_ONCE(lock->owner); -+ if (owner) -+ retval = owner->on_cpu; -+ rcu_read_unlock(); -+ /* -+ * if lock->owner is not set, the mutex owner may have just acquired -+ * it and not set the owner yet or the mutex has been released. -+ */ -+ return retval; -+} -+ -+static inline bool six_spin_on_owner(struct six_lock *lock, -+ struct task_struct *owner) -+{ -+ bool ret = true; -+ -+ rcu_read_lock(); -+ while (lock->owner == owner) { -+ /* -+ * Ensure we emit the owner->on_cpu, dereference _after_ -+ * checking lock->owner still matches owner. If that fails, -+ * owner might point to freed memory. If it still matches, -+ * the rcu_read_lock() ensures the memory stays valid. -+ */ -+ barrier(); -+ -+ if (!owner->on_cpu || need_resched()) { -+ ret = false; -+ break; -+ } -+ -+ cpu_relax(); -+ } -+ rcu_read_unlock(); -+ -+ return ret; -+} -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ struct task_struct *task = current; -+ -+ if (type == SIX_LOCK_write) -+ return false; -+ -+ preempt_disable(); -+ if (!six_can_spin_on_owner(lock)) -+ goto fail; -+ -+ if (!osq_lock(&lock->osq)) -+ goto fail; -+ -+ while (1) { -+ struct task_struct *owner; -+ -+ /* -+ * If there's an owner, wait for it to either -+ * release the lock or go to sleep. -+ */ -+ owner = READ_ONCE(lock->owner); -+ if (owner && !six_spin_on_owner(lock, owner)) -+ break; -+ -+ if (do_six_trylock_type(lock, type)) { -+ osq_unlock(&lock->osq); -+ preempt_enable(); -+ return true; -+ } -+ -+ /* -+ * When there's no owner, we might have preempted between the -+ * owner acquiring the lock and setting the owner field. If -+ * we're an RT task that will live-lock because we won't let -+ * the owner complete. -+ */ -+ if (!owner && (need_resched() || rt_task(task))) -+ break; -+ -+ /* -+ * The cpu_relax() call is a compiler barrier which forces -+ * everything in this loop to be re-loaded. We don't need -+ * memory barriers as we'll eventually observe the right -+ * values at the cost of a few extra spins. -+ */ -+ cpu_relax(); -+ } -+ -+ osq_unlock(&lock->osq); -+fail: -+ preempt_enable(); -+ -+ /* -+ * If we fell out of the spin path because of need_resched(), -+ * reschedule now, before we try-lock again. This avoids getting -+ * scheduled out right after we obtained the lock. -+ */ -+ if (need_resched()) -+ schedule(); -+ -+ return false; -+} -+ -+#else /* CONFIG_LOCK_SPIN_ON_OWNER */ -+ -+static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) -+{ -+ return false; -+} -+ -+#endif -+ -+noinline -+static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ struct six_lock_waiter wait; -+ int ret = 0; -+ u64 v; -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ return ret; -+ -+ if (six_optimistic_spin(lock, type)) -+ return 0; -+ -+ lock_contended(&lock->dep_map, _RET_IP_); -+ -+ INIT_LIST_HEAD(&wait.list); -+ wait.task = current; -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (type == SIX_LOCK_write) -+ EBUG_ON(lock->owner != current); -+ else if (list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_add_tail(&wait.list, &lock->wait_list[type]); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; -+ if (ret) -+ break; -+ -+ v = READ_ONCE(lock->state.v); -+ do { -+ new.v = old.v = v; -+ -+ if (!(old.v & l[type].lock_fail)) -+ new.v += l[type].lock_val; -+ else if (!(new.waiters & (1 << type))) -+ new.waiters |= 1 << type; -+ else -+ break; /* waiting bit already set */ -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ if (!(old.v & l[type].lock_fail)) -+ break; -+ -+ schedule(); -+ } -+ -+ if (!ret) -+ six_set_owner(lock, type, old); -+ -+ __set_current_state(TASK_RUNNING); -+ -+ if (!list_empty_careful(&wait.list)) { -+ raw_spin_lock(&lock->wait_lock); -+ list_del_init(&wait.list); -+ raw_spin_unlock(&lock->wait_lock); -+ } -+ -+ return ret; -+} -+ -+__always_inline -+static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, -+ six_lock_should_sleep_fn should_sleep_fn, void *p) -+{ -+ int ret; -+ -+ if (type != SIX_LOCK_write) -+ six_acquire(&lock->dep_map, 0); -+ -+ ret = do_six_trylock_type(lock, type) ? 0 -+ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); -+ -+ if (ret && type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ if (!ret) -+ lock_acquired(&lock->dep_map, _RET_IP_); -+ -+ return ret; -+} -+ -+static inline void six_lock_wakeup(struct six_lock *lock, -+ union six_lock_state state, -+ unsigned waitlist_id) -+{ -+ struct list_head *wait_list = &lock->wait_list[waitlist_id]; -+ struct six_lock_waiter *w, *next; -+ -+ if (waitlist_id == SIX_LOCK_write && state.read_lock) -+ return; -+ -+ if (!(state.waiters & (1 << waitlist_id))) -+ return; -+ -+ clear_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ -+ if (waitlist_id == SIX_LOCK_write) { -+ struct task_struct *p = READ_ONCE(lock->owner); -+ -+ if (p) -+ wake_up_process(p); -+ return; -+ } -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry_safe(w, next, wait_list, list) { -+ list_del_init(&w->list); -+ -+ if (wake_up_process(w->task) && -+ waitlist_id != SIX_LOCK_read) { -+ if (!list_empty(wait_list)) -+ set_bit(waitlist_bitnr(waitlist_id), -+ (unsigned long *) &lock->state.v); -+ break; -+ } -+ } -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+ -+__always_inline __flatten -+static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state state; -+ -+ EBUG_ON(!(lock->state.v & l[type].held_mask)); -+ EBUG_ON(type == SIX_LOCK_write && -+ !(lock->state.v & __SIX_LOCK_HELD_intent)); -+ -+ if (type != SIX_LOCK_write) -+ six_release(&lock->dep_map); -+ -+ if (type == SIX_LOCK_intent) { -+ EBUG_ON(lock->owner != current); -+ -+ if (lock->intent_lock_recurse) { -+ --lock->intent_lock_recurse; -+ return; -+ } -+ -+ lock->owner = NULL; -+ } -+ -+ state.v = atomic64_add_return_release(l[type].unlock_val, -+ &lock->state.counter); -+ six_lock_wakeup(lock, state, l[type].unlock_wakeup); -+} -+ -+#define __SIX_LOCK(type) \ -+bool six_trylock_##type(struct six_lock *lock) \ -+{ \ -+ return __six_trylock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_trylock_##type); \ -+ \ -+bool six_relock_##type(struct six_lock *lock, u32 seq) \ -+{ \ -+ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ -+} \ -+EXPORT_SYMBOL_GPL(six_relock_##type); \ -+ \ -+int six_lock_##type(struct six_lock *lock, \ -+ six_lock_should_sleep_fn should_sleep_fn, void *p) \ -+{ \ -+ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ -+} \ -+EXPORT_SYMBOL_GPL(six_lock_##type); \ -+ \ -+void six_unlock_##type(struct six_lock *lock) \ -+{ \ -+ __six_unlock_type(lock, SIX_LOCK_##type); \ -+} \ -+EXPORT_SYMBOL_GPL(six_unlock_##type); -+ -+__SIX_LOCK(read) -+__SIX_LOCK(intent) -+__SIX_LOCK(write) -+ -+#undef __SIX_LOCK -+ -+/* Convert from intent to read: */ -+void six_lock_downgrade(struct six_lock *lock) -+{ -+ six_lock_increment(lock, SIX_LOCK_read); -+ six_unlock_intent(lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_downgrade); -+ -+bool six_lock_tryupgrade(struct six_lock *lock) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ union six_lock_state old, new; -+ u64 v = READ_ONCE(lock->state.v); -+ -+ do { -+ new.v = old.v = v; -+ -+ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); -+ -+ new.v += l[SIX_LOCK_read].unlock_val; -+ -+ if (new.v & l[SIX_LOCK_intent].lock_fail) -+ return false; -+ -+ new.v += l[SIX_LOCK_intent].lock_val; -+ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, -+ old.v, new.v)) != old.v); -+ -+ six_set_owner(lock, SIX_LOCK_intent, old); -+ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); -+ -+ return true; -+} -+EXPORT_SYMBOL_GPL(six_lock_tryupgrade); -+ -+bool six_trylock_convert(struct six_lock *lock, -+ enum six_lock_type from, -+ enum six_lock_type to) -+{ -+ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); -+ -+ if (to == from) -+ return true; -+ -+ if (to == SIX_LOCK_read) { -+ six_lock_downgrade(lock); -+ return true; -+ } else { -+ return six_lock_tryupgrade(lock); -+ } -+} -+EXPORT_SYMBOL_GPL(six_trylock_convert); -+ -+/* -+ * Increment read/intent lock count, assuming we already have it read or intent -+ * locked: -+ */ -+void six_lock_increment(struct six_lock *lock, enum six_lock_type type) -+{ -+ const struct six_lock_vals l[] = LOCK_VALS; -+ -+ EBUG_ON(type == SIX_LOCK_write); -+ six_acquire(&lock->dep_map, 0); -+ -+ /* XXX: assert already locked, and that we don't overflow: */ -+ -+ switch (type) { -+ case SIX_LOCK_read: -+ atomic64_add(l[type].lock_val, &lock->state.counter); -+ break; -+ case SIX_LOCK_intent: -+ lock->intent_lock_recurse++; -+ break; -+ case SIX_LOCK_write: -+ BUG(); -+ break; -+ } -+} -+EXPORT_SYMBOL_GPL(six_lock_increment); -+ -+void six_lock_wakeup_all(struct six_lock *lock) -+{ -+ struct six_lock_waiter *w; -+ -+ raw_spin_lock(&lock->wait_lock); -+ -+ list_for_each_entry(w, &lock->wait_list[0], list) -+ wake_up_process(w->task); -+ list_for_each_entry(w, &lock->wait_list[1], list) -+ wake_up_process(w->task); -+ -+ raw_spin_unlock(&lock->wait_lock); -+} -+EXPORT_SYMBOL_GPL(six_lock_wakeup_all); -diff --git a/kernel/module.c b/kernel/module.c -index 1c5cff34d9f2..8f9f37b0bfaa 100644 ---- a/kernel/module.c -+++ b/kernel/module.c -@@ -2830,9 +2830,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) - - void * __weak module_alloc(unsigned long size) - { -- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, -- NUMA_NO_NODE, __builtin_return_address(0)); -+ return vmalloc_exec(size, GFP_KERNEL); - } - - bool __weak module_init_section(const char *name) -diff --git a/lib/Kconfig b/lib/Kconfig -index b4b98a03ff98..7ec0b400c545 100644 ---- a/lib/Kconfig -+++ b/lib/Kconfig -@@ -461,6 +461,9 @@ config ASSOCIATIVE_ARRAY - - for more information. - -+config CLOSURES -+ bool -+ - config HAS_IOMEM - bool - depends on !NO_IOMEM -diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug -index 0c781f912f9f..efe645766784 100644 ---- a/lib/Kconfig.debug -+++ b/lib/Kconfig.debug -@@ -1517,6 +1517,15 @@ config DEBUG_CREDENTIALS - - source "kernel/rcu/Kconfig.debug" - -+config DEBUG_CLOSURES -+ bool "Debug closures (bcache async widgits)" -+ depends on CLOSURES -+ select DEBUG_FS -+ help -+ Keeps all active closures in a linked list and provides a debugfs -+ interface to list them, which makes it possible to see asynchronous -+ operations that get stuck. -+ - config DEBUG_WQ_FORCE_RR_CPU - bool "Force round-robin CPU selection for unbound work items" - depends on DEBUG_KERNEL -diff --git a/lib/Makefile b/lib/Makefile -index a4a4c6864f51..dfefe98c29ec 100644 ---- a/lib/Makefile -+++ b/lib/Makefile -@@ -234,6 +234,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o - - obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o - -+obj-$(CONFIG_CLOSURES) += closure.o -+ - obj-$(CONFIG_DQL) += dynamic_queue_limits.o - - obj-$(CONFIG_GLOB) += glob.o -diff --git a/lib/closure.c b/lib/closure.c -new file mode 100644 -index 000000000000..3e6366c26209 ---- /dev/null -+++ b/lib/closure.c -@@ -0,0 +1,214 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* -+ * Asynchronous refcounty things -+ * -+ * Copyright 2010, 2011 Kent Overstreet -+ * Copyright 2012 Google, Inc. -+ */ -+ -+#include -+#include -+#include -+#include -+#include -+ -+static inline void closure_put_after_sub(struct closure *cl, int flags) -+{ -+ int r = flags & CLOSURE_REMAINING_MASK; -+ -+ BUG_ON(flags & CLOSURE_GUARD_MASK); -+ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); -+ -+ if (!r) { -+ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { -+ atomic_set(&cl->remaining, -+ CLOSURE_REMAINING_INITIALIZER); -+ closure_queue(cl); -+ } else { -+ struct closure *parent = cl->parent; -+ closure_fn *destructor = cl->fn; -+ -+ closure_debug_destroy(cl); -+ -+ if (destructor) -+ destructor(cl); -+ -+ if (parent) -+ closure_put(parent); -+ } -+ } -+} -+ -+/* For clearing flags with the same atomic op as a put */ -+void closure_sub(struct closure *cl, int v) -+{ -+ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); -+} -+EXPORT_SYMBOL(closure_sub); -+ -+/* -+ * closure_put - decrement a closure's refcount -+ */ -+void closure_put(struct closure *cl) -+{ -+ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); -+} -+EXPORT_SYMBOL(closure_put); -+ -+/* -+ * closure_wake_up - wake up all closures on a wait list, without memory barrier -+ */ -+void __closure_wake_up(struct closure_waitlist *wait_list) -+{ -+ struct llist_node *list; -+ struct closure *cl, *t; -+ struct llist_node *reverse = NULL; -+ -+ list = llist_del_all(&wait_list->list); -+ -+ /* We first reverse the list to preserve FIFO ordering and fairness */ -+ reverse = llist_reverse_order(list); -+ -+ /* Then do the wakeups */ -+ llist_for_each_entry_safe(cl, t, reverse, list) { -+ closure_set_waiting(cl, 0); -+ closure_sub(cl, CLOSURE_WAITING + 1); -+ } -+} -+EXPORT_SYMBOL(__closure_wake_up); -+ -+/** -+ * closure_wait - add a closure to a waitlist -+ * @waitlist: will own a ref on @cl, which will be released when -+ * closure_wake_up() is called on @waitlist. -+ * @cl: closure pointer. -+ * -+ */ -+bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) -+{ -+ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) -+ return false; -+ -+ closure_set_waiting(cl, _RET_IP_); -+ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); -+ llist_add(&cl->list, &waitlist->list); -+ -+ return true; -+} -+EXPORT_SYMBOL(closure_wait); -+ -+struct closure_syncer { -+ struct task_struct *task; -+ int done; -+}; -+ -+static void closure_sync_fn(struct closure *cl) -+{ -+ struct closure_syncer *s = cl->s; -+ struct task_struct *p; -+ -+ rcu_read_lock(); -+ p = READ_ONCE(s->task); -+ s->done = 1; -+ wake_up_process(p); -+ rcu_read_unlock(); -+} -+ -+void __sched __closure_sync(struct closure *cl) -+{ -+ struct closure_syncer s = { .task = current }; -+ -+ cl->s = &s; -+ continue_at(cl, closure_sync_fn, NULL); -+ -+ while (1) { -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ if (s.done) -+ break; -+ schedule(); -+ } -+ -+ __set_current_state(TASK_RUNNING); -+} -+EXPORT_SYMBOL(__closure_sync); -+ -+#ifdef CONFIG_DEBUG_CLOSURES -+ -+static LIST_HEAD(closure_list); -+static DEFINE_SPINLOCK(closure_list_lock); -+ -+void closure_debug_create(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_ALIVE; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_add(&cl->all, &closure_list); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_create); -+ -+void closure_debug_destroy(struct closure *cl) -+{ -+ unsigned long flags; -+ -+ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); -+ cl->magic = CLOSURE_MAGIC_DEAD; -+ -+ spin_lock_irqsave(&closure_list_lock, flags); -+ list_del(&cl->all); -+ spin_unlock_irqrestore(&closure_list_lock, flags); -+} -+EXPORT_SYMBOL(closure_debug_destroy); -+ -+static int debug_seq_show(struct seq_file *f, void *data) -+{ -+ struct closure *cl; -+ -+ spin_lock_irq(&closure_list_lock); -+ -+ list_for_each_entry(cl, &closure_list, all) { -+ int r = atomic_read(&cl->remaining); -+ -+ seq_printf(f, "%p: %pS -> %pS p %p r %i ", -+ cl, (void *) cl->ip, cl->fn, cl->parent, -+ r & CLOSURE_REMAINING_MASK); -+ -+ seq_printf(f, "%s%s\n", -+ test_bit(WORK_STRUCT_PENDING_BIT, -+ work_data_bits(&cl->work)) ? "Q" : "", -+ r & CLOSURE_RUNNING ? "R" : ""); -+ -+ if (r & CLOSURE_WAITING) -+ seq_printf(f, " W %pS\n", -+ (void *) cl->waiting_on); -+ -+ seq_puts(f, "\n"); -+ } -+ -+ spin_unlock_irq(&closure_list_lock); -+ return 0; -+} -+ -+static int debug_seq_open(struct inode *inode, struct file *file) -+{ -+ return single_open(file, debug_seq_show, NULL); -+} -+ -+static const struct file_operations debug_ops = { -+ .owner = THIS_MODULE, -+ .open = debug_seq_open, -+ .read = seq_read, -+ .release = single_release -+}; -+ -+static int __init closure_debug_init(void) -+{ -+ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); -+ return 0; -+} -+late_initcall(closure_debug_init) -+ -+#endif -diff --git a/mm/filemap.c b/mm/filemap.c -index 99c49eeae71b..a5a07767a2eb 100644 ---- a/mm/filemap.c -+++ b/mm/filemap.c -@@ -117,6 +117,69 @@ - * ->tasklist_lock (memory_failure, collect_procs_ao) - */ - -+static int page_cache_tree_insert_vec(struct page *pages[], -+ unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, -+ gfp_t gfp_mask, -+ void *shadow[]) -+{ -+ XA_STATE(xas, &mapping->i_pages, index); -+ void *old; -+ int i = 0, error = 0; -+ -+ mapping_set_update(&xas, mapping); -+ -+ if (!nr_pages) -+ return 0; -+ -+ xa_lock_irq(&mapping->i_pages); -+ -+ while (1) { -+ old = xas_load(&xas); -+ if (old && !xa_is_value(old)) { -+ error = -EEXIST; -+ break; -+ } -+ -+ xas_store(&xas, pages[i]); -+ error = xas_error(&xas); -+ -+ if (error == -ENOMEM) { -+ xa_unlock_irq(&mapping->i_pages); -+ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) -+ error = 0; -+ xa_lock_irq(&mapping->i_pages); -+ -+ if (!error) -+ continue; -+ break; -+ } -+ -+ if (error) -+ break; -+ -+ if (shadow) -+ shadow[i] = old; -+ if (xa_is_value(old)) -+ mapping->nrexceptional--; -+ mapping->nrpages++; -+ -+ /* hugetlb pages do not participate in page cache accounting. */ -+ if (!PageHuge(pages[i])) -+ __inc_lruvec_page_state(pages[i], NR_FILE_PAGES); -+ -+ if (++i == nr_pages) -+ break; -+ -+ xas_next(&xas); -+ } -+ -+ xa_unlock_irq(&mapping->i_pages); -+ -+ return i ?: error; -+} -+ - static void page_cache_delete(struct address_space *mapping, - struct page *page, void *shadow) - { -@@ -827,114 +890,148 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) - } - EXPORT_SYMBOL_GPL(replace_page_cache_page); - --static int __add_to_page_cache_locked(struct page *page, -- struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask, -- void **shadowp) -+static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, -+ struct address_space *mapping, -+ pgoff_t index, gfp_t gfp_mask, -+ void *shadow[]) - { -- XA_STATE(xas, &mapping->i_pages, offset); -- int huge = PageHuge(page); -- int error; -- void *old; -+ int i, nr_added = 0, error = 0; - -- VM_BUG_ON_PAGE(!PageLocked(page), page); -- VM_BUG_ON_PAGE(PageSwapBacked(page), page); -- mapping_set_update(&xas, mapping); -+ for (i = 0; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- get_page(page); -- page->mapping = mapping; -- page->index = offset; -+ VM_BUG_ON_PAGE(PageSwapBacked(page), page); -+ VM_BUG_ON_PAGE(PageSwapCache(page), page); - -- if (!huge) { -- error = mem_cgroup_charge(page, current->mm, gfp_mask); -- if (error) -- goto error; -+ __SetPageLocked(page); -+ get_page(page); -+ page->mapping = mapping; -+ page->index = index + i; -+ -+ if (!PageHuge(page)) { -+ error = mem_cgroup_charge(page, current->mm, gfp_mask); -+ if (error) { -+ page->mapping = NULL; -+ /* Leave page->index set: truncation relies upon it */ -+ put_page(page); -+ __ClearPageLocked(page); -+ if (!i) -+ return error; -+ nr_pages = i; -+ break; -+ } -+ } - } - -- do { -- xas_lock_irq(&xas); -- old = xas_load(&xas); -- if (old && !xa_is_value(old)) -- xas_set_err(&xas, -EEXIST); -- xas_store(&xas, page); -- if (xas_error(&xas)) -- goto unlock; -+ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, -+ index, gfp_mask, shadow); -+ if (error > 0) { -+ nr_added = error; -+ error = 0; -+ } - -- if (xa_is_value(old)) { -- mapping->nrexceptional--; -- if (shadowp) -- *shadowp = old; -- } -- mapping->nrpages++; -+ for (i = 0; i < nr_added; i++) -+ trace_mm_filemap_add_to_page_cache(pages[i]); - -- /* hugetlb pages do not participate in page cache accounting */ -- if (!huge) -- __inc_lruvec_page_state(page, NR_FILE_PAGES); --unlock: -- xas_unlock_irq(&xas); -- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); -+ for (i = nr_added; i < nr_pages; i++) { -+ struct page *page = pages[i]; - -- if (xas_error(&xas)) { -- error = xas_error(&xas); -- goto error; -+ /* Leave page->index set: truncation relies upon it */ -+ page->mapping = NULL; -+ put_page(page); -+ __ClearPageLocked(page); - } - -- trace_mm_filemap_add_to_page_cache(page); -- return 0; --error: -- page->mapping = NULL; -- /* Leave page->index set: truncation relies upon it */ -- put_page(page); -- return error; -+ return nr_added ?: error; - } --ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); -+ALLOW_ERROR_INJECTION(__add_to_page_cache, ERRNO); - - /** -- * add_to_page_cache_locked - add a locked page to the pagecache -+ * add_to_page_cache - add a newly allocated page to the pagecache - * @page: page to add - * @mapping: the page's address_space - * @offset: page index - * @gfp_mask: page allocation mode - * -- * This function is used to add a page to the pagecache. It must be locked. -- * This function does not add the page to the LRU. The caller must do that. -+ * This function is used to add a page to the pagecache. It must be newly -+ * allocated. This function does not add the page to the LRU. The caller must -+ * do that. - * - * Return: %0 on success, negative error code otherwise. - */ --int add_to_page_cache_locked(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) - { -- return __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, NULL); -+ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, -+ gfp_mask, NULL); -+ if (ret < 0) -+ return ret; -+ return 0; - } --EXPORT_SYMBOL(add_to_page_cache_locked); -+EXPORT_SYMBOL(add_to_page_cache); -+ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); - --int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -- pgoff_t offset, gfp_t gfp_mask) -+int add_to_page_cache_lru_vec(struct address_space *mapping, -+ struct page **pages, -+ unsigned nr_pages, -+ pgoff_t offset, gfp_t gfp_mask) - { -- void *shadow = NULL; -- int ret; -+ void *shadow_stack[8], **shadow = shadow_stack; -+ int i, ret = 0, err = 0, nr_added; -+ -+ if (nr_pages > ARRAY_SIZE(shadow_stack)) { -+ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); -+ if (!shadow) -+ goto slowpath; -+ } -+ -+ for (i = 0; i < nr_pages; i++) -+ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); -+ -+ ret = add_to_page_cache_vec(pages, nr_pages, mapping, -+ offset, gfp_mask, shadow); -+ nr_added = ret > 0 ? ret : 0; -+ -+ /* -+ * The page might have been evicted from cache only recently, in which -+ * case it should be activated like any other repeatedly accessed page. -+ * The exception is pages getting rewritten; evicting other data from -+ * the working set, only to cache data that will get overwritten with -+ * something else, is a waste of memory. -+ */ -+ for (i = 0; i < nr_added; i++) { -+ struct page *page = pages[i]; -+ void *s = shadow[i]; - -- __SetPageLocked(page); -- ret = __add_to_page_cache_locked(page, mapping, offset, -- gfp_mask, &shadow); -- if (unlikely(ret)) -- __ClearPageLocked(page); -- else { -- /* -- * The page might have been evicted from cache only -- * recently, in which case it should be activated like -- * any other repeatedly accessed page. -- * The exception is pages getting rewritten; evicting other -- * data from the working set, only to cache data that will -- * get overwritten with something else, is a waste of memory. -- */ - WARN_ON_ONCE(PageActive(page)); -- if (!(gfp_mask & __GFP_WRITE) && shadow) -- workingset_refault(page, shadow); -+ if (!(gfp_mask & __GFP_WRITE) && s) -+ workingset_refault(page, s); - lru_cache_add(page); - } -+ -+ if (shadow != shadow_stack) -+ kfree(shadow); -+ - return ret; -+slowpath: -+ for (i = 0; i < nr_pages; i++) { -+ err = add_to_page_cache_lru(pages[i], mapping, -+ offset + i, gfp_mask); -+ if (err) -+ break; -+ } -+ -+ return i ?: err; -+} -+EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); -+ -+int add_to_page_cache_lru(struct page *page, struct address_space *mapping, -+ pgoff_t offset, gfp_t gfp_mask) -+{ -+ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); -+ if (ret < 0) -+ return ret; -+ return 0; - } - EXPORT_SYMBOL_GPL(add_to_page_cache_lru); - -@@ -1990,6 +2087,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, - - return ret; - } -+EXPORT_SYMBOL(find_get_pages_range); - - /** - * find_get_pages_contig - gang contiguous pagecache lookup -@@ -2138,6 +2236,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) - ra->ra_pages /= 4; - } - -+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) -+{ -+ if (iocb->ki_flags & IOCB_WAITQ) -+ return lock_page_async(page, iocb->ki_waitq); -+ else if (iocb->ki_flags & IOCB_NOWAIT) -+ return trylock_page(page) ? 0 : -EAGAIN; -+ else -+ return lock_page_killable(page); -+} -+ -+static struct page * -+generic_file_buffered_read_readpage(struct kiocb *iocb, -+ struct file *filp, -+ struct address_space *mapping, -+ struct page *page) -+{ -+ struct file_ra_state *ra = &filp->f_ra; -+ int error; -+ -+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { -+ unlock_page(page); -+ put_page(page); -+ return ERR_PTR(-EAGAIN); -+ } -+ -+ /* -+ * A previous I/O error may have been due to temporary -+ * failures, eg. multipath errors. -+ * PG_error will be set again if readpage fails. -+ */ -+ ClearPageError(page); -+ /* Start the actual read. The read will unlock the page. */ -+ error = mapping->a_ops->readpage(filp, page); -+ -+ if (unlikely(error)) { -+ put_page(page); -+ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; -+ } -+ -+ if (!PageUptodate(page)) { -+ error = lock_page_for_iocb(iocb, page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ if (!PageUptodate(page)) { -+ if (page->mapping == NULL) { -+ /* -+ * invalidate_mapping_pages got it -+ */ -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ unlock_page(page); -+ shrink_readahead_size_eio(ra); -+ put_page(page); -+ return ERR_PTR(-EIO); -+ } -+ unlock_page(page); -+ } -+ -+ return page; -+} -+ -+static struct page * -+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, -+ struct file *filp, -+ struct iov_iter *iter, -+ struct page *page, -+ loff_t pos, loff_t count) -+{ -+ struct address_space *mapping = filp->f_mapping; -+ struct inode *inode = mapping->host; -+ int error; -+ -+ /* -+ * See comment in do_read_cache_page on why -+ * wait_on_page_locked is used to avoid unnecessarily -+ * serialisations and why it's safe. -+ */ -+ if (iocb->ki_flags & IOCB_WAITQ) { -+ error = wait_on_page_locked_async(page, -+ iocb->ki_waitq); -+ } else { -+ error = wait_on_page_locked_killable(page); -+ } -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ if (PageUptodate(page)) -+ return page; -+ -+ if (inode->i_blkbits == PAGE_SHIFT || -+ !mapping->a_ops->is_partially_uptodate) -+ goto page_not_up_to_date; -+ /* pipes can't handle partially uptodate pages */ -+ if (unlikely(iov_iter_is_pipe(iter))) -+ goto page_not_up_to_date; -+ if (!trylock_page(page)) -+ goto page_not_up_to_date; -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) -+ goto page_not_up_to_date_locked; -+ if (!mapping->a_ops->is_partially_uptodate(page, -+ pos & ~PAGE_MASK, count)) -+ goto page_not_up_to_date_locked; -+ unlock_page(page); -+ return page; -+ -+page_not_up_to_date: -+ /* Get exclusive access to the page ... */ -+ error = lock_page_for_iocb(iocb, page); -+ if (unlikely(error)) { -+ put_page(page); -+ return ERR_PTR(error); -+ } -+ -+page_not_up_to_date_locked: -+ /* Did it get truncated before we got the lock? */ -+ if (!page->mapping) { -+ unlock_page(page); -+ put_page(page); -+ return NULL; -+ } -+ -+ /* Did somebody else fill it already? */ -+ if (PageUptodate(page)) { -+ unlock_page(page); -+ return page; -+ } -+ -+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page); -+} -+ -+static struct page * -+generic_file_buffered_read_no_cached_page(struct kiocb *iocb, -+ struct iov_iter *iter) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ struct page *page; -+ int error; -+ -+ if (iocb->ki_flags & IOCB_NOIO) -+ return ERR_PTR(-EAGAIN); -+ -+ /* -+ * Ok, it wasn't cached, so we need to create a new -+ * page.. -+ */ -+ page = page_cache_alloc(mapping); -+ if (!page) -+ return ERR_PTR(-ENOMEM); -+ -+ error = add_to_page_cache_lru(page, mapping, index, -+ mapping_gfp_constraint(mapping, GFP_KERNEL)); -+ if (error) { -+ put_page(page); -+ return error != -EEXIST ? ERR_PTR(error) : NULL; -+ } -+ -+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page); -+} -+ -+static int generic_file_buffered_read_get_pages(struct kiocb *iocb, -+ struct iov_iter *iter, -+ struct page **pages, -+ unsigned int nr) -+{ -+ struct file *filp = iocb->ki_filp; -+ struct address_space *mapping = filp->f_mapping; -+ struct file_ra_state *ra = &filp->f_ra; -+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; -+ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -+ int i, j, nr_got, err = 0; -+ -+ nr = min_t(unsigned long, last_index - index, nr); -+find_page: -+ if (fatal_signal_pending(current)) -+ return -EINTR; -+ -+ nr_got = find_get_pages_contig(mapping, index, nr, pages); -+ if (nr_got) -+ goto got_pages; -+ -+ if (iocb->ki_flags & IOCB_NOIO) -+ return -EAGAIN; -+ -+ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); -+ -+ nr_got = find_get_pages_contig(mapping, index, nr, pages); -+ if (nr_got) -+ goto got_pages; -+ -+ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); -+ err = PTR_ERR_OR_ZERO(pages[0]); -+ if (!IS_ERR_OR_NULL(pages[0])) -+ nr_got = 1; -+got_pages: -+ for (i = 0; i < nr_got; i++) { -+ struct page *page = pages[i]; -+ pgoff_t pg_index = index + i; -+ loff_t pg_pos = max(iocb->ki_pos, -+ (loff_t) pg_index << PAGE_SHIFT); -+ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; -+ -+ if (PageReadahead(page)) { -+ if (iocb->ki_flags & IOCB_NOIO) { -+ for (j = i; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = -EAGAIN; -+ break; -+ } -+ page_cache_async_readahead(mapping, ra, filp, page, -+ pg_index, last_index - pg_index); -+ } -+ -+ if (!PageUptodate(page)) { -+ if ((iocb->ki_flags & IOCB_NOWAIT) || -+ ((iocb->ki_flags & IOCB_WAITQ) && i)) { -+ for (j = i; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = -EAGAIN; -+ break; -+ } -+ -+ page = generic_file_buffered_read_pagenotuptodate(iocb, -+ filp, iter, page, pg_pos, pg_count); -+ if (IS_ERR_OR_NULL(page)) { -+ for (j = i + 1; j < nr_got; j++) -+ put_page(pages[j]); -+ nr_got = i; -+ err = PTR_ERR_OR_ZERO(page); -+ break; -+ } -+ } -+ } -+ -+ if (likely(nr_got)) -+ return nr_got; -+ if (err) -+ return err; -+ /* -+ * No pages and no error means we raced and should retry: -+ */ -+ goto find_page; -+} -+ - /** - * generic_file_buffered_read - generic file read routine - * @iocb: the iocb to read -@@ -2158,276 +2509,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, - struct iov_iter *iter, ssize_t written) - { - struct file *filp = iocb->ki_filp; -+ struct file_ra_state *ra = &filp->f_ra; - struct address_space *mapping = filp->f_mapping; - struct inode *inode = mapping->host; -- struct file_ra_state *ra = &filp->f_ra; -- loff_t *ppos = &iocb->ki_pos; -- pgoff_t index; -- pgoff_t last_index; -- pgoff_t prev_index; -- unsigned long offset; /* offset into pagecache page */ -- unsigned int prev_offset; -- int error = 0; -- -- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) -+ struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; -+ unsigned int nr_pages = min_t(unsigned int, 512, -+ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - -+ (iocb->ki_pos >> PAGE_SHIFT)); -+ int i, pg_nr, error = 0; -+ bool writably_mapped; -+ loff_t isize, end_offset; -+ -+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) - return 0; - iov_iter_truncate(iter, inode->i_sb->s_maxbytes); - -- index = *ppos >> PAGE_SHIFT; -- prev_index = ra->prev_pos >> PAGE_SHIFT; -- prev_offset = ra->prev_pos & (PAGE_SIZE-1); -- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; -- offset = *ppos & ~PAGE_MASK; -+ if (nr_pages > ARRAY_SIZE(pages_onstack)) -+ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); - -- for (;;) { -- struct page *page; -- pgoff_t end_index; -- loff_t isize; -- unsigned long nr, ret; -+ if (!pages) { -+ pages = pages_onstack; -+ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); -+ } - -+ do { - cond_resched(); --find_page: -- if (fatal_signal_pending(current)) { -- error = -EINTR; -- goto out; -- } - -- page = find_get_page(mapping, index); -- if (!page) { -- if (iocb->ki_flags & IOCB_NOIO) -- goto would_block; -- page_cache_sync_readahead(mapping, -- ra, filp, -- index, last_index - index); -- page = find_get_page(mapping, index); -- if (unlikely(page == NULL)) -- goto no_cached_page; -- } -- if (PageReadahead(page)) { -- if (iocb->ki_flags & IOCB_NOIO) { -- put_page(page); -- goto out; -- } -- page_cache_async_readahead(mapping, -- ra, filp, page, -- index, last_index - index); -- } -- if (!PageUptodate(page)) { -- /* -- * See comment in do_read_cache_page on why -- * wait_on_page_locked is used to avoid unnecessarily -- * serialisations and why it's safe. -- */ -- if (iocb->ki_flags & IOCB_WAITQ) { -- if (written) { -- put_page(page); -- goto out; -- } -- error = wait_on_page_locked_async(page, -- iocb->ki_waitq); -- } else { -- if (iocb->ki_flags & IOCB_NOWAIT) { -- put_page(page); -- goto would_block; -- } -- error = wait_on_page_locked_killable(page); -- } -- if (unlikely(error)) -- goto readpage_error; -- if (PageUptodate(page)) -- goto page_ok; -- -- if (inode->i_blkbits == PAGE_SHIFT || -- !mapping->a_ops->is_partially_uptodate) -- goto page_not_up_to_date; -- /* pipes can't handle partially uptodate pages */ -- if (unlikely(iov_iter_is_pipe(iter))) -- goto page_not_up_to_date; -- if (!trylock_page(page)) -- goto page_not_up_to_date; -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) -- goto page_not_up_to_date_locked; -- if (!mapping->a_ops->is_partially_uptodate(page, -- offset, iter->count)) -- goto page_not_up_to_date_locked; -- unlock_page(page); -+ /* -+ * We can't return -EIOCBQUEUED once we've done some work, so -+ * ensure we don't block: -+ */ -+ if ((iocb->ki_flags & IOCB_WAITQ) && written) -+ iocb->ki_flags |= IOCB_NOWAIT; -+ -+ i = 0; -+ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, -+ pages, nr_pages); -+ if (pg_nr < 0) { -+ error = pg_nr; -+ break; - } --page_ok: -+ - /* -- * i_size must be checked after we know the page is Uptodate. -+ * i_size must be checked after we know the pages are Uptodate. - * - * Checking i_size after the check allows us to calculate - * the correct value for "nr", which means the zero-filled - * part of the page is not copied back to userspace (unless - * another truncate extends the file - this is desired though). - */ -- - isize = i_size_read(inode); -- end_index = (isize - 1) >> PAGE_SHIFT; -- if (unlikely(!isize || index > end_index)) { -- put_page(page); -- goto out; -- } -+ if (unlikely(iocb->ki_pos >= isize)) -+ goto put_pages; - -- /* nr is the maximum number of bytes to copy from this page */ -- nr = PAGE_SIZE; -- if (index == end_index) { -- nr = ((isize - 1) & ~PAGE_MASK) + 1; -- if (nr <= offset) { -- put_page(page); -- goto out; -- } -- } -- nr = nr - offset; -+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); - -- /* If users can be writing to this page using arbitrary -- * virtual addresses, take care about potential aliasing -- * before reading the page on the kernel side. -- */ -- if (mapping_writably_mapped(mapping)) -- flush_dcache_page(page); -+ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > -+ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) -+ put_page(pages[--pg_nr]); - - /* -- * When a sequential read accesses a page several times, -- * only mark it as accessed the first time. -+ * Once we start copying data, we don't want to be touching any -+ * cachelines that might be contended: - */ -- if (prev_index != index || offset != prev_offset) -- mark_page_accessed(page); -- prev_index = index; -+ writably_mapped = mapping_writably_mapped(mapping); - - /* -- * Ok, we have the page, and it's up-to-date, so -- * now we can copy it to user space... -+ * When a sequential read accesses a page several times, only -+ * mark it as accessed the first time. - */ -+ if (iocb->ki_pos >> PAGE_SHIFT != -+ ra->prev_pos >> PAGE_SHIFT) -+ mark_page_accessed(pages[0]); -+ for (i = 1; i < pg_nr; i++) -+ mark_page_accessed(pages[i]); -+ -+ for (i = 0; i < pg_nr; i++) { -+ unsigned int offset = iocb->ki_pos & ~PAGE_MASK; -+ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, -+ PAGE_SIZE - offset); -+ unsigned int copied; - -- ret = copy_page_to_iter(page, offset, nr, iter); -- offset += ret; -- index += offset >> PAGE_SHIFT; -- offset &= ~PAGE_MASK; -- prev_offset = offset; -- -- put_page(page); -- written += ret; -- if (!iov_iter_count(iter)) -- goto out; -- if (ret < nr) { -- error = -EFAULT; -- goto out; -- } -- continue; -- --page_not_up_to_date: -- /* Get exclusive access to the page ... */ -- if (iocb->ki_flags & IOCB_WAITQ) -- error = lock_page_async(page, iocb->ki_waitq); -- else -- error = lock_page_killable(page); -- if (unlikely(error)) -- goto readpage_error; -- --page_not_up_to_date_locked: -- /* Did it get truncated before we got the lock? */ -- if (!page->mapping) { -- unlock_page(page); -- put_page(page); -- continue; -- } -- -- /* Did somebody else fill it already? */ -- if (PageUptodate(page)) { -- unlock_page(page); -- goto page_ok; -- } -+ /* -+ * If users can be writing to this page using arbitrary -+ * virtual addresses, take care about potential aliasing -+ * before reading the page on the kernel side. -+ */ -+ if (writably_mapped) -+ flush_dcache_page(pages[i]); - --readpage: -- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { -- unlock_page(page); -- put_page(page); -- goto would_block; -- } -- /* -- * A previous I/O error may have been due to temporary -- * failures, eg. multipath errors. -- * PG_error will be set again if readpage fails. -- */ -- ClearPageError(page); -- /* Start the actual read. The read will unlock the page. */ -- error = mapping->a_ops->readpage(filp, page); -+ copied = copy_page_to_iter(pages[i], offset, bytes, iter); - -- if (unlikely(error)) { -- if (error == AOP_TRUNCATED_PAGE) { -- put_page(page); -- error = 0; -- goto find_page; -- } -- goto readpage_error; -- } -+ written += copied; -+ iocb->ki_pos += copied; -+ ra->prev_pos = iocb->ki_pos; - -- if (!PageUptodate(page)) { -- if (iocb->ki_flags & IOCB_WAITQ) -- error = lock_page_async(page, iocb->ki_waitq); -- else -- error = lock_page_killable(page); -- -- if (unlikely(error)) -- goto readpage_error; -- if (!PageUptodate(page)) { -- if (page->mapping == NULL) { -- /* -- * invalidate_mapping_pages got it -- */ -- unlock_page(page); -- put_page(page); -- goto find_page; -- } -- unlock_page(page); -- shrink_readahead_size_eio(ra); -- error = -EIO; -- goto readpage_error; -+ if (copied < bytes) { -+ error = -EFAULT; -+ break; - } -- unlock_page(page); - } -+put_pages: -+ for (i = 0; i < pg_nr; i++) -+ put_page(pages[i]); -+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); - -- goto page_ok; -- --readpage_error: -- /* UHHUH! A synchronous read error occurred. Report it */ -- put_page(page); -- goto out; -- --no_cached_page: -- /* -- * Ok, it wasn't cached, so we need to create a new -- * page.. -- */ -- page = page_cache_alloc(mapping); -- if (!page) { -- error = -ENOMEM; -- goto out; -- } -- error = add_to_page_cache_lru(page, mapping, index, -- mapping_gfp_constraint(mapping, GFP_KERNEL)); -- if (error) { -- put_page(page); -- if (error == -EEXIST) { -- error = 0; -- goto find_page; -- } -- goto out; -- } -- goto readpage; -- } -+ file_accessed(filp); - --would_block: -- error = -EAGAIN; --out: -- ra->prev_pos = prev_index; -- ra->prev_pos <<= PAGE_SHIFT; -- ra->prev_pos |= prev_offset; -+ if (pages != pages_onstack) -+ kfree(pages); - -- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; -- file_accessed(filp); - return written ? written : error; - } - EXPORT_SYMBOL_GPL(generic_file_buffered_read); -diff --git a/mm/gup.c b/mm/gup.c -index e869c634cc9a..9bfb3e933deb 100644 ---- a/mm/gup.c -+++ b/mm/gup.c -@@ -1085,6 +1085,13 @@ static long __get_user_pages(struct mm_struct *mm, - } - cond_resched(); - -+ if (current->faults_disabled_mapping && -+ vma->vm_file && -+ vma->vm_file->f_mapping == current->faults_disabled_mapping) { -+ ret = -EFAULT; -+ goto out; -+ } -+ - page = follow_page_mask(vma, start, foll_flags, &ctx); - if (!page) { - ret = faultin_page(vma, start, &foll_flags, locked); -diff --git a/mm/nommu.c b/mm/nommu.c -index 75a327149af1..fe0a77d01656 100644 ---- a/mm/nommu.c -+++ b/mm/nommu.c -@@ -290,6 +290,24 @@ void *vzalloc_node(unsigned long size, int node) - } - EXPORT_SYMBOL(vzalloc_node); - -+/** -+ * vmalloc_exec - allocate virtually contiguous, executable memory -+ * @size: allocation size -+ * -+ * Kernel-internal function to allocate enough pages to cover @size -+ * the page level allocator and map them into contiguous and -+ * executable kernel virtual space. -+ * -+ * For tight control over page level allocator and protection flags -+ * use __vmalloc() instead. -+ */ -+ -+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) -+{ -+ return __vmalloc(size, gfp_mask); -+} -+EXPORT_SYMBOL_GPL(vmalloc_exec); -+ - /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) - * @size: allocation size -diff --git a/mm/page-writeback.c b/mm/page-writeback.c -index 4e4ddd67b71e..563cc766f511 100644 ---- a/mm/page-writeback.c -+++ b/mm/page-writeback.c -@@ -2475,20 +2475,19 @@ int __set_page_dirty_nobuffers(struct page *page) - lock_page_memcg(page); - if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); -- unsigned long flags; - - if (!mapping) { - unlock_page_memcg(page); - return 1; - } - -- xa_lock_irqsave(&mapping->i_pages, flags); -+ xa_lock_irq(&mapping->i_pages); - BUG_ON(page_mapping(page) != mapping); - WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); - account_page_dirtied(page, mapping); - __xa_set_mark(&mapping->i_pages, page_index(page), - PAGECACHE_TAG_DIRTY); -- xa_unlock_irqrestore(&mapping->i_pages, flags); -+ xa_unlock_irq(&mapping->i_pages); - unlock_page_memcg(page); - - if (mapping->host) { -diff --git a/mm/vmalloc.c b/mm/vmalloc.c -index be4724b916b3..efd7f9dd1eb8 100644 ---- a/mm/vmalloc.c -+++ b/mm/vmalloc.c -@@ -2665,6 +2665,27 @@ void *vzalloc_node(unsigned long size, int node) - } - EXPORT_SYMBOL(vzalloc_node); - -+/** -+ * vmalloc_exec - allocate virtually contiguous, executable memory -+ * @size: allocation size -+ * -+ * Kernel-internal function to allocate enough pages to cover @size -+ * the page level allocator and map them into contiguous and -+ * executable kernel virtual space. -+ * -+ * For tight control over page level allocator and protection flags -+ * use __vmalloc() instead. -+ * -+ * Return: pointer to the allocated memory or %NULL on error -+ */ -+void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) -+{ -+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, -+ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, -+ NUMA_NO_NODE, __builtin_return_address(0)); -+} -+EXPORT_SYMBOL_GPL(vmalloc_exec); -+ - #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) - #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) - #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/linux59-tkg/linux59-tkg-patches/0009-glitched-bmq.patch b/linux59-tkg/linux59-tkg-patches/0009-glitched-bmq.patch deleted file mode 100644 index e42e522..0000000 --- a/linux59-tkg/linux59-tkg-patches/0009-glitched-bmq.patch +++ /dev/null @@ -1,90 +0,0 @@ -From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 -From: Tk-Glitch -Date: Wed, 4 Jul 2018 04:30:08 +0200 -Subject: glitched - BMQ - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_250 -+ default HZ_500 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -39,6 +39,13 @@ choice - on SMP and NUMA systems and exactly dividing by both PAL and - NTSC frame rates for video and multimedia work. - -+ config HZ_500 -+ bool "500 HZ" -+ help -+ 500 Hz is a balanced timer frequency. Provides fast interactivity -+ on desktops with great smoothness without increasing CPU power -+ consumption and sacrificing the battery life on laptops. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -52,6 +59,7 @@ config HZ - default 100 if HZ_100 - default 250 if HZ_250 - default 300 if HZ_300 -+ default 500 if HZ_500 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz -index 2a202a846757..1d9c7ed79b11 100644 ---- a/kernel/Kconfig.hz -+++ b/kernel/Kconfig.hz -@@ -4,7 +4,7 @@ - - choice - prompt "Timer frequency" -- default HZ_500 -+ default HZ_750 - help - Allows the configuration of the timer frequency. It is customary - to have the timer interrupt run at 1000 Hz but 100 Hz may be more -@@ -46,6 +46,13 @@ choice - on desktops with great smoothness without increasing CPU power - consumption and sacrificing the battery life on laptops. - -+ config HZ_750 -+ bool "750 HZ" -+ help -+ 750 Hz is a good timer frequency for desktops. Provides fast -+ interactivity with great smoothness without sacrificing too -+ much throughput. -+ - config HZ_1000 - bool "1000 HZ" - help -@@ -60,6 +67,7 @@ config HZ - default 250 if HZ_250 - default 300 if HZ_300 - default 500 if HZ_500 -+ default 750 if HZ_750 - default 1000 if HZ_1000 - - config SCHED_HRTICK - -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 9270a4370d54..30d01e647417 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -169,7 +169,7 @@ - /* - * From 0 .. 200. Higher means more swappy. - */ --int vm_swappiness = 60; -+int vm_swappiness = 20; - - static void set_task_reclaim_state(struct task_struct *task, - struct reclaim_state *rs) diff --git a/linux59-tkg/linux59-tkg-patches/0009-glitched-ondemand-bmq.patch b/linux59-tkg/linux59-tkg-patches/0009-glitched-ondemand-bmq.patch deleted file mode 100644 index a926040..0000000 --- a/linux59-tkg/linux59-tkg-patches/0009-glitched-ondemand-bmq.patch +++ /dev/null @@ -1,18 +0,0 @@ -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index 6b423eebfd5d..61e3271675d6 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -21,10 +21,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (63) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux59-tkg/linux59-tkg-patches/0009-prjc_v5.9-r0.patch b/linux59-tkg/linux59-tkg-patches/0009-prjc_v5.9-r0.patch deleted file mode 100644 index 550d29c..0000000 --- a/linux59-tkg/linux59-tkg-patches/0009-prjc_v5.9-r0.patch +++ /dev/null @@ -1,8809 +0,0 @@ -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index a1068742a6df..b97a9697fde4 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4611,6 +4611,12 @@ - - sbni= [NET] Granch SBNI12 leased line adapter - -+ sched_timeslice= -+ [KNL] Time slice in us for BMQ/PDS scheduler. -+ Format: (must be >= 1000) -+ Default: 4000 -+ See Documentation/scheduler/sched-BMQ.txt -+ - sched_debug [KNL] Enables verbose scheduler debug messages. - - schedstats= [KNL,X86] Enable or disable scheduled statistics. -diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst -index d4b32cc32bb7..14118e5168ef 100644 ---- a/Documentation/admin-guide/sysctl/kernel.rst -+++ b/Documentation/admin-guide/sysctl/kernel.rst -@@ -1515,3 +1515,13 @@ is 10 seconds. - - The softlockup threshold is (``2 * watchdog_thresh``). Setting this - tunable to zero will disable lockup detection altogether. -+ -+yield_type: -+=========== -+ -+BMQ/PDS CPU scheduler only. This determines what type of yield calls -+to sched_yield will perform. -+ -+ 0 - No yield. -+ 1 - Deboost and requeue task. (default) -+ 2 - Set run queue skip task. -diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt -new file mode 100644 -index 000000000000..05c84eec0f31 ---- /dev/null -+++ b/Documentation/scheduler/sched-BMQ.txt -@@ -0,0 +1,110 @@ -+ BitMap queue CPU Scheduler -+ -------------------------- -+ -+CONTENT -+======== -+ -+ Background -+ Design -+ Overview -+ Task policy -+ Priority management -+ BitMap Queue -+ CPU Assignment and Migration -+ -+ -+Background -+========== -+ -+BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution -+of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), -+and inspired by Zircon scheduler. The goal of it is to keep the scheduler code -+simple, while efficiency and scalable for interactive tasks, such as desktop, -+movie playback and gaming etc. -+ -+Design -+====== -+ -+Overview -+-------- -+ -+BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, -+each CPU is responsible for scheduling the tasks that are putting into it's -+run queue. -+ -+The run queue is a set of priority queues. Note that these queues are fifo -+queue for non-rt tasks or priority queue for rt tasks in data structure. See -+BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact -+that most applications are non-rt tasks. No matter the queue is fifo or -+priority, In each queue is an ordered list of runnable tasks awaiting execution -+and the data structures are the same. When it is time for a new task to run, -+the scheduler simply looks the lowest numbered queueue that contains a task, -+and runs the first task from the head of that queue. And per CPU idle task is -+also in the run queue, so the scheduler can always find a task to run on from -+its run queue. -+ -+Each task will assigned the same timeslice(default 4ms) when it is picked to -+start running. Task will be reinserted at the end of the appropriate priority -+queue when it uses its whole timeslice. When the scheduler selects a new task -+from the priority queue it sets the CPU's preemption timer for the remainder of -+the previous timeslice. When that timer fires the scheduler will stop execution -+on that task, select another task and start over again. -+ -+If a task blocks waiting for a shared resource then it's taken out of its -+priority queue and is placed in a wait queue for the shared resource. When it -+is unblocked it will be reinserted in the appropriate priority queue of an -+eligible CPU. -+ -+Task policy -+----------- -+ -+BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the -+mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's -+NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each -+policy. -+ -+DEADLINE -+ It is squashed as priority 0 FIFO task. -+ -+FIFO/RR -+ All RT tasks share one single priority queue in BMQ run queue designed. The -+complexity of insert operation is O(n). BMQ is not designed for system runs -+with major rt policy tasks. -+ -+NORMAL/BATCH/IDLE -+ BATCH and IDLE tasks are treated as the same policy. They compete CPU with -+NORMAL policy tasks, but they just don't boost. To control the priority of -+NORMAL/BATCH/IDLE tasks, simply use nice level. -+ -+ISO -+ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy -+task instead. -+ -+Priority management -+------------------- -+ -+RT tasks have priority from 0-99. For non-rt tasks, there are three different -+factors used to determine the effective priority of a task. The effective -+priority being what is used to determine which queue it will be in. -+ -+The first factor is simply the task’s static priority. Which is assigned from -+task's nice level, within [-20, 19] in userland's point of view and [0, 39] -+internally. -+ -+The second factor is the priority boost. This is a value bounded between -+[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is -+modified by the following cases: -+ -+*When a thread has used up its entire timeslice, always deboost its boost by -+increasing by one. -+*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, -+and its switch-in time(time after last switch and run) below the thredhold -+based on its priority boost, will boost its boost by decreasing by one buti is -+capped at 0 (won’t go negative). -+ -+The intent in this system is to ensure that interactive threads are serviced -+quickly. These are usually the threads that interact directly with the user -+and cause user-perceivable latency. These threads usually do little work and -+spend most of their time blocked awaiting another user event. So they get the -+priority boost from unblocking while background threads that do most of the -+processing receive the priority penalty for using their entire timeslice. -diff --git a/fs/proc/base.c b/fs/proc/base.c -index 617db4e0faa0..f85926764f9a 100644 ---- a/fs/proc/base.c -+++ b/fs/proc/base.c -@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, - seq_puts(m, "0 0 0\n"); - else - seq_printf(m, "%llu %llu %lu\n", -- (unsigned long long)task->se.sum_exec_runtime, -+ (unsigned long long)tsk_seruntime(task), - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); - -diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h -index 8874f681b056..59eb72bf7d5f 100644 ---- a/include/asm-generic/resource.h -+++ b/include/asm-generic/resource.h -@@ -23,7 +23,7 @@ - [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ - [RLIMIT_SIGPENDING] = { 0, 0 }, \ - [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ -- [RLIMIT_NICE] = { 0, 0 }, \ -+ [RLIMIT_NICE] = { 30, 30 }, \ - [RLIMIT_RTPRIO] = { 0, 0 }, \ - [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ - } -diff --git a/include/linux/sched.h b/include/linux/sched.h -index afe01e232935..8918609cb9f0 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -34,6 +34,7 @@ - #include - #include - #include -+#include - - /* task_struct member predeclarations (sorted alphabetically): */ - struct audit_context; -@@ -652,12 +653,18 @@ struct task_struct { - unsigned int ptrace; - - #ifdef CONFIG_SMP -- int on_cpu; - struct __call_single_node wake_entry; -+#endif -+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) -+ int on_cpu; -+#endif -+ -+#ifdef CONFIG_SMP - #ifdef CONFIG_THREAD_INFO_IN_TASK - /* Current CPU: */ - unsigned int cpu; - #endif -+#ifndef CONFIG_SCHED_ALT - unsigned int wakee_flips; - unsigned long wakee_flip_decay_ts; - struct task_struct *last_wakee; -@@ -671,6 +678,7 @@ struct task_struct { - */ - int recent_used_cpu; - int wake_cpu; -+#endif /* !CONFIG_SCHED_ALT */ - #endif - int on_rq; - -@@ -679,13 +687,33 @@ struct task_struct { - int normal_prio; - unsigned int rt_priority; - -+#ifdef CONFIG_SCHED_ALT -+ u64 last_ran; -+ s64 time_slice; -+#ifdef CONFIG_SCHED_BMQ -+ int boost_prio; -+ int bmq_idx; -+ struct list_head bmq_node; -+#endif /* CONFIG_SCHED_BMQ */ -+#ifdef CONFIG_SCHED_PDS -+ u64 deadline; -+ u64 priodl; -+ /* skip list level */ -+ int sl_level; -+ /* skip list node */ -+ struct skiplist_node sl_node; -+#endif /* CONFIG_SCHED_PDS */ -+ /* sched_clock time spent running */ -+ u64 sched_time; -+#else /* !CONFIG_SCHED_ALT */ - const struct sched_class *sched_class; - struct sched_entity se; - struct sched_rt_entity rt; -+ struct sched_dl_entity dl; -+#endif - #ifdef CONFIG_CGROUP_SCHED - struct task_group *sched_task_group; - #endif -- struct sched_dl_entity dl; - - #ifdef CONFIG_UCLAMP_TASK - /* -@@ -1332,6 +1360,15 @@ struct task_struct { - */ - }; - -+#ifdef CONFIG_SCHED_ALT -+#define tsk_seruntime(t) ((t)->sched_time) -+/* replace the uncertian rt_timeout with 0UL */ -+#define tsk_rttimeout(t) (0UL) -+#else /* CFS */ -+#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) -+#define tsk_rttimeout(t) ((t)->rt.timeout) -+#endif /* !CONFIG_SCHED_ALT */ -+ - static inline struct pid *task_pid(struct task_struct *task) - { - return task->thread_pid; -diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h -index 1aff00b65f3c..179d77c8360e 100644 ---- a/include/linux/sched/deadline.h -+++ b/include/linux/sched/deadline.h -@@ -1,5 +1,24 @@ - /* SPDX-License-Identifier: GPL-2.0 */ - -+#ifdef CONFIG_SCHED_ALT -+ -+static inline int dl_task(struct task_struct *p) -+{ -+ return 0; -+} -+ -+#ifdef CONFIG_SCHED_BMQ -+#define __tsk_deadline(p) (0UL) -+#endif -+ -+#ifdef CONFIG_SCHED_PDS -+#define __tsk_deadline(p) ((p)->priodl) -+#endif -+ -+#else -+ -+#define __tsk_deadline(p) ((p)->dl.deadline) -+ - /* - * SCHED_DEADLINE tasks has negative priorities, reflecting - * the fact that any of them has higher prio than RT and -@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) - { - return dl_prio(p->prio); - } -+#endif /* CONFIG_SCHED_ALT */ - - static inline bool dl_time_before(u64 a, u64 b) - { -diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h -index 7d64feafc408..42730d27ceb5 100644 ---- a/include/linux/sched/prio.h -+++ b/include/linux/sched/prio.h -@@ -20,11 +20,20 @@ - */ - - #define MAX_USER_RT_PRIO 100 -+ - #define MAX_RT_PRIO MAX_USER_RT_PRIO - - #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) - #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) - -+/* +/- priority levels from the base priority */ -+#ifdef CONFIG_SCHED_BMQ -+#define MAX_PRIORITY_ADJ 7 -+#endif -+#ifdef CONFIG_SCHED_PDS -+#define MAX_PRIORITY_ADJ 0 -+#endif -+ - /* - * Convert user-nice values [ -20 ... 0 ... 19 ] - * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h -index e5af028c08b4..0a7565d0d3cf 100644 ---- a/include/linux/sched/rt.h -+++ b/include/linux/sched/rt.h -@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) - - if (policy == SCHED_FIFO || policy == SCHED_RR) - return true; -+#ifndef CONFIG_SCHED_ALT - if (policy == SCHED_DEADLINE) - return true; -+#endif - return false; - } - -diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h -new file mode 100644 -index 000000000000..47ca955a451d ---- /dev/null -+++ b/include/linux/skip_list.h -@@ -0,0 +1,177 @@ -+/* -+ * Copyright (C) 2016 Alfred Chen. -+ * -+ * Code based on Con Kolivas's skip list implementation for BFS, and -+ * which is based on example originally by William Pugh. -+ * -+ * Skip Lists are a probabilistic alternative to balanced trees, as -+ * described in the June 1990 issue of CACM and were invented by -+ * William Pugh in 1987. -+ * -+ * A couple of comments about this implementation: -+ * -+ * This file only provides a infrastructure of skip list. -+ * -+ * skiplist_node is embedded into container data structure, to get rid -+ * the dependency of kmalloc/kfree operation in scheduler code. -+ * -+ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT -+ * macro and be used for skip list insert operation. -+ * -+ * Random Level is also not defined in this file, instead, it should be -+ * customized implemented and set to node->level then pass to the customized -+ * skiplist_insert function. -+ * -+ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) -+ * -+ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, -+ * considering that there will be 256 entries to enable the top level when using -+ * random level p=0.5, and that number is more than enough for a run queue usage -+ * in a scheduler usage. And it also help to reduce the memory usage of the -+ * embedded skip list node in task_struct to about 50%. -+ * -+ * The insertion routine has been implemented so as to use the -+ * dirty hack described in the CACM paper: if a random level is -+ * generated that is more than the current maximum level, the -+ * current maximum level plus one is used instead. -+ * -+ * BFS Notes: In this implementation of skiplists, there are bidirectional -+ * next/prev pointers and the insert function returns a pointer to the actual -+ * node the value is stored. The key here is chosen by the scheduler so as to -+ * sort tasks according to the priority list requirements and is no longer used -+ * by the scheduler after insertion. The scheduler lookup, however, occurs in -+ * O(1) time because it is always the first item in the level 0 linked list. -+ * Since the task struct stores a copy of the node pointer upon skiplist_insert, -+ * it can also remove it much faster than the original implementation with the -+ * aid of prev<->next pointer manipulation and no searching. -+ */ -+#ifndef _LINUX_SKIP_LIST_H -+#define _LINUX_SKIP_LIST_H -+ -+#include -+ -+#define NUM_SKIPLIST_LEVEL (8) -+ -+struct skiplist_node { -+ int level; /* Levels in this node */ -+ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; -+ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; -+}; -+ -+#define SKIPLIST_NODE_INIT(name) { 0,\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ {&name, &name, &name, &name,\ -+ &name, &name, &name, &name},\ -+ } -+ -+static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ /* only level 0 ->next matters in skiplist_empty() */ -+ WRITE_ONCE(node->next[0], node); -+} -+ -+/** -+ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header -+ * @node: the skip list node to be inited. -+ */ -+static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) -+{ -+ int i; -+ -+ node->level = 0; -+ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { -+ WRITE_ONCE(node->next[i], node); -+ node->prev[i] = node; -+ } -+} -+ -+/** -+ * skiplist_empty - test whether a skip list is empty -+ * @head: the skip list to test. -+ */ -+static inline int skiplist_empty(const struct skiplist_node *head) -+{ -+ return READ_ONCE(head->next[0]) == head; -+} -+ -+/** -+ * skiplist_entry - get the struct for this entry -+ * @ptr: the &struct skiplist_node pointer. -+ * @type: the type of the struct this is embedded in. -+ * @member: the name of the skiplist_node within the struct. -+ */ -+#define skiplist_entry(ptr, type, member) \ -+ container_of(ptr, type, member) -+ -+/** -+ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert -+ * function, which takes two parameters, first one is the header node of the -+ * skip list, second one is the skip list node to be inserted -+ * @func_name: the customized skip list insert function name -+ * @search_func: the search function to be used, which takes two parameters, -+ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list -+ * node to be inserted, the function should return true if search should be -+ * continued, otherwise return false. -+ * Returns 1 if @node is inserted as the first item of skip list at level zero, -+ * otherwise 0 -+ */ -+#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ -+static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ -+{\ -+ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ -+ struct skiplist_node *p, *q;\ -+ int k = head->level;\ -+\ -+ p = head;\ -+ do {\ -+ while (q = p->next[k], q != head && search_func(q, node))\ -+ p = q;\ -+ update[k] = p;\ -+ } while (--k >= 0);\ -+\ -+ k = node->level;\ -+ if (unlikely(k > head->level)) {\ -+ node->level = k = ++head->level;\ -+ update[k] = head;\ -+ }\ -+\ -+ do {\ -+ p = update[k];\ -+ q = p->next[k];\ -+ node->next[k] = q;\ -+ p->next[k] = node;\ -+ node->prev[k] = p;\ -+ q->prev[k] = node;\ -+ } while (--k >= 0);\ -+\ -+ return (p == head);\ -+} -+ -+/** -+ * skiplist_del_init -- delete skip list node from a skip list and reset it's -+ * init state -+ * @head: the header node of the skip list to be deleted from. -+ * @node: the skip list node to be deleted, the caller need to ensure @node is -+ * in skip list which @head represent. -+ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 -+ */ -+static inline int -+skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) -+{ -+ int l, m = node->level; -+ -+ for (l = 0; l <= m; l++) { -+ node->prev[l]->next[l] = node->next[l]; -+ node->next[l]->prev[l] = node->prev[l]; -+ } -+ if (m == head->level && m > 0) { -+ while (head->next[m] == head && m > 0) -+ m--; -+ head->level = m; -+ } -+ INIT_SKIPLIST_NODE(node); -+ -+ return (node->prev[0] == head); -+} -+#endif /* _LINUX_SKIP_LIST_H */ -diff --git a/init/Kconfig b/init/Kconfig -index d6a0b31b13dc..2122dba5596f 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -770,9 +770,39 @@ config GENERIC_SCHED_CLOCK - - menu "Scheduler features" - -+menuconfig SCHED_ALT -+ bool "Alternative CPU Schedulers" -+ default y -+ help -+ This feature enable alternative CPU scheduler" -+ -+if SCHED_ALT -+ -+choice -+ prompt "Alternative CPU Scheduler" -+ default SCHED_BMQ -+ -+config SCHED_BMQ -+ bool "BMQ CPU scheduler" -+ help -+ The BitMap Queue CPU scheduler for excellent interactivity and -+ responsiveness on the desktop and solid scalability on normal -+ hardware and commodity servers. -+ -+config SCHED_PDS -+ bool "PDS CPU scheduler" -+ help -+ The Priority and Deadline based Skip list multiple queue CPU -+ Scheduler. -+ -+endchoice -+ -+endif -+ - config UCLAMP_TASK - bool "Enable utilization clamping for RT/FAIR tasks" - depends on CPU_FREQ_GOV_SCHEDUTIL -+ depends on !SCHED_ALT - help - This feature enables the scheduler to track the clamped utilization - of each CPU based on RUNNABLE tasks scheduled on that CPU. -@@ -858,6 +888,7 @@ config NUMA_BALANCING - depends on ARCH_SUPPORTS_NUMA_BALANCING - depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION -+ depends on !SCHED_ALT - help - This option adds support for automatic NUMA aware memory/task placement. - The mechanism is quite primitive and is based on migrating memory when -@@ -944,7 +975,7 @@ menuconfig CGROUP_SCHED - bandwidth allocation to such task groups. It uses cgroups to group - tasks. - --if CGROUP_SCHED -+if CGROUP_SCHED && !SCHED_ALT - config FAIR_GROUP_SCHED - bool "Group scheduling for SCHED_OTHER" - depends on CGROUP_SCHED -@@ -1200,6 +1231,7 @@ config CHECKPOINT_RESTORE - - config SCHED_AUTOGROUP - bool "Automatic process group scheduling" -+ depends on !SCHED_ALT - select CGROUPS - select CGROUP_SCHED - select FAIR_GROUP_SCHED -diff --git a/init/init_task.c b/init/init_task.c -index f6889fce64af..5a23122f3d2c 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -75,9 +75,15 @@ struct task_struct init_task - .stack = init_stack, - .usage = REFCOUNT_INIT(2), - .flags = PF_KTHREAD, -+#ifdef CONFIG_SCHED_ALT -+ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+ .static_prio = DEFAULT_PRIO, -+ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, -+#else - .prio = MAX_PRIO - 20, - .static_prio = MAX_PRIO - 20, - .normal_prio = MAX_PRIO - 20, -+#endif - .policy = SCHED_NORMAL, - .cpus_ptr = &init_task.cpus_mask, - .cpus_mask = CPU_MASK_ALL, -@@ -87,6 +93,19 @@ struct task_struct init_task - .restart_block = { - .fn = do_no_restart_syscall, - }, -+#ifdef CONFIG_SCHED_ALT -+#ifdef CONFIG_SCHED_BMQ -+ .boost_prio = 0, -+ .bmq_idx = 15, -+ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), -+#endif -+#ifdef CONFIG_SCHED_PDS -+ .deadline = 0, -+ .sl_level = 0, -+ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), -+#endif -+ .time_slice = HZ, -+#else - .se = { - .group_node = LIST_HEAD_INIT(init_task.se.group_node), - }, -@@ -94,6 +113,7 @@ struct task_struct init_task - .run_list = LIST_HEAD_INIT(init_task.rt.run_list), - .time_slice = RR_TIMESLICE, - }, -+#endif - .tasks = LIST_HEAD_INIT(init_task.tasks), - #ifdef CONFIG_SMP - .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), -diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c -index 642415b8c3c9..7e0e1fe18035 100644 ---- a/kernel/cgroup/cpuset.c -+++ b/kernel/cgroup/cpuset.c -@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) - return ret; - } - --#ifdef CONFIG_SMP -+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) - /* - * Helper routine for generate_sched_domains(). - * Do cpusets a, b have overlapping effective cpus_allowed masks? -@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) - /* Have scheduler rebuild the domains */ - partition_and_rebuild_sched_domains(ndoms, doms, attr); - } --#else /* !CONFIG_SMP */ -+#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ - static void rebuild_sched_domains_locked(void) - { - } -diff --git a/kernel/delayacct.c b/kernel/delayacct.c -index 27725754ac99..769d773c7182 100644 ---- a/kernel/delayacct.c -+++ b/kernel/delayacct.c -@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) - */ - t1 = tsk->sched_info.pcount; - t2 = tsk->sched_info.run_delay; -- t3 = tsk->se.sum_exec_runtime; -+ t3 = tsk_seruntime(tsk); - - d->cpu_count += t1; - -diff --git a/kernel/exit.c b/kernel/exit.c -index 733e80f334e7..3f3506c851fd 100644 ---- a/kernel/exit.c -+++ b/kernel/exit.c -@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->curr_target = next_thread(tsk); - } - -- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, -+ add_device_randomness((const void*) &tsk_seruntime(tsk), - sizeof(unsigned long long)); - - /* -@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); -- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; -+ sig->sum_sched_runtime += tsk_seruntime(tsk); - sig->nr_threads--; - __unhash_process(tsk, group_dead); - write_sequnlock(&sig->stats_lock); -diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c -index f6310f848f34..4176ad070bc9 100644 ---- a/kernel/livepatch/transition.c -+++ b/kernel/livepatch/transition.c -@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) - */ - rq = task_rq_lock(task, &flags); - -+#ifdef CONFIG_SCHED_ALT -+ if (task_running(task) && task != current) { -+#else - if (task_running(rq, task) && task != current) { -+#endif - snprintf(err_buf, STACK_ERR_BUF_SIZE, - "%s: %s:%d is running\n", __func__, task->comm, - task->pid); -diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c -index cfdd5b93264d..84c284eb544a 100644 ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, - * Only use with rt_mutex_waiter_{less,equal}() - */ - #define task_to_waiter(p) \ -- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } -+ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } - - static inline int - rt_mutex_waiter_less(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline < right->deadline); -+#else - if (left->prio < right->prio) - return 1; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return dl_time_before(left->deadline, right->deadline); -+#endif - - return 0; -+#endif - } - - static inline int - rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - struct rt_mutex_waiter *right) - { -+#ifdef CONFIG_SCHED_PDS -+ return (left->deadline == right->deadline); -+#else - if (left->prio != right->prio) - return 0; - -+#ifndef CONFIG_SCHED_BMQ - /* - * If both waiters have dl_prio(), we check the deadlines of the - * associated tasks. -@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, - */ - if (dl_prio(left->prio)) - return left->deadline == right->deadline; -+#endif - - return 1; -+#endif - } - - static void -@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, - * the values of the node being removed. - */ - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - rt_mutex_enqueue(lock, waiter); - -@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, - waiter->task = task; - waiter->lock = lock; - waiter->prio = task->prio; -- waiter->deadline = task->dl.deadline; -+ waiter->deadline = __tsk_deadline(task); - - /* Get the top priority waiter on the lock */ - if (rt_mutex_has_waiters(lock)) -diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile -index 5fc9c9b70862..eb6d7d87779f 100644 ---- a/kernel/sched/Makefile -+++ b/kernel/sched/Makefile -@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) - CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer - endif - --obj-y += core.o loadavg.o clock.o cputime.o --obj-y += idle.o fair.o rt.o deadline.o --obj-y += wait.o wait_bit.o swait.o completion.o -- --obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o -+ifdef CONFIG_SCHED_ALT -+obj-y += alt_core.o alt_debug.o -+else -+obj-y += core.o -+obj-y += fair.o rt.o deadline.o -+obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o - obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o --obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_SCHED_DEBUG) += debug.o -+endif -+obj-y += loadavg.o clock.o cputime.o -+obj-y += idle.o -+obj-y += wait.o wait_bit.o swait.o completion.o -+obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o -+obj-$(CONFIG_SCHEDSTATS) += stats.o - obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o - obj-$(CONFIG_CPU_FREQ) += cpufreq.o - obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -new file mode 100644 -index 000000000000..f36264fea75c ---- /dev/null -+++ b/kernel/sched/alt_core.c -@@ -0,0 +1,6360 @@ -+/* -+ * kernel/sched/alt_core.c -+ * -+ * Core alternative kernel scheduler code and related syscalls -+ * -+ * Copyright (C) 1991-2002 Linus Torvalds -+ * -+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes -+ * a whole lot of those previous things. -+ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel -+ * scheduler by Alfred Chen. -+ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. -+ */ -+#include "sched.h" -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+ -+#include "../workqueue_internal.h" -+#include "../../fs/io-wq.h" -+#include "../smpboot.h" -+ -+#include "pelt.h" -+#include "smp.h" -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+#define ALT_SCHED_VERSION "v5.9-r0" -+ -+/* rt_prio(prio) defined in include/linux/sched/rt.h */ -+#define rt_task(p) rt_prio((p)->prio) -+#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) -+#define task_has_rt_policy(p) (rt_policy((p)->policy)) -+ -+#define STOP_PRIO (MAX_RT_PRIO - 1) -+ -+/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ -+u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); -+ -+static int __init sched_timeslice(char *str) -+{ -+ int timeslice_us; -+ -+ get_option(&str, ×lice_us); -+ if (timeslice_us >= 1000) -+ sched_timeslice_ns = timeslice_us * 1000; -+ -+ return 0; -+} -+early_param("sched_timeslice", sched_timeslice); -+ -+/* Reschedule if less than this many μs left */ -+#define RESCHED_NS (100 * 1000) -+ -+/** -+ * sched_yield_type - Choose what sort of yield sched_yield will perform. -+ * 0: No yield. -+ * 1: Deboost and requeue task. (default) -+ * 2: Set rq skip task. -+ */ -+int sched_yield_type __read_mostly = 1; -+ -+#ifdef CONFIG_SMP -+static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; -+ -+DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); -+DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); -+ -+#ifdef CONFIG_SCHED_SMT -+DEFINE_STATIC_KEY_FALSE(sched_smt_present); -+EXPORT_SYMBOL_GPL(sched_smt_present); -+#endif -+ -+/* -+ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of -+ * the domain), this allows us to quickly tell if two cpus are in the same cache -+ * domain, see cpus_share_cache(). -+ */ -+DEFINE_PER_CPU(int, sd_llc_id); -+#endif /* CONFIG_SMP */ -+ -+static DEFINE_MUTEX(sched_hotcpu_mutex); -+ -+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+ -+#ifndef prepare_arch_switch -+# define prepare_arch_switch(next) do { } while (0) -+#endif -+#ifndef finish_arch_post_lock_switch -+# define finish_arch_post_lock_switch() do { } while (0) -+#endif -+ -+#define IDLE_WM (IDLE_TASK_SCHED_PRIO) -+ -+#ifdef CONFIG_SCHED_SMT -+static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; -+#endif -+static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq_imp.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds_imp.h" -+#endif -+ -+static inline void update_sched_rq_watermark(struct rq *rq) -+{ -+ unsigned long watermark = sched_queue_watermark(rq); -+ unsigned long last_wm = rq->watermark; -+ unsigned long i; -+ int cpu; -+ -+ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", -+ cpu_of(rq), watermark, last_wm);*/ -+ if (watermark == last_wm) -+ return; -+ -+ rq->watermark = watermark; -+ cpu = cpu_of(rq); -+ if (watermark < last_wm) { -+ for (i = watermark + 1; i <= last_wm; i++) -+ cpumask_andnot(&sched_rq_watermark[i], -+ &sched_rq_watermark[i], cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == last_wm) -+ cpumask_andnot(&sched_sg_idle_mask, -+ &sched_sg_idle_mask, cpu_smt_mask(cpu)); -+#endif -+ return; -+ } -+ /* last_wm < watermark */ -+ for (i = last_wm + 1; i <= watermark; i++) -+ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); -+#ifdef CONFIG_SCHED_SMT -+ if (!static_branch_likely(&sched_smt_present)) -+ return; -+ if (IDLE_WM == watermark) { -+ cpumask_t tmp; -+ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); -+ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) -+ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), -+ &sched_sg_idle_mask); -+ } -+#endif -+} -+ -+static inline struct task_struct *rq_runnable_task(struct rq *rq) -+{ -+ struct task_struct *next = sched_rq_first_task(rq); -+ -+ if (unlikely(next == rq->skip)) -+ next = sched_rq_next_task(next, rq); -+ -+ return next; -+} -+ -+/* -+ * Serialization rules: -+ * -+ * Lock order: -+ * -+ * p->pi_lock -+ * rq->lock -+ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) -+ * -+ * rq1->lock -+ * rq2->lock where: rq1 < rq2 -+ * -+ * Regular state: -+ * -+ * Normal scheduling state is serialized by rq->lock. __schedule() takes the -+ * local CPU's rq->lock, it optionally removes the task from the runqueue and -+ * always looks at the local rq data structures to find the most elegible task -+ * to run next. -+ * -+ * Task enqueue is also under rq->lock, possibly taken from another CPU. -+ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to -+ * the local CPU to avoid bouncing the runqueue state around [ see -+ * ttwu_queue_wakelist() ] -+ * -+ * Task wakeup, specifically wakeups that involve migration, are horribly -+ * complicated to avoid having to take two rq->locks. -+ * -+ * Special state: -+ * -+ * System-calls and anything external will use task_rq_lock() which acquires -+ * both p->pi_lock and rq->lock. As a consequence the state they change is -+ * stable while holding either lock: -+ * -+ * - sched_setaffinity()/ -+ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed -+ * - set_user_nice(): p->se.load, p->*prio -+ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, -+ * p->se.load, p->rt_priority, -+ * p->dl.dl_{runtime, deadline, period, flags, bw, density} -+ * - sched_setnuma(): p->numa_preferred_nid -+ * - sched_move_task()/ -+ * cpu_cgroup_fork(): p->sched_task_group -+ * - uclamp_update_active() p->uclamp* -+ * -+ * p->state <- TASK_*: -+ * -+ * is changed locklessly using set_current_state(), __set_current_state() or -+ * set_special_state(), see their respective comments, or by -+ * try_to_wake_up(). This latter uses p->pi_lock to serialize against -+ * concurrent self. -+ * -+ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: -+ * -+ * is set by activate_task() and cleared by deactivate_task(), under -+ * rq->lock. Non-zero indicates the task is runnable, the special -+ * ON_RQ_MIGRATING state is used for migration without holding both -+ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). -+ * -+ * p->on_cpu <- { 0, 1 }: -+ * -+ * is set by prepare_task() and cleared by finish_task() such that it will be -+ * set before p is scheduled-in and cleared after p is scheduled-out, both -+ * under rq->lock. Non-zero indicates the task is running on its CPU. -+ * -+ * [ The astute reader will observe that it is possible for two tasks on one -+ * CPU to have ->on_cpu = 1 at the same time. ] -+ * -+ * task_cpu(p): is changed by set_task_cpu(), the rules are: -+ * -+ * - Don't call set_task_cpu() on a blocked task: -+ * -+ * We don't care what CPU we're not running on, this simplifies hotplug, -+ * the CPU assignment of blocked tasks isn't required to be valid. -+ * -+ * - for try_to_wake_up(), called under p->pi_lock: -+ * -+ * This allows try_to_wake_up() to only take one rq->lock, see its comment. -+ * -+ * - for migration called under rq->lock: -+ * [ see task_on_rq_migrating() in task_rq_lock() ] -+ * -+ * o move_queued_task() -+ * o detach_task() -+ * -+ * - for migration called under double_rq_lock(): -+ * -+ * o __migrate_swap_task() -+ * o push_rt_task() / pull_rt_task() -+ * o push_dl_task() / pull_dl_task() -+ * o dl_task_offline_migration() -+ * -+ */ -+ -+/* -+ * Context: p->pi_lock -+ */ -+static inline struct rq -+*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock(&rq->lock); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ *plock = NULL; -+ return rq; -+ } -+ } -+} -+ -+static inline void -+__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) -+{ -+ if (NULL != lock) -+ raw_spin_unlock(lock); -+} -+ -+static inline struct rq -+*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, -+ unsigned long *flags) -+{ -+ struct rq *rq; -+ for (;;) { -+ rq = task_rq(p); -+ if (p->on_cpu || task_on_rq_queued(p)) { -+ raw_spin_lock_irqsave(&rq->lock, *flags); -+ if (likely((p->on_cpu || task_on_rq_queued(p)) -+ && rq == task_rq(p))) { -+ *plock = &rq->lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&rq->lock, *flags); -+ } else if (task_on_rq_migrating(p)) { -+ do { -+ cpu_relax(); -+ } while (unlikely(task_on_rq_migrating(p))); -+ } else { -+ raw_spin_lock_irqsave(&p->pi_lock, *flags); -+ if (likely(!p->on_cpu && !p->on_rq && -+ rq == task_rq(p))) { -+ *plock = &p->pi_lock; -+ return rq; -+ } -+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); -+ } -+ } -+} -+ -+static inline void -+task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, -+ unsigned long *flags) -+{ -+ raw_spin_unlock_irqrestore(lock, *flags); -+} -+ -+/* -+ * __task_rq_lock - lock the rq @p resides on. -+ */ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ lockdep_assert_held(&p->pi_lock); -+ -+ for (;;) { -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) -+ return rq; -+ raw_spin_unlock(&rq->lock); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+/* -+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. -+ */ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ for (;;) { -+ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); -+ rq = task_rq(p); -+ raw_spin_lock(&rq->lock); -+ /* -+ * move_queued_task() task_rq_lock() -+ * -+ * ACQUIRE (rq->lock) -+ * [S] ->on_rq = MIGRATING [L] rq = task_rq() -+ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); -+ * [S] ->cpu = new_cpu [L] task_rq() -+ * [L] ->on_rq -+ * RELEASE (rq->lock) -+ * -+ * If we observe the old CPU in task_rq_lock(), the acquire of -+ * the old rq->lock will fully serialize against the stores. -+ * -+ * If we observe the new CPU in task_rq_lock(), the address -+ * dependency headed by '[L] rq = task_rq()' and the acquire -+ * will pair with the WMB to ensure we then also see migrating. -+ */ -+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { -+ return rq; -+ } -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+ -+ while (unlikely(task_on_rq_migrating(p))) -+ cpu_relax(); -+ } -+} -+ -+static inline void -+rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ raw_spin_lock_irqsave(&rq->lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); -+} -+ -+/* -+ * RQ-clock updating methods: -+ */ -+ -+static void update_rq_clock_task(struct rq *rq, s64 delta) -+{ -+/* -+ * In theory, the compile should just see 0 here, and optimize out the call -+ * to sched_rt_avg_update. But I don't trust it... -+ */ -+ s64 __maybe_unused steal = 0, irq_delta = 0; -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; -+ -+ /* -+ * Since irq_time is only updated on {soft,}irq_exit, we might run into -+ * this case when a previous update_rq_clock() happened inside a -+ * {soft,}irq region. -+ * -+ * When this happens, we stop ->clock_task and only update the -+ * prev_irq_time stamp to account for the part that fit, so that a next -+ * update will consume the rest. This ensures ->clock_task is -+ * monotonic. -+ * -+ * It does however cause some slight miss-attribution of {soft,}irq -+ * time, a more accurate solution would be to update the irq_time using -+ * the current rq->clock timestamp, except that would require using -+ * atomic ops. -+ */ -+ if (irq_delta > delta) -+ irq_delta = delta; -+ -+ rq->prev_irq_time += irq_delta; -+ delta -= irq_delta; -+#endif -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ if (static_key_false((¶virt_steal_rq_enabled))) { -+ steal = paravirt_steal_clock(cpu_of(rq)); -+ steal -= rq->prev_steal_time_rq; -+ -+ if (unlikely(steal > delta)) -+ steal = delta; -+ -+ rq->prev_steal_time_rq += steal; -+ delta -= steal; -+ } -+#endif -+ -+ rq->clock_task += delta; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ if ((irq_delta + steal)) -+ update_irq_load_avg(rq, irq_delta + steal); -+#endif -+} -+ -+static inline void update_rq_clock(struct rq *rq) -+{ -+ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; -+ -+ if (unlikely(delta <= 0)) -+ return; -+ rq->clock += delta; -+ update_rq_clock_task(rq, delta); -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+/* -+ * Tick may be needed by tasks in the runqueue depending on their policy and -+ * requirements. If tick is needed, lets send the target an IPI to kick it out -+ * of nohz mode if necessary. -+ */ -+static inline void sched_update_tick_dependency(struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!tick_nohz_full_cpu(cpu)) -+ return; -+ -+ if (rq->nr_running < 2) -+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); -+ else -+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); -+} -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_update_tick_dependency(struct rq *rq) { } -+#endif -+ -+/* -+ * Add/Remove/Requeue task to/from the runqueue routines -+ * Context: rq->lock -+ */ -+static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); -+ --rq->nr_running; -+#ifdef CONFIG_SMP -+ if (1 == rq->nr_running) -+ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+} -+ -+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", -+ task_cpu(p), cpu_of(rq)); -+ -+ __SCHED_ENQUEUE_TASK(p, rq, flags); -+ update_sched_rq_watermark(rq); -+ ++rq->nr_running; -+#ifdef CONFIG_SMP -+ if (2 == rq->nr_running) -+ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); -+#endif -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * If in_iowait is set, the code below may not trigger any cpufreq -+ * utilization updates, so do it here explicitly with the IOWAIT flag -+ * passed. -+ */ -+ if (p->in_iowait) -+ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq) -+{ -+ lockdep_assert_held(&rq->lock); -+ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ -+ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", -+ cpu_of(rq), task_cpu(p)); -+ -+ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); -+} -+ -+/* -+ * cmpxchg based fetch_or, macro so it works for different integer types -+ */ -+#define fetch_or(ptr, mask) \ -+ ({ \ -+ typeof(ptr) _ptr = (ptr); \ -+ typeof(mask) _mask = (mask); \ -+ typeof(*_ptr) _old, _val = *_ptr; \ -+ \ -+ for (;;) { \ -+ _old = cmpxchg(_ptr, _val, _val | _mask); \ -+ if (_old == _val) \ -+ break; \ -+ _val = _old; \ -+ } \ -+ _old; \ -+}) -+ -+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) -+/* -+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, -+ * this avoids any races wrt polling state changes and thereby avoids -+ * spurious IPIs. -+ */ -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); -+} -+ -+/* -+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. -+ * -+ * If this returns true, then the idle task promises to call -+ * sched_ttwu_pending() and reschedule soon. -+ */ -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ struct thread_info *ti = task_thread_info(p); -+ typeof(ti->flags) old, val = READ_ONCE(ti->flags); -+ -+ for (;;) { -+ if (!(val & _TIF_POLLING_NRFLAG)) -+ return false; -+ if (val & _TIF_NEED_RESCHED) -+ return true; -+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); -+ if (old == val) -+ break; -+ val = old; -+ } -+ return true; -+} -+ -+#else -+static bool set_nr_and_not_polling(struct task_struct *p) -+{ -+ set_tsk_need_resched(p); -+ return true; -+} -+ -+#ifdef CONFIG_SMP -+static bool set_nr_if_polling(struct task_struct *p) -+{ -+ return false; -+} -+#endif -+#endif -+ -+static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ struct wake_q_node *node = &task->wake_q; -+ -+ /* -+ * Atomically grab the task, if ->wake_q is !nil already it means -+ * its already queued (either by us or someone else) and will get the -+ * wakeup due to that. -+ * -+ * In order to ensure that a pending wakeup will observe our pending -+ * state, even in the failed case, an explicit smp_mb() must be used. -+ */ -+ smp_mb__before_atomic(); -+ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) -+ return false; -+ -+ /* -+ * The head is context local, there can be no concurrency. -+ */ -+ *head->lastp = node; -+ head->lastp = &node->next; -+ return true; -+} -+ -+/** -+ * wake_q_add() - queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ */ -+void wake_q_add(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (__wake_q_add(head, task)) -+ get_task_struct(task); -+} -+ -+/** -+ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. -+ * @head: the wake_q_head to add @task to -+ * @task: the task to queue for 'later' wakeup -+ * -+ * Queue a task for later wakeup, most likely by the wake_up_q() call in the -+ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come -+ * instantly. -+ * -+ * This function must be used as-if it were wake_up_process(); IOW the task -+ * must be ready to be woken at this location. -+ * -+ * This function is essentially a task-safe equivalent to wake_q_add(). Callers -+ * that already hold reference to @task can call the 'safe' version and trust -+ * wake_q to do the right thing depending whether or not the @task is already -+ * queued for wakeup. -+ */ -+void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) -+{ -+ if (!__wake_q_add(head, task)) -+ put_task_struct(task); -+} -+ -+void wake_up_q(struct wake_q_head *head) -+{ -+ struct wake_q_node *node = head->first; -+ -+ while (node != WAKE_Q_TAIL) { -+ struct task_struct *task; -+ -+ task = container_of(node, struct task_struct, wake_q); -+ BUG_ON(!task); -+ /* task can safely be re-inserted now: */ -+ node = node->next; -+ task->wake_q.next = NULL; -+ -+ /* -+ * wake_up_process() executes a full barrier, which pairs with -+ * the queueing in wake_q_add() so as not to miss wakeups. -+ */ -+ wake_up_process(task); -+ put_task_struct(task); -+ } -+} -+ -+/* -+ * resched_curr - mark rq's current task 'to be rescheduled now'. -+ * -+ * On UP this means the setting of the need_resched flag, on SMP it -+ * might also involve a cross-CPU call to trigger the scheduler on -+ * the target CPU. -+ */ -+void resched_curr(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ int cpu; -+ -+ lockdep_assert_held(&rq->lock); -+ -+ if (test_tsk_need_resched(curr)) -+ return; -+ -+ cpu = cpu_of(rq); -+ if (cpu == smp_processor_id()) { -+ set_tsk_need_resched(curr); -+ set_preempt_need_resched(); -+ return; -+ } -+ -+ if (set_nr_and_not_polling(curr)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+void resched_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (cpu_online(cpu) || cpu == smp_processor_id()) -+ resched_curr(cpu_rq(cpu)); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+} -+ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_NO_HZ_COMMON -+void nohz_balance_enter_idle(int cpu) {} -+ -+void select_nohz_load_balancer(int stop_tick) {} -+ -+void set_cpu_sd_state_idle(void) {} -+ -+/* -+ * In the semi idle case, use the nearest busy CPU for migrating timers -+ * from an idle CPU. This is good for power-savings. -+ * -+ * We don't do similar optimization for completely idle system, as -+ * selecting an idle CPU will add more delays to the timers than intended -+ * (as that CPU's timer base may not be uptodate wrt jiffies etc). -+ */ -+int get_nohz_timer_target(void) -+{ -+ int i, cpu = smp_processor_id(), default_cpu = -1; -+ struct cpumask *mask; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { -+ if (!idle_cpu(cpu)) -+ return cpu; -+ default_cpu = cpu; -+ } -+ -+ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) -+ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) -+ if (!idle_cpu(i)) -+ return i; -+ -+ if (default_cpu == -1) -+ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); -+ cpu = default_cpu; -+ -+ return cpu; -+} -+ -+/* -+ * When add_timer_on() enqueues a timer into the timer wheel of an -+ * idle CPU then this timer might expire before the next timer event -+ * which is scheduled to wake up that CPU. In case of a completely -+ * idle system the next event might even be infinite time into the -+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and -+ * leaves the inner idle loop so the newly added timer is taken into -+ * account when the CPU goes back to idle and evaluates the timer -+ * wheel for the next timer event. -+ */ -+static inline void wake_up_idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (cpu == smp_processor_id()) -+ return; -+ -+ if (set_nr_and_not_polling(rq->idle)) -+ smp_send_reschedule(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+static inline bool wake_up_full_nohz_cpu(int cpu) -+{ -+ /* -+ * We just need the target to call irq_exit() and re-evaluate -+ * the next tick. The nohz full kick at least implies that. -+ * If needed we can still optimize that later with an -+ * empty IRQ. -+ */ -+ if (cpu_is_offline(cpu)) -+ return true; /* Don't try to wake offline CPUs. */ -+ if (tick_nohz_full_cpu(cpu)) { -+ if (cpu != smp_processor_id() || -+ tick_nohz_tick_stopped()) -+ tick_nohz_full_kick_cpu(cpu); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_nohz_cpu(int cpu) -+{ -+ if (!wake_up_full_nohz_cpu(cpu)) -+ wake_up_idle_cpu(cpu); -+} -+ -+static void nohz_csd_func(void *info) -+{ -+ struct rq *rq = info; -+ int cpu = cpu_of(rq); -+ unsigned int flags; -+ -+ /* -+ * Release the rq::nohz_csd. -+ */ -+ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); -+ WARN_ON(!(flags & NOHZ_KICK_MASK)); -+ -+ rq->idle_balance = idle_cpu(cpu); -+ if (rq->idle_balance && !need_resched()) { -+ rq->nohz_idle_balance = flags; -+ raise_softirq_irqoff(SCHED_SOFTIRQ); -+ } -+} -+ -+#endif /* CONFIG_NO_HZ_COMMON */ -+#endif /* CONFIG_SMP */ -+ -+static inline void check_preempt_curr(struct rq *rq) -+{ -+ if (sched_rq_first_task(rq) != rq->curr) -+ resched_curr(rq); -+} -+ -+static inline void -+rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -+{ -+ csd->flags = 0; -+ csd->func = func; -+ csd->info = rq; -+} -+ -+#ifdef CONFIG_SCHED_HRTICK -+/* -+ * Use HR-timers to deliver accurate preemption points. -+ */ -+ -+static void hrtick_clear(struct rq *rq) -+{ -+ if (hrtimer_active(&rq->hrtick_timer)) -+ hrtimer_cancel(&rq->hrtick_timer); -+} -+ -+/* -+ * High-resolution timer tick. -+ * Runs from hardirq context with interrupts disabled. -+ */ -+static enum hrtimer_restart hrtick(struct hrtimer *timer) -+{ -+ struct rq *rq = container_of(timer, struct rq, hrtick_timer); -+ struct task_struct *p; -+ -+ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -+ -+ raw_spin_lock(&rq->lock); -+ p = rq->curr; -+ p->time_slice = 0; -+ resched_curr(rq); -+ raw_spin_unlock(&rq->lock); -+ -+ return HRTIMER_NORESTART; -+} -+ -+/* -+ * Use hrtick when: -+ * - enabled by features -+ * - hrtimer is actually high res -+ */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ /** -+ * Alt schedule FW doesn't support sched_feat yet -+ if (!sched_feat(HRTICK)) -+ return 0; -+ */ -+ if (!cpu_active(cpu_of(rq))) -+ return 0; -+ return hrtimer_is_hres_active(&rq->hrtick_timer); -+} -+ -+#ifdef CONFIG_SMP -+ -+static void __hrtick_restart(struct rq *rq) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ -+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); -+} -+ -+/* -+ * called from hardirq (IPI) context -+ */ -+static void __hrtick_start(void *arg) -+{ -+ struct rq *rq = arg; -+ -+ raw_spin_lock(&rq->lock); -+ __hrtick_restart(rq); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ struct hrtimer *timer = &rq->hrtick_timer; -+ ktime_t time; -+ s64 delta; -+ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense and can cause timer DoS. -+ */ -+ delta = max_t(s64, delay, 10000LL); -+ time = ktime_add_ns(timer->base->get_time(), delta); -+ -+ hrtimer_set_expires(timer, time); -+ -+ if (rq == this_rq()) -+ __hrtick_restart(rq); -+ else -+ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); -+} -+ -+#else -+/* -+ * Called to set the hrtick timer state. -+ * -+ * called with rq->lock held and irqs disabled -+ */ -+void hrtick_start(struct rq *rq, u64 delay) -+{ -+ /* -+ * Don't schedule slices shorter than 10000ns, that just -+ * doesn't make sense. Rely on vruntime for fairness. -+ */ -+ delay = max_t(u64, delay, 10000LL); -+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), -+ HRTIMER_MODE_REL_PINNED_HARD); -+} -+#endif /* CONFIG_SMP */ -+ -+static void hrtick_rq_init(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); -+#endif -+ -+ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); -+ rq->hrtick_timer.function = hrtick; -+} -+#else /* CONFIG_SCHED_HRTICK */ -+static inline int hrtick_enabled(struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline void hrtick_clear(struct rq *rq) -+{ -+} -+ -+static inline void hrtick_rq_init(struct rq *rq) -+{ -+} -+#endif /* CONFIG_SCHED_HRTICK */ -+ -+static inline int normal_prio(struct task_struct *p) -+{ -+ if (task_has_rt_policy(p)) -+ return MAX_RT_PRIO - 1 - p->rt_priority; -+ -+ return p->static_prio + MAX_PRIORITY_ADJ; -+} -+ -+/* -+ * Calculate the current priority, i.e. the priority -+ * taken into account by the scheduler. This value might -+ * be boosted by RT tasks as it will be RT if the task got -+ * RT-boosted. If not then it returns p->normal_prio. -+ */ -+static int effective_prio(struct task_struct *p) -+{ -+ p->normal_prio = normal_prio(p); -+ /* -+ * If we are RT tasks or we were boosted to RT priority, -+ * keep the priority unchanged. Otherwise, update priority -+ * to the normal priority: -+ */ -+ if (!rt_prio(p->prio)) -+ return p->normal_prio; -+ return p->prio; -+} -+ -+/* -+ * activate_task - move a task to the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static void activate_task(struct task_struct *p, struct rq *rq) -+{ -+ enqueue_task(p, rq, ENQUEUE_WAKEUP); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ cpufreq_update_util(rq, 0); -+} -+ -+/* -+ * deactivate_task - remove a task from the runqueue. -+ * -+ * Context: rq->lock -+ */ -+static inline void deactivate_task(struct task_struct *p, struct rq *rq) -+{ -+ dequeue_task(p, rq, DEQUEUE_SLEEP); -+ p->on_rq = 0; -+ cpufreq_update_util(rq, 0); -+} -+ -+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be -+ * successfully executed on another CPU. We must ensure that updates of -+ * per-task data have been completed by this moment. -+ */ -+ smp_wmb(); -+ -+#ifdef CONFIG_THREAD_INFO_IN_TASK -+ WRITE_ONCE(p->cpu, cpu); -+#else -+ WRITE_ONCE(task_thread_info(p)->cpu, cpu); -+#endif -+#endif -+} -+ -+#ifdef CONFIG_SMP -+void set_task_cpu(struct task_struct *p, unsigned int new_cpu) -+{ -+#ifdef CONFIG_SCHED_DEBUG -+ /* -+ * We should never call set_task_cpu() on a blocked task, -+ * ttwu() will sort out the placement. -+ */ -+ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && -+ !p->on_rq); -+#ifdef CONFIG_LOCKDEP -+ /* -+ * The caller should hold either p->pi_lock or rq->lock, when changing -+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. -+ * -+ * sched_move_task() holds both and thus holding either pins the cgroup, -+ * see task_group(). -+ */ -+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || -+ lockdep_is_held(&task_rq(p)->lock))); -+#endif -+ /* -+ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. -+ */ -+ WARN_ON_ONCE(!cpu_online(new_cpu)); -+#endif -+ if (task_cpu(p) == new_cpu) -+ return; -+ trace_sched_migrate_task(p, new_cpu); -+ rseq_migrate(p); -+ perf_event_task_migrate(p); -+ -+ __set_task_cpu(p, new_cpu); -+} -+ -+static inline bool is_per_cpu_kthread(struct task_struct *p) -+{ -+ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); -+} -+ -+/* -+ * Per-CPU kthreads are allowed to run on !active && online CPUs, see -+ * __set_cpus_allowed_ptr() and select_fallback_rq(). -+ */ -+static inline bool is_cpu_allowed(struct task_struct *p, int cpu) -+{ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ -+ if (is_per_cpu_kthread(p)) -+ return cpu_online(cpu); -+ -+ return cpu_active(cpu); -+} -+ -+/* -+ * This is how migration works: -+ * -+ * 1) we invoke migration_cpu_stop() on the target CPU using -+ * stop_one_cpu(). -+ * 2) stopper starts to run (implicitly forcing the migrated thread -+ * off the CPU) -+ * 3) it checks whether the migrated task is still in the wrong runqueue. -+ * 4) if it's in the wrong runqueue then the migration thread removes -+ * it and puts it into the right queue. -+ * 5) stopper completes and stop_one_cpu() returns and the migration -+ * is done. -+ */ -+ -+/* -+ * move_queued_task - move a queued task to new rq. -+ * -+ * Returns (locked) new rq. Old rq's lock is released. -+ */ -+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int -+ new_cpu) -+{ -+ lockdep_assert_held(&rq->lock); -+ -+ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); -+ dequeue_task(p, rq, 0); -+ set_task_cpu(p, new_cpu); -+ raw_spin_unlock(&rq->lock); -+ -+ rq = cpu_rq(new_cpu); -+ -+ raw_spin_lock(&rq->lock); -+ BUG_ON(task_cpu(p) != new_cpu); -+ enqueue_task(p, rq, 0); -+ p->on_rq = TASK_ON_RQ_QUEUED; -+ check_preempt_curr(rq); -+ -+ return rq; -+} -+ -+struct migration_arg { -+ struct task_struct *task; -+ int dest_cpu; -+}; -+ -+/* -+ * Move (not current) task off this CPU, onto the destination CPU. We're doing -+ * this because either it can't run here any more (set_cpus_allowed() -+ * away from this CPU, or CPU going down), or because we're -+ * attempting to rebalance this task on exec (sched_exec). -+ * -+ * So we race with normal scheduler movements, but that's OK, as long -+ * as the task is no longer on this CPU. -+ */ -+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int -+ dest_cpu) -+{ -+ /* Affinity changed (again). */ -+ if (!is_cpu_allowed(p, dest_cpu)) -+ return rq; -+ -+ update_rq_clock(rq); -+ return move_queued_task(rq, p, dest_cpu); -+} -+ -+/* -+ * migration_cpu_stop - this will be executed by a highprio stopper thread -+ * and performs thread migration by bumping thread off CPU then -+ * 'pushing' onto another runqueue. -+ */ -+static int migration_cpu_stop(void *data) -+{ -+ struct migration_arg *arg = data; -+ struct task_struct *p = arg->task; -+ struct rq *rq = this_rq(); -+ -+ /* -+ * The original target CPU might have gone down and we might -+ * be on another CPU but it doesn't matter. -+ */ -+ local_irq_disable(); -+ /* -+ * We need to explicitly wake pending tasks before running -+ * __migrate_task() such that we will not miss enforcing cpus_ptr -+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. -+ */ -+ flush_smp_call_function_from_idle(); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ /* -+ * If task_rq(p) != rq, it cannot be migrated here, because we're -+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because -+ * we're holding p->pi_lock. -+ */ -+ if (task_rq(p) == rq && task_on_rq_queued(p)) -+ rq = __migrate_task(rq, p, arg->dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_enable(); -+ return 0; -+} -+ -+static inline void -+set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ cpumask_copy(&p->cpus_mask, new_mask); -+ p->nr_cpus_allowed = cpumask_weight(new_mask); -+} -+ -+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ set_cpus_allowed_common(p, new_mask); -+} -+#endif -+ -+/** -+ * task_curr - is this task currently executing on a CPU? -+ * @p: the task in question. -+ * -+ * Return: 1 if the task is currently executing. 0 otherwise. -+ */ -+inline int task_curr(const struct task_struct *p) -+{ -+ return cpu_curr(task_cpu(p)) == p; -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * wait_task_inactive - wait for a thread to unschedule. -+ * -+ * If @match_state is nonzero, it's the @p->state value just checked and -+ * not expected to change. If it changes, i.e. @p might have woken up, -+ * then return zero. When we succeed in waiting for @p to be off its CPU, -+ * we return a positive number (its total switch count). If a second call -+ * a short while later returns the same number, the caller can be sure that -+ * @p has remained unscheduled the whole time. -+ * -+ * The caller must ensure that the task *will* unschedule sometime soon, -+ * else this function might spin for a *long* time. This function can't -+ * be called with interrupts off, or it may introduce deadlock with -+ * smp_call_function() if an IPI is sent by the same process we are -+ * waiting to become inactive. -+ */ -+unsigned long wait_task_inactive(struct task_struct *p, long match_state) -+{ -+ unsigned long flags; -+ bool running, on_rq; -+ unsigned long ncsw; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ for (;;) { -+ rq = task_rq(p); -+ -+ /* -+ * If the task is actively running on another CPU -+ * still, just relax and busy-wait without holding -+ * any locks. -+ * -+ * NOTE! Since we don't hold any locks, it's not -+ * even sure that "rq" stays as the right runqueue! -+ * But we don't care, since this will return false -+ * if the runqueue has changed and p is actually now -+ * running somewhere else! -+ */ -+ while (task_running(p) && p == rq->curr) { -+ if (match_state && unlikely(p->state != match_state)) -+ return 0; -+ cpu_relax(); -+ } -+ -+ /* -+ * Ok, time to look more closely! We need the rq -+ * lock now, to be *sure*. If we're wrong, we'll -+ * just go back and repeat. -+ */ -+ task_access_lock_irqsave(p, &lock, &flags); -+ trace_sched_wait_task(p); -+ running = task_running(p); -+ on_rq = p->on_rq; -+ ncsw = 0; -+ if (!match_state || p->state == match_state) -+ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ /* -+ * If it changed from the expected state, bail out now. -+ */ -+ if (unlikely(!ncsw)) -+ break; -+ -+ /* -+ * Was it really running after all now that we -+ * checked with the proper locks actually held? -+ * -+ * Oops. Go back and try again.. -+ */ -+ if (unlikely(running)) { -+ cpu_relax(); -+ continue; -+ } -+ -+ /* -+ * It's not enough that it's not actively running, -+ * it must be off the runqueue _entirely_, and not -+ * preempted! -+ * -+ * So if it was still runnable (but just not actively -+ * running right now), it's preempted, and we should -+ * yield - it could be a while. -+ */ -+ if (unlikely(on_rq)) { -+ ktime_t to = NSEC_PER_SEC / HZ; -+ -+ set_current_state(TASK_UNINTERRUPTIBLE); -+ schedule_hrtimeout(&to, HRTIMER_MODE_REL); -+ continue; -+ } -+ -+ /* -+ * Ahh, all good. It wasn't running, and it wasn't -+ * runnable, which means that it will never become -+ * running in the future either. We're all done! -+ */ -+ break; -+ } -+ -+ return ncsw; -+} -+ -+/*** -+ * kick_process - kick a running thread to enter/exit the kernel -+ * @p: the to-be-kicked thread -+ * -+ * Cause a process which is running on another CPU to enter -+ * kernel-mode, without any delay. (to get signals handled.) -+ * -+ * NOTE: this function doesn't have to take the runqueue lock, -+ * because all it wants to ensure is that the remote task enters -+ * the kernel. If the IPI races and the task has been migrated -+ * to another CPU then no harm is done and the purpose has been -+ * achieved as well. -+ */ -+void kick_process(struct task_struct *p) -+{ -+ int cpu; -+ -+ preempt_disable(); -+ cpu = task_cpu(p); -+ if ((cpu != smp_processor_id()) && task_curr(p)) -+ smp_send_reschedule(cpu); -+ preempt_enable(); -+} -+EXPORT_SYMBOL_GPL(kick_process); -+ -+/* -+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock -+ * -+ * A few notes on cpu_active vs cpu_online: -+ * -+ * - cpu_active must be a subset of cpu_online -+ * -+ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, -+ * see __set_cpus_allowed_ptr(). At this point the newly online -+ * CPU isn't yet part of the sched domains, and balancing will not -+ * see it. -+ * -+ * - on cpu-down we clear cpu_active() to mask the sched domains and -+ * avoid the load balancer to place new tasks on the to be removed -+ * CPU. Existing tasks will remain running there and will be taken -+ * off. -+ * -+ * This means that fallback selection must not select !active CPUs. -+ * And can assume that any active CPU must be online. Conversely -+ * select_task_rq() below may allow selection of !active CPUs in order -+ * to satisfy the above rules. -+ */ -+static int select_fallback_rq(int cpu, struct task_struct *p) -+{ -+ int nid = cpu_to_node(cpu); -+ const struct cpumask *nodemask = NULL; -+ enum { cpuset, possible, fail } state = cpuset; -+ int dest_cpu; -+ -+ /* -+ * If the node that the CPU is on has been offlined, cpu_to_node() -+ * will return -1. There is no CPU on the node, and we should -+ * select the CPU on the other node. -+ */ -+ if (nid != -1) { -+ nodemask = cpumask_of_node(nid); -+ -+ /* Look for allowed, online CPU in same node. */ -+ for_each_cpu(dest_cpu, nodemask) { -+ if (!cpu_active(dest_cpu)) -+ continue; -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) -+ return dest_cpu; -+ } -+ } -+ -+ for (;;) { -+ /* Any allowed, online CPU? */ -+ for_each_cpu(dest_cpu, p->cpus_ptr) { -+ if (!is_cpu_allowed(p, dest_cpu)) -+ continue; -+ goto out; -+ } -+ -+ /* No more Mr. Nice Guy. */ -+ switch (state) { -+ case cpuset: -+ if (IS_ENABLED(CONFIG_CPUSETS)) { -+ cpuset_cpus_allowed_fallback(p); -+ state = possible; -+ break; -+ } -+ fallthrough; -+ case possible: -+ do_set_cpus_allowed(p, cpu_possible_mask); -+ state = fail; -+ break; -+ -+ case fail: -+ BUG(); -+ break; -+ } -+ } -+ -+out: -+ if (state != cpuset) { -+ /* -+ * Don't tell them about moving exiting tasks or -+ * kernel threads (both mm NULL), since they never -+ * leave kernel. -+ */ -+ if (p->mm && printk_ratelimit()) { -+ printk_deferred("process %d (%s) no longer affine to cpu%d\n", -+ task_pid_nr(p), p->comm, cpu); -+ } -+ } -+ -+ return dest_cpu; -+} -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ cpumask_t chk_mask, tmp; -+ -+ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) -+ return select_fallback_rq(task_cpu(p), p); -+ -+ if ( -+#ifdef CONFIG_SCHED_SMT -+ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || -+#endif -+ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || -+ cpumask_and(&tmp, &chk_mask, -+ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) -+ return best_mask_cpu(task_cpu(p), &tmp); -+ -+ return best_mask_cpu(task_cpu(p), &chk_mask); -+} -+ -+void sched_set_stop_task(int cpu, struct task_struct *stop) -+{ -+ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; -+ struct sched_param start_param = { .sched_priority = 0 }; -+ struct task_struct *old_stop = cpu_rq(cpu)->stop; -+ -+ if (stop) { -+ /* -+ * Make it appear like a SCHED_FIFO task, its something -+ * userspace knows about and won't get confused about. -+ * -+ * Also, it will make PI more or less work without too -+ * much confusion -- but then, stop work should not -+ * rely on PI working anyway. -+ */ -+ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); -+ } -+ -+ cpu_rq(cpu)->stop = stop; -+ -+ if (old_stop) { -+ /* -+ * Reset it back to a normal scheduling policy so that -+ * it can die in pieces. -+ */ -+ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); -+ } -+} -+ -+/* -+ * Change a given task's CPU affinity. Migrate the thread to a -+ * proper CPU and schedule it away if the CPU it's executing on -+ * is removed from the allowed bitmask. -+ * -+ * NOTE: the caller must have a valid reference to the task, the -+ * task must not exit() & deallocate itself prematurely. The -+ * call is not atomic; no spinlocks may be held. -+ */ -+static int __set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ const struct cpumask *cpu_valid_mask = cpu_active_mask; -+ int dest_cpu; -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * Kernel threads are allowed on online && !active CPUs -+ */ -+ cpu_valid_mask = cpu_online_mask; -+ } -+ -+ /* -+ * Must re-check here, to close a race against __kthread_bind(), -+ * sched_setaffinity() is not guaranteed to observe the flag. -+ */ -+ if (check && (p->flags & PF_NO_SETAFFINITY)) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ if (cpumask_equal(&p->cpus_mask, new_mask)) -+ goto out; -+ -+ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); -+ if (dest_cpu >= nr_cpu_ids) { -+ ret = -EINVAL; -+ goto out; -+ } -+ -+ do_set_cpus_allowed(p, new_mask); -+ -+ if (p->flags & PF_KTHREAD) { -+ /* -+ * For kernel threads that do indeed end up on online && -+ * !active we want to ensure they are strict per-CPU threads. -+ */ -+ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && -+ !cpumask_intersects(new_mask, cpu_active_mask) && -+ p->nr_cpus_allowed != 1); -+ } -+ -+ /* Can the task run on the task's current CPU? If so, we're done */ -+ if (cpumask_test_cpu(task_cpu(p), new_mask)) -+ goto out; -+ -+ if (task_running(p) || p->state == TASK_WAKING) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ /* Need help from migration thread: drop lock and wait. */ -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); -+ return 0; -+ } -+ if (task_on_rq_queued(p)) { -+ /* -+ * OK, since we're going to drop the lock immediately -+ * afterwards anyway. -+ */ -+ update_rq_clock(rq); -+ rq = move_queued_task(rq, p, dest_cpu); -+ lock = &rq->lock; -+ } -+ -+out: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return ret; -+} -+ -+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) -+{ -+ return __set_cpus_allowed_ptr(p, new_mask, false); -+} -+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -+ -+#else /* CONFIG_SMP */ -+ -+static inline int select_task_rq(struct task_struct *p, struct rq *rq) -+{ -+ return 0; -+} -+ -+static inline int -+__set_cpus_allowed_ptr(struct task_struct *p, -+ const struct cpumask *new_mask, bool check) -+{ -+ return set_cpus_allowed_ptr(p, new_mask); -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static void -+ttwu_stat(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq; -+ -+ if (!schedstat_enabled()) -+ return; -+ -+ rq= this_rq(); -+ -+#ifdef CONFIG_SMP -+ if (cpu == rq->cpu) -+ __schedstat_inc(rq->ttwu_local); -+ else { -+ /** Alt schedule FW ToDo: -+ * How to do ttwu_wake_remote -+ */ -+ } -+#endif /* CONFIG_SMP */ -+ -+ __schedstat_inc(rq->ttwu_count); -+} -+ -+/* -+ * Mark the task runnable and perform wakeup-preemption. -+ */ -+static inline void -+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ check_preempt_curr(rq); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+} -+ -+static inline void -+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) -+{ -+ if (p->sched_contributes_to_load) -+ rq->nr_uninterruptible--; -+ -+ activate_task(p, rq); -+ ttwu_do_wakeup(rq, p, 0); -+} -+ -+/* -+ * Consider @p being inside a wait loop: -+ * -+ * for (;;) { -+ * set_current_state(TASK_UNINTERRUPTIBLE); -+ * -+ * if (CONDITION) -+ * break; -+ * -+ * schedule(); -+ * } -+ * __set_current_state(TASK_RUNNING); -+ * -+ * between set_current_state() and schedule(). In this case @p is still -+ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in -+ * an atomic manner. -+ * -+ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq -+ * then schedule() must still happen and p->state can be changed to -+ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we -+ * need to do a full wakeup with enqueue. -+ * -+ * Returns: %true when the wakeup is done, -+ * %false otherwise. -+ */ -+static int ttwu_runnable(struct task_struct *p, int wake_flags) -+{ -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ int ret = 0; -+ -+ rq = __task_access_lock(p, &lock); -+ if (task_on_rq_queued(p)) { -+ /* check_preempt_curr() may use rq clock */ -+ update_rq_clock(rq); -+ ttwu_do_wakeup(rq, p, wake_flags); -+ ret = 1; -+ } -+ __task_access_unlock(p, lock); -+ -+ return ret; -+} -+ -+#ifdef CONFIG_SMP -+void sched_ttwu_pending(void *arg) -+{ -+ struct llist_node *llist = arg; -+ struct rq *rq = this_rq(); -+ struct task_struct *p, *t; -+ struct rq_flags rf; -+ -+ if (!llist) -+ return; -+ -+ /* -+ * rq::ttwu_pending racy indication of out-standing wakeups. -+ * Races such that false-negatives are possible, since they -+ * are shorter lived that false-positives would be. -+ */ -+ WRITE_ONCE(rq->ttwu_pending, 0); -+ -+ rq_lock_irqsave(rq, &rf); -+ update_rq_clock(rq); -+ -+ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { -+ if (WARN_ON_ONCE(p->on_cpu)) -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) -+ set_task_cpu(p, cpu_of(rq)); -+ -+ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+} -+ -+void send_call_function_single_ipi(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (!set_nr_if_polling(rq->idle)) -+ arch_send_call_function_single_ipi(cpu); -+ else -+ trace_sched_wake_idle_without_ipi(cpu); -+} -+ -+/* -+ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if -+ * necessary. The wakee CPU on receipt of the IPI will queue the task -+ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost -+ * of the wakeup instead of the waker. -+ */ -+static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); -+ -+ WRITE_ONCE(rq->ttwu_pending, 1); -+ __smp_call_single_queue(cpu, &p->wake_entry.llist); -+} -+ -+static inline bool ttwu_queue_cond(int cpu, int wake_flags) -+{ -+ /* -+ * If the CPU does not share cache, then queue the task on the -+ * remote rqs wakelist to avoid accessing remote data. -+ */ -+ if (!cpus_share_cache(smp_processor_id(), cpu)) -+ return true; -+ -+ /* -+ * If the task is descheduling and the only running task on the -+ * CPU then use the wakelist to offload the task activation to -+ * the soon-to-be-idle CPU as the current CPU is likely busy. -+ * nr_running is checked to avoid unnecessary task stacking. -+ */ -+ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) -+ return true; -+ -+ return false; -+} -+ -+static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { -+ if (WARN_ON_ONCE(cpu == smp_processor_id())) -+ return false; -+ -+ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ -+ __ttwu_queue_wakelist(p, cpu, wake_flags); -+ return true; -+ } -+ -+ return false; -+} -+ -+void wake_up_if_idle(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ if (!is_idle_task(rcu_dereference(rq->curr))) -+ goto out; -+ -+ if (set_nr_if_polling(rq->idle)) { -+ trace_sched_wake_idle_without_ipi(cpu); -+ } else { -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ if (is_idle_task(rq->curr)) -+ smp_send_reschedule(cpu); -+ /* Else CPU is not idle, do nothing here */ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ } -+ -+out: -+ rcu_read_unlock(); -+} -+ -+bool cpus_share_cache(int this_cpu, int that_cpu) -+{ -+ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); -+} -+#else /* !CONFIG_SMP */ -+ -+static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_SMP */ -+ -+static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (ttwu_queue_wakelist(p, cpu, wake_flags)) -+ return; -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ ttwu_do_activate(rq, p, wake_flags); -+ raw_spin_unlock(&rq->lock); -+} -+ -+/* -+ * Notes on Program-Order guarantees on SMP systems. -+ * -+ * MIGRATION -+ * -+ * The basic program-order guarantee on SMP systems is that when a task [t] -+ * migrates, all its activity on its old CPU [c0] happens-before any subsequent -+ * execution on its new CPU [c1]. -+ * -+ * For migration (of runnable tasks) this is provided by the following means: -+ * -+ * A) UNLOCK of the rq(c0)->lock scheduling out task t -+ * B) migration for t is required to synchronize *both* rq(c0)->lock and -+ * rq(c1)->lock (if not at the same time, then in that order). -+ * C) LOCK of the rq(c1)->lock scheduling in task -+ * -+ * Transitivity guarantees that B happens after A and C after B. -+ * Note: we only require RCpc transitivity. -+ * Note: the CPU doing B need not be c0 or c1 -+ * -+ * Example: -+ * -+ * CPU0 CPU1 CPU2 -+ * -+ * LOCK rq(0)->lock -+ * sched-out X -+ * sched-in Y -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(0)->lock // orders against CPU0 -+ * dequeue X -+ * UNLOCK rq(0)->lock -+ * -+ * LOCK rq(1)->lock -+ * enqueue X -+ * UNLOCK rq(1)->lock -+ * -+ * LOCK rq(1)->lock // orders against CPU2 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(1)->lock -+ * -+ * -+ * BLOCKING -- aka. SLEEP + WAKEUP -+ * -+ * For blocking we (obviously) need to provide the same guarantee as for -+ * migration. However the means are completely different as there is no lock -+ * chain to provide order. Instead we do: -+ * -+ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() -+ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() -+ * -+ * Example: -+ * -+ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) -+ * -+ * LOCK rq(0)->lock LOCK X->pi_lock -+ * dequeue X -+ * sched-out X -+ * smp_store_release(X->on_cpu, 0); -+ * -+ * smp_cond_load_acquire(&X->on_cpu, !VAL); -+ * X->state = WAKING -+ * set_task_cpu(X,2) -+ * -+ * LOCK rq(2)->lock -+ * enqueue X -+ * X->state = RUNNING -+ * UNLOCK rq(2)->lock -+ * -+ * LOCK rq(2)->lock // orders against CPU1 -+ * sched-out Z -+ * sched-in X -+ * UNLOCK rq(2)->lock -+ * -+ * UNLOCK X->pi_lock -+ * UNLOCK rq(0)->lock -+ * -+ * -+ * However; for wakeups there is a second guarantee we must provide, namely we -+ * must observe the state that lead to our wakeup. That is, not only must our -+ * task observe its own prior state, it must also observe the stores prior to -+ * its wakeup. -+ * -+ * This means that any means of doing remote wakeups must order the CPU doing -+ * the wakeup against the CPU the task is going to end up running on. This, -+ * however, is already required for the regular Program-Order guarantee above, -+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). -+ * -+ */ -+ -+/** -+ * try_to_wake_up - wake up a thread -+ * @p: the thread to be awakened -+ * @state: the mask of task states that can be woken -+ * @wake_flags: wake modifier flags (WF_*) -+ * -+ * Conceptually does: -+ * -+ * If (@state & @p->state) @p->state = TASK_RUNNING. -+ * -+ * If the task was not queued/runnable, also place it back on a runqueue. -+ * -+ * This function is atomic against schedule() which would dequeue the task. -+ * -+ * It issues a full memory barrier before accessing @p->state, see the comment -+ * with set_current_state(). -+ * -+ * Uses p->pi_lock to serialize against concurrent wake-ups. -+ * -+ * Relies on p->pi_lock stabilizing: -+ * - p->sched_class -+ * - p->cpus_ptr -+ * - p->sched_task_group -+ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). -+ * -+ * Tries really hard to only take one task_rq(p)->lock for performance. -+ * Takes rq->lock in: -+ * - ttwu_runnable() -- old rq, unavoidable, see comment there; -+ * - ttwu_queue() -- new rq, for enqueue of the task; -+ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. -+ * -+ * As a consequence we race really badly with just about everything. See the -+ * many memory barriers and their comments for details. -+ * -+ * Return: %true if @p->state changes (an actual wakeup was done), -+ * %false otherwise. -+ */ -+static int try_to_wake_up(struct task_struct *p, unsigned int state, -+ int wake_flags) -+{ -+ unsigned long flags; -+ int cpu, success = 0; -+ -+ preempt_disable(); -+ if (p == current) { -+ /* -+ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) -+ * == smp_processor_id()'. Together this means we can special -+ * case the whole 'p->on_rq && ttwu_runnable()' case below -+ * without taking any locks. -+ * -+ * In particular: -+ * - we rely on Program-Order guarantees for all the ordering, -+ * - we're serialized against set_special_state() by virtue of -+ * it disabling IRQs (this allows not taking ->pi_lock). -+ */ -+ if (!(p->state & state)) -+ goto out; -+ -+ success = 1; -+ trace_sched_waking(p); -+ p->state = TASK_RUNNING; -+ trace_sched_wakeup(p); -+ goto out; -+ } -+ -+ /* -+ * If we are going to wake up a thread waiting for CONDITION we -+ * need to ensure that CONDITION=1 done by the caller can not be -+ * reordered with p->state check below. This pairs with smp_store_mb() -+ * in set_current_state() that the waiting thread does. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ smp_mb__after_spinlock(); -+ if (!(p->state & state)) -+ goto unlock; -+ -+ trace_sched_waking(p); -+ -+ /* We're going to change ->state: */ -+ success = 1; -+ -+ /* -+ * Ensure we load p->on_rq _after_ p->state, otherwise it would -+ * be possible to, falsely, observe p->on_rq == 0 and get stuck -+ * in smp_cond_load_acquire() below. -+ * -+ * sched_ttwu_pending() try_to_wake_up() -+ * STORE p->on_rq = 1 LOAD p->state -+ * UNLOCK rq->lock -+ * -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * UNLOCK rq->lock -+ * -+ * [task p] -+ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). -+ */ -+ smp_rmb(); -+ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) -+ goto unlock; -+ -+ if (p->in_iowait) { -+ delayacct_blkio_end(p); -+ atomic_dec(&task_rq(p)->nr_iowait); -+ } -+ -+#ifdef CONFIG_SMP -+ /* -+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be -+ * possible to, falsely, observe p->on_cpu == 0. -+ * -+ * One must be running (->on_cpu == 1) in order to remove oneself -+ * from the runqueue. -+ * -+ * __schedule() (switch to task 'p') try_to_wake_up() -+ * STORE p->on_cpu = 1 LOAD p->on_rq -+ * UNLOCK rq->lock -+ * -+ * __schedule() (put 'p' to sleep) -+ * LOCK rq->lock smp_rmb(); -+ * smp_mb__after_spinlock(); -+ * STORE p->on_rq = 0 LOAD p->on_cpu -+ * -+ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in -+ * __schedule(). See the comment for smp_mb__after_spinlock(). -+ * -+ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure -+ * schedule()'s deactivate_task() has 'happened' and p will no longer -+ * care about it's own p->state. See the comment in __schedule(). -+ */ -+ smp_acquire__after_ctrl_dep(); -+ -+ /* -+ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq -+ * == 0), which means we need to do an enqueue, change p->state to -+ * TASK_WAKING such that we can unlock p->pi_lock before doing the -+ * enqueue, such as ttwu_queue_wakelist(). -+ */ -+ p->state = TASK_WAKING; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, considering queueing p on the remote CPUs wake_list -+ * which potentially sends an IPI instead of spinning on p->on_cpu to -+ * let the waker make forward progress. This is safe because IRQs are -+ * disabled and the IPI will deliver after on_cpu is cleared. -+ * -+ * Ensure we load task_cpu(p) after p->on_cpu: -+ * -+ * set_task_cpu(p, cpu); -+ * STORE p->cpu = @cpu -+ * __schedule() (switch to task 'p') -+ * LOCK rq->lock -+ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) -+ * STORE p->on_cpu = 1 LOAD p->cpu -+ * -+ * to ensure we observe the correct CPU on which the task is currently -+ * scheduling. -+ */ -+ if (smp_load_acquire(&p->on_cpu) && -+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) -+ goto unlock; -+ -+ /* -+ * If the owning (remote) CPU is still in the middle of schedule() with -+ * this task as prev, wait until its done referencing the task. -+ * -+ * Pairs with the smp_store_release() in finish_task(). -+ * -+ * This ensures that tasks getting woken will be fully ordered against -+ * their previous state and preserve Program Order. -+ */ -+ smp_cond_load_acquire(&p->on_cpu, !VAL); -+ -+ sched_task_ttwu(p); -+ -+ cpu = select_task_rq(p, this_rq()); -+ -+ if (cpu != task_cpu(p)) { -+ wake_flags |= WF_MIGRATED; -+ psi_ttwu_dequeue(p); -+ set_task_cpu(p, cpu); -+ } -+#else -+ cpu = task_cpu(p); -+#endif /* CONFIG_SMP */ -+ -+ ttwu_queue(p, cpu, wake_flags); -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+out: -+ if (success) -+ ttwu_stat(p, task_cpu(p), wake_flags); -+ preempt_enable(); -+ -+ return success; -+} -+ -+/** -+ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state -+ * @p: Process for which the function is to be invoked. -+ * @func: Function to invoke. -+ * @arg: Argument to function. -+ * -+ * If the specified task can be quickly locked into a definite state -+ * (either sleeping or on a given runqueue), arrange to keep it in that -+ * state while invoking @func(@arg). This function can use ->on_rq and -+ * task_curr() to work out what the state is, if required. Given that -+ * @func can be invoked with a runqueue lock held, it had better be quite -+ * lightweight. -+ * -+ * Returns: -+ * @false if the task slipped out from under the locks. -+ * @true if the task was locked onto a runqueue or is sleeping. -+ * However, @func can override this by returning @false. -+ */ -+bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) -+{ -+ bool ret = false; -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ lockdep_assert_irqs_enabled(); -+ raw_spin_lock_irq(&p->pi_lock); -+ if (p->on_rq) { -+ rq = __task_rq_lock(p, &rf); -+ if (task_rq(p) == rq) -+ ret = func(p, arg); -+ __task_rq_unlock(rq, &rf); -+ } else { -+ switch (p->state) { -+ case TASK_RUNNING: -+ case TASK_WAKING: -+ break; -+ default: -+ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). -+ if (!p->on_rq) -+ ret = func(p, arg); -+ } -+ } -+ raw_spin_unlock_irq(&p->pi_lock); -+ return ret; -+} -+ -+/** -+ * wake_up_process - Wake up a specific process -+ * @p: The process to be woken up. -+ * -+ * Attempt to wake up the nominated process and move it to the set of runnable -+ * processes. -+ * -+ * Return: 1 if the process was woken up, 0 if it was already running. -+ * -+ * This function executes a full memory barrier before accessing the task state. -+ */ -+int wake_up_process(struct task_struct *p) -+{ -+ return try_to_wake_up(p, TASK_NORMAL, 0); -+} -+EXPORT_SYMBOL(wake_up_process); -+ -+int wake_up_state(struct task_struct *p, unsigned int state) -+{ -+ return try_to_wake_up(p, state, 0); -+} -+ -+/* -+ * Perform scheduler related setup for a newly forked process p. -+ * p is forked by current. -+ * -+ * __sched_fork() is basic setup used by init_idle() too: -+ */ -+static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ p->on_rq = 0; -+ p->on_cpu = 0; -+ p->utime = 0; -+ p->stime = 0; -+ p->sched_time = 0; -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ INIT_HLIST_HEAD(&p->preempt_notifiers); -+#endif -+ -+#ifdef CONFIG_COMPACTION -+ p->capture_control = NULL; -+#endif -+#ifdef CONFIG_SMP -+ p->wake_entry.u_flags = CSD_TYPE_TTWU; -+#endif -+} -+ -+/* -+ * fork()/clone()-time setup: -+ */ -+int sched_fork(unsigned long clone_flags, struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ __sched_fork(clone_flags, p); -+ /* -+ * We mark the process as NEW here. This guarantees that -+ * nobody will actually run it, and a signal or other external -+ * event cannot wake it up and insert it on the runqueue either. -+ */ -+ p->state = TASK_NEW; -+ -+ /* -+ * Make sure we do not leak PI boosting priority to the child. -+ */ -+ p->prio = current->normal_prio; -+ -+ /* -+ * Revert to default priority/policy on fork if requested. -+ */ -+ if (unlikely(p->sched_reset_on_fork)) { -+ if (task_has_rt_policy(p)) { -+ p->policy = SCHED_NORMAL; -+ p->static_prio = NICE_TO_PRIO(0); -+ p->rt_priority = 0; -+ } else if (PRIO_TO_NICE(p->static_prio) < 0) -+ p->static_prio = NICE_TO_PRIO(0); -+ -+ p->prio = p->normal_prio = normal_prio(p); -+ -+ /* -+ * We don't need the reset flag anymore after the fork. It has -+ * fulfilled its duty: -+ */ -+ p->sched_reset_on_fork = 0; -+ } -+ -+ /* -+ * The child is not yet in the pid-hash so no cgroup attach races, -+ * and the cgroup is pinned to this child due to cgroup_fork() -+ * is ran before sched_fork(). -+ * -+ * Silence PROVE_RCU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ /* -+ * Share the timeslice between parent and child, thus the -+ * total amount of pending timeslices in the system doesn't change, -+ * resulting in more scheduling fairness. -+ */ -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ rq->curr->time_slice /= 2; -+ p->time_slice = rq->curr->time_slice; -+#ifdef CONFIG_SCHED_HRTICK -+ hrtick_start(rq, rq->curr->time_slice); -+#endif -+ -+ if (p->time_slice < RESCHED_NS) { -+ p->time_slice = sched_timeslice_ns; -+ resched_curr(rq); -+ } -+ sched_task_fork(p, rq); -+ raw_spin_unlock(&rq->lock); -+ -+ rseq_migrate(p); -+ /* -+ * We're setting the CPU for the first time, we don't migrate, -+ * so use __set_task_cpu(). -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+#ifdef CONFIG_SCHED_INFO -+ if (unlikely(sched_info_on())) -+ memset(&p->sched_info, 0, sizeof(p->sched_info)); -+#endif -+ init_task_preempt_count(p); -+ -+ return 0; -+} -+ -+void sched_post_fork(struct task_struct *p) {} -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+DEFINE_STATIC_KEY_FALSE(sched_schedstats); -+static bool __initdata __sched_schedstats = false; -+ -+static void set_schedstats(bool enabled) -+{ -+ if (enabled) -+ static_branch_enable(&sched_schedstats); -+ else -+ static_branch_disable(&sched_schedstats); -+} -+ -+void force_schedstat_enabled(void) -+{ -+ if (!schedstat_enabled()) { -+ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); -+ static_branch_enable(&sched_schedstats); -+ } -+} -+ -+static int __init setup_schedstats(char *str) -+{ -+ int ret = 0; -+ if (!str) -+ goto out; -+ -+ /* -+ * This code is called before jump labels have been set up, so we can't -+ * change the static branch directly just yet. Instead set a temporary -+ * variable so init_schedstats() can do it later. -+ */ -+ if (!strcmp(str, "enable")) { -+ __sched_schedstats = true; -+ ret = 1; -+ } else if (!strcmp(str, "disable")) { -+ __sched_schedstats = false; -+ ret = 1; -+ } -+out: -+ if (!ret) -+ pr_warn("Unable to parse schedstats=\n"); -+ -+ return ret; -+} -+__setup("schedstats=", setup_schedstats); -+ -+static void __init init_schedstats(void) -+{ -+ set_schedstats(__sched_schedstats); -+} -+ -+#ifdef CONFIG_PROC_SYSCTL -+int sysctl_schedstats(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ struct ctl_table t; -+ int err; -+ int state = static_branch_likely(&sched_schedstats); -+ -+ if (write && !capable(CAP_SYS_ADMIN)) -+ return -EPERM; -+ -+ t = *table; -+ t.data = &state; -+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); -+ if (err < 0) -+ return err; -+ if (write) -+ set_schedstats(state); -+ return err; -+} -+#endif /* CONFIG_PROC_SYSCTL */ -+#else /* !CONFIG_SCHEDSTATS */ -+static inline void init_schedstats(void) {} -+#endif /* CONFIG_SCHEDSTATS */ -+ -+/* -+ * wake_up_new_task - wake up a newly created task for the first time. -+ * -+ * This function will do some initial scheduler statistics housekeeping -+ * that must be done for every newly created context, then puts the task -+ * on the runqueue and wakes it. -+ */ -+void wake_up_new_task(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ p->state = TASK_RUNNING; -+ -+ rq = cpu_rq(select_task_rq(p, this_rq())); -+#ifdef CONFIG_SMP -+ rseq_migrate(p); -+ /* -+ * Fork balancing, do it here and not earlier because: -+ * - cpus_ptr can change in the fork path -+ * - any previously selected CPU might disappear through hotplug -+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, -+ * as we're not fully set-up yet. -+ */ -+ __set_task_cpu(p, cpu_of(rq)); -+#endif -+ -+ raw_spin_lock(&rq->lock); -+ -+ update_rq_clock(rq); -+ activate_task(p, rq); -+ trace_sched_wakeup_new(p); -+ check_preempt_curr(rq); -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#ifdef CONFIG_PREEMPT_NOTIFIERS -+ -+static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); -+ -+void preempt_notifier_inc(void) -+{ -+ static_branch_inc(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_inc); -+ -+void preempt_notifier_dec(void) -+{ -+ static_branch_dec(&preempt_notifier_key); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_dec); -+ -+/** -+ * preempt_notifier_register - tell me when current is being preempted & rescheduled -+ * @notifier: notifier struct to register -+ */ -+void preempt_notifier_register(struct preempt_notifier *notifier) -+{ -+ if (!static_branch_unlikely(&preempt_notifier_key)) -+ WARN(1, "registering preempt_notifier while notifiers disabled\n"); -+ -+ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_register); -+ -+/** -+ * preempt_notifier_unregister - no longer interested in preemption notifications -+ * @notifier: notifier struct to unregister -+ * -+ * This is *not* safe to call from within a preemption notifier. -+ */ -+void preempt_notifier_unregister(struct preempt_notifier *notifier) -+{ -+ hlist_del(¬ifier->link); -+} -+EXPORT_SYMBOL_GPL(preempt_notifier_unregister); -+ -+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_in(notifier, raw_smp_processor_id()); -+} -+ -+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_in_preempt_notifiers(curr); -+} -+ -+static void -+__fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ struct preempt_notifier *notifier; -+ -+ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) -+ notifier->ops->sched_out(notifier, next); -+} -+ -+static __always_inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+ if (static_branch_unlikely(&preempt_notifier_key)) -+ __fire_sched_out_preempt_notifiers(curr, next); -+} -+ -+#else /* !CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) -+{ -+} -+ -+static inline void -+fire_sched_out_preempt_notifiers(struct task_struct *curr, -+ struct task_struct *next) -+{ -+} -+ -+#endif /* CONFIG_PREEMPT_NOTIFIERS */ -+ -+static inline void prepare_task(struct task_struct *next) -+{ -+ /* -+ * Claim the task as running, we do this before switching to it -+ * such that any running task will have this set. -+ * -+ * See the ttwu() WF_ON_CPU case and its ordering comment. -+ */ -+ WRITE_ONCE(next->on_cpu, 1); -+} -+ -+static inline void finish_task(struct task_struct *prev) -+{ -+#ifdef CONFIG_SMP -+ /* -+ * This must be the very last reference to @prev from this CPU. After -+ * p->on_cpu is cleared, the task can be moved to a different CPU. We -+ * must ensure this doesn't happen until the switch is completely -+ * finished. -+ * -+ * In particular, the load of prev->state in finish_task_switch() must -+ * happen before this. -+ * -+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). -+ */ -+ smp_store_release(&prev->on_cpu, 0); -+#else -+ prev->on_cpu = 0; -+#endif -+} -+ -+static inline void -+prepare_lock_switch(struct rq *rq, struct task_struct *next) -+{ -+ /* -+ * Since the runqueue lock will be released by the next -+ * task (which is an invalid locking op but in the case -+ * of the scheduler it's an obvious special-case), so we -+ * do an early lockdep release here: -+ */ -+ spin_release(&rq->lock.dep_map, _THIS_IP_); -+#ifdef CONFIG_DEBUG_SPINLOCK -+ /* this is a valid case when another task releases the spinlock */ -+ rq->lock.owner = next; -+#endif -+} -+ -+static inline void finish_lock_switch(struct rq *rq) -+{ -+ /* -+ * If we are tracking spinlock dependencies then we have to -+ * fix up the runqueue lock - which gets 'carried over' from -+ * prev into current: -+ */ -+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+/** -+ * prepare_task_switch - prepare to switch tasks -+ * @rq: the runqueue preparing to switch -+ * @next: the task we are going to switch to. -+ * -+ * This is called with the rq lock held and interrupts off. It must -+ * be paired with a subsequent finish_task_switch after the context -+ * switch. -+ * -+ * prepare_task_switch sets up locking and calls architecture specific -+ * hooks. -+ */ -+static inline void -+prepare_task_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ kcov_prepare_switch(prev); -+ sched_info_switch(rq, prev, next); -+ perf_event_task_sched_out(prev, next); -+ rseq_preempt(prev); -+ fire_sched_out_preempt_notifiers(prev, next); -+ prepare_task(next); -+ prepare_arch_switch(next); -+} -+ -+/** -+ * finish_task_switch - clean up after a task-switch -+ * @rq: runqueue associated with task-switch -+ * @prev: the thread we just switched away from. -+ * -+ * finish_task_switch must be called after the context switch, paired -+ * with a prepare_task_switch call before the context switch. -+ * finish_task_switch will reconcile locking set up by prepare_task_switch, -+ * and do any other architecture-specific cleanup actions. -+ * -+ * Note that we may have delayed dropping an mm in context_switch(). If -+ * so, we finish that here outside of the runqueue lock. (Doing it -+ * with the lock held can cause deadlocks; see schedule() for -+ * details.) -+ * -+ * The context switch have flipped the stack from under us and restored the -+ * local variables which were saved when this task called schedule() in the -+ * past. prev == current is still correct but we need to recalculate this_rq -+ * because prev may have moved to another CPU. -+ */ -+static struct rq *finish_task_switch(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq = this_rq(); -+ struct mm_struct *mm = rq->prev_mm; -+ long prev_state; -+ -+ /* -+ * The previous task will have left us with a preempt_count of 2 -+ * because it left us after: -+ * -+ * schedule() -+ * preempt_disable(); // 1 -+ * __schedule() -+ * raw_spin_lock_irq(&rq->lock) // 2 -+ * -+ * Also, see FORK_PREEMPT_COUNT. -+ */ -+ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, -+ "corrupted preempt_count: %s/%d/0x%x\n", -+ current->comm, current->pid, preempt_count())) -+ preempt_count_set(FORK_PREEMPT_COUNT); -+ -+ rq->prev_mm = NULL; -+ -+ /* -+ * A task struct has one reference for the use as "current". -+ * If a task dies, then it sets TASK_DEAD in tsk->state and calls -+ * schedule one last time. The schedule call will never return, and -+ * the scheduled task must drop that reference. -+ * -+ * We must observe prev->state before clearing prev->on_cpu (in -+ * finish_task), otherwise a concurrent wakeup can get prev -+ * running on another CPU and we could rave with its RUNNING -> DEAD -+ * transition, resulting in a double drop. -+ */ -+ prev_state = prev->state; -+ vtime_task_switch(prev); -+ perf_event_task_sched_in(prev, current); -+ finish_task(prev); -+ finish_lock_switch(rq); -+ finish_arch_post_lock_switch(); -+ kcov_finish_switch(current); -+ -+ fire_sched_in_preempt_notifiers(current); -+ /* -+ * When switching through a kernel thread, the loop in -+ * membarrier_{private,global}_expedited() may have observed that -+ * kernel thread and not issued an IPI. It is therefore possible to -+ * schedule between user->kernel->user threads without passing though -+ * switch_mm(). Membarrier requires a barrier after storing to -+ * rq->curr, before returning to userspace, so provide them here: -+ * -+ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly -+ * provided by mmdrop(), -+ * - a sync_core for SYNC_CORE. -+ */ -+ if (mm) { -+ membarrier_mm_sync_core_before_usermode(mm); -+ mmdrop(mm); -+ } -+ if (unlikely(prev_state == TASK_DEAD)) { -+ /* -+ * Remove function-return probe instances associated with this -+ * task and put them back on the free list. -+ */ -+ kprobe_flush_task(prev); -+ -+ /* Task is done with its stack. */ -+ put_task_stack(prev); -+ -+ put_task_struct_rcu_user(prev); -+ } -+ -+ tick_nohz_task_switch(); -+ return rq; -+} -+ -+/** -+ * schedule_tail - first thing a freshly forked thread must call. -+ * @prev: the thread we just switched away from. -+ */ -+asmlinkage __visible void schedule_tail(struct task_struct *prev) -+ __releases(rq->lock) -+{ -+ struct rq *rq; -+ -+ /* -+ * New tasks start with FORK_PREEMPT_COUNT, see there and -+ * finish_task_switch() for details. -+ * -+ * finish_task_switch() will drop rq->lock() and lower preempt_count -+ * and the preempt_enable() will end up enabling preemption (on -+ * PREEMPT_COUNT kernels). -+ */ -+ -+ rq = finish_task_switch(prev); -+ preempt_enable(); -+ -+ if (current->set_child_tid) -+ put_user(task_pid_vnr(current), current->set_child_tid); -+ -+ calculate_sigpending(); -+} -+ -+/* -+ * context_switch - switch to the new MM and the new thread's register state. -+ */ -+static __always_inline struct rq * -+context_switch(struct rq *rq, struct task_struct *prev, -+ struct task_struct *next) -+{ -+ prepare_task_switch(rq, prev, next); -+ -+ /* -+ * For paravirt, this is coupled with an exit in switch_to to -+ * combine the page table reload and the switch backend into -+ * one hypercall. -+ */ -+ arch_start_context_switch(prev); -+ -+ /* -+ * kernel -> kernel lazy + transfer active -+ * user -> kernel lazy + mmgrab() active -+ * -+ * kernel -> user switch + mmdrop() active -+ * user -> user switch -+ */ -+ if (!next->mm) { // to kernel -+ enter_lazy_tlb(prev->active_mm, next); -+ -+ next->active_mm = prev->active_mm; -+ if (prev->mm) // from user -+ mmgrab(prev->active_mm); -+ else -+ prev->active_mm = NULL; -+ } else { // to user -+ membarrier_switch_mm(rq, prev->active_mm, next->mm); -+ /* -+ * sys_membarrier() requires an smp_mb() between setting -+ * rq->curr / membarrier_switch_mm() and returning to userspace. -+ * -+ * The below provides this either through switch_mm(), or in -+ * case 'prev->active_mm == next->mm' through -+ * finish_task_switch()'s mmdrop(). -+ */ -+ switch_mm_irqs_off(prev->active_mm, next->mm, next); -+ -+ if (!prev->mm) { // from kernel -+ /* will mmdrop() in finish_task_switch(). */ -+ rq->prev_mm = prev->active_mm; -+ prev->active_mm = NULL; -+ } -+ } -+ -+ prepare_lock_switch(rq, next); -+ -+ /* Here we just switch the register state and the stack. */ -+ switch_to(prev, next, prev); -+ barrier(); -+ -+ return finish_task_switch(prev); -+} -+ -+/* -+ * nr_running, nr_uninterruptible and nr_context_switches: -+ * -+ * externally visible scheduler statistics: current number of runnable -+ * threads, total number of context switches performed since bootup. -+ */ -+unsigned long nr_running(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_online_cpu(i) -+ sum += cpu_rq(i)->nr_running; -+ -+ return sum; -+} -+ -+/* -+ * Check if only the current task is running on the CPU. -+ * -+ * Caution: this function does not check that the caller has disabled -+ * preemption, thus the result might have a time-of-check-to-time-of-use -+ * race. The caller is responsible to use it correctly, for example: -+ * -+ * - from a non-preemptible section (of course) -+ * -+ * - from a thread that is bound to a single CPU -+ * -+ * - in a loop with very short iterations (e.g. a polling loop) -+ */ -+bool single_task_running(void) -+{ -+ return raw_rq()->nr_running == 1; -+} -+EXPORT_SYMBOL(single_task_running); -+ -+unsigned long long nr_context_switches(void) -+{ -+ int i; -+ unsigned long long sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += cpu_rq(i)->nr_switches; -+ -+ return sum; -+} -+ -+/* -+ * Consumers of these two interfaces, like for example the cpuidle menu -+ * governor, are using nonsensical data. Preferring shallow idle state selection -+ * for a CPU that has IO-wait which might not even end up running the task when -+ * it does become runnable. -+ */ -+ -+unsigned long nr_iowait_cpu(int cpu) -+{ -+ return atomic_read(&cpu_rq(cpu)->nr_iowait); -+} -+ -+/* -+ * IO-wait accounting, and how its mostly bollocks (on SMP). -+ * -+ * The idea behind IO-wait account is to account the idle time that we could -+ * have spend running if it were not for IO. That is, if we were to improve the -+ * storage performance, we'd have a proportional reduction in IO-wait time. -+ * -+ * This all works nicely on UP, where, when a task blocks on IO, we account -+ * idle time as IO-wait, because if the storage were faster, it could've been -+ * running and we'd not be idle. -+ * -+ * This has been extended to SMP, by doing the same for each CPU. This however -+ * is broken. -+ * -+ * Imagine for instance the case where two tasks block on one CPU, only the one -+ * CPU will have IO-wait accounted, while the other has regular idle. Even -+ * though, if the storage were faster, both could've ran at the same time, -+ * utilising both CPUs. -+ * -+ * This means, that when looking globally, the current IO-wait accounting on -+ * SMP is a lower bound, by reason of under accounting. -+ * -+ * Worse, since the numbers are provided per CPU, they are sometimes -+ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly -+ * associated with any one particular CPU, it can wake to another CPU than it -+ * blocked on. This means the per CPU IO-wait number is meaningless. -+ * -+ * Task CPU affinities can make all that even more 'interesting'. -+ */ -+ -+unsigned long nr_iowait(void) -+{ -+ unsigned long i, sum = 0; -+ -+ for_each_possible_cpu(i) -+ sum += nr_iowait_cpu(i); -+ -+ return sum; -+} -+ -+#ifdef CONFIG_SMP -+ -+/* -+ * sched_exec - execve() is a valuable balancing opportunity, because at -+ * this point the task has the smallest effective memory and cache -+ * footprint. -+ */ -+void sched_exec(void) -+{ -+ struct task_struct *p = current; -+ unsigned long flags; -+ int dest_cpu; -+ struct rq *rq; -+ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = this_rq(); -+ -+ if (rq != task_rq(p) || rq->nr_running < 2) -+ goto unlock; -+ -+ dest_cpu = select_task_rq(p, task_rq(p)); -+ if (dest_cpu == smp_processor_id()) -+ goto unlock; -+ -+ if (likely(cpu_active(dest_cpu))) { -+ struct migration_arg arg = { p, dest_cpu }; -+ -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); -+ return; -+ } -+unlock: -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+ -+#endif -+ -+DEFINE_PER_CPU(struct kernel_stat, kstat); -+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); -+ -+EXPORT_PER_CPU_SYMBOL(kstat); -+EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -+ -+static inline void update_curr(struct rq *rq, struct task_struct *p) -+{ -+ s64 ns = rq->clock_task - p->last_ran; -+ -+ p->sched_time += ns; -+ account_group_exec_runtime(p, ns); -+ -+ p->time_slice -= ns; -+ p->last_ran = rq->clock_task; -+} -+ -+/* -+ * Return accounted runtime for the task. -+ * Return separately the current's pending runtime that have not been -+ * accounted yet. -+ */ -+unsigned long long task_sched_runtime(struct task_struct *p) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ u64 ns; -+ -+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) -+ /* -+ * 64-bit doesn't need locks to atomically read a 64-bit value. -+ * So we have a optimization chance when the task's delta_exec is 0. -+ * Reading ->on_cpu is racy, but this is ok. -+ * -+ * If we race with it leaving CPU, we'll take a lock. So we're correct. -+ * If we race with it entering CPU, unaccounted time is 0. This is -+ * indistinguishable from the read occurring a few cycles earlier. -+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has -+ * been accounted, so we're correct here as well. -+ */ -+ if (!p->on_cpu || !task_on_rq_queued(p)) -+ return tsk_seruntime(p); -+#endif -+ -+ rq = task_access_lock_irqsave(p, &lock, &flags); -+ /* -+ * Must be ->curr _and_ ->on_rq. If dequeued, we would -+ * project cycles that may never be accounted to this -+ * thread, breaking clock_gettime(). -+ */ -+ if (p == rq->curr && task_on_rq_queued(p)) { -+ update_rq_clock(rq); -+ update_curr(rq, p); -+ } -+ ns = tsk_seruntime(p); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+ return ns; -+} -+ -+/* This manages tasks that have run out of timeslice during a scheduler_tick */ -+static inline void scheduler_task_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (is_idle_task(p)) -+ return; -+ -+ update_curr(rq, p); -+ cpufreq_update_util(rq, 0); -+ -+ /* -+ * Tasks have less than RESCHED_NS of time slice left they will be -+ * rescheduled. -+ */ -+ if (p->time_slice >= RESCHED_NS) -+ return; -+ set_tsk_need_resched(p); -+ set_preempt_need_resched(); -+} -+ -+/* -+ * This function gets called by the timer code, with HZ frequency. -+ * We call it with interrupts disabled. -+ */ -+void scheduler_tick(void) -+{ -+ int cpu __maybe_unused = smp_processor_id(); -+ struct rq *rq = cpu_rq(cpu); -+ -+ arch_scale_freq_tick(); -+ sched_clock_tick(); -+ -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ scheduler_task_tick(rq); -+ calc_global_load_tick(rq); -+ psi_task_tick(rq); -+ -+ rq->last_tick = rq->clock; -+ raw_spin_unlock(&rq->lock); -+ -+ perf_event_task_tick(); -+} -+ -+#ifdef CONFIG_SCHED_SMT -+static inline int active_load_balance_cpu_stop(void *data) -+{ -+ struct rq *rq = this_rq(); -+ struct task_struct *p = data; -+ cpumask_t tmp; -+ unsigned long flags; -+ -+ local_irq_save(flags); -+ -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ rq->active_balance = 0; -+ /* _something_ may have changed the task, double check again */ -+ if (task_on_rq_queued(p) && task_rq(p) == rq && -+ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { -+ int cpu = cpu_of(rq); -+ int dcpu = __best_mask_cpu(cpu, &tmp, -+ per_cpu(sched_cpu_llc_mask, cpu)); -+ rq = move_queued_task(rq, p, dcpu); -+ } -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ local_irq_restore(flags); -+ -+ return 0; -+} -+ -+/* sg_balance_trigger - trigger slibing group balance for @cpu */ -+static inline int sg_balance_trigger(const int cpu) -+{ -+ struct rq *rq= cpu_rq(cpu); -+ unsigned long flags; -+ struct task_struct *curr; -+ int res; -+ -+ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) -+ return 0; -+ curr = rq->curr; -+ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ -+ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ -+ (!rq->active_balance); -+ -+ if (res) -+ rq->active_balance = 1; -+ -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ if (res) -+ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, -+ curr, &rq->active_balance_work); -+ return res; -+} -+ -+/* -+ * sg_balance_check - slibing group balance check for run queue @rq -+ */ -+static inline void sg_balance_check(struct rq *rq) -+{ -+ cpumask_t chk; -+ int cpu; -+ -+ /* exit when no sg in idle */ -+ if (cpumask_empty(&sched_sg_idle_mask)) -+ return; -+ -+ cpu = cpu_of(rq); -+ /* -+ * Only cpu in slibing idle group will do the checking and then -+ * find potential cpus which can migrate the current running task -+ */ -+ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && -+ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && -+ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { -+ int i, tried = 0; -+ -+ for_each_cpu_wrap(i, &chk, cpu) { -+ if (cpumask_subset(cpu_smt_mask(i), &chk)) { -+ if (sg_balance_trigger(i)) -+ return; -+ if (tried) -+ return; -+ tried++; -+ } -+ } -+ } -+} -+#endif /* CONFIG_SCHED_SMT */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+ -+struct tick_work { -+ int cpu; -+ atomic_t state; -+ struct delayed_work work; -+}; -+/* Values for ->state, see diagram below. */ -+#define TICK_SCHED_REMOTE_OFFLINE 0 -+#define TICK_SCHED_REMOTE_OFFLINING 1 -+#define TICK_SCHED_REMOTE_RUNNING 2 -+ -+/* -+ * State diagram for ->state: -+ * -+ * -+ * TICK_SCHED_REMOTE_OFFLINE -+ * | ^ -+ * | | -+ * | | sched_tick_remote() -+ * | | -+ * | | -+ * +--TICK_SCHED_REMOTE_OFFLINING -+ * | ^ -+ * | | -+ * sched_tick_start() | | sched_tick_stop() -+ * | | -+ * V | -+ * TICK_SCHED_REMOTE_RUNNING -+ * -+ * -+ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() -+ * and sched_tick_start() are happy to leave the state in RUNNING. -+ */ -+ -+static struct tick_work __percpu *tick_work_cpu; -+ -+static void sched_tick_remote(struct work_struct *work) -+{ -+ struct delayed_work *dwork = to_delayed_work(work); -+ struct tick_work *twork = container_of(dwork, struct tick_work, work); -+ int cpu = twork->cpu; -+ struct rq *rq = cpu_rq(cpu); -+ struct task_struct *curr; -+ unsigned long flags; -+ u64 delta; -+ int os; -+ -+ /* -+ * Handle the tick only if it appears the remote CPU is running in full -+ * dynticks mode. The check is racy by nature, but missing a tick or -+ * having one too much is no big deal because the scheduler tick updates -+ * statistics and checks timeslices in a time-independent way, regardless -+ * of when exactly it is running. -+ */ -+ if (!tick_nohz_tick_stopped_cpu(cpu)) -+ goto out_requeue; -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ curr = rq->curr; -+ if (cpu_is_offline(cpu)) -+ goto out_unlock; -+ -+ update_rq_clock(rq); -+ if (!is_idle_task(curr)) { -+ /* -+ * Make sure the next tick runs within a reasonable -+ * amount of time. -+ */ -+ delta = rq_clock_task(rq) - curr->last_ran; -+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); -+ } -+ scheduler_task_tick(rq); -+ -+ calc_load_nohz_remote(rq); -+out_unlock: -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+out_requeue: -+ /* -+ * Run the remote tick once per second (1Hz). This arbitrary -+ * frequency is large enough to avoid overload but short enough -+ * to keep scheduler internal stats reasonably up to date. But -+ * first update state to reflect hotplug activity if required. -+ */ -+ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); -+ if (os == TICK_SCHED_REMOTE_RUNNING) -+ queue_delayed_work(system_unbound_wq, dwork, HZ); -+} -+ -+static void sched_tick_start(int cpu) -+{ -+ int os; -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); -+ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); -+ if (os == TICK_SCHED_REMOTE_OFFLINE) { -+ twork->cpu = cpu; -+ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); -+ queue_delayed_work(system_unbound_wq, &twork->work, HZ); -+ } -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+static void sched_tick_stop(int cpu) -+{ -+ struct tick_work *twork; -+ -+ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) -+ return; -+ -+ WARN_ON_ONCE(!tick_work_cpu); -+ -+ twork = per_cpu_ptr(tick_work_cpu, cpu); -+ cancel_delayed_work_sync(&twork->work); -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+int __init sched_tick_offload_init(void) -+{ -+ tick_work_cpu = alloc_percpu(struct tick_work); -+ BUG_ON(!tick_work_cpu); -+ return 0; -+} -+ -+#else /* !CONFIG_NO_HZ_FULL */ -+static inline void sched_tick_start(int cpu) { } -+static inline void sched_tick_stop(int cpu) { } -+#endif -+ -+#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ -+ defined(CONFIG_PREEMPT_TRACER)) -+/* -+ * If the value passed in is equal to the current preempt count -+ * then we just disabled preemption. Start timing the latency. -+ */ -+static inline void preempt_latency_start(int val) -+{ -+ if (preempt_count() == val) { -+ unsigned long ip = get_lock_parent_ip(); -+#ifdef CONFIG_DEBUG_PREEMPT -+ current->preempt_disable_ip = ip; -+#endif -+ trace_preempt_off(CALLER_ADDR0, ip); -+ } -+} -+ -+void preempt_count_add(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) -+ return; -+#endif -+ __preempt_count_add(val); -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Spinlock count overflowing soon? -+ */ -+ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= -+ PREEMPT_MASK - 10); -+#endif -+ preempt_latency_start(val); -+} -+EXPORT_SYMBOL(preempt_count_add); -+NOKPROBE_SYMBOL(preempt_count_add); -+ -+/* -+ * If the value passed in equals to the current preempt count -+ * then we just enabled preemption. Stop timing the latency. -+ */ -+static inline void preempt_latency_stop(int val) -+{ -+ if (preempt_count() == val) -+ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); -+} -+ -+void preempt_count_sub(int val) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ /* -+ * Underflow? -+ */ -+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) -+ return; -+ /* -+ * Is the spinlock portion underflowing? -+ */ -+ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && -+ !(preempt_count() & PREEMPT_MASK))) -+ return; -+#endif -+ -+ preempt_latency_stop(val); -+ __preempt_count_sub(val); -+} -+EXPORT_SYMBOL(preempt_count_sub); -+NOKPROBE_SYMBOL(preempt_count_sub); -+ -+#else -+static inline void preempt_latency_start(int val) { } -+static inline void preempt_latency_stop(int val) { } -+#endif -+ -+static inline unsigned long get_preempt_disable_ip(struct task_struct *p) -+{ -+#ifdef CONFIG_DEBUG_PREEMPT -+ return p->preempt_disable_ip; -+#else -+ return 0; -+#endif -+} -+ -+/* -+ * Print scheduling while atomic bug: -+ */ -+static noinline void __schedule_bug(struct task_struct *prev) -+{ -+ /* Save this before calling printk(), since that will clobber it */ -+ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ if (oops_in_progress) -+ return; -+ -+ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", -+ prev->comm, prev->pid, preempt_count()); -+ -+ debug_show_held_locks(prev); -+ print_modules(); -+ if (irqs_disabled()) -+ print_irqtrace_events(prev); -+ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) -+ && in_atomic_preempt_off()) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+ if (panic_on_warn) -+ panic("scheduling while atomic\n"); -+ -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+ -+/* -+ * Various schedule()-time debugging checks and statistics: -+ */ -+static inline void schedule_debug(struct task_struct *prev, bool preempt) -+{ -+#ifdef CONFIG_SCHED_STACK_END_CHECK -+ if (task_stack_end_corrupted(prev)) -+ panic("corrupted stack end detected inside scheduler\n"); -+ -+ if (task_scs_end_corrupted(prev)) -+ panic("corrupted shadow stack detected inside scheduler\n"); -+#endif -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+ if (!preempt && prev->state && prev->non_block_count) { -+ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", -+ prev->comm, prev->pid, prev->non_block_count); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+ } -+#endif -+ -+ if (unlikely(in_atomic_preempt_off())) { -+ __schedule_bug(prev); -+ preempt_count_set(PREEMPT_DISABLED); -+ } -+ rcu_sleep_check(); -+ -+ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -+ -+ schedstat_inc(this_rq()->sched_count); -+} -+ -+/* -+ * Compile time debug macro -+ * #define ALT_SCHED_DEBUG -+ */ -+ -+#ifdef ALT_SCHED_DEBUG -+void alt_sched_debug(void) -+{ -+ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", -+ sched_rq_pending_mask.bits[0], -+ sched_rq_watermark[IDLE_WM].bits[0], -+ sched_sg_idle_mask.bits[0]); -+} -+#else -+inline void alt_sched_debug(void) {} -+#endif -+ -+#ifdef CONFIG_SMP -+ -+#define SCHED_RQ_NR_MIGRATION (32UL) -+/* -+ * Migrate pending tasks in @rq to @dest_cpu -+ * Will try to migrate mininal of half of @rq nr_running tasks and -+ * SCHED_RQ_NR_MIGRATION to @dest_cpu -+ */ -+static inline int -+migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) -+{ -+ struct task_struct *p, *skip = rq->curr; -+ int nr_migrated = 0; -+ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); -+ -+ while (skip != rq->idle && nr_tries && -+ (p = sched_rq_next_task(skip, rq)) != rq->idle) { -+ skip = sched_rq_next_task(p, rq); -+ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { -+ __SCHED_DEQUEUE_TASK(p, rq, 0, ); -+ set_task_cpu(p, dest_cpu); -+ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); -+ nr_migrated++; -+ } -+ nr_tries--; -+ } -+ -+ return nr_migrated; -+} -+ -+static inline int take_other_rq_tasks(struct rq *rq, int cpu) -+{ -+ struct cpumask *affinity_mask, *end_mask; -+ -+ if (unlikely(!rq->online)) -+ return 0; -+ -+ if (cpumask_empty(&sched_rq_pending_mask)) -+ return 0; -+ -+ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); -+ do { -+ int i; -+ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { -+ int nr_migrated; -+ struct rq *src_rq; -+ -+ src_rq = cpu_rq(i); -+ if (!do_raw_spin_trylock(&src_rq->lock)) -+ continue; -+ spin_acquire(&src_rq->lock.dep_map, -+ SINGLE_DEPTH_NESTING, 1, _RET_IP_); -+ -+ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { -+ src_rq->nr_running -= nr_migrated; -+#ifdef CONFIG_SMP -+ if (src_rq->nr_running < 2) -+ cpumask_clear_cpu(i, &sched_rq_pending_mask); -+#endif -+ rq->nr_running += nr_migrated; -+#ifdef CONFIG_SMP -+ if (rq->nr_running > 1) -+ cpumask_set_cpu(cpu, &sched_rq_pending_mask); -+#endif -+ update_sched_rq_watermark(rq); -+ cpufreq_update_util(rq, 0); -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ -+ return 1; -+ } -+ -+ spin_release(&src_rq->lock.dep_map, _RET_IP_); -+ do_raw_spin_unlock(&src_rq->lock); -+ } -+ } while (++affinity_mask < end_mask); -+ -+ return 0; -+} -+#endif -+ -+/* -+ * Timeslices below RESCHED_NS are considered as good as expired as there's no -+ * point rescheduling when there's so little time left. -+ */ -+static inline void check_curr(struct task_struct *p, struct rq *rq) -+{ -+ if (unlikely(rq->idle == p)) -+ return; -+ -+ update_curr(rq, p); -+ -+ if (p->time_slice < RESCHED_NS) -+ time_slice_expired(p, rq); -+} -+ -+static inline struct task_struct * -+choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) -+{ -+ struct task_struct *next; -+ -+ if (unlikely(rq->skip)) { -+ next = rq_runnable_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ rq->skip = NULL; -+ schedstat_inc(rq->sched_goidle); -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = rq_runnable_task(rq); -+#endif -+ } -+ rq->skip = NULL; -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ return next; -+ } -+ -+ next = sched_rq_first_task(rq); -+ if (next == rq->idle) { -+#ifdef CONFIG_SMP -+ if (!take_other_rq_tasks(rq, cpu)) { -+#endif -+ schedstat_inc(rq->sched_goidle); -+ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ -+ return next; -+#ifdef CONFIG_SMP -+ } -+ next = sched_rq_first_task(rq); -+#endif -+ } -+#ifdef CONFIG_HIGH_RES_TIMERS -+ hrtick_start(rq, next->time_slice); -+#endif -+ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, -+ * next);*/ -+ return next; -+} -+ -+/* -+ * schedule() is the main scheduler function. -+ * -+ * The main means of driving the scheduler and thus entering this function are: -+ * -+ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. -+ * -+ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return -+ * paths. For example, see arch/x86/entry_64.S. -+ * -+ * To drive preemption between tasks, the scheduler sets the flag in timer -+ * interrupt handler scheduler_tick(). -+ * -+ * 3. Wakeups don't really cause entry into schedule(). They add a -+ * task to the run-queue and that's it. -+ * -+ * Now, if the new task added to the run-queue preempts the current -+ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets -+ * called on the nearest possible occasion: -+ * -+ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): -+ * -+ * - in syscall or exception context, at the next outmost -+ * preempt_enable(). (this might be as soon as the wake_up()'s -+ * spin_unlock()!) -+ * -+ * - in IRQ context, return from interrupt-handler to -+ * preemptible context -+ * -+ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) -+ * then at the next: -+ * -+ * - cond_resched() call -+ * - explicit schedule() call -+ * - return from syscall or exception to user-space -+ * - return from interrupt-handler to user-space -+ * -+ * WARNING: must be called with preemption disabled! -+ */ -+static void __sched notrace __schedule(bool preempt) -+{ -+ struct task_struct *prev, *next; -+ unsigned long *switch_count; -+ unsigned long prev_state; -+ struct rq *rq; -+ int cpu; -+ -+ cpu = smp_processor_id(); -+ rq = cpu_rq(cpu); -+ prev = rq->curr; -+ -+ schedule_debug(prev, preempt); -+ -+ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ -+ hrtick_clear(rq); -+ -+ local_irq_disable(); -+ rcu_note_context_switch(preempt); -+ -+ /* -+ * Make sure that signal_pending_state()->signal_pending() below -+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) -+ * done by the caller to avoid the race with signal_wake_up(): -+ * -+ * __set_current_state(@state) signal_wake_up() -+ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) -+ * wake_up_state(p, state) -+ * LOCK rq->lock LOCK p->pi_state -+ * smp_mb__after_spinlock() smp_mb__after_spinlock() -+ * if (signal_pending_state()) if (p->state & @state) -+ * -+ * Also, the membarrier system call requires a full memory barrier -+ * after coming from user-space, before storing to rq->curr. -+ */ -+ raw_spin_lock(&rq->lock); -+ smp_mb__after_spinlock(); -+ -+ update_rq_clock(rq); -+ -+ switch_count = &prev->nivcsw; -+ /* -+ * We must load prev->state once (task_struct::state is volatile), such -+ * that: -+ * -+ * - we form a control dependency vs deactivate_task() below. -+ * - ptrace_{,un}freeze_traced() can change ->state underneath us. -+ */ -+ prev_state = prev->state; -+ if (!preempt && prev_state && prev_state == prev->state) { -+ if (signal_pending_state(prev_state, prev)) { -+ prev->state = TASK_RUNNING; -+ } else { -+ prev->sched_contributes_to_load = -+ (prev_state & TASK_UNINTERRUPTIBLE) && -+ !(prev_state & TASK_NOLOAD) && -+ !(prev->flags & PF_FROZEN); -+ -+ if (prev->sched_contributes_to_load) -+ rq->nr_uninterruptible++; -+ -+ /* -+ * __schedule() ttwu() -+ * prev_state = prev->state; if (p->on_rq && ...) -+ * if (prev_state) goto out; -+ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); -+ * p->state = TASK_WAKING -+ * -+ * Where __schedule() and ttwu() have matching control dependencies. -+ * -+ * After this, schedule() must not care about p->state any more. -+ */ -+ sched_task_deactivate(prev, rq); -+ deactivate_task(prev, rq); -+ -+ if (prev->in_iowait) { -+ atomic_inc(&rq->nr_iowait); -+ delayacct_blkio_start(); -+ } -+ } -+ switch_count = &prev->nvcsw; -+ } -+ -+ check_curr(prev, rq); -+ -+ next = choose_next_task(rq, cpu, prev); -+ clear_tsk_need_resched(prev); -+ clear_preempt_need_resched(); -+ -+ -+ if (likely(prev != next)) { -+ next->last_ran = rq->clock_task; -+ rq->last_ts_switch = rq->clock; -+ -+ rq->nr_switches++; -+ /* -+ * RCU users of rcu_dereference(rq->curr) may not see -+ * changes to task_struct made by pick_next_task(). -+ */ -+ RCU_INIT_POINTER(rq->curr, next); -+ /* -+ * The membarrier system call requires each architecture -+ * to have a full memory barrier after updating -+ * rq->curr, before returning to user-space. -+ * -+ * Here are the schemes providing that barrier on the -+ * various architectures: -+ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. -+ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. -+ * - finish_lock_switch() for weakly-ordered -+ * architectures where spin_unlock is a full barrier, -+ * - switch_to() for arm64 (weakly-ordered, spin_unlock -+ * is a RELEASE barrier), -+ */ -+ ++*switch_count; -+ -+ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); -+ -+ trace_sched_switch(preempt, prev, next); -+ -+ /* Also unlocks the rq: */ -+ rq = context_switch(rq, prev, next); -+ } else -+ raw_spin_unlock_irq(&rq->lock); -+ -+#ifdef CONFIG_SCHED_SMT -+ sg_balance_check(rq); -+#endif -+} -+ -+void __noreturn do_task_dead(void) -+{ -+ /* Causes final put_task_struct in finish_task_switch(): */ -+ set_special_state(TASK_DEAD); -+ -+ /* Tell freezer to ignore us: */ -+ current->flags |= PF_NOFREEZE; -+ -+ __schedule(false); -+ BUG(); -+ -+ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ -+ for (;;) -+ cpu_relax(); -+} -+ -+static inline void sched_submit_work(struct task_struct *tsk) -+{ -+ if (!tsk->state) -+ return; -+ -+ /* -+ * If a worker went to sleep, notify and ask workqueue whether -+ * it wants to wake up a task to maintain concurrency. -+ * As this function is called inside the schedule() context, -+ * we disable preemption to avoid it calling schedule() again -+ * in the possible wakeup of a kworker and because wq_worker_sleeping() -+ * requires it. -+ */ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ preempt_disable(); -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_sleeping(tsk); -+ else -+ io_wq_worker_sleeping(tsk); -+ preempt_enable_no_resched(); -+ } -+ -+ if (tsk_is_pi_blocked(tsk)) -+ return; -+ -+ /* -+ * If we are going to sleep and we have plugged IO queued, -+ * make sure to submit it to avoid deadlocks. -+ */ -+ if (blk_needs_flush_plug(tsk)) -+ blk_schedule_flush_plug(tsk); -+} -+ -+static void sched_update_worker(struct task_struct *tsk) -+{ -+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { -+ if (tsk->flags & PF_WQ_WORKER) -+ wq_worker_running(tsk); -+ else -+ io_wq_worker_running(tsk); -+ } -+} -+ -+asmlinkage __visible void __sched schedule(void) -+{ -+ struct task_struct *tsk = current; -+ -+ sched_submit_work(tsk); -+ do { -+ preempt_disable(); -+ __schedule(false); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ sched_update_worker(tsk); -+} -+EXPORT_SYMBOL(schedule); -+ -+/* -+ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted -+ * state (have scheduled out non-voluntarily) by making sure that all -+ * tasks have either left the run queue or have gone into user space. -+ * As idle tasks do not do either, they must not ever be preempted -+ * (schedule out non-voluntarily). -+ * -+ * schedule_idle() is similar to schedule_preempt_disable() except that it -+ * never enables preemption because it does not call sched_submit_work(). -+ */ -+void __sched schedule_idle(void) -+{ -+ /* -+ * As this skips calling sched_submit_work(), which the idle task does -+ * regardless because that function is a nop when the task is in a -+ * TASK_RUNNING state, make sure this isn't used someplace that the -+ * current task can be in any other state. Note, idle is always in the -+ * TASK_RUNNING state. -+ */ -+ WARN_ON_ONCE(current->state); -+ do { -+ __schedule(false); -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_CONTEXT_TRACKING -+asmlinkage __visible void __sched schedule_user(void) -+{ -+ /* -+ * If we come here after a random call to set_need_resched(), -+ * or we have been woken up remotely but the IPI has not yet arrived, -+ * we haven't yet exited the RCU idle mode. Do it here manually until -+ * we find a better solution. -+ * -+ * NB: There are buggy callers of this function. Ideally we -+ * should warn if prev_state != CONTEXT_USER, but that will trigger -+ * too frequently to make sense yet. -+ */ -+ enum ctx_state prev_state = exception_enter(); -+ schedule(); -+ exception_exit(prev_state); -+} -+#endif -+ -+/** -+ * schedule_preempt_disabled - called with preemption disabled -+ * -+ * Returns with preemption disabled. Note: preempt_count must be 1 -+ */ -+void __sched schedule_preempt_disabled(void) -+{ -+ sched_preempt_enable_no_resched(); -+ schedule(); -+ preempt_disable(); -+} -+ -+static void __sched notrace preempt_schedule_common(void) -+{ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ __schedule(true); -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ -+ /* -+ * Check again in case we missed a preemption opportunity -+ * between schedule and now. -+ */ -+ } while (need_resched()); -+} -+ -+#ifdef CONFIG_PREEMPTION -+/* -+ * This is the entry point to schedule() from in-kernel preemption -+ * off of preempt_enable. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule(void) -+{ -+ /* -+ * If there is a non-zero preempt_count or interrupts are disabled, -+ * we do not want to preempt the current task. Just return.. -+ */ -+ if (likely(!preemptible())) -+ return; -+ -+ preempt_schedule_common(); -+} -+NOKPROBE_SYMBOL(preempt_schedule); -+EXPORT_SYMBOL(preempt_schedule); -+ -+/** -+ * preempt_schedule_notrace - preempt_schedule called by tracing -+ * -+ * The tracing infrastructure uses preempt_enable_notrace to prevent -+ * recursion and tracing preempt enabling caused by the tracing -+ * infrastructure itself. But as tracing can happen in areas coming -+ * from userspace or just about to enter userspace, a preempt enable -+ * can occur before user_exit() is called. This will cause the scheduler -+ * to be called when the system is still in usermode. -+ * -+ * To prevent this, the preempt_enable_notrace will use this function -+ * instead of preempt_schedule() to exit user context if needed before -+ * calling the scheduler. -+ */ -+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) -+{ -+ enum ctx_state prev_ctx; -+ -+ if (likely(!preemptible())) -+ return; -+ -+ do { -+ /* -+ * Because the function tracer can trace preempt_count_sub() -+ * and it also uses preempt_enable/disable_notrace(), if -+ * NEED_RESCHED is set, the preempt_enable_notrace() called -+ * by the function tracer will call this function again and -+ * cause infinite recursion. -+ * -+ * Preemption must be disabled here before the function -+ * tracer can trace. Break up preempt_disable() into two -+ * calls. One to disable preemption without fear of being -+ * traced. The other to still record the preemption latency, -+ * which can also be traced by the function tracer. -+ */ -+ preempt_disable_notrace(); -+ preempt_latency_start(1); -+ /* -+ * Needs preempt disabled in case user_exit() is traced -+ * and the tracer calls preempt_enable_notrace() causing -+ * an infinite recursion. -+ */ -+ prev_ctx = exception_enter(); -+ __schedule(true); -+ exception_exit(prev_ctx); -+ -+ preempt_latency_stop(1); -+ preempt_enable_no_resched_notrace(); -+ } while (need_resched()); -+} -+EXPORT_SYMBOL_GPL(preempt_schedule_notrace); -+ -+#endif /* CONFIG_PREEMPTION */ -+ -+/* -+ * This is the entry point to schedule() from kernel preemption -+ * off of irq context. -+ * Note, that this is called and return with irqs disabled. This will -+ * protect us against recursive calling from irq. -+ */ -+asmlinkage __visible void __sched preempt_schedule_irq(void) -+{ -+ enum ctx_state prev_state; -+ -+ /* Catch callers which need to be fixed */ -+ BUG_ON(preempt_count() || !irqs_disabled()); -+ -+ prev_state = exception_enter(); -+ -+ do { -+ preempt_disable(); -+ local_irq_enable(); -+ __schedule(true); -+ local_irq_disable(); -+ sched_preempt_enable_no_resched(); -+ } while (need_resched()); -+ -+ exception_exit(prev_state); -+} -+ -+int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, -+ void *key) -+{ -+ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); -+ return try_to_wake_up(curr->private, mode, wake_flags); -+} -+EXPORT_SYMBOL(default_wake_function); -+ -+static inline void check_task_changed(struct rq *rq, struct task_struct *p) -+{ -+ /* Trigger resched if task sched_prio has been modified. */ -+ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { -+ requeue_task(p, rq); -+ check_preempt_curr(rq); -+ } -+} -+ -+#ifdef CONFIG_RT_MUTEXES -+ -+static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) -+{ -+ if (pi_task) -+ prio = min(prio, pi_task->prio); -+ -+ return prio; -+} -+ -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ struct task_struct *pi_task = rt_mutex_get_top_task(p); -+ -+ return __rt_effective_prio(pi_task, prio); -+} -+ -+/* -+ * rt_mutex_setprio - set the current priority of a task -+ * @p: task to boost -+ * @pi_task: donor task -+ * -+ * This function changes the 'effective' priority of a task. It does -+ * not touch ->normal_prio like __setscheduler(). -+ * -+ * Used by the rt_mutex code to implement priority inheritance -+ * logic. Call site only calls if the priority of the task changed. -+ */ -+void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) -+{ -+ int prio; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ /* XXX used to be waiter->prio, not waiter->task->prio */ -+ prio = __rt_effective_prio(pi_task, p->normal_prio); -+ -+ /* -+ * If nothing changed; bail early. -+ */ -+ if (p->pi_top_task == pi_task && prio == p->prio) -+ return; -+ -+ rq = __task_access_lock(p, &lock); -+ /* -+ * Set under pi_lock && rq->lock, such that the value can be used under -+ * either lock. -+ * -+ * Note that there is loads of tricky to make this pointer cache work -+ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to -+ * ensure a task is de-boosted (pi_task is set to NULL) before the -+ * task is allowed to run again (and can exit). This ensures the pointer -+ * points to a blocked task -- which guaratees the task is present. -+ */ -+ p->pi_top_task = pi_task; -+ -+ /* -+ * For FIFO/RR we only need to set prio, if that matches we're done. -+ */ -+ if (prio == p->prio) -+ goto out_unlock; -+ -+ /* -+ * Idle task boosting is a nono in general. There is one -+ * exception, when PREEMPT_RT and NOHZ is active: -+ * -+ * The idle task calls get_next_timer_interrupt() and holds -+ * the timer wheel base->lock on the CPU and another CPU wants -+ * to access the timer (probably to cancel it). We can safely -+ * ignore the boosting request, as the idle CPU runs this code -+ * with interrupts disabled and will complete the lock -+ * protected section without being interrupted. So there is no -+ * real need to boost. -+ */ -+ if (unlikely(p == rq->idle)) { -+ WARN_ON(p != rq->curr); -+ WARN_ON(p->pi_blocked_on); -+ goto out_unlock; -+ } -+ -+ trace_sched_pi_setprio(p, pi_task); -+ p->prio = prio; -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+} -+#else -+static inline int rt_effective_prio(struct task_struct *p, int prio) -+{ -+ return prio; -+} -+#endif -+ -+void set_user_nice(struct task_struct *p, long nice) -+{ -+ unsigned long flags; -+ struct rq *rq; -+ raw_spinlock_t *lock; -+ -+ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) -+ return; -+ /* -+ * We have to be careful, if called from sys_setpriority(), -+ * the task might be in the middle of scheduling on another CPU. -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ rq = __task_access_lock(p, &lock); -+ -+ p->static_prio = NICE_TO_PRIO(nice); -+ /* -+ * The RT priorities are set via sched_setscheduler(), but we still -+ * allow the 'normal' nice value to be set - but as expected -+ * it wont have any effect on scheduling until the task is -+ * not SCHED_NORMAL/SCHED_BATCH: -+ */ -+ if (task_has_rt_policy(p)) -+ goto out_unlock; -+ -+ p->prio = effective_prio(p); -+ update_task_priodl(p); -+ -+ check_task_changed(rq, p); -+out_unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+} -+EXPORT_SYMBOL(set_user_nice); -+ -+/* -+ * can_nice - check if a task can reduce its nice value -+ * @p: task -+ * @nice: nice value -+ */ -+int can_nice(const struct task_struct *p, const int nice) -+{ -+ /* Convert nice value [19,-20] to rlimit style value [1,40] */ -+ int nice_rlim = nice_to_rlimit(nice); -+ -+ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || -+ capable(CAP_SYS_NICE)); -+} -+ -+#ifdef __ARCH_WANT_SYS_NICE -+ -+/* -+ * sys_nice - change the priority of the current process. -+ * @increment: priority increment -+ * -+ * sys_setpriority is a more generic, but much slower function that -+ * does similar things. -+ */ -+SYSCALL_DEFINE1(nice, int, increment) -+{ -+ long nice, retval; -+ -+ /* -+ * Setpriority might change our priority at the same moment. -+ * We don't have to worry. Conceptually one call occurs first -+ * and we have a single winner. -+ */ -+ -+ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); -+ nice = task_nice(current) + increment; -+ -+ nice = clamp_val(nice, MIN_NICE, MAX_NICE); -+ if (increment < 0 && !can_nice(current, nice)) -+ return -EPERM; -+ -+ retval = security_task_setnice(current, nice); -+ if (retval) -+ return retval; -+ -+ set_user_nice(current, nice); -+ return 0; -+} -+ -+#endif -+ -+/** -+ * idle_cpu - is a given CPU idle currently? -+ * @cpu: the processor in question. -+ * -+ * Return: 1 if the CPU is currently idle. 0 otherwise. -+ */ -+int idle_cpu(int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ if (rq->curr != rq->idle) -+ return 0; -+ -+ if (rq->nr_running) -+ return 0; -+ -+#ifdef CONFIG_SMP -+ if (rq->ttwu_pending) -+ return 0; -+#endif -+ -+ return 1; -+} -+ -+/** -+ * idle_task - return the idle task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * Return: The idle task for the cpu @cpu. -+ */ -+struct task_struct *idle_task(int cpu) -+{ -+ return cpu_rq(cpu)->idle; -+} -+ -+/** -+ * find_process_by_pid - find a process with a matching PID value. -+ * @pid: the pid in question. -+ * -+ * The task of @pid, if found. %NULL otherwise. -+ */ -+static inline struct task_struct *find_process_by_pid(pid_t pid) -+{ -+ return pid ? find_task_by_vpid(pid) : current; -+} -+ -+/* -+ * sched_setparam() passes in -1 for its policy, to let the functions -+ * it calls know not to change it. -+ */ -+#define SETPARAM_POLICY -1 -+ -+static void __setscheduler_params(struct task_struct *p, -+ const struct sched_attr *attr) -+{ -+ int policy = attr->sched_policy; -+ -+ if (policy == SETPARAM_POLICY) -+ policy = p->policy; -+ -+ p->policy = policy; -+ -+ /* -+ * allow normal nice value to be set, but will not have any -+ * effect on scheduling until the task not SCHED_NORMAL/ -+ * SCHED_BATCH -+ */ -+ p->static_prio = NICE_TO_PRIO(attr->sched_nice); -+ -+ /* -+ * __sched_setscheduler() ensures attr->sched_priority == 0 when -+ * !rt_policy. Always setting this ensures that things like -+ * getparam()/getattr() don't report silly values for !rt tasks. -+ */ -+ p->rt_priority = attr->sched_priority; -+ p->normal_prio = normal_prio(p); -+} -+ -+/* Actually do priority change: must hold rq lock. */ -+static void __setscheduler(struct rq *rq, struct task_struct *p, -+ const struct sched_attr *attr, bool keep_boost) -+{ -+ __setscheduler_params(p, attr); -+ -+ /* -+ * Keep a potential priority boosting if called from -+ * sched_setscheduler(). -+ */ -+ p->prio = normal_prio(p); -+ if (keep_boost) -+ p->prio = rt_effective_prio(p, p->prio); -+ update_task_priodl(p); -+} -+ -+/* -+ * check the target process has a UID that matches the current process's -+ */ -+static bool check_same_owner(struct task_struct *p) -+{ -+ const struct cred *cred = current_cred(), *pcred; -+ bool match; -+ -+ rcu_read_lock(); -+ pcred = __task_cred(p); -+ match = (uid_eq(cred->euid, pcred->euid) || -+ uid_eq(cred->euid, pcred->uid)); -+ rcu_read_unlock(); -+ return match; -+} -+ -+static int __sched_setscheduler(struct task_struct *p, -+ const struct sched_attr *attr, -+ bool user, bool pi) -+{ -+ const struct sched_attr dl_squash_attr = { -+ .size = sizeof(struct sched_attr), -+ .sched_policy = SCHED_FIFO, -+ .sched_nice = 0, -+ .sched_priority = 99, -+ }; -+ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ int retval, oldpolicy = -1; -+ int policy = attr->sched_policy; -+ unsigned long flags; -+ struct rq *rq; -+ int reset_on_fork; -+ raw_spinlock_t *lock; -+ -+ /* The pi code expects interrupts enabled */ -+ BUG_ON(pi && in_interrupt()); -+ -+ /* -+ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO -+ */ -+ if (unlikely(SCHED_DEADLINE == policy)) { -+ attr = &dl_squash_attr; -+ policy = attr->sched_policy; -+ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; -+ } -+recheck: -+ /* Double check policy once rq lock held */ -+ if (policy < 0) { -+ reset_on_fork = p->sched_reset_on_fork; -+ policy = oldpolicy = p->policy; -+ } else { -+ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); -+ -+ if (policy > SCHED_IDLE) -+ return -EINVAL; -+ } -+ -+ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) -+ return -EINVAL; -+ -+ /* -+ * Valid priorities for SCHED_FIFO and SCHED_RR are -+ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and -+ * SCHED_BATCH and SCHED_IDLE is 0. -+ */ -+ if (attr->sched_priority < 0 || -+ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || -+ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) -+ return -EINVAL; -+ if ((SCHED_RR == policy || SCHED_FIFO == policy) != -+ (attr->sched_priority != 0)) -+ return -EINVAL; -+ -+ /* -+ * Allow unprivileged RT tasks to decrease priority: -+ */ -+ if (user && !capable(CAP_SYS_NICE)) { -+ if (SCHED_FIFO == policy || SCHED_RR == policy) { -+ unsigned long rlim_rtprio = -+ task_rlimit(p, RLIMIT_RTPRIO); -+ -+ /* Can't set/change the rt policy */ -+ if (policy != p->policy && !rlim_rtprio) -+ return -EPERM; -+ -+ /* Can't increase priority */ -+ if (attr->sched_priority > p->rt_priority && -+ attr->sched_priority > rlim_rtprio) -+ return -EPERM; -+ } -+ -+ /* Can't change other user's priorities */ -+ if (!check_same_owner(p)) -+ return -EPERM; -+ -+ /* Normal users shall not reset the sched_reset_on_fork flag */ -+ if (p->sched_reset_on_fork && !reset_on_fork) -+ return -EPERM; -+ } -+ -+ if (user) { -+ retval = security_task_setscheduler(p); -+ if (retval) -+ return retval; -+ } -+ -+ if (pi) -+ cpuset_read_lock(); -+ -+ /* -+ * Make sure no PI-waiters arrive (or leave) while we are -+ * changing the priority of the task: -+ */ -+ raw_spin_lock_irqsave(&p->pi_lock, flags); -+ -+ /* -+ * To be able to change p->policy safely, task_access_lock() -+ * must be called. -+ * IF use task_access_lock() here: -+ * For the task p which is not running, reading rq->stop is -+ * racy but acceptable as ->stop doesn't change much. -+ * An enhancemnet can be made to read rq->stop saftly. -+ */ -+ rq = __task_access_lock(p, &lock); -+ -+ /* -+ * Changing the policy of the stop threads its a very bad idea -+ */ -+ if (p == rq->stop) { -+ retval = -EINVAL; -+ goto unlock; -+ } -+ -+ /* -+ * If not changing anything there's no need to proceed further: -+ */ -+ if (unlikely(policy == p->policy)) { -+ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) -+ goto change; -+ if (!rt_policy(policy) && -+ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) -+ goto change; -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ retval = 0; -+ goto unlock; -+ } -+change: -+ -+ /* Re-check policy now with rq lock held */ -+ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { -+ policy = oldpolicy = -1; -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ goto recheck; -+ } -+ -+ p->sched_reset_on_fork = reset_on_fork; -+ -+ if (pi) { -+ /* -+ * Take priority boosted tasks into account. If the new -+ * effective priority is unchanged, we just store the new -+ * normal parameters and do not touch the scheduler class and -+ * the runqueue. This will be done when the task deboost -+ * itself. -+ */ -+ if (rt_effective_prio(p, newprio) == p->prio) { -+ __setscheduler_params(p, attr); -+ retval = 0; -+ goto unlock; -+ } -+ } -+ -+ __setscheduler(rq, p, attr, pi); -+ -+ check_task_changed(rq, p); -+ -+ /* Avoid rq from going away on us: */ -+ preempt_disable(); -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ if (pi) { -+ cpuset_read_unlock(); -+ rt_mutex_adjust_pi(p); -+ } -+ -+ preempt_enable(); -+ -+ return 0; -+ -+unlock: -+ __task_access_unlock(p, lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ if (pi) -+ cpuset_read_unlock(); -+ return retval; -+} -+ -+static int _sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param, bool check) -+{ -+ struct sched_attr attr = { -+ .sched_policy = policy, -+ .sched_priority = param->sched_priority, -+ .sched_nice = PRIO_TO_NICE(p->static_prio), -+ }; -+ -+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ -+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { -+ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ policy &= ~SCHED_RESET_ON_FORK; -+ attr.sched_policy = policy; -+ } -+ -+ return __sched_setscheduler(p, &attr, check, true); -+} -+ -+/** -+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Use sched_set_fifo(), read its comment. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * -+ * NOTE that the task may be already dead. -+ */ -+int sched_setscheduler(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, true); -+} -+ -+int sched_setattr(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, true, true); -+} -+ -+int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) -+{ -+ return __sched_setscheduler(p, attr, false, true); -+} -+ -+/** -+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -+ * @p: the task in question. -+ * @policy: new policy. -+ * @param: structure containing the new RT priority. -+ * -+ * Just like sched_setscheduler, only don't bother checking if the -+ * current context has permission. For example, this is needed in -+ * stop_machine(): we create temporary high priority worker threads, -+ * but our caller might not have that capability. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+int sched_setscheduler_nocheck(struct task_struct *p, int policy, -+ const struct sched_param *param) -+{ -+ return _sched_setscheduler(p, policy, param, false); -+} -+ -+/* -+ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally -+ * incapable of resource management, which is the one thing an OS really should -+ * be doing. -+ * -+ * This is of course the reason it is limited to privileged users only. -+ * -+ * Worse still; it is fundamentally impossible to compose static priority -+ * workloads. You cannot take two correctly working static prio workloads -+ * and smash them together and still expect them to work. -+ * -+ * For this reason 'all' FIFO tasks the kernel creates are basically at: -+ * -+ * MAX_RT_PRIO / 2 -+ * -+ * The administrator _MUST_ configure the system, the kernel simply doesn't -+ * know enough information to make a sensible choice. -+ */ -+void sched_set_fifo(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo); -+ -+/* -+ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. -+ */ -+void sched_set_fifo_low(struct task_struct *p) -+{ -+ struct sched_param sp = { .sched_priority = 1 }; -+ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_fifo_low); -+ -+void sched_set_normal(struct task_struct *p, int nice) -+{ -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ .sched_nice = nice, -+ }; -+ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); -+} -+EXPORT_SYMBOL_GPL(sched_set_normal); -+ -+static int -+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) -+{ -+ struct sched_param lparam; -+ struct task_struct *p; -+ int retval; -+ -+ if (!param || pid < 0) -+ return -EINVAL; -+ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) -+ return -EFAULT; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (likely(p)) -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (likely(p)) { -+ retval = sched_setscheduler(p, policy, &lparam); -+ put_task_struct(p); -+ } -+ -+ return retval; -+} -+ -+/* -+ * Mimics kernel/events/core.c perf_copy_attr(). -+ */ -+static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) -+{ -+ u32 size; -+ int ret; -+ -+ /* Zero the full structure, so that a short copy will be nice: */ -+ memset(attr, 0, sizeof(*attr)); -+ -+ ret = get_user(size, &uattr->size); -+ if (ret) -+ return ret; -+ -+ /* ABI compatibility quirk: */ -+ if (!size) -+ size = SCHED_ATTR_SIZE_VER0; -+ -+ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) -+ goto err_size; -+ -+ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); -+ if (ret) { -+ if (ret == -E2BIG) -+ goto err_size; -+ return ret; -+ } -+ -+ /* -+ * XXX: Do we want to be lenient like existing syscalls; or do we want -+ * to be strict and return an error on out-of-bounds values? -+ */ -+ attr->sched_nice = clamp(attr->sched_nice, -20, 19); -+ -+ /* sched/core.c uses zero here but we already know ret is zero */ -+ return 0; -+ -+err_size: -+ put_user(sizeof(*attr), &uattr->size); -+ return -E2BIG; -+} -+ -+/** -+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority -+ * @pid: the pid in question. -+ * @policy: new policy. -+ * -+ * Return: 0 on success. An error code otherwise. -+ * @param: structure containing the new RT priority. -+ */ -+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) -+{ -+ if (policy < 0) -+ return -EINVAL; -+ -+ return do_sched_setscheduler(pid, policy, param); -+} -+ -+/** -+ * sys_sched_setparam - set/change the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the new RT priority. -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); -+} -+ -+/** -+ * sys_sched_setattr - same as above, but with extended sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ */ -+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, flags) -+{ -+ struct sched_attr attr; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || flags) -+ return -EINVAL; -+ -+ retval = sched_copy_attr(uattr, &attr); -+ if (retval) -+ return retval; -+ -+ if ((int)attr.sched_policy < 0) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (p != NULL) -+ retval = sched_setattr(p, &attr); -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread -+ * @pid: the pid in question. -+ * -+ * Return: On success, the policy of the thread. Otherwise, a negative error -+ * code. -+ */ -+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) -+{ -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (pid < 0) -+ goto out_nounlock; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (p) { -+ retval = security_task_getscheduler(p); -+ if (!retval) -+ retval = p->policy; -+ } -+ rcu_read_unlock(); -+ -+out_nounlock: -+ return retval; -+} -+ -+/** -+ * sys_sched_getscheduler - get the RT priority of a thread -+ * @pid: the pid in question. -+ * @param: structure containing the RT priority. -+ * -+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error -+ * code. -+ */ -+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) -+{ -+ struct sched_param lp = { .sched_priority = 0 }; -+ struct task_struct *p; -+ int retval = -EINVAL; -+ -+ if (!param || pid < 0) -+ goto out_nounlock; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ if (task_has_rt_policy(p)) -+ lp.sched_priority = p->rt_priority; -+ rcu_read_unlock(); -+ -+ /* -+ * This one might sleep, we cannot do it with a spinlock held ... -+ */ -+ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; -+ -+out_nounlock: -+ return retval; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/* -+ * Copy the kernel size attribute structure (which might be larger -+ * than what user-space knows about) to user-space. -+ * -+ * Note that all cases are valid: user-space buffer can be larger or -+ * smaller than the kernel-space buffer. The usual case is that both -+ * have the same size. -+ */ -+static int -+sched_attr_copy_to_user(struct sched_attr __user *uattr, -+ struct sched_attr *kattr, -+ unsigned int usize) -+{ -+ unsigned int ksize = sizeof(*kattr); -+ -+ if (!access_ok(uattr, usize)) -+ return -EFAULT; -+ -+ /* -+ * sched_getattr() ABI forwards and backwards compatibility: -+ * -+ * If usize == ksize then we just copy everything to user-space and all is good. -+ * -+ * If usize < ksize then we only copy as much as user-space has space for, -+ * this keeps ABI compatibility as well. We skip the rest. -+ * -+ * If usize > ksize then user-space is using a newer version of the ABI, -+ * which part the kernel doesn't know about. Just ignore it - tooling can -+ * detect the kernel's knowledge of attributes from the attr->size value -+ * which is set to ksize in this case. -+ */ -+ kattr->size = min(usize, ksize); -+ -+ if (copy_to_user(uattr, kattr, kattr->size)) -+ return -EFAULT; -+ -+ return 0; -+} -+ -+/** -+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr -+ * @pid: the pid in question. -+ * @uattr: structure containing the extended parameters. -+ * @usize: sizeof(attr) for fwd/bwd comp. -+ * @flags: for future extension. -+ */ -+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, -+ unsigned int, usize, unsigned int, flags) -+{ -+ struct sched_attr kattr = { }; -+ struct task_struct *p; -+ int retval; -+ -+ if (!uattr || pid < 0 || usize > PAGE_SIZE || -+ usize < SCHED_ATTR_SIZE_VER0 || flags) -+ return -EINVAL; -+ -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ retval = -ESRCH; -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ kattr.sched_policy = p->policy; -+ if (p->sched_reset_on_fork) -+ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; -+ if (task_has_rt_policy(p)) -+ kattr.sched_priority = p->rt_priority; -+ else -+ kattr.sched_nice = task_nice(p); -+ -+#ifdef CONFIG_UCLAMP_TASK -+ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; -+ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; -+#endif -+ -+ rcu_read_unlock(); -+ -+ return sched_attr_copy_to_user(uattr, &kattr, usize); -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) -+{ -+ cpumask_var_t cpus_allowed, new_mask; -+ struct task_struct *p; -+ int retval; -+ -+ get_online_cpus(); -+ rcu_read_lock(); -+ -+ p = find_process_by_pid(pid); -+ if (!p) { -+ rcu_read_unlock(); -+ put_online_cpus(); -+ return -ESRCH; -+ } -+ -+ /* Prevent p going away */ -+ get_task_struct(p); -+ rcu_read_unlock(); -+ -+ if (p->flags & PF_NO_SETAFFINITY) { -+ retval = -EINVAL; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_put_task; -+ } -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { -+ retval = -ENOMEM; -+ goto out_free_cpus_allowed; -+ } -+ retval = -EPERM; -+ if (!check_same_owner(p)) { -+ rcu_read_lock(); -+ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { -+ rcu_read_unlock(); -+ goto out_unlock; -+ } -+ rcu_read_unlock(); -+ } -+ -+ retval = security_task_setscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ cpuset_cpus_allowed(p, cpus_allowed); -+ cpumask_and(new_mask, in_mask, cpus_allowed); -+again: -+ retval = __set_cpus_allowed_ptr(p, new_mask, true); -+ -+ if (!retval) { -+ cpuset_cpus_allowed(p, cpus_allowed); -+ if (!cpumask_subset(new_mask, cpus_allowed)) { -+ /* -+ * We must have raced with a concurrent cpuset -+ * update. Just reset the cpus_allowed to the -+ * cpuset's cpus_allowed -+ */ -+ cpumask_copy(new_mask, cpus_allowed); -+ goto again; -+ } -+ } -+out_unlock: -+ free_cpumask_var(new_mask); -+out_free_cpus_allowed: -+ free_cpumask_var(cpus_allowed); -+out_put_task: -+ put_task_struct(p); -+ put_online_cpus(); -+ return retval; -+} -+ -+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, -+ struct cpumask *new_mask) -+{ -+ if (len < cpumask_size()) -+ cpumask_clear(new_mask); -+ else if (len > cpumask_size()) -+ len = cpumask_size(); -+ -+ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; -+} -+ -+/** -+ * sys_sched_setaffinity - set the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to the new CPU mask -+ * -+ * Return: 0 on success. An error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ cpumask_var_t new_mask; -+ int retval; -+ -+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); -+ if (retval == 0) -+ retval = sched_setaffinity(pid, new_mask); -+ free_cpumask_var(new_mask); -+ return retval; -+} -+ -+long sched_getaffinity(pid_t pid, cpumask_t *mask) -+{ -+ struct task_struct *p; -+ raw_spinlock_t *lock; -+ unsigned long flags; -+ int retval; -+ -+ rcu_read_lock(); -+ -+ retval = -ESRCH; -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ -+ task_access_lock_irqsave(p, &lock, &flags); -+ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); -+ task_access_unlock_irqrestore(p, lock, &flags); -+ -+out_unlock: -+ rcu_read_unlock(); -+ -+ return retval; -+} -+ -+/** -+ * sys_sched_getaffinity - get the CPU affinity of a process -+ * @pid: pid of the process -+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr -+ * @user_mask_ptr: user-space pointer to hold the current CPU mask -+ * -+ * Return: size of CPU mask copied to user_mask_ptr on success. An -+ * error code otherwise. -+ */ -+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, -+ unsigned long __user *, user_mask_ptr) -+{ -+ int ret; -+ cpumask_var_t mask; -+ -+ if ((len * BITS_PER_BYTE) < nr_cpu_ids) -+ return -EINVAL; -+ if (len & (sizeof(unsigned long)-1)) -+ return -EINVAL; -+ -+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) -+ return -ENOMEM; -+ -+ ret = sched_getaffinity(pid, mask); -+ if (ret == 0) { -+ unsigned int retlen = min_t(size_t, len, cpumask_size()); -+ -+ if (copy_to_user(user_mask_ptr, mask, retlen)) -+ ret = -EFAULT; -+ else -+ ret = retlen; -+ } -+ free_cpumask_var(mask); -+ -+ return ret; -+} -+ -+/** -+ * sys_sched_yield - yield the current processor to other threads. -+ * -+ * This function yields the current CPU to other tasks. It does this by -+ * scheduling away the current task. If it still has the earliest deadline -+ * it will be scheduled again as the next task. -+ * -+ * Return: 0. -+ */ -+static void do_sched_yield(void) -+{ -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ if (!sched_yield_type) -+ return; -+ -+ rq = this_rq_lock_irq(&rf); -+ -+ schedstat_inc(rq->yld_count); -+ -+ if (1 == sched_yield_type) { -+ if (!rt_task(current)) -+ do_sched_yield_type_1(current, rq); -+ } else if (2 == sched_yield_type) { -+ if (rq->nr_running > 1) -+ rq->skip = current; -+ } -+ -+ /* -+ * Since we are going to call schedule() anyway, there's -+ * no need to preempt or enable interrupts: -+ */ -+ preempt_disable(); -+ raw_spin_unlock(&rq->lock); -+ sched_preempt_enable_no_resched(); -+ -+ schedule(); -+} -+ -+SYSCALL_DEFINE0(sched_yield) -+{ -+ do_sched_yield(); -+ return 0; -+} -+ -+#ifndef CONFIG_PREEMPTION -+int __sched _cond_resched(void) -+{ -+ if (should_resched(0)) { -+ preempt_schedule_common(); -+ return 1; -+ } -+ rcu_all_qs(); -+ return 0; -+} -+EXPORT_SYMBOL(_cond_resched); -+#endif -+ -+/* -+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, -+ * call schedule, and on return reacquire the lock. -+ * -+ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level -+ * operations here to prevent schedule() from being called twice (once via -+ * spin_unlock(), once by hand). -+ */ -+int __cond_resched_lock(spinlock_t *lock) -+{ -+ int resched = should_resched(PREEMPT_LOCK_OFFSET); -+ int ret = 0; -+ -+ lockdep_assert_held(lock); -+ -+ if (spin_needbreak(lock) || resched) { -+ spin_unlock(lock); -+ if (resched) -+ preempt_schedule_common(); -+ else -+ cpu_relax(); -+ ret = 1; -+ spin_lock(lock); -+ } -+ return ret; -+} -+EXPORT_SYMBOL(__cond_resched_lock); -+ -+/** -+ * yield - yield the current processor to other threads. -+ * -+ * Do not ever use this function, there's a 99% chance you're doing it wrong. -+ * -+ * The scheduler is at all times free to pick the calling task as the most -+ * eligible task to run, if removing the yield() call from your code breaks -+ * it, its already broken. -+ * -+ * Typical broken usage is: -+ * -+ * while (!event) -+ * yield(); -+ * -+ * where one assumes that yield() will let 'the other' process run that will -+ * make event true. If the current task is a SCHED_FIFO task that will never -+ * happen. Never use yield() as a progress guarantee!! -+ * -+ * If you want to use yield() to wait for something, use wait_event(). -+ * If you want to use yield() to be 'nice' for others, use cond_resched(). -+ * If you still want to use yield(), do not! -+ */ -+void __sched yield(void) -+{ -+ set_current_state(TASK_RUNNING); -+ do_sched_yield(); -+} -+EXPORT_SYMBOL(yield); -+ -+/** -+ * yield_to - yield the current processor to another thread in -+ * your thread group, or accelerate that thread toward the -+ * processor it's on. -+ * @p: target task -+ * @preempt: whether task preemption is allowed or not -+ * -+ * It's the caller's job to ensure that the target task struct -+ * can't go away on us before we can do any checks. -+ * -+ * In Alt schedule FW, yield_to is not supported. -+ * -+ * Return: -+ * true (>0) if we indeed boosted the target task. -+ * false (0) if we failed to boost the target. -+ * -ESRCH if there's no task to yield to. -+ */ -+int __sched yield_to(struct task_struct *p, bool preempt) -+{ -+ return 0; -+} -+EXPORT_SYMBOL_GPL(yield_to); -+ -+int io_schedule_prepare(void) -+{ -+ int old_iowait = current->in_iowait; -+ -+ current->in_iowait = 1; -+ blk_schedule_flush_plug(current); -+ -+ return old_iowait; -+} -+ -+void io_schedule_finish(int token) -+{ -+ current->in_iowait = token; -+} -+ -+/* -+ * This task is about to go to sleep on IO. Increment rq->nr_iowait so -+ * that process accounting knows that this is a task in IO wait state. -+ * -+ * But don't do that if it is a deliberate, throttling IO wait (this task -+ * has set its backing_dev_info: the queue against which it should throttle) -+ */ -+ -+long __sched io_schedule_timeout(long timeout) -+{ -+ int token; -+ long ret; -+ -+ token = io_schedule_prepare(); -+ ret = schedule_timeout(timeout); -+ io_schedule_finish(token); -+ -+ return ret; -+} -+EXPORT_SYMBOL(io_schedule_timeout); -+ -+void __sched io_schedule(void) -+{ -+ int token; -+ -+ token = io_schedule_prepare(); -+ schedule(); -+ io_schedule_finish(token); -+} -+EXPORT_SYMBOL(io_schedule); -+ -+/** -+ * sys_sched_get_priority_max - return maximum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the maximum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_max, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = MAX_USER_RT_PRIO-1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+/** -+ * sys_sched_get_priority_min - return minimum RT priority. -+ * @policy: scheduling class. -+ * -+ * Return: On success, this syscall returns the minimum -+ * rt_priority that can be used by a given scheduling class. -+ * On failure, a negative error code is returned. -+ */ -+SYSCALL_DEFINE1(sched_get_priority_min, int, policy) -+{ -+ int ret = -EINVAL; -+ -+ switch (policy) { -+ case SCHED_FIFO: -+ case SCHED_RR: -+ ret = 1; -+ break; -+ case SCHED_NORMAL: -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ ret = 0; -+ break; -+ } -+ return ret; -+} -+ -+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) -+{ -+ struct task_struct *p; -+ int retval; -+ -+ alt_sched_debug(); -+ -+ if (pid < 0) -+ return -EINVAL; -+ -+ retval = -ESRCH; -+ rcu_read_lock(); -+ p = find_process_by_pid(pid); -+ if (!p) -+ goto out_unlock; -+ -+ retval = security_task_getscheduler(p); -+ if (retval) -+ goto out_unlock; -+ rcu_read_unlock(); -+ -+ *t = ns_to_timespec64(sched_timeslice_ns); -+ return 0; -+ -+out_unlock: -+ rcu_read_unlock(); -+ return retval; -+} -+ -+/** -+ * sys_sched_rr_get_interval - return the default timeslice of a process. -+ * @pid: pid of the process. -+ * @interval: userspace pointer to the timeslice value. -+ * -+ * -+ * Return: On success, 0 and the timeslice is in @interval. Otherwise, -+ * an error code. -+ */ -+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, -+ struct __kernel_timespec __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_timespec64(&t, interval); -+ -+ return retval; -+} -+ -+#ifdef CONFIG_COMPAT_32BIT_TIME -+SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, -+ struct old_timespec32 __user *, interval) -+{ -+ struct timespec64 t; -+ int retval = sched_rr_get_interval(pid, &t); -+ -+ if (retval == 0) -+ retval = put_old_timespec32(&t, interval); -+ return retval; -+} -+#endif -+ -+void sched_show_task(struct task_struct *p) -+{ -+ unsigned long free = 0; -+ int ppid; -+ -+ if (!try_get_task_stack(p)) -+ return; -+ -+ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); -+ -+ if (p->state == TASK_RUNNING) -+ pr_cont(" running task "); -+#ifdef CONFIG_DEBUG_STACK_USAGE -+ free = stack_not_used(p); -+#endif -+ ppid = 0; -+ rcu_read_lock(); -+ if (pid_alive(p)) -+ ppid = task_pid_nr(rcu_dereference(p->real_parent)); -+ rcu_read_unlock(); -+ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", -+ free, task_pid_nr(p), ppid, -+ (unsigned long)task_thread_info(p)->flags); -+ -+ print_worker_info(KERN_INFO, p); -+ show_stack(p, NULL, KERN_INFO); -+ put_task_stack(p); -+} -+EXPORT_SYMBOL_GPL(sched_show_task); -+ -+static inline bool -+state_filter_match(unsigned long state_filter, struct task_struct *p) -+{ -+ /* no filter, everything matches */ -+ if (!state_filter) -+ return true; -+ -+ /* filter, but doesn't match */ -+ if (!(p->state & state_filter)) -+ return false; -+ -+ /* -+ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows -+ * TASK_KILLABLE). -+ */ -+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) -+ return false; -+ -+ return true; -+} -+ -+ -+void show_state_filter(unsigned long state_filter) -+{ -+ struct task_struct *g, *p; -+ -+ rcu_read_lock(); -+ for_each_process_thread(g, p) { -+ /* -+ * reset the NMI-timeout, listing all files on a slow -+ * console might take a lot of time: -+ * Also, reset softlockup watchdogs on all CPUs, because -+ * another CPU might be blocked waiting for us to process -+ * an IPI. -+ */ -+ touch_nmi_watchdog(); -+ touch_all_softlockup_watchdogs(); -+ if (state_filter_match(state_filter, p)) -+ sched_show_task(p); -+ } -+ -+#ifdef CONFIG_SCHED_DEBUG -+ /* TODO: Alt schedule FW should support this -+ if (!state_filter) -+ sysrq_sched_debug_show(); -+ */ -+#endif -+ rcu_read_unlock(); -+ /* -+ * Only show locks if all tasks are dumped: -+ */ -+ if (!state_filter) -+ debug_show_all_locks(); -+} -+ -+void dump_cpu_task(int cpu) -+{ -+ pr_info("Task dump for CPU %d:\n", cpu); -+ sched_show_task(cpu_curr(cpu)); -+} -+ -+/** -+ * init_idle - set up an idle thread for a given CPU -+ * @idle: task in question -+ * @cpu: CPU the idle task belongs to -+ * -+ * NOTE: this function does not set the idle thread's NEED_RESCHED -+ * flag, to make booting more robust. -+ */ -+void init_idle(struct task_struct *idle, int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ __sched_fork(0, idle); -+ -+ raw_spin_lock_irqsave(&idle->pi_lock, flags); -+ raw_spin_lock(&rq->lock); -+ update_rq_clock(rq); -+ -+ idle->last_ran = rq->clock_task; -+ idle->state = TASK_RUNNING; -+ idle->flags |= PF_IDLE; -+ sched_queue_init_idle(rq, idle); -+ -+ scs_task_reset(idle); -+ kasan_unpoison_task_stack(idle); -+ -+#ifdef CONFIG_SMP -+ /* -+ * It's possible that init_idle() gets called multiple times on a task, -+ * in that case do_set_cpus_allowed() will not do the right thing. -+ * -+ * And since this is boot we can forgo the serialisation. -+ */ -+ set_cpus_allowed_common(idle, cpumask_of(cpu)); -+#endif -+ -+ /* Silence PROVE_RCU */ -+ rcu_read_lock(); -+ __set_task_cpu(idle, cpu); -+ rcu_read_unlock(); -+ -+ rq->idle = idle; -+ rcu_assign_pointer(rq->curr, idle); -+ idle->on_cpu = 1; -+ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); -+ -+ /* Set the preempt count _outside_ the spinlocks! */ -+ init_idle_preempt_count(idle, cpu); -+ -+ ftrace_graph_init_idle_task(idle, cpu); -+ vtime_init_idle(idle, cpu); -+#ifdef CONFIG_SMP -+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); -+#endif -+} -+ -+#ifdef CONFIG_SMP -+ -+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, -+ const struct cpumask __maybe_unused *trial) -+{ -+ return 1; -+} -+ -+int task_can_attach(struct task_struct *p, -+ const struct cpumask *cs_cpus_allowed) -+{ -+ int ret = 0; -+ -+ /* -+ * Kthreads which disallow setaffinity shouldn't be moved -+ * to a new cpuset; we don't want to change their CPU -+ * affinity and isolating such threads by their set of -+ * allowed nodes is unnecessary. Thus, cpusets are not -+ * applicable for such threads. This prevents checking for -+ * success of set_cpus_allowed_ptr() on all attached tasks -+ * before cpus_mask may be changed. -+ */ -+ if (p->flags & PF_NO_SETAFFINITY) -+ ret = -EINVAL; -+ -+ return ret; -+} -+ -+bool sched_smp_initialized __read_mostly; -+ -+#ifdef CONFIG_HOTPLUG_CPU -+/* -+ * Ensures that the idle task is using init_mm right before its CPU goes -+ * offline. -+ */ -+void idle_task_exit(void) -+{ -+ struct mm_struct *mm = current->active_mm; -+ -+ BUG_ON(current != this_rq()->idle); -+ -+ if (mm != &init_mm) { -+ switch_mm(mm, &init_mm, current); -+ finish_arch_post_lock_switch(); -+ } -+ -+ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ -+} -+ -+/* -+ * Migrate all tasks from the rq, sleeping tasks will be migrated by -+ * try_to_wake_up()->select_task_rq(). -+ * -+ * Called with rq->lock held even though we'er in stop_machine() and -+ * there's no concurrency possible, we hold the required locks anyway -+ * because of lock validation efforts. -+ */ -+static void migrate_tasks(struct rq *dead_rq) -+{ -+ struct rq *rq = dead_rq; -+ struct task_struct *p, *stop = rq->stop; -+ int count = 0; -+ -+ /* -+ * Fudge the rq selection such that the below task selection loop -+ * doesn't get stuck on the currently eligible stop task. -+ * -+ * We're currently inside stop_machine() and the rq is either stuck -+ * in the stop_machine_cpu_stop() loop, or we're executing this code, -+ * either way we should never end up calling schedule() until we're -+ * done here. -+ */ -+ rq->stop = NULL; -+ -+ p = sched_rq_first_task(rq); -+ while (p != rq->idle) { -+ int dest_cpu; -+ -+ /* skip the running task */ -+ if (task_running(p) || 1 == p->nr_cpus_allowed) { -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ /* -+ * Rules for changing task_struct::cpus_allowed are holding -+ * both pi_lock and rq->lock, such that holding either -+ * stabilizes the mask. -+ * -+ * Drop rq->lock is not quite as disastrous as it usually is -+ * because !cpu_active at this point, which means load-balance -+ * will not interfere. Also, stop-machine. -+ */ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_lock(&p->pi_lock); -+ raw_spin_lock(&rq->lock); -+ -+ /* -+ * Since we're inside stop-machine, _nothing_ should have -+ * changed the task, WARN if weird stuff happened, because in -+ * that case the above rq->lock drop is a fail too. -+ */ -+ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { -+ raw_spin_unlock(&p->pi_lock); -+ p = sched_rq_next_task(p, rq); -+ continue; -+ } -+ -+ count++; -+ /* Find suitable destination for @next, with force if needed. */ -+ dest_cpu = select_fallback_rq(dead_rq->cpu, p); -+ rq = __migrate_task(rq, p, dest_cpu); -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock(&p->pi_lock); -+ -+ rq = dead_rq; -+ raw_spin_lock(&rq->lock); -+ /* Check queued task all over from the header again */ -+ p = sched_rq_first_task(rq); -+ } -+ -+ rq->stop = stop; -+} -+ -+static void set_rq_offline(struct rq *rq) -+{ -+ if (rq->online) -+ rq->online = false; -+} -+#endif /* CONFIG_HOTPLUG_CPU */ -+ -+static void set_rq_online(struct rq *rq) -+{ -+ if (!rq->online) -+ rq->online = true; -+} -+ -+/* -+ * used to mark begin/end of suspend/resume: -+ */ -+static int num_cpus_frozen; -+ -+/* -+ * Update cpusets according to cpu_active mask. If cpusets are -+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper -+ * around partition_sched_domains(). -+ * -+ * If we come here as part of a suspend/resume, don't touch cpusets because we -+ * want to restore it back to its original state upon resume anyway. -+ */ -+static void cpuset_cpu_active(void) -+{ -+ if (cpuhp_tasks_frozen) { -+ /* -+ * num_cpus_frozen tracks how many CPUs are involved in suspend -+ * resume sequence. As long as this is not the last online -+ * operation in the resume sequence, just build a single sched -+ * domain, ignoring cpusets. -+ */ -+ partition_sched_domains(1, NULL, NULL); -+ if (--num_cpus_frozen) -+ return; -+ /* -+ * This is the last CPU online operation. So fall through and -+ * restore the original sched domains by considering the -+ * cpuset configurations. -+ */ -+ cpuset_force_rebuild(); -+ } -+ -+ cpuset_update_active_cpus(); -+} -+ -+static int cpuset_cpu_inactive(unsigned int cpu) -+{ -+ if (!cpuhp_tasks_frozen) { -+ cpuset_update_active_cpus(); -+ } else { -+ num_cpus_frozen++; -+ partition_sched_domains(1, NULL, NULL); -+ } -+ return 0; -+} -+ -+int sched_cpu_activate(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going up, increment the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) -+ static_branch_inc_cpuslocked(&sched_smt_present); -+#endif -+ set_cpu_active(cpu, true); -+ -+ if (sched_smp_initialized) -+ cpuset_cpu_active(); -+ -+ /* -+ * Put the rq online, if not already. This happens: -+ * -+ * 1) In the early boot process, because we build the real domains -+ * after all cpus have been brought up. -+ * -+ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the -+ * domains. -+ */ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_online(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ return 0; -+} -+ -+int sched_cpu_deactivate(unsigned int cpu) -+{ -+ int ret; -+ -+ set_cpu_active(cpu, false); -+ /* -+ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU -+ * users of this state to go away such that all new such users will -+ * observe it. -+ * -+ * Do sync before park smpboot threads to take care the rcu boost case. -+ */ -+ synchronize_rcu(); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When going down, decrement the number of cores with SMT present. -+ */ -+ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { -+ static_branch_dec_cpuslocked(&sched_smt_present); -+ if (!static_branch_likely(&sched_smt_present)) -+ cpumask_clear(&sched_sg_idle_mask); -+ } -+#endif -+ -+ if (!sched_smp_initialized) -+ return 0; -+ -+ ret = cpuset_cpu_inactive(cpu); -+ if (ret) { -+ set_cpu_active(cpu, true); -+ return ret; -+ } -+ return 0; -+} -+ -+static void sched_rq_cpu_starting(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->calc_load_update = calc_load_update; -+} -+ -+int sched_cpu_starting(unsigned int cpu) -+{ -+ sched_rq_cpu_starting(cpu); -+ sched_tick_start(cpu); -+ return 0; -+} -+ -+#ifdef CONFIG_HOTPLUG_CPU -+int sched_cpu_dying(unsigned int cpu) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ /* Handle pending wakeups and then migrate everything off */ -+ sched_tick_stop(cpu); -+ -+ raw_spin_lock_irqsave(&rq->lock, flags); -+ set_rq_offline(rq); -+ migrate_tasks(rq); -+ raw_spin_unlock_irqrestore(&rq->lock, flags); -+ -+ hrtick_clear(rq); -+ return 0; -+} -+#endif -+ -+#ifdef CONFIG_SMP -+static void sched_init_topology_cpumask_early(void) -+{ -+ int cpu, level; -+ cpumask_t *tmp; -+ -+ for_each_possible_cpu(cpu) { -+ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { -+ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); -+ cpumask_copy(tmp, cpu_possible_mask); -+ cpumask_clear_cpu(cpu, tmp); -+ } -+ per_cpu(sched_cpu_llc_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); -+ /*per_cpu(sd_llc_id, cpu) = cpu;*/ -+ } -+} -+ -+#define TOPOLOGY_CPUMASK(name, mask, last) \ -+ if (cpumask_and(chk, chk, mask)) \ -+ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ -+ cpu, (chk++)->bits[0]); \ -+ if (!last) \ -+ cpumask_complement(chk, mask) -+ -+static void sched_init_topology_cpumask(void) -+{ -+ int cpu; -+ cpumask_t *chk; -+ -+ for_each_online_cpu(cpu) { -+ /* take chance to reset time slice for idle tasks */ -+ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; -+ -+ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); -+ -+ cpumask_complement(chk, cpumask_of(cpu)); -+#ifdef CONFIG_SCHED_SMT -+ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); -+#endif -+ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); -+ per_cpu(sched_cpu_llc_mask, cpu) = chk; -+ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); -+ -+ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); -+ -+ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; -+ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", -+ cpu, per_cpu(sd_llc_id, cpu), -+ (int) (per_cpu(sched_cpu_llc_mask, cpu) - -+ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); -+ } -+} -+#endif -+ -+void __init sched_init_smp(void) -+{ -+ /* Move init over to a non-isolated CPU */ -+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) -+ BUG(); -+ -+ sched_init_topology_cpumask(); -+ -+ sched_smp_initialized = true; -+} -+#else -+void __init sched_init_smp(void) -+{ -+ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; -+} -+#endif /* CONFIG_SMP */ -+ -+int in_sched_functions(unsigned long addr) -+{ -+ return in_lock_functions(addr) || -+ (addr >= (unsigned long)__sched_text_start -+ && addr < (unsigned long)__sched_text_end); -+} -+ -+#ifdef CONFIG_CGROUP_SCHED -+/* task group related information */ -+struct task_group { -+ struct cgroup_subsys_state css; -+ -+ struct rcu_head rcu; -+ struct list_head list; -+ -+ struct task_group *parent; -+ struct list_head siblings; -+ struct list_head children; -+}; -+ -+/* -+ * Default task group. -+ * Every task in system belongs to this group at bootup. -+ */ -+struct task_group root_task_group; -+LIST_HEAD(task_groups); -+ -+/* Cacheline aligned slab cache for task_group */ -+static struct kmem_cache *task_group_cache __read_mostly; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+void __init sched_init(void) -+{ -+ int i; -+ struct rq *rq; -+ -+ printk(KERN_INFO ALT_SCHED_VERSION_MSG); -+ -+ wait_bit_init(); -+ -+#ifdef CONFIG_SMP -+ for (i = 0; i < SCHED_BITS; i++) -+ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+ task_group_cache = KMEM_CACHE(task_group, 0); -+ -+ list_add(&root_task_group.list, &task_groups); -+ INIT_LIST_HEAD(&root_task_group.children); -+ INIT_LIST_HEAD(&root_task_group.siblings); -+#endif /* CONFIG_CGROUP_SCHED */ -+ for_each_possible_cpu(i) { -+ rq = cpu_rq(i); -+ -+ sched_queue_init(rq); -+ rq->watermark = IDLE_WM; -+ rq->skip = NULL; -+ -+ raw_spin_lock_init(&rq->lock); -+ rq->nr_running = rq->nr_uninterruptible = 0; -+ rq->calc_load_active = 0; -+ rq->calc_load_update = jiffies + LOAD_FREQ; -+#ifdef CONFIG_SMP -+ rq->online = false; -+ rq->cpu = i; -+ -+#ifdef CONFIG_SCHED_SMT -+ rq->active_balance = 0; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); -+#endif -+#endif /* CONFIG_SMP */ -+ rq->nr_switches = 0; -+ -+ hrtick_rq_init(rq); -+ atomic_set(&rq->nr_iowait, 0); -+ } -+#ifdef CONFIG_SMP -+ /* Set rq->online for cpu 0 */ -+ cpu_rq(0)->online = true; -+#endif -+ /* -+ * The boot idle thread does lazy MMU switching as well: -+ */ -+ mmgrab(&init_mm); -+ enter_lazy_tlb(&init_mm, current); -+ -+ /* -+ * Make us the idle thread. Technically, schedule() should not be -+ * called from this thread, however somewhere below it might be, -+ * but because we are the idle thread, we just pick up running again -+ * when this runqueue becomes "idle". -+ */ -+ init_idle(current, smp_processor_id()); -+ -+ calc_load_update = jiffies + LOAD_FREQ; -+ -+#ifdef CONFIG_SMP -+ idle_thread_set_boot_cpu(); -+ -+ sched_init_topology_cpumask_early(); -+#endif /* SMP */ -+ -+ init_schedstats(); -+ -+ psi_init(); -+} -+ -+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP -+static inline int preempt_count_equals(int preempt_offset) -+{ -+ int nested = preempt_count() + rcu_preempt_depth(); -+ -+ return (nested == preempt_offset); -+} -+ -+void __might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* -+ * Blocking primitives will set (and therefore destroy) current->state, -+ * since we will exit with TASK_RUNNING make sure we enter with it, -+ * otherwise we will destroy state. -+ */ -+ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, -+ "do not call blocking ops when !TASK_RUNNING; " -+ "state=%lx set at [<%p>] %pS\n", -+ current->state, -+ (void *)current->task_state_change, -+ (void *)current->task_state_change); -+ -+ ___might_sleep(file, line, preempt_offset); -+} -+EXPORT_SYMBOL(__might_sleep); -+ -+void ___might_sleep(const char *file, int line, int preempt_offset) -+{ -+ /* Ratelimiting timestamp: */ -+ static unsigned long prev_jiffy; -+ -+ unsigned long preempt_disable_ip; -+ -+ /* WARN_ON_ONCE() by default, no rate limit required: */ -+ rcu_sleep_check(); -+ -+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && -+ !is_idle_task(current) && !current->non_block_count) || -+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || -+ oops_in_progress) -+ return; -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ /* Save this before calling printk(), since that will clobber it: */ -+ preempt_disable_ip = get_preempt_disable_ip(current); -+ -+ printk(KERN_ERR -+ "BUG: sleeping function called from invalid context at %s:%d\n", -+ file, line); -+ printk(KERN_ERR -+ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), current->non_block_count, -+ current->pid, current->comm); -+ -+ if (task_stack_end_corrupted(current)) -+ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); -+ -+ debug_show_held_locks(current); -+ if (irqs_disabled()) -+ print_irqtrace_events(current); -+#ifdef CONFIG_DEBUG_PREEMPT -+ if (!preempt_count_equals(preempt_offset)) { -+ pr_err("Preemption disabled at:"); -+ print_ip_sym(KERN_ERR, preempt_disable_ip); -+ } -+#endif -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL(___might_sleep); -+ -+void __cant_sleep(const char *file, int line, int preempt_offset) -+{ -+ static unsigned long prev_jiffy; -+ -+ if (irqs_disabled()) -+ return; -+ -+ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) -+ return; -+ -+ if (preempt_count() > preempt_offset) -+ return; -+ -+ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) -+ return; -+ prev_jiffy = jiffies; -+ -+ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); -+ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", -+ in_atomic(), irqs_disabled(), -+ current->pid, current->comm); -+ -+ debug_show_held_locks(current); -+ dump_stack(); -+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); -+} -+EXPORT_SYMBOL_GPL(__cant_sleep); -+#endif -+ -+#ifdef CONFIG_MAGIC_SYSRQ -+void normalize_rt_tasks(void) -+{ -+ struct task_struct *g, *p; -+ struct sched_attr attr = { -+ .sched_policy = SCHED_NORMAL, -+ }; -+ -+ read_lock(&tasklist_lock); -+ for_each_process_thread(g, p) { -+ /* -+ * Only normalize user tasks: -+ */ -+ if (p->flags & PF_KTHREAD) -+ continue; -+ -+ if (!rt_task(p)) { -+ /* -+ * Renice negative nice level userspace -+ * tasks back to 0: -+ */ -+ if (task_nice(p) < 0) -+ set_user_nice(p, 0); -+ continue; -+ } -+ -+ __sched_setscheduler(p, &attr, false, false); -+ } -+ read_unlock(&tasklist_lock); -+} -+#endif /* CONFIG_MAGIC_SYSRQ */ -+ -+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) -+/* -+ * These functions are only useful for the IA64 MCA handling, or kdb. -+ * -+ * They can only be called when the whole system has been -+ * stopped - every CPU needs to be quiescent, and no scheduling -+ * activity can take place. Using them for anything else would -+ * be a serious bug, and as a result, they aren't even visible -+ * under any other configuration. -+ */ -+ -+/** -+ * curr_task - return the current task for a given CPU. -+ * @cpu: the processor in question. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ * -+ * Return: The current task for @cpu. -+ */ -+struct task_struct *curr_task(int cpu) -+{ -+ return cpu_curr(cpu); -+} -+ -+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ -+ -+#ifdef CONFIG_IA64 -+/** -+ * ia64_set_curr_task - set the current task for a given CPU. -+ * @cpu: the processor in question. -+ * @p: the task pointer to set. -+ * -+ * Description: This function must only be used when non-maskable interrupts -+ * are serviced on a separate stack. It allows the architecture to switch the -+ * notion of the current task on a CPU in a non-blocking manner. This function -+ * must be called with all CPU's synchronised, and interrupts disabled, the -+ * and caller must save the original value of the current task (see -+ * curr_task() above) and restore that value before reenabling interrupts and -+ * re-starting the system. -+ * -+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! -+ */ -+void ia64_set_curr_task(int cpu, struct task_struct *p) -+{ -+ cpu_curr(cpu) = p; -+} -+ -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+static void sched_free_group(struct task_group *tg) -+{ -+ kmem_cache_free(task_group_cache, tg); -+} -+ -+/* allocate runqueue etc for a new task group */ -+struct task_group *sched_create_group(struct task_group *parent) -+{ -+ struct task_group *tg; -+ -+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); -+ if (!tg) -+ return ERR_PTR(-ENOMEM); -+ -+ return tg; -+} -+ -+void sched_online_group(struct task_group *tg, struct task_group *parent) -+{ -+} -+ -+/* rcu callback to free various structures associated with a task group */ -+static void sched_free_group_rcu(struct rcu_head *rhp) -+{ -+ /* Now it should be safe to free those cfs_rqs */ -+ sched_free_group(container_of(rhp, struct task_group, rcu)); -+} -+ -+void sched_destroy_group(struct task_group *tg) -+{ -+ /* Wait for possible concurrent references to cfs_rqs complete */ -+ call_rcu(&tg->rcu, sched_free_group_rcu); -+} -+ -+void sched_offline_group(struct task_group *tg) -+{ -+} -+ -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ -+static struct cgroup_subsys_state * -+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) -+{ -+ struct task_group *parent = css_tg(parent_css); -+ struct task_group *tg; -+ -+ if (!parent) { -+ /* This is early initialization for the top cgroup */ -+ return &root_task_group.css; -+ } -+ -+ tg = sched_create_group(parent); -+ if (IS_ERR(tg)) -+ return ERR_PTR(-ENOMEM); -+ return &tg->css; -+} -+ -+/* Expose task group only after completing cgroup initialization */ -+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ struct task_group *parent = css_tg(css->parent); -+ -+ if (parent) -+ sched_online_group(tg, parent); -+ return 0; -+} -+ -+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ sched_offline_group(tg); -+} -+ -+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ /* -+ * Relies on the RCU grace period between css_released() and this. -+ */ -+ sched_free_group(tg); -+} -+ -+static void cpu_cgroup_fork(struct task_struct *task) -+{ -+} -+ -+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ return 0; -+} -+ -+static void cpu_cgroup_attach(struct cgroup_taskset *tset) -+{ -+} -+ -+static struct cftype cpu_legacy_files[] = { -+ { } /* Terminate */ -+}; -+ -+ -+static struct cftype cpu_files[] = { -+ { } /* terminate */ -+}; -+ -+static int cpu_extra_stat_show(struct seq_file *sf, -+ struct cgroup_subsys_state *css) -+{ -+ return 0; -+} -+ -+struct cgroup_subsys cpu_cgrp_subsys = { -+ .css_alloc = cpu_cgroup_css_alloc, -+ .css_online = cpu_cgroup_css_online, -+ .css_released = cpu_cgroup_css_released, -+ .css_free = cpu_cgroup_css_free, -+ .css_extra_stat_show = cpu_extra_stat_show, -+ .fork = cpu_cgroup_fork, -+ .can_attach = cpu_cgroup_can_attach, -+ .attach = cpu_cgroup_attach, -+ .legacy_cftypes = cpu_files, -+ .legacy_cftypes = cpu_legacy_files, -+ .dfl_cftypes = cpu_files, -+ .early_init = true, -+ .threaded = true, -+}; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#undef CREATE_TRACE_POINTS -diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c -new file mode 100644 -index 000000000000..1212a031700e ---- /dev/null -+++ b/kernel/sched/alt_debug.c -@@ -0,0 +1,31 @@ -+/* -+ * kernel/sched/alt_debug.c -+ * -+ * Print the alt scheduler debugging details -+ * -+ * Author: Alfred Chen -+ * Date : 2020 -+ */ -+#include "sched.h" -+ -+/* -+ * This allows printing both to /proc/sched_debug and -+ * to the console -+ */ -+#define SEQ_printf(m, x...) \ -+ do { \ -+ if (m) \ -+ seq_printf(m, x); \ -+ else \ -+ pr_cont(x); \ -+ } while (0) -+ -+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, -+ struct seq_file *m) -+{ -+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), -+ get_nr_threads(p)); -+} -+ -+void proc_sched_set_task(struct task_struct *p) -+{} -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -new file mode 100644 -index 000000000000..99be2c51c88d ---- /dev/null -+++ b/kernel/sched/alt_sched.h -@@ -0,0 +1,555 @@ -+#ifndef ALT_SCHED_H -+#define ALT_SCHED_H -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+ -+#ifdef CONFIG_PARAVIRT -+# include -+#endif -+ -+#include "cpupri.h" -+ -+#ifdef CONFIG_SCHED_BMQ -+#include "bmq.h" -+#endif -+#ifdef CONFIG_SCHED_PDS -+#include "pds.h" -+#endif -+ -+/* task_struct::on_rq states: */ -+#define TASK_ON_RQ_QUEUED 1 -+#define TASK_ON_RQ_MIGRATING 2 -+ -+static inline int task_on_rq_queued(struct task_struct *p) -+{ -+ return p->on_rq == TASK_ON_RQ_QUEUED; -+} -+ -+static inline int task_on_rq_migrating(struct task_struct *p) -+{ -+ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; -+} -+ -+/* -+ * wake flags -+ */ -+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -+#define WF_FORK 0x02 /* child wakeup after fork */ -+#define WF_MIGRATED 0x04 /* internal use, task got migrated */ -+#define WF_ON_CPU 0x08 /* Wakee is on_rq */ -+ -+/* -+ * This is the main, per-CPU runqueue data structure. -+ * This data should only be modified by the local cpu. -+ */ -+struct rq { -+ /* runqueue lock: */ -+ raw_spinlock_t lock; -+ -+ struct task_struct __rcu *curr; -+ struct task_struct *idle, *stop, *skip; -+ struct mm_struct *prev_mm; -+ -+#ifdef CONFIG_SCHED_BMQ -+ struct bmq queue; -+#endif -+#ifdef CONFIG_SCHED_PDS -+ struct skiplist_node sl_header; -+#endif -+ unsigned long watermark; -+ -+ /* switch count */ -+ u64 nr_switches; -+ -+ atomic_t nr_iowait; -+ -+#ifdef CONFIG_MEMBARRIER -+ int membarrier_state; -+#endif -+ -+#ifdef CONFIG_SMP -+ int cpu; /* cpu of this runqueue */ -+ bool online; -+ -+ unsigned int ttwu_pending; -+ unsigned char nohz_idle_balance; -+ unsigned char idle_balance; -+ -+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ -+ struct sched_avg avg_irq; -+#endif -+ -+#ifdef CONFIG_SCHED_SMT -+ int active_balance; -+ struct cpu_stop_work active_balance_work; -+#endif -+#endif /* CONFIG_SMP */ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+ u64 prev_irq_time; -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+#ifdef CONFIG_PARAVIRT -+ u64 prev_steal_time; -+#endif /* CONFIG_PARAVIRT */ -+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING -+ u64 prev_steal_time_rq; -+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ -+ -+ /* calc_load related fields */ -+ unsigned long calc_load_update; -+ long calc_load_active; -+ -+ u64 clock, last_tick; -+ u64 last_ts_switch; -+ u64 clock_task; -+ -+ unsigned long nr_running; -+ unsigned long nr_uninterruptible; -+ -+#ifdef CONFIG_SCHED_HRTICK -+#ifdef CONFIG_SMP -+ call_single_data_t hrtick_csd; -+#endif -+ struct hrtimer hrtick_timer; -+#endif -+ -+#ifdef CONFIG_SCHEDSTATS -+ -+ /* latency stats */ -+ struct sched_info rq_sched_info; -+ unsigned long long rq_cpu_time; -+ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ -+ -+ /* sys_sched_yield() stats */ -+ unsigned int yld_count; -+ -+ /* schedule() stats */ -+ unsigned int sched_switch; -+ unsigned int sched_count; -+ unsigned int sched_goidle; -+ -+ /* try_to_wake_up() stats */ -+ unsigned int ttwu_count; -+ unsigned int ttwu_local; -+#endif /* CONFIG_SCHEDSTATS */ -+ -+#ifdef CONFIG_CPU_IDLE -+ /* Must be inspected within a rcu lock section */ -+ struct cpuidle_state *idle_state; -+#endif -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#ifdef CONFIG_SMP -+ call_single_data_t nohz_csd; -+#endif -+ atomic_t nohz_flags; -+#endif /* CONFIG_NO_HZ_COMMON */ -+}; -+ -+extern unsigned long calc_load_update; -+extern atomic_long_t calc_load_tasks; -+ -+extern void calc_global_load_tick(struct rq *this_rq); -+extern long calc_load_fold_active(struct rq *this_rq, long adjust); -+ -+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -+#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) -+#define this_rq() this_cpu_ptr(&runqueues) -+#define task_rq(p) cpu_rq(task_cpu(p)) -+#define cpu_curr(cpu) (cpu_rq(cpu)->curr) -+#define raw_rq() raw_cpu_ptr(&runqueues) -+ -+#ifdef CONFIG_SMP -+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) -+void register_sched_domain_sysctl(void); -+void unregister_sched_domain_sysctl(void); -+#else -+static inline void register_sched_domain_sysctl(void) -+{ -+} -+static inline void unregister_sched_domain_sysctl(void) -+{ -+} -+#endif -+ -+extern bool sched_smp_initialized; -+ -+enum { -+ BASE_CPU_AFFINITY_CHK_LEVEL = 1, -+#ifdef CONFIG_SCHED_SMT -+ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+#ifdef CONFIG_SCHED_MC -+ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, -+#endif -+ NR_CPU_AFFINITY_CHK_LEVEL -+}; -+ -+DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); -+ -+static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, -+ const cpumask_t *mask) -+{ -+ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) -+ mask++; -+ return cpu; -+} -+ -+static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) -+{ -+ return cpumask_test_cpu(cpu, cpumask)? cpu : -+ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); -+} -+ -+extern void flush_smp_call_function_from_idle(void); -+ -+#else /* !CONFIG_SMP */ -+static inline void flush_smp_call_function_from_idle(void) { } -+#endif -+ -+#ifndef arch_scale_freq_tick -+static __always_inline -+void arch_scale_freq_tick(void) -+{ -+} -+#endif -+ -+#ifndef arch_scale_freq_capacity -+static __always_inline -+unsigned long arch_scale_freq_capacity(int cpu) -+{ -+ return SCHED_CAPACITY_SCALE; -+} -+#endif -+ -+static inline u64 __rq_clock_broken(struct rq *rq) -+{ -+ return READ_ONCE(rq->clock); -+} -+ -+static inline u64 rq_clock(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock; -+} -+ -+static inline u64 rq_clock_task(struct rq *rq) -+{ -+ /* -+ * Relax lockdep_assert_held() checking as in VRQ, call to -+ * sched_info_xxxx() may not held rq->lock -+ * lockdep_assert_held(&rq->lock); -+ */ -+ return rq->clock_task; -+} -+ -+/* -+ * {de,en}queue flags: -+ * -+ * DEQUEUE_SLEEP - task is no longer runnable -+ * ENQUEUE_WAKEUP - task just became runnable -+ * -+ */ -+ -+#define DEQUEUE_SLEEP 0x01 -+ -+#define ENQUEUE_WAKEUP 0x01 -+ -+ -+/* -+ * Below are scheduler API which using in other kernel code -+ * It use the dummy rq_flags -+ * ToDo : BMQ need to support these APIs for compatibility with mainline -+ * scheduler code. -+ */ -+struct rq_flags { -+ unsigned long flags; -+}; -+ -+struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(rq->lock); -+ -+struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) -+ __acquires(p->pi_lock) -+ __acquires(rq->lock); -+ -+static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock(&rq->lock); -+} -+ -+static inline void -+task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) -+ __releases(rq->lock) -+ __releases(p->pi_lock) -+{ -+ raw_spin_unlock(&rq->lock); -+ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); -+} -+ -+static inline void -+rq_unlock_irq(struct rq *rq, struct rq_flags *rf) -+ __releases(rq->lock) -+{ -+ raw_spin_unlock_irq(&rq->lock); -+} -+ -+static inline struct rq * -+this_rq_lock_irq(struct rq_flags *rf) -+ __acquires(rq->lock) -+{ -+ struct rq *rq; -+ -+ local_irq_disable(); -+ rq = this_rq(); -+ raw_spin_lock(&rq->lock); -+ -+ return rq; -+} -+ -+static inline int task_current(struct rq *rq, struct task_struct *p) -+{ -+ return rq->curr == p; -+} -+ -+static inline bool task_running(struct task_struct *p) -+{ -+ return p->on_cpu; -+} -+ -+extern struct static_key_false sched_schedstats; -+ -+#ifdef CONFIG_CPU_IDLE -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+ rq->idle_state = idle_state; -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ WARN_ON(!rcu_read_lock_held()); -+ return rq->idle_state; -+} -+#else -+static inline void idle_set_state(struct rq *rq, -+ struct cpuidle_state *idle_state) -+{ -+} -+ -+static inline struct cpuidle_state *idle_get_state(struct rq *rq) -+{ -+ return NULL; -+} -+#endif -+ -+static inline int cpu_of(const struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->cpu; -+#else -+ return 0; -+#endif -+} -+ -+#include "stats.h" -+ -+#ifdef CONFIG_NO_HZ_COMMON -+#define NOHZ_BALANCE_KICK_BIT 0 -+#define NOHZ_STATS_KICK_BIT 1 -+ -+#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) -+#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) -+ -+#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) -+ -+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -+ -+/* TODO: needed? -+extern void nohz_balance_exit_idle(struct rq *rq); -+#else -+static inline void nohz_balance_exit_idle(struct rq *rq) { } -+*/ -+#endif -+ -+#ifdef CONFIG_IRQ_TIME_ACCOUNTING -+struct irqtime { -+ u64 total; -+ u64 tick_delta; -+ u64 irq_start_time; -+ struct u64_stats_sync sync; -+}; -+ -+DECLARE_PER_CPU(struct irqtime, cpu_irqtime); -+ -+/* -+ * Returns the irqtime minus the softirq time computed by ksoftirqd. -+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime -+ * and never move forward. -+ */ -+static inline u64 irq_time_read(int cpu) -+{ -+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); -+ unsigned int seq; -+ u64 total; -+ -+ do { -+ seq = __u64_stats_fetch_begin(&irqtime->sync); -+ total = irqtime->total; -+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); -+ -+ return total; -+} -+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ -+ -+#ifdef CONFIG_CPU_FREQ -+DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); -+ -+/** -+ * cpufreq_update_util - Take a note about CPU utilization changes. -+ * @rq: Runqueue to carry out the update for. -+ * @flags: Update reason flags. -+ * -+ * This function is called by the scheduler on the CPU whose utilization is -+ * being updated. -+ * -+ * It can only be called from RCU-sched read-side critical sections. -+ * -+ * The way cpufreq is currently arranged requires it to evaluate the CPU -+ * performance state (frequency/voltage) on a regular basis to prevent it from -+ * being stuck in a completely inadequate performance level for too long. -+ * That is not guaranteed to happen if the updates are only triggered from CFS -+ * and DL, though, because they may not be coming in if only RT tasks are -+ * active all the time (or there are RT tasks only). -+ * -+ * As a workaround for that issue, this function is called periodically by the -+ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, -+ * but that really is a band-aid. Going forward it should be replaced with -+ * solutions targeted more specifically at RT tasks. -+ */ -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) -+{ -+ struct update_util_data *data; -+ -+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); -+ if (data) -+ data->func(data, rq_clock(rq), flags); -+} -+#else -+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} -+#endif /* CONFIG_CPU_FREQ */ -+ -+#ifdef CONFIG_NO_HZ_FULL -+extern int __init sched_tick_offload_init(void); -+#else -+static inline int sched_tick_offload_init(void) { return 0; } -+#endif -+ -+#ifdef arch_scale_freq_capacity -+#ifndef arch_scale_freq_invariant -+#define arch_scale_freq_invariant() (true) -+#endif -+#else /* arch_scale_freq_capacity */ -+#define arch_scale_freq_invariant() (false) -+#endif -+ -+extern void schedule_idle(void); -+ -+/* -+ * !! For sched_setattr_nocheck() (kernel) only !! -+ * -+ * This is actually gross. :( -+ * -+ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE -+ * tasks, but still be able to sleep. We need this on platforms that cannot -+ * atomically change clock frequency. Remove once fast switching will be -+ * available on such platforms. -+ * -+ * SUGOV stands for SchedUtil GOVernor. -+ */ -+#define SCHED_FLAG_SUGOV 0x10000000 -+ -+#ifdef CONFIG_MEMBARRIER -+/* -+ * The scheduler provides memory barriers required by membarrier between: -+ * - prior user-space memory accesses and store to rq->membarrier_state, -+ * - store to rq->membarrier_state and following user-space memory accesses. -+ * In the same way it provides those guarantees around store to rq->curr. -+ */ -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+ int membarrier_state; -+ -+ if (prev_mm == next_mm) -+ return; -+ -+ membarrier_state = atomic_read(&next_mm->membarrier_state); -+ if (READ_ONCE(rq->membarrier_state) == membarrier_state) -+ return; -+ -+ WRITE_ONCE(rq->membarrier_state, membarrier_state); -+} -+#else -+static inline void membarrier_switch_mm(struct rq *rq, -+ struct mm_struct *prev_mm, -+ struct mm_struct *next_mm) -+{ -+} -+#endif -+ -+#ifdef CONFIG_NUMA -+extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); -+#else -+static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return nr_cpu_ids; -+} -+#endif -+ -+void swake_up_all_locked(struct swait_queue_head *q); -+void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+#endif /* ALT_SCHED_H */ -diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h -new file mode 100644 -index 000000000000..aff0bb30a884 ---- /dev/null -+++ b/kernel/sched/bmq.h -@@ -0,0 +1,20 @@ -+#ifndef BMQ_H -+#define BMQ_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+struct bmq { -+ DECLARE_BITMAP(bitmap, SCHED_BITS); -+ struct list_head heads[SCHED_BITS]; -+}; -+ -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); -+} -+ -+#endif -diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h -new file mode 100644 -index 000000000000..ad9a7c448da7 ---- /dev/null -+++ b/kernel/sched/bmq_imp.h -@@ -0,0 +1,185 @@ -+#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+/* -+ * BMQ only routines -+ */ -+#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) -+#define boost_threshold(p) (sched_timeslice_ns >>\ -+ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) -+ -+static inline void boost_task(struct task_struct *p) -+{ -+ int limit; -+ -+ switch (p->policy) { -+ case SCHED_NORMAL: -+ limit = -MAX_PRIORITY_ADJ; -+ break; -+ case SCHED_BATCH: -+ case SCHED_IDLE: -+ limit = 0; -+ break; -+ default: -+ return; -+ } -+ -+ if (p->boost_prio > limit) -+ p->boost_prio--; -+} -+ -+static inline void deboost_task(struct task_struct *p) -+{ -+ if (p->boost_prio < MAX_PRIORITY_ADJ) -+ p->boost_prio++; -+} -+ -+/* -+ * Common interfaces -+ */ -+static inline int task_sched_prio(struct task_struct *p, struct rq *rq) -+{ -+ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { -+ if (SCHED_RR != p->policy) -+ deboost_task(p); -+ requeue_task(p, rq); -+ } -+} -+ -+static inline void update_task_priodl(struct task_struct *p) {} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return find_first_bit(rq->queue.bitmap, SCHED_BITS); -+} -+ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ struct bmq *q = &rq->queue; -+ int i; -+ -+ bitmap_zero(q->bitmap, SCHED_BITS); -+ for(i = 0; i < SCHED_BITS; i++) -+ INIT_LIST_HEAD(&q->heads[i]); -+} -+ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ struct bmq *q = &rq->queue; -+ -+ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; -+ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); -+ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); -+ set_bit(idle->bmq_idx, q->bitmap); -+} -+ -+/* -+ * This routine used in bmq scheduler only which assume the idle task in the bmq -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); -+ const struct list_head *head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ unsigned long idx = p->bmq_idx; -+ struct list_head *head = &rq->queue.heads[idx]; -+ -+ if (list_is_last(&p->bmq_node, head)) { -+ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); -+ head = &rq->queue.heads[idx]; -+ -+ return list_first_entry(head, struct task_struct, bmq_node); -+ } -+ -+ return list_next_entry(p, bmq_node); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ list_del(&p->bmq_node); \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap);\ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->bmq_idx = task_sched_prio(p, rq); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ -+ set_bit(p->bmq_idx, rq->queue.bitmap) -+ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{ \ -+ int idx = task_sched_prio(p, rq); \ -+\ -+ list_del(&p->bmq_node); \ -+ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ -+ if (idx != p->bmq_idx) { \ -+ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ -+ clear_bit(p->bmq_idx, rq->queue.bitmap); \ -+ p->bmq_idx = idx; \ -+ set_bit(p->bmq_idx, rq->queue.bitmap); \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ return (task_sched_prio(p, rq) != p->bmq_idx); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = (p->boost_prio < 0) ? -+ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ return (p->prio - MAX_RT_PRIO + p->boost_prio); -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ p->boost_prio = MAX_PRIORITY_ADJ; -+} -+ -+static void sched_task_ttwu(struct task_struct *p) -+{ -+ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) -+ boost_task(p); -+} -+ -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) -+{ -+ if (rq_switch_time(rq) < boost_threshold(p)) -+ boost_task(p); -+} -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index e39008242cf4..5963716fe391 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, - return cpufreq_driver_resolve_freq(policy, freq); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); - } -+#else /* CONFIG_SCHED_ALT */ -+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) -+{ -+ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); -+ return sg_cpu->max; -+} -+#endif - - /** - * sugov_iowait_reset() - Reset the IO boost status of a CPU. -@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } - */ - static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) - { -+#ifndef CONFIG_SCHED_ALT - if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) -+#endif - sg_policy->limits_changed = true; - } - -@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) - } - - ret = sched_setattr_nocheck(thread, &attr); -+ - if (ret) { - kthread_stop(thread); - pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); -@@ -912,6 +923,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) - cpufreq_governor_init(schedutil_gov); - - #ifdef CONFIG_ENERGY_MODEL -+#ifndef CONFIG_SCHED_ALT - extern bool sched_energy_update; - extern struct mutex sched_energy_mutex; - -@@ -942,4 +954,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, - } - - } -+#else /* CONFIG_SCHED_ALT */ -+void sched_cpufreq_governor_change(struct cpufreq_policy *policy, -+ struct cpufreq_governor *old_gov) -+{ -+} -+#endif - #endif -diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c -index 5a55d2300452..66a0ab7165f0 100644 ---- a/kernel/sched/cputime.c -+++ b/kernel/sched/cputime.c -@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) - p->utime += cputime; - account_group_user_time(p, cputime); - -- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; -+ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; - - /* Add user time to cpustat. */ - task_group_account_field(p, index, cputime); -@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) - p->gtime += cputime; - - /* Add guest time to cpustat. */ -- if (task_nice(p) > 0) { -+ if (task_running_nice(p)) { - cpustat[CPUTIME_NICE] += cputime; - cpustat[CPUTIME_GUEST_NICE] += cputime; - } else { -@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) - #ifdef CONFIG_64BIT - static inline u64 read_sum_exec_runtime(struct task_struct *t) - { -- return t->se.sum_exec_runtime; -+ return tsk_seruntime(t); - } - #else - static u64 read_sum_exec_runtime(struct task_struct *t) -@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) - struct rq *rq; - - rq = task_rq_lock(t, &rf); -- ns = t->se.sum_exec_runtime; -+ ns = tsk_seruntime(t); - task_rq_unlock(rq, t, &rf); - - return ns; -@@ -614,7 +614,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, - void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) - { - struct task_cputime cputime = { -- .sum_exec_runtime = p->se.sum_exec_runtime, -+ .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index f324dc36fc43..a6b566bda65b 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) - do_idle(); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * idle-task scheduling class. - */ -@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class - .switched_to = switched_to_idle, - .update_curr = update_curr_idle, - }; -+#endif -diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h -new file mode 100644 -index 000000000000..7fdeace7e8a5 ---- /dev/null -+++ b/kernel/sched/pds.h -@@ -0,0 +1,14 @@ -+#ifndef PDS_H -+#define PDS_H -+ -+/* bits: -+ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ -+#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) -+#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (p->prio > DEFAULT_PRIO); -+} -+ -+#endif -diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h -new file mode 100644 -index 000000000000..6baee5e961b9 ---- /dev/null -+++ b/kernel/sched/pds_imp.h -@@ -0,0 +1,257 @@ -+#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" -+ -+static const u64 user_prio2deadline[NICE_WIDTH] = { -+/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, -+/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, -+/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, -+/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, -+/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, -+/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, -+/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, -+/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 -+}; -+ -+static const unsigned char dl_level_map[] = { -+/* 0 4 8 12 */ -+ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, -+/* 16 20 24 28 */ -+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, -+/* 32 36 40 44 */ -+ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, -+/* 48 52 56 60 */ -+ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, -+/* 64 68 72 76 */ -+ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, -+/* 80 84 88 92 */ -+ 1, 0 -+}; -+ -+static inline int -+task_sched_prio(const struct task_struct *p, const struct rq *rq) -+{ -+ size_t delta; -+ -+ if (p == rq->idle) -+ return IDLE_TASK_SCHED_PRIO; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return p->prio; -+ -+ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; -+ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); -+ -+ return MAX_RT_PRIO + dl_level_map[delta]; -+} -+ -+static inline void update_task_priodl(struct task_struct *p) -+{ -+ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); -+} -+ -+static inline void requeue_task(struct task_struct *p, struct rq *rq); -+ -+static inline void time_slice_expired(struct task_struct *p, struct rq *rq) -+{ -+ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ -+ p->time_slice = sched_timeslice_ns; -+ -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+ -+ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) -+ requeue_task(p, rq); -+} -+ -+/* -+ * pds_skiplist_task_search -- search function used in PDS run queue skip list -+ * node insert operation. -+ * @it: iterator pointer to the node in the skip list -+ * @node: pointer to the skiplist_node to be inserted -+ * -+ * Returns true if key of @it is less or equal to key value of @node, otherwise -+ * false. -+ */ -+static inline bool -+pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) -+{ -+ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= -+ skiplist_entry(node, struct task_struct, sl_node)->priodl); -+} -+ -+/* -+ * Define the skip list insert function for PDS -+ */ -+DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); -+ -+/* -+ * Init the queue structure in rq -+ */ -+static inline void sched_queue_init(struct rq *rq) -+{ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+} -+ -+/* -+ * Init idle task and put into queue structure of rq -+ * IMPORTANT: may be called multiple times for a single cpu -+ */ -+static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) -+{ -+ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ -+ int default_prio = idle->prio; -+ -+ idle->prio = MAX_PRIO; -+ idle->deadline = 0ULL; -+ update_task_priodl(idle); -+ -+ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); -+ -+ idle->sl_node.level = idle->sl_level; -+ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); -+ -+ idle->prio = default_prio; -+} -+ -+/* -+ * This routine assume that the idle task always in queue -+ */ -+static inline struct task_struct *sched_rq_first_task(struct rq *rq) -+{ -+ struct skiplist_node *node = rq->sl_header.next[0]; -+ -+ BUG_ON(node == &rq->sl_header); -+ return skiplist_entry(node, struct task_struct, sl_node); -+} -+ -+static inline struct task_struct * -+sched_rq_next_task(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *next = p->sl_node.next[0]; -+ -+ BUG_ON(next == &rq->sl_header); -+ return skiplist_entry(next, struct task_struct, sl_node); -+} -+ -+static inline unsigned long sched_queue_watermark(struct rq *rq) -+{ -+ return task_sched_prio(sched_rq_first_task(rq), rq); -+} -+ -+#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ -+ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ -+ sched_info_dequeued(rq, p); \ -+ \ -+ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ -+ func; \ -+ } -+ -+#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ -+ sched_info_queued(rq, p); \ -+ psi_enqueue(p, flags); \ -+ \ -+ p->sl_node.level = p->sl_level; \ -+ pds_skiplist_insert(&rq->sl_header, &p->sl_node) -+ -+/* -+ * Requeue a task @p to @rq -+ */ -+#define __SCHED_REQUEUE_TASK(p, rq, func) \ -+{\ -+ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ -+\ -+ p->sl_node.level = p->sl_level; \ -+ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ -+ func; \ -+ } \ -+} -+ -+static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) -+{ -+ struct skiplist_node *node = p->sl_node.prev[0]; -+ -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl > p->priodl) -+ return true; -+ } -+ -+ node = p->sl_node.next[0]; -+ if (node != &rq->sl_header) { -+ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); -+ -+ if (t->priodl < p->priodl) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip -+ * list node which is used in PDS run queue. -+ * -+ * In current implementation, based on testing, the first 8 bits in microseconds -+ * of niffies are suitable for random level population. -+ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there -+ * should be platform hardware supported instruction(known as ctz/clz) to speed -+ * up this function. -+ * The skiplist level for a task is populated when task is created and doesn't -+ * change in task's life time. When task is being inserted into run queue, this -+ * skiplist level is set to task's sl_node->level, the skiplist insert function -+ * may change it based on current level of the skip lsit. -+ */ -+static inline int pds_skiplist_random_level(const struct task_struct *p) -+{ -+ long unsigned int randseed; -+ -+ /* -+ * 1. Some architectures don't have better than microsecond resolution -+ * so mask out ~microseconds as a factor of the random seed for skiplist -+ * insertion. -+ * 2. Use address of task structure pointer as another factor of the -+ * random seed for task burst forking scenario. -+ */ -+ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; -+ -+ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); -+} -+ -+static void sched_task_fork(struct task_struct *p, struct rq *rq) -+{ -+ p->sl_level = pds_skiplist_random_level(p); -+ if (p->prio >= MAX_RT_PRIO) -+ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; -+ update_task_priodl(p); -+} -+ -+/** -+ * task_prio - return the priority value of a given task. -+ * @p: the task in question. -+ * -+ * Return: The priority value as seen by users in /proc. -+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes -+ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). -+ */ -+int task_prio(const struct task_struct *p) -+{ -+ int ret; -+ -+ if (p->prio < MAX_RT_PRIO) -+ return (p->prio - MAX_RT_PRIO); -+ -+ preempt_disable(); -+ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; -+ preempt_enable(); -+ -+ return ret; -+} -+ -+static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) -+{ -+ time_slice_expired(p, rq); -+} -+ -+static void sched_task_ttwu(struct task_struct *p) {} -+static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} -diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c -index 2c613e1cff3a..0103b2a7201d 100644 ---- a/kernel/sched/pelt.c -+++ b/kernel/sched/pelt.c -@@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) - WRITE_ONCE(sa->util_avg, sa->util_sum / divider); - } - -+#ifndef CONFIG_SCHED_ALT - /* - * sched_entity: - * -@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - - return 0; - } -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - /* - * thermal: - * -diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h -index 795e43e02afc..856163dac896 100644 ---- a/kernel/sched/pelt.h -+++ b/kernel/sched/pelt.h -@@ -1,13 +1,15 @@ - #ifdef CONFIG_SMP - #include "sched-pelt.h" - -+#ifndef CONFIG_SCHED_ALT - int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); - int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); - int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); - int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); - int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); -+#endif - --#ifdef CONFIG_SCHED_THERMAL_PRESSURE -+#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) - int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); - - static inline u64 thermal_load_avg(struct rq *rq) -@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) - return LOAD_AVG_MAX - 1024 + avg->period_contrib; - } - -+#ifndef CONFIG_SCHED_ALT - /* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. -@@ -162,9 +165,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) - return rq_clock_pelt(rq_of(cfs_rq)); - } - #endif -+#endif /* CONFIG_SCHED_ALT */ - - #else - -+#ifndef CONFIG_SCHED_ALT - static inline int - update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) - { -@@ -182,6 +187,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) - { - return 0; - } -+#endif - - static inline int - update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 28709f6b0975..6bc68bacbac8 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -2,6 +2,10 @@ - /* - * Scheduler internal types and methods: - */ -+#ifdef CONFIG_SCHED_ALT -+#include "alt_sched.h" -+#else -+ - #include - - #include -@@ -2626,3 +2630,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) - - void swake_up_all_locked(struct swait_queue_head *q); - void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); -+ -+static inline int task_running_nice(struct task_struct *p) -+{ -+ return (task_nice(p) > 0); -+} -+#endif /* !CONFIG_SCHED_ALT */ -diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c -index 750fb3c67eed..108422ebc7bf 100644 ---- a/kernel/sched/stats.c -+++ b/kernel/sched/stats.c -@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) - } else { - struct rq *rq; - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - struct sched_domain *sd; - int dcount = 0; -+#endif - #endif - cpu = (unsigned long)(v - 2); - rq = cpu_rq(cpu); -@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - seq_printf(seq, "\n"); - - #ifdef CONFIG_SMP -+#ifndef CONFIG_SCHED_ALT - /* domain-specific stats */ - rcu_read_lock(); - for_each_domain(cpu, sd) { -@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) - sd->ttwu_move_balance); - } - rcu_read_unlock(); -+#endif - #endif - } - return 0; -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 1bd7e3af904f..cc946a9bd550 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -4,6 +4,7 @@ - */ - #include "sched.h" - -+#ifndef CONFIG_SCHED_ALT - DEFINE_MUTEX(sched_domains_mutex); - - /* Protected by sched_domains_mutex: */ -@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) - */ - - static int default_relax_domain_level = -1; -+#endif /* CONFIG_SCHED_ALT */ - int sched_domain_level_max; - -+#ifndef CONFIG_SCHED_ALT - static int __init setup_relax_domain_level(char *str) - { - if (kstrtoint(str, 0, &default_relax_domain_level)) -@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, - - return sd; - } -+#endif /* CONFIG_SCHED_ALT */ - - /* - * Topology list, bottom-up. -@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) - sched_domain_topology = tl; - } - -+#ifndef CONFIG_SCHED_ALT - #ifdef CONFIG_NUMA - - static const struct cpumask *sd_numa_mask(int cpu) -@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], - partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); - mutex_unlock(&sched_domains_mutex); - } -+#else /* CONFIG_SCHED_ALT */ -+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], -+ struct sched_domain_attr *dattr_new) -+{} -+ -+#ifdef CONFIG_NUMA -+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; -+ -+int sched_numa_find_closest(const struct cpumask *cpus, int cpu) -+{ -+ return best_mask_cpu(cpu, cpus); -+} -+#endif /* CONFIG_NUMA */ -+#endif -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index afad085960b8..e91b4cb3042b 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; - static int one_hundred = 100; - static int two_hundred = 200; - static int one_thousand = 1000; -+#ifdef CONFIG_SCHED_ALT -+static int __maybe_unused zero = 0; -+extern int sched_yield_type; -+#endif - #ifdef CONFIG_PRINTK - static int ten_thousand = 10000; - #endif -@@ -184,7 +188,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; - int sysctl_legacy_va_layout; - #endif - --#ifdef CONFIG_SCHED_DEBUG -+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) - static int min_sched_granularity_ns = 100000; /* 100 usecs */ - static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ - static int min_wakeup_granularity_ns; /* 0 usecs */ -@@ -1652,6 +1656,7 @@ int proc_do_static_key(struct ctl_table *table, int write, - } - - static struct ctl_table kern_table[] = { -+#ifndef CONFIG_SCHED_ALT - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, -@@ -1854,6 +1859,7 @@ static struct ctl_table kern_table[] = { - .extra2 = SYSCTL_ONE, - }, - #endif -+#endif /* !CONFIG_SCHED_ALT */ - #ifdef CONFIG_PROVE_LOCKING - { - .procname = "prove_locking", -@@ -2430,6 +2436,17 @@ static struct ctl_table kern_table[] = { - .proc_handler = proc_dointvec, - }, - #endif -+#ifdef CONFIG_SCHED_ALT -+ { -+ .procname = "yield_type", -+ .data = &sched_yield_type, -+ .maxlen = sizeof (int), -+ .mode = 0644, -+ .proc_handler = &proc_dointvec_minmax, -+ .extra1 = &zero, -+ .extra2 = &two, -+ }, -+#endif - #if defined(CONFIG_S390) && defined(CONFIG_SMP) - { - .procname = "spin_retry", -diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c -index 95b6a708b040..81f2ee62c807 100644 ---- a/kernel/time/hrtimer.c -+++ b/kernel/time/hrtimer.c -@@ -1927,8 +1927,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, - int ret = 0; - u64 slack; - -+#ifndef CONFIG_SCHED_ALT - slack = current->timer_slack_ns; - if (dl_task(current) || rt_task(current)) -+#endif - slack = 0; - - hrtimer_init_sleeper_on_stack(&t, clockid, mode); -diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c -index a71758e34e45..d20c347df861 100644 ---- a/kernel/time/posix-cpu-timers.c -+++ b/kernel/time/posix-cpu-timers.c -@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) - u64 stime, utime; - - task_cputime(p, &utime, &stime); -- store_samples(samples, stime, utime, p->se.sum_exec_runtime); -+ store_samples(samples, stime, utime, tsk_seruntime(p)); - } - - static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, -@@ -801,6 +801,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, - } - } - -+#ifndef CONFIG_SCHED_ALT - static inline void check_dl_overrun(struct task_struct *tsk) - { - if (tsk->dl.dl_overrun) { -@@ -808,6 +809,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) - __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - } - } -+#endif - - static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) - { -@@ -835,8 +837,10 @@ static void check_thread_timers(struct task_struct *tsk, - u64 samples[CPUCLOCK_MAX]; - unsigned long soft; - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk)) - check_dl_overrun(tsk); -+#endif - - if (expiry_cache_is_inactive(pct)) - return; -@@ -850,7 +854,7 @@ static void check_thread_timers(struct task_struct *tsk, - soft = task_rlimit(tsk, RLIMIT_RTTIME); - if (soft != RLIM_INFINITY) { - /* Task RT timeout is accounted in jiffies. RTTIME is usec */ -- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); -+ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); - unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); - - /* At the hard limit, send SIGKILL. No further action. */ -@@ -1086,8 +1090,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) - return true; - } - -+#ifndef CONFIG_SCHED_ALT - if (dl_task(tsk) && tsk->dl.dl_overrun) - return true; -+#endif - - return false; - } -diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c -index b5e3496cf803..65f60c77bc50 100644 ---- a/kernel/trace/trace_selftest.c -+++ b/kernel/trace/trace_selftest.c -@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) - { - /* Make this a -deadline thread */ - static const struct sched_attr attr = { -+#ifdef CONFIG_SCHED_ALT -+ /* No deadline on BMQ/PDS, use RR */ -+ .sched_policy = SCHED_RR, -+#else - .sched_policy = SCHED_DEADLINE, - .sched_runtime = 100000ULL, - .sched_deadline = 10000000ULL, - .sched_period = 10000000ULL -+#endif - }; - struct wakeup_test_data *x = data; - -diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c -index f36264fea75c6ca7c34eaa259c0bff829cbf6ac0..d43ca62fd00fe442bda9b4ad548fae432a7436de 100644 ---- a/kernel/sched/alt_core.c -+++ b/kernel/sched/alt_core.c -@@ -11,6 +11,10 @@ - * scheduler by Alfred Chen. - * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. - */ -+#define CREATE_TRACE_POINTS -+#include -+#undef CREATE_TRACE_POINTS -+ - #include "sched.h" - - #include -@@ -42,8 +46,11 @@ - #include "pelt.h" - #include "smp.h" - --#define CREATE_TRACE_POINTS --#include -+/* -+ * Export tracepoints that act as a bare tracehook (ie: have no trace event -+ * associated with them) to allow external modules to probe them. -+ */ -+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); - - #define ALT_SCHED_VERSION "v5.9-r0" - -diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h -index 99be2c51c88d0406cced20b36d7230da12930a5c..03f8b8b1aa27eeb15989af25b4050c767da12aad 100644 ---- a/kernel/sched/alt_sched.h -+++ b/kernel/sched/alt_sched.h -@@ -46,6 +46,8 @@ - - #include "cpupri.h" - -+#include -+ - #ifdef CONFIG_SCHED_BMQ - #include "bmq.h" - #endif -@@ -496,6 +498,8 @@ static inline int sched_tick_offload_init(void) { return 0; } - - extern void schedule_idle(void); - -+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * diff --git a/linux59-tkg/linux59-tkg-patches/0011-ZFS-fix.patch b/linux59-tkg/linux59-tkg-patches/0011-ZFS-fix.patch deleted file mode 100644 index af71d04..0000000 --- a/linux59-tkg/linux59-tkg-patches/0011-ZFS-fix.patch +++ /dev/null @@ -1,43 +0,0 @@ -From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= -Date: Thu, 2 May 2019 05:28:08 +0100 -Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with - EXPORT_SYMBOL_GPL -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -We need these symbols in zfs as the fpu implementation breaks userspace: - -https://github.com/zfsonlinux/zfs/issues/9346 -Signed-off-by: Jörg Thalheim ---- - arch/x86/kernel/fpu/core.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c -index 12c70840980e..352538b3bb5d 100644 ---- a/arch/x86/kernel/fpu/core.c -+++ b/arch/x86/kernel/fpu/core.c -@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) - } - __cpu_invalidate_fpregs_state(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_begin); -+EXPORT_SYMBOL(kernel_fpu_begin); - - void kernel_fpu_end(void) - { -@@ -111,7 +111,7 @@ void kernel_fpu_end(void) - this_cpu_write(in_kernel_fpu, false); - preempt_enable(); - } --EXPORT_SYMBOL_GPL(kernel_fpu_end); -+EXPORT_SYMBOL(kernel_fpu_end); - - /* - * Save the FPU state (mark it for reload if necessary): --- -2.23.0 - - diff --git a/linux59-tkg/linux59-tkg-patches/0012-misc-additions.patch b/linux59-tkg/linux59-tkg-patches/0012-misc-additions.patch deleted file mode 100644 index a4efaef..0000000 --- a/linux59-tkg/linux59-tkg-patches/0012-misc-additions.patch +++ /dev/null @@ -1,54 +0,0 @@ -diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig -index 0840d27381ea..73aba9a31064 100644 ---- a/drivers/tty/Kconfig -+++ b/drivers/tty/Kconfig -@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP - def_bool y - depends on VT_CONSOLE && PM_SLEEP - -+config NR_TTY_DEVICES -+ int "Maximum tty device number" -+ depends on VT -+ range 12 63 -+ default 63 -+ help -+ This option is used to change the number of tty devices in /dev. -+ The default value is 63. The lowest number you can set is 12, -+ 63 is also the upper limit so we don't overrun the serial -+ consoles. -+ -+ If unsure, say 63. -+ - config HW_CONSOLE - bool - depends on VT && !UML -diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h -index e9d39c48520a..3bceead8da40 100644 ---- a/include/uapi/linux/vt.h -+++ b/include/uapi/linux/vt.h -@@ -3,12 +3,25 @@ - #define _UAPI_LINUX_VT_H - - -+/* -+ * We will make this definition solely for the purpose of making packages -+ * such as splashutils build, because they can not understand that -+ * NR_TTY_DEVICES is defined in the kernel configuration. -+ */ -+#ifndef CONFIG_NR_TTY_DEVICES -+#define CONFIG_NR_TTY_DEVICES 63 -+#endif -+ - /* - * These constants are also useful for user-level apps (e.g., VC - * resizing). - */ - #define MIN_NR_CONSOLES 1 /* must be at least 1 */ --#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ -+/* -+ * NR_TTY_DEVICES: -+ * Value MUST be at least 12 and must never be higher then 63 -+ */ -+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ - /* Note: the ioctl VT_GETSTATE does not work for - consoles 16 and higher (since it returns a short) */ \ No newline at end of file